lib/ex_crawlzy.ex

defmodule ExCrawlzy do
  @moduledoc """
  Documentation for `ExCrawlzy`.
  """

  alias ExCrawlzy.Utils
  alias ExCrawlzy.BrowserClients

  @type result() :: :ok | :error
  @type map_key() :: String.t() | atom()
  @type post_processing() :: atom() | {module(), atom()} | (any() -> String.t())
  @type selector_tuple() :: {String.t(), post_processing()}

  @doc """
  Request link and returns the raw content.

  ## Examples

      iex> ExCrawlzy.crawl("http://some.site")
      {:ok, "<!doctype html><html>  <head>    <title>the title</title>  </head>  <body>    <div id=\\\"the_body\\\">      the body      <div id=\\\"inner_field\\\">        inner field      </div>      <div id=\\\"inner_second_field\\\">        inner second field        <div id=\\\"the_number\\\">          2023        </div>      </div>      <div id=\\\"exist\\\">        this field exist      </div>      <a class=\\\"link_class\\\" href=\\\"http://some_external.link\\\"></a>      <img class=\\\"img_class\\\" src=\\\"http://some_external.link/image_path.jpg\\\" alt=\\\"some image\\\">    </div>  </body></html>"}

  """
  @spec crawl(String.t(), [BrowserClients.client()]) :: {result(), String.t()}
  def crawl(link, clients \\ []) do
    clients = if Enum.empty?(clients),
                 do: BrowserClients.clients(),
                 else: Enum.map(clients, &BrowserClients.create_client/1)

    client = Enum.random(clients)
    opts = [
      method: :get,
      url: link
    ]

    case Tesla.request(client, opts) do
      {:ok, %Tesla.Env{status: status, body: body}} when status >= 200 and status < 300 ->
        content =
          body
          |> String.replace("\n", "")
          |> Utils.binary_to_string()
        {:ok, content}
      {:ok, %Tesla.Env{status: status}} when status == 301 ->
        # implement redirection
        {:error, :failure_redirect}
      _ ->
        {:error, :failure_on_call}
    end
  rescue
    _ ->
      nil
  end

  @doc """
  Request link and returns the raw content.

  ## Examples

      iex> raw_content = "<html><head><title>the title</title></head><body><div id=\\\"the_body\\\">the body</div></body></html>"
      iex> ExCrawlzy.parse(%{body: {"#the_body", :text}}, raw_content)
      {:ok, %{body: "the body"}}

  """
  @spec parse(%{map_key() => selector_tuple()}, String.t() | Floki.html_tree() | Floki.html_node()) :: {result(), %{map_key() => String.t()}}
  def parse(mapping, raw_content) when is_bitstring(raw_content) do

    case Floki.parse_document(raw_content) do
      {:ok, document} ->
        parse(mapping, document)
      _ ->
        {:error, nil}
    end
  end
  def parse(mapping, document) do
    data =
      mapping
      |> Map.keys()
      |> Enum.reduce(%{}, fn key, acc ->
        {selector, post_processing} = Map.get(mapping, key)
        crawled_data = Floki.find(document, selector)
        data =
          case post_processing do
            post_processing when is_function(post_processing) ->
              post_processing.(crawled_data)
            {mod, func} ->
              apply(mod, func, [crawled_data])
            post_processing ->
              apply(Utils, post_processing, [crawled_data])
          end

        Map.put(acc, key, data)
      end)
    {:ok, data}
  end
end