lib/client/json_list.ex

defmodule ExCrawlzy.Client.JsonList do
  @moduledoc """
  Module that helps to create clients to crawl some sites and extract the data as list of json/map

  The point its create other module that uses this module, add the list selector with `list_selector/1` and add fields
  using the macro `add_field/3`

  By default takes 20 elements but you can customize using `list_size/1`

  Then just use the client function `crawl/1` with your url and will crawl and parse the data and return the list of json/map

  Example:

  ```elixir
  defmodule GithubProfilePinnedRepos do
    use ExCrawlzy.Client.JsonList

    add_browser_client([
      {"referer", "https://your_site.com"},
      {"user-agent", "Custom User Agent"}
    ])

    list_selector("div.pinned-item-list-item")
    add_field(:name, "a.mr-1 span.repo", :text)
    add_field(:link, "a.mr-1", :link)
    add_field(:access, "span.Label", :text)
    add_field(:description, "p.pinned-item-desc", :text)
    add_field(:language, "span.d-inline-block span[itemprop=\"programmingLanguage\"]", :text)

    def link(doc) do
      path = ExCrawlzy.Utils.props("href", doc)
      "https://github.com\#{path}"
    end
  end

  GithubProfilePinnedRepos.crawl("https://github.com/nicolkill")
  ```
  """

  defmacro __using__(_) do
    quote do
      use ExCrawlzy.Client.Handlers.Fields
      use ExCrawlzy.Client.Handlers.List
      use ExCrawlzy.Client.Handlers.BrowserClients

      import ExCrawlzy.Client.JsonList

      @behaviour ExCrawlzy.Client.Crawler.Interface

      @impl ExCrawlzy.Client.Crawler.Interface
      def crawl(site) do
        browser_clients = browser_clients()

        case ExCrawlzy.crawl(site, browser_clients) do
          {:ok, content} ->
            list_size = list_size()
            list_selector = get_list_selector_selector()
            fields = fields()

            content
            |> Floki.find(list_selector)
            |> Enum.take(list_size)
            |> Enum.map(fn sub_document ->
              {:ok, data} = ExCrawlzy.parse(fields, sub_document)
              data
            end)
          error ->
            error
        end
      end
    end
  end
end