lib/crawly/middlewares/domain_filter.ex

defmodule Crawly.Middlewares.DomainFilter do
  @moduledoc """
  Filters out requests which are going outside of the crawled domain.

  The domain that is used to compare against the request url is obtained from the spider's `c:Crawly.Spider.base_url` callback.

  Does not accept any options. Tuple-based configuration options will be ignored.

  ### Example Declaration
  ```
  middlewares: [
    Crawly.Middlewares.DomainFilter
  ]
  ```
  """

  @behaviour Crawly.Pipeline
  require Logger

  def run(request, state, _opts \\ []) do
    base_url = state.spider_name.base_url()
    parsed_url = URI.parse(request.url)
    host = parsed_url.host

    case host != nil and String.contains?(base_url, host) do
      false ->
        Logger.debug(
          "Dropping request: #{inspect(request.url)} (domain filter)"
        )

        {false, state}

      true ->
        {request, state}
    end
  end
end