lib/crawly/middlewares/unique_request.ex

defmodule Crawly.Middlewares.UniqueRequest do
  @moduledoc """
  Avoid scheduling multiple requests for the same page. Allow to set a hashing
  algorithm via options to reduce the memory footprint. Be aware of reduced collision
  resistance, depending on the chosen algorithm.

  ### Example Declarations
  ```
  middlewares: [
    Crawly.Middlewares.UniqueRequest
  ]
  ```

  ```
  middlewares: [
    {Crawly.Middlewares.UniqueRequest, hash: :sha}
  ]
  ```

  See the [Erlang documentation for crypto](https://www.erlang.org/doc/man/crypto.html#type-sha1)
  for available algorithms.
  """
  require Logger

  def run(request, state, opts \\ []) do
    unique_request_seen_requests =
      Map.get(state, :unique_request_seen_requests, %{})

    # we assume that https://example/foo and https://example/foo/ refer to the same content,
    # in case they are both accessible
    normalised_url = request.url |> String.replace_suffix("/", "")

    # optionally hash the URL
    unique_hash =
      if algo = opts[:hash] do
        :crypto.hash(algo, normalised_url)
      else
        normalised_url
      end

    case Map.get(unique_request_seen_requests, unique_hash) do
      nil ->
        unique_request_seen_requests =
          Map.put(unique_request_seen_requests, unique_hash, true)

        new_state =
          Map.put(
            state,
            :unique_request_seen_requests,
            unique_request_seen_requests
          )

        {request, new_state}

      _ ->
        Logger.debug(
          "Dropping request: #{request.url}, as it's already processed"
        )

        {false, state}
    end
  end
end