lib/crawly/middlewares/robotstxt.ex

defmodule Crawly.Middlewares.RobotsTxt do
  @moduledoc """
  Obey robots.txt

  A robots.txt file tells search engine crawlers which pages or files the
  crawler can or can't request from your site. This is used mainly to avoid
  overloading a site with requests!

  No options are required for this middleware. Any tuple-based configurations options passed will be ignored.


  ### Example Declaration
  ```
  middlewares: [
    Crawly.Middlewares.RobotsTxt
  ]
  ```
  """

  @behaviour Crawly.Pipeline
  require Logger

  def run(request, state, _opts \\ []) do
    case Gollum.crawlable?("Crawly", request.url) do
      :uncrawlable ->
        Logger.debug("Dropping request: #{request.url} (robots.txt filter)")

        {false, state}

      _ ->
        {request, state}
    end
  end
end