defmodule Crawly.Middlewares.DomainFilter do
@moduledoc """
Filters out requests which are going outside of the crawled domain.
The domain that is used to compare against the request url is obtained from the spider's `c:Crawly.Spider.base_url` callback.
Does not accept any options. Tuple-based configuration options will be ignored.
### Example Declaration
```
middlewares: [
Crawly.Middlewares.DomainFilter
]
```
"""
@behaviour Crawly.Pipeline
require Logger
def run(request, state, _opts \\ []) do
base_url = state.spider_name.base_url()
parsed_url = URI.parse(request.url)
host = parsed_url.host
case host != nil and String.contains?(base_url, host) do
false ->
Logger.debug(
"Dropping request: #{inspect(request.url)} (domain filter)"
)
{false, state}
true ->
{request, state}
end
end
end