defmodule Crawler.Fetcher.Policer do
@moduledoc """
Checks a series of conditions to determine whether it is okay to continue.
"""
require Logger
alias Crawler.Store
@uri_schemes ["http", "https"]
@asset_extra_depth 2
@doc """
Checks a series of conditions to determine whether it is okay to continue,
i.e. to allow `Crawler.Fetcher.fetch/1` to begin its tasks.
"""
def police(opts) do
with {_, true} <- within_max_pages?(opts),
{_, true} <- within_fetch_depth?(opts),
{_, true} <- acceptable_uri_scheme?(opts),
{_, true} <- not_fetched_yet?(opts),
{_, true} <- perform_url_filtering(opts) do
{:ok, opts}
else
{fail_type, _} -> police_warn(fail_type, opts)
end
end
defp within_max_pages?(%{max_pages: :infinity} = _opts), do: {:within_max_pages?, true}
defp within_max_pages?(%{max_pages: max_pages} = _opts) when is_integer(max_pages) do
{:within_max_pages?, Store.ops_count() <= max_pages}
end
defp within_max_pages?(_opts), do: {:within_max_pages?, true}
defp within_fetch_depth?(%{depth: depth, max_depths: max_depths} = opts) do
max_depths =
case opts[:html_tag] do
"a" -> max_depths
_ -> max_depths + @asset_extra_depth
end
{:within_fetch_depth?, depth < max_depths}
end
defp within_fetch_depth?(_opts), do: {:within_fetch_depth?, true}
defp acceptable_uri_scheme?(%{url: url} = _opts) do
scheme =
url
|> String.split("://", parts: 2)
|> Kernel.hd()
{:acceptable_uri_scheme?, Enum.member?(@uri_schemes, scheme)}
end
defp acceptable_uri_scheme?(_opts), do: {:acceptable_uri_scheme?, true}
defp not_fetched_yet?(%{url: url, scope: scope} = _opts) do
{:not_fetched_yet?, !Store.find({url, scope})}
end
defp not_fetched_yet?(_opts), do: {:not_fetched_yet?, true}
defp perform_url_filtering(%{url_filter: url_filter, url: url} = opts) do
{:ok, pass_through?} = url_filter.filter(url, opts)
{:perform_url_filtering, pass_through?}
end
defp perform_url_filtering(_opts), do: {:perform_url_filtering, true}
defp police_warn(fail_type, opts) do
{:warn, "Fetch failed check '#{fail_type}', with opts: #{Kernel.inspect(opts)}."}
end
end