%% @author Antoine Gagné <gagnantoine@gmail.com>
%% @copyright 2023 Antoine Gagné
%% @doc Parse and manipulate robots.txt files according to the specification (RFC 9309).
-module(robots).
-ifdef(TEST).
-include_lib("eunit/include/eunit.hrl").
-endif.
%% API
-export([
parse/2,
sitemap/1,
is_allowed/3
]).
-export_type([agent_rules/0]).
-type code() :: 100..599.
-type agent() :: binary().
-type rule() :: binary().
-type rules() :: [rule()].
-type content() :: string() | binary().
-type status() :: allowed | disallowed.
-type allowed_all() :: {allowed, all}.
-type rules_index() :: #{
agent() := {Allowed :: rules(), Disallowed :: rules()} | allowed_all(),
sitemap => binary()
}.
-type sitemap() :: binary().
-opaque agent_rules() :: {status(), all} | rules_index().
-define(ALL, <<"*">>).
%%%===================================================================
%%% API
%%%===================================================================
-spec parse(content(), code()) -> {ok, agent_rules()} | {error, term()}.
%% @doc Parses the content of the <em>robot.txt</em> and returns all the rules
%% indexed by their agents.
parse(_Content, Code) when Code >= 500 andalso Code < 600 ->
{ok, {disallowed, all}};
parse(_Content, Code) when Code >= 400 ->
{ok, {allowed, all}};
parse(Content, Code) when Code >= 200 andalso Code < 300 ->
build_rules(Content);
parse(_Content, Code) ->
{error, {invalid_status_code, Code}}.
-spec is_allowed(agent(), uri_string:uri_string(), agent_rules()) -> boolean().
%% @doc Verifies that the given URL is allowed for the specified agent.
is_allowed(_Agent, _Url, {allowed, all}) ->
true;
is_allowed(_Agent, _Url, {disallowed, all}) ->
false;
is_allowed(RawAgent, Url, RulesIndex) ->
Agent = to_agent(RawAgent),
MaybeRules = find_agent_rules(Agent, RulesIndex),
is_allowed(Url, MaybeRules).
-spec sitemap(agent_rules()) -> {ok, sitemap()} | {error, not_found}.
%% @doc Fetches the sitemap of the parsed index.
sitemap(RulesIndex) ->
case maps:find(sitemap, RulesIndex) of
error -> {error, not_found};
V = {ok, _} -> V
end.
%%%===================================================================
%%% Internal functions
%%%===================================================================
-spec find_agent_rules(binary(), agent_rules()) ->
{error, not_found} | {ok, {rules(), rules()} | allowed_all()}.
find_agent_rules(<<>>, RulesIndex) ->
case maps:find(?ALL, RulesIndex) of
error ->
{error, not_found};
Result ->
Result
end;
find_agent_rules(Agent, RulesIndex) ->
case maps:find(Agent, RulesIndex) of
error ->
<<_:1/binary, Rest/binary>> = Agent,
find_agent_rules(Rest, RulesIndex);
Result ->
Result
end.
-spec is_allowed(binary(), {ok, {rules(), rules()} | allowed_all()} | {error, term()}) -> boolean().
is_allowed(_Url, {ok, {allowed, all}}) ->
true;
is_allowed(Url, {ok, {Allowed, Disallowed}}) ->
Match = fun(Rule) -> match(Url, Rule) end,
lists:any(Match, Allowed) orelse not lists:any(Match, Disallowed);
is_allowed(_Url, {error, _}) ->
true.
-spec build_rules(binary() | string()) -> {ok, rules_index()}.
build_rules(Content) when is_list(Content) ->
Binary = unicode:characters_to_binary(Content),
build_rules(Binary);
build_rules(Content) ->
Split = string:lexemes(Content, [[$\r, $\n], $\r, $\n]),
Sanitized = lists:filtermap(fun sanitize/1, Split),
WithEof = Sanitized ++ [{<<"eof">>, <<"end">>}],
{_, _, Rules} = lists:foldl(fun build_rules/2, {[], false, #{}}, WithEof),
{ok, maps:map(fun sort_rules/2, Rules)}.
-spec sanitize(binary()) -> false | {true, {binary(), binary()}}.
sanitize(Line) ->
Trimmed = trim(Line),
case string:take(Trimmed, [$#], true) of
{<<>>, _} -> false;
{NotComment, _} -> handle_line(NotComment)
end.
-spec handle_line(binary()) -> {true, {binary(), binary()}} | false.
handle_line(Line) ->
case string:split(Line, ":") of
Split = [_, _ | _] ->
[Key, Value | _] = lists:map(fun trim/1, Split),
{true, {string:lowercase(Key), Value}};
_ ->
false
end.
-spec sort_rules(agent() | sitemap, {[rule()], [rule()]} | allowed_all() | binary()) ->
binary() | {[rule()], [rule()]}.
sort_rules(_, Value = {allowed, all}) ->
Value;
sort_rules(_, {Allowed, Disallowed}) ->
Compare = fun(R1, R2) -> R1 > R2 end,
{lists:sort(Compare, Allowed), lists:sort(Compare, Disallowed)};
sort_rules(sitemap, Value) ->
Value.
-spec trim(unicode:chardata()) -> unicode:chardata().
trim(String) ->
string:trim(String, both).
-spec build_rules({binary(), binary()}, {[agent()], IsFirstAgent, rules_index()}) ->
{[agent()], IsFirstAgent, rules_index()}
when
IsFirstAgent :: boolean().
build_rules({<<"user-agent">>, RawAgent}, {Agents, false, RulesIndex}) ->
Reversed = to_agent(RawAgent),
{[Reversed | Agents], false, RulesIndex};
build_rules({<<"user-agent">>, RawAgent}, {_Agents, true, RulesIndex}) ->
Reversed = to_agent(RawAgent),
{[Reversed], false, RulesIndex};
build_rules({<<"allow">>, <<>>}, {Agents, _, RulesIndex}) ->
{Agents, true, RulesIndex};
build_rules({<<"allow">>, Rule}, {Agents, _, RulesIndex}) ->
{_, UpdatedIndex} = lists:foldl(fun update_index/2, {{allowed, Rule}, RulesIndex}, Agents),
{Agents, true, UpdatedIndex};
build_rules({<<"disallow">>, <<>>}, {Agents, _, RulesIndex}) ->
{Agents, true, RulesIndex};
build_rules({<<"disallow">>, Rule}, {Agents, _, RulesIndex}) ->
{_, UpdatedIndex} = lists:foldl(fun update_index/2, {{disallowed, Rule}, RulesIndex}, Agents),
{Agents, true, UpdatedIndex};
build_rules({<<"eof">>, _}, {Agents, false, RulesIndex}) ->
{_, UpdatedIndex} = lists:foldl(fun update_index/2, {{allowed, all}, RulesIndex}, Agents),
{Agents, false, UpdatedIndex};
build_rules({<<"sitemap">>, Map}, {Agents, ParsingRules, RulesIndex}) ->
{Agents, ParsingRules, RulesIndex#{sitemap => Map}};
build_rules({_Invalid, _Rule}, Acc) ->
Acc.
-spec update_index(agent(), {{status(), rule()}, rules_index()}) ->
{{status(), rule()}, rules_index()}.
update_index(Agent, {Rule = {allowed, all}, RulesIndex}) ->
Update = fun(_) -> Rule end,
UpdatedIndex = maps:update_with(Agent, Update, Rule, RulesIndex),
{Rule, UpdatedIndex};
update_index(Agent, {{allowed, Rule}, RulesIndex}) ->
Update = fun({Allowed, Disallowed}) -> {[Rule | Allowed], Disallowed} end,
UpdatedIndex = maps:update_with(Agent, Update, {[Rule], []}, RulesIndex),
{{allowed, Rule}, UpdatedIndex};
update_index(Agent, {{disallowed, Rule}, RulesIndex}) ->
Update = fun({Allowed, Disallowed}) -> {Allowed, [Rule | Disallowed]} end,
UpdatedIndex = maps:update_with(Agent, Update, {[], [Rule]}, RulesIndex),
{{disallowed, Rule}, UpdatedIndex}.
-spec match(binary(), rule()) -> boolean().
match(<<>>, <<$$>>) ->
true;
match(_, <<$$>>) ->
false;
match(_, <<$*>>) ->
true;
match(<<$/, _/binary>>, <<$/>>) ->
true;
match(_, <<$/>>) ->
false;
match(<<>>, <<>>) ->
true;
match(<<>>, _) ->
false;
match(_, <<>>) ->
true;
match(<<A, R1/binary>>, <<$*, A, R2/binary>>) ->
match(R1, R2);
match(<<_, R1/binary>>, <<$*, _, _/binary>> = R2) ->
match(R1, R2);
match(<<A, R1/binary>>, <<A, R2/binary>>) ->
match(R1, R2);
match(<<_, _/binary>>, <<_, _/binary>>) ->
false.
-spec to_agent(Raw :: binary()) -> unicode:chardata().
to_agent(Raw) ->
Reversed = reverse(Raw),
string:lowercase(Reversed).
%% Taken from: https://stackoverflow.com/a/43310493
-spec reverse(binary()) -> binary().
reverse(Binary) ->
Size = bit_size(Binary),
<<X:Size/integer-little>> = Binary,
<<X:Size/integer-big>>.
%%%===================================================================
%%% EUnit Tests
%%%===================================================================
-ifdef(TEST).
simple_path_test_() ->
Rule = <<"/fish">>,
[
?_assert(match(<<"/fish">>, Rule)),
?_assert(match(<<"/fish.html">>, Rule)),
?_assert(match(<<"/fish/salmon.html">>, Rule)),
?_assert(match(<<"/fishheads">>, Rule)),
?_assert(match(<<"/fishheads/yummy.html">>, Rule)),
?_assert(match(<<"/fish.php?id=anything">>, Rule)),
?_assertNot(match(<<"/Fish.asp">>, Rule)),
?_assertNot(match(<<"/catfish">>, Rule)),
?_assertNot(match(<<"/?id=fish">>, Rule))
].
trailing_wildcard_test_() ->
Rule = <<"/fish*">>,
[
?_assert(match(<<"/fish">>, Rule)),
?_assert(match(<<"/fish.html">>, Rule)),
?_assert(match(<<"/fish/salmon.html">>, Rule)),
?_assert(match(<<"/fishheads">>, Rule)),
?_assert(match(<<"/fishheads/yummy.html">>, Rule)),
?_assert(match(<<"/fish.php?id=anything">>, Rule)),
?_assertNot(match(<<"/Fish.asp">>, Rule)),
?_assertNot(match(<<"/catfish">>, Rule)),
?_assertNot(match(<<"/?id=fish">>, Rule))
].
trailing_slash_test_() ->
Rule = <<"/fish/">>,
[
?_assert(match(<<"/fish/">>, Rule)),
?_assert(match(<<"/fish/?id=anything">>, Rule)),
?_assert(match(<<"/fish/salmon.htm">>, Rule)),
?_assertNot(match(<<"/fish">>, Rule)),
?_assertNot(match(<<"/fish.html">>, Rule)),
?_assertNot(match(<<"/Fish/Salmon.asp">>, Rule))
].
nested_wildcard_test_() ->
Rule = <<"/*.php">>,
[
?_assert(match(<<"/filename.php">>, Rule)),
?_assert(match(<<"/folder/filename.php">>, Rule)),
?_assert(match(<<"/folder/filename.php?parameters">>, Rule)),
?_assert(match(<<"/folder/any.php.file.html">>, Rule)),
?_assert(match(<<"/filename.php/">>, Rule)),
?_assertNot(match(<<"/">>, Rule)),
?_assertNot(match(<<"/windows.PHP">>, Rule))
].
nested_wildcard_with_ending_test_() ->
Rule = <<"/*.php$">>,
[
?_assert(match(<<"/filename.php">>, Rule)),
?_assert(match(<<"/folder/filename.php">>, Rule)),
?_assertNot(match(<<"/filename.php?parameters">>, Rule)),
?_assertNot(match(<<"/filename.php/">>, Rule)),
?_assertNot(match(<<"/filename.php5">>, Rule)),
?_assertNot(match(<<"/windows.PHP">>, Rule))
].
simple_path_with_nested_wildcard_test_() ->
Rule = <<"/fish*.php">>,
[
?_assert(match(<<"/fish.php">>, Rule)),
?_assert(match(<<"/fishheads/catfish.php?parameters">>, Rule)),
?_assertNot(match(<<"/Fish.PHP">>, Rule))
].
user_agent_matching_test_() ->
News = <<"/news">>,
All = <<"/all">>,
Generic = <<"/generic">>,
RulesIndex = #{
reverse(<<"googlebot-news">>) => {[News], []},
<<"*">> => {[All], []},
reverse(<<"googlebot">>) => {[Generic], []}
},
[
?_assertEqual(
{ok, {[News], []}},
find_agent_rules(reverse(<<"googlebot-news/1.0.0">>), RulesIndex)
),
?_assertEqual(
{ok, {[Generic], []}},
find_agent_rules(reverse(<<"googlebot-web*">>), RulesIndex)
),
?_assertEqual(
{ok, {[Generic], []}},
find_agent_rules(reverse(<<"googlebot-images*">>), RulesIndex)
),
?_assertEqual(
{ok, {[All], []}},
find_agent_rules(reverse(<<"otherbot-web/1.2.0">>), RulesIndex)
),
?_assertEqual(
{ok, {[All], []}},
find_agent_rules(reverse(<<"otherbot-news/1.2.0">>), RulesIndex)
),
?_assertEqual({error, not_found}, find_agent_rules(reverse(<<"non-existent/1.0.0">>), #{}))
].
-endif.