src/treewalker.erl

%%------------------------------------------------------------------------------
%% @doc This OTP application is used for crawling websites while respecting `robots.txt'.
%%
%% This module exposes some high level functions to be able to add new crawlers and start/stop them.
%%
%% While most of the configuration is per crawler, this application is also configurable globally
%% via the following `sys.config' settings:
%%
%%  ```
%%  {treewalker, [
%%                %% The minimum delay to wait before retrying a failed request
%%                {min_retry_delay, pos_integer()},
%%                %% The maximum delay to wait before retrying a failed request
%%                {max_retry_delay, pos_integer()},
%%                %% The maximum amount of retries of a failed request
%%                {max_retries, pos_integer()},
%%                %% The maximum amount of delay before starting a request (in seconds)
%%                {max_worker_delay, pos_integer()},
%%                %% The maximum amount of concurrent workers making HTTP requests
%%                {max_concurrent_worker, pos_integer()},
%%                %% The user agent making the HTTP requests
%%                {user_agent, binary()}]},
%%  '''
%%
%% @copyright 2020 Antoine Gagné
%% @author Antoine Gagné <gagnantoine@gmail.com>
%% @end
%%------------------------------------------------------------------------------
-module(treewalker).

%% API
-export([add_crawler/2,
         add_crawler/3,
         remove_crawler/1,
         start_crawler/1,
         stop_crawler/1]).

-type child() :: treewalker_crawlers_sup:child().
-type options() :: #{scraper => module(),
                     scraper_options => term(),
                     fetcher => module(),
                     fetcher_options => module(),
                     max_depth => pos_integer(),
                     store => module(),
                     store_options => term(),
                     link_filter => module()}.
-type url() :: treewalker_page:url().

%%%===================================================================
%%% API
%%%===================================================================

%%------------------------------------------------------------------------------
%% @doc
%% Add a new crawler with the default configuration.
%% @end
%%------------------------------------------------------------------------------
-spec add_crawler(term(), url()) -> {ok, child()} | {ok, child(), term()} | {error, term()}.
add_crawler(Name, Url) ->
    treewalker_crawlers_sup:add_crawler(Name, #{url => Url}).

%%------------------------------------------------------------------------------
%% @doc
%% Remove the specified crawler.
%% @end
%%------------------------------------------------------------------------------
-spec remove_crawler(term()) -> ok.
remove_crawler(Name) ->
    treewalker_crawlers_sup:remove_crawler(Name).

%%------------------------------------------------------------------------------
%% @doc
%% Add a new crawler with the specified configuration.
%%
%% The available options are as follow:
%%
%% - `scraper': Module implementing the {@link treewalker_scraper} behaviour.
%%
%% - `scraper_options': The options to pass to the module implementing the
%%                      {@link treewalker_scraper} behaviour.
%%
%% - `fetcher': Module implementing the {@link treewalker_fetcher} behaviour.
%%
%% - `fetcher_options': The options to pass to the module implementing the
%%                      {@link treewalker_fetcher} behaviour.
%%
%% - `max_depth': The max depth that the crawler will crawl.
%%
%% - `store': Module implementing the {@link treewalker_store} behaviour.
%%
%% - `store_options': The options to pass to the module implementing the
%%                    {@link treewalker_store} behaviour.
%%
%% - `link_filter': Module implementing the {@link treewalker_link_filter} behaviour.
%% @end
%%------------------------------------------------------------------------------
-spec add_crawler(term(), url(), options()) ->
    {ok, child()} | {ok, child(), term()} | {error, term()}.
add_crawler(Name, Url, Custom) ->
    treewalker_crawlers_sup:add_crawler(Name, Custom#{url => Url}).

%%------------------------------------------------------------------------------
%% @doc
%% Start the specified crawler.
%% @end
%%------------------------------------------------------------------------------
-spec start_crawler(term()) -> ok.
start_crawler(Name) ->
    treewalker_crawlers_sup:start_crawler(Name).

%%------------------------------------------------------------------------------
%% @doc
%% Stop the specified crawler.
%% @end
%%------------------------------------------------------------------------------
-spec stop_crawler(term()) -> ok.
stop_crawler(Name) ->
    treewalker_crawlers_sup:stop_crawler(Name).

%%%===================================================================
%%% Internal functions
%%%===================================================================