src/treewalker_scraper.erl

%%------------------------------------------------------------------------------
%% @doc Scrap the page for useful data and links.
%%
%% @copyright 2020 Antoine Gagné
%% @author Antoine Gagné <gagnantoine@gmail.com>
%% @end
%%------------------------------------------------------------------------------
-module(treewalker_scraper).

%% API
-export([scrap/3,
         scrap_links/3]).

-export_type([page_data/0,
              options/0]).

-type url() :: treewalker_page:url().
-type page_data() :: binary().
-type content() :: treewalker_page:content().
-type options() :: term().

%%%===================================================================
%%% Callbacks
%%%===================================================================

-callback scrap(Url :: url(), PageData :: page_data(), ScrappingOptions :: options()) ->
    {ok, Scrapped :: content()} | {error, Reason :: term()}.

-callback scrap_links(Url :: url(), PageData :: page_data(), ScrappingOptions :: options()) ->
    {ok, Links :: [url()]} | {error, Reason :: term()}.

%%%===================================================================
%%% API
%%%===================================================================

-spec scrap(url(), page_data(), options()) -> {ok, content()}.
scrap(_Url, PageData, _ScrappingOptions) ->
    {ok, PageData}.

-spec scrap_links(url(), page_data(), options()) -> {ok, [url()]} | {error, term()}.
scrap_links(_Url, PageData, _ScrappingOptions) ->
    case ecureuil:find(<<"a">>, PageData) of
        {ok, Nodes} ->
            {ok, lists:filtermap(fun extract_link/1, Nodes)};
        Error={error, _} ->
            Error
    end.

%%%===================================================================
%%% Internal functions
%%%===================================================================

extract_link(Node) ->
    case ecureuil_html:attribute(Node, <<"href">>) of
        {ok, Link} -> {true, Link};
        {error, _} -> false
    end.