src/z_url_fetch.erl

% @author Marc Worrell
%% @copyright 2014-2024 Marc Worrell
%% @doc Fetch (part of) the data of an Url, including its headers.

%% Copyright 2014-2024 Marc Worrell
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%%     http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.

-module(z_url_fetch).

-author("Marc Worrell <marc@worrell.nl>").

%% Maximum nmber of bytes fetched for metadata extraction
-define(HTTPC_LENGTH, 64*1024).
-define(HTTPC_MAX_LENGTH, 1024*1024*100).  % Max 100MB

%% Number of redirects followed before giving up
-define(HTTPC_REDIRECT_COUNT, 10).

%% Total request timeout
-define(HTTPC_TIMEOUT, 20000).

%% Connect timeout, server has to respond before this
-define(HTTPC_TIMEOUT_CONNECT, 10000).

%% Some url shorteners return HTML+Javascript, except for simple text-only browsers
-define(CURL_UA, "curl/7.21.4 (universal-apple-darwin11.0) libcurl/7.21.4 OpenSSL/0.9.8r zlib/1.2.5").

%% Use our own user agent string. Sites sometimes handle well known user agents like 
%% twitterbot or apple messages badly. Using our own user agent string works better.
%% Picky websites to test this on: asos.com, hm.com.
-define(HTTPC_UA, "ZStdLib/1.0").

% Default Accept header
-define(HTTP_ACCEPT, "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8").

-export([
    fetch/2,
    fetch/4,
    fetch_partial/1,
    fetch_partial/2,
    fetch_partial/4,

    profile/1,
    ensure_profiles/0,
    periodic_cleanup/0
    ]).

-type options() :: list(option()).

-type option() :: {device, pid()}
                | {timeout, pos_integer()}
                | {max_length, pos_integer()}
                | {authorization, binary() | string()}
                | {accept, binary() | string()}
                | {user_agent, binary() | string()}
                | {language, atom()}
                | {content_type, binary() | string()}
                | insecure.

-type fetch_result() :: {ok, {string(), list({string(), string()}), pos_integer(), binary()}} | {error, term()}.

-export_type([
    options/0,
    option/0,
    fetch_result/0
]).

-define(is_method(M), (M =:= get orelse M =:= post orelse M =:= delete orelse M =:= put orelse M =:= patch)).

%% @doc Fetch the data and headers from an url
-spec fetch(Url, Options) -> fetch_result() when
    Url :: string() | binary(),
    Options :: options().
fetch(Url, Options) ->
    fetch_partial(get, Url, <<>>, Options).

%% @doc Fetch the data and headers from an url
-spec fetch(Method, Url, Payload, Options) -> fetch_result() when
    Method :: get | post | put | delete | patch,
    Url :: string()|binary(),
    Payload :: binary(),
    Options :: options().
fetch(Method, Url, Payload, Options) when is_binary(Payload), ?is_method(Method) ->
    fetch_partial(Method, Url, Payload, Options).


%% @doc Fetch the first kilobytes of data and headers from an url
-spec fetch_partial(Url) -> fetch_result() when
    Url :: string() | binary().
fetch_partial(Url) ->
    fetch_partial(get, Url, <<>>, [{max_length, ?HTTPC_LENGTH}]).

%% @doc Fetch the first N bytes of data and headers from an url, optionally save to the file device
-spec fetch_partial(Url, Options) -> fetch_result() when
    Url :: string() | binary(),
    Options :: options().
fetch_partial("data:" ++ _ = DataUrl, Options) ->
    fetch_data_url(DataUrl, Options);
fetch_partial(<<"data:", _/binary>> = DataUrl, Options) ->
    fetch_data_url(DataUrl, Options);
fetch_partial(Url, Options) ->
    fetch_partial(get, Url, <<>>, Options).

%% @doc Fetch the first N bytes of data and headers from an url, optionally save to the file device
-spec fetch_partial(Method, Url, Payload, Options) -> fetch_result() when
    Method :: get | post | delete | put | patch,
    Url :: string()|binary(),
    Payload :: binary(),
    Options :: options().
fetch_partial(Method, Url, Payload, Options) when is_binary(Payload), ?is_method(Method) ->
    OutDevice = proplists:get_value(device, Options),
    MaxLength = proplists:get_value(max_length, Options, ?HTTPC_MAX_LENGTH),
    maybe_handle_content_encoding(
        fetch_partial(Method, z_convert:to_list(Url), Payload, 0, MaxLength, OutDevice, Options),
        MaxLength).

-spec ensure_profiles() -> ok.
ensure_profiles() ->
    case inets:start(httpc, [{profile, z_url_fetch}]) of
        {ok, _} ->
            ok = httpc:set_options([
                {max_sessions, 10},
                {max_keep_alive_length, 10},
                {keep_alive_timeout, 20000},
                {cookies, enabled}
            ], z_url_fetch),
            periodic_cleanup(),
            ok;
        {error, {already_started, _}} -> ok
    end.

-spec periodic_cleanup() -> ok.
periodic_cleanup() ->
    httpc:reset_cookies(z_url_fetch),
    {ok, _} = timer:apply_after(3600*1000, ?MODULE, periodic_cleanup, []),
    ok.

-spec profile(string()|binary()) -> atom().
profile(_Url) ->
    ensure_profiles(),
    z_url_fetch.

%% -------------------------------------- Fetch first part of a HTTP location -----------------------------------------

fetch_data_url(DataUrl, Options) when is_list(DataUrl) ->
    fetch_data_url(iolist_to_binary(DataUrl), Options);
fetch_data_url(DataUrl, Options) when is_binary(DataUrl) ->
    case z_url:decode_data_url(DataUrl) of
        {ok, Mime, _Charset, Bytes} ->
            % TODO: charset
            Headers = [
                {"content-type", z_convert:to_list(Mime)},
                {"content-length", z_convert:to_list(size(Bytes))}
            ],
            case proplists:get_value(device, Options) of
                undefined ->
                    {ok, {200, Headers, size(Bytes), Bytes}};
                Dev ->
                    file:write(Dev, Bytes),
                    {ok, {200, Headers, size(Bytes), <<>>}}
            end;
        {error, _} = Error ->
            Error
    end.

fetch_partial(_Method, Url0, _Payload, RedirectCount, _Max, _OutDev, _Opts) when RedirectCount >= ?HTTPC_REDIRECT_COUNT ->
    error_logger:warning_msg("Error fetching url, too many redirects ~p", [Url0]),
    {error, too_many_redirects};
fetch_partial(Method, Url0, Payload, RedirectCount, Max, OutDev, Opts) when is_binary(Payload) ->
    httpc_flush(),
    case normalize_url(Url0) of
        {ok, {Host, UrlBin}} ->
            Url = to_list(UrlBin),
            Language = z_convert:to_list(proplists:get_value(language, Opts, en)),
            Accept = z_convert:to_list(proplists:get_value(accept, Opts, ?HTTP_ACCEPT)),
            UserAgent = z_convert:to_list(proplists:get_value(user_agent, Opts, httpc_ua(Url))),
            ContentType = case proplists:get_value(content_type, Opts) of
                undefined -> "application/octet-stream";
                CT -> to_list(CT)
            end,
            Headers = [
                {"Accept", Accept},
                {"Accept-Encoding", "identity"},
                {"Accept-Charset", "UTF-8;q=1.0, ISO-8859-1;q=0.5, *;q=0"},
                {"Accept-Language", Language ++ ",*;q=0"},
                {"User-Agent", UserAgent}
            ] ++ case Max of
                undefined -> [];
                _ -> [ {"Range", "bytes=0-"++integer_to_list(Max-1)} ]
            end ++ case proplists:get_value(authorization, Opts) of
                undefined -> [];
                Auth -> [ {"Authorization", to_list(Auth)} ]
            end,
            Request = case Method of
                get -> {Url, Headers};
                delete when Payload =:= <<>> -> {Url, Headers};
                delete -> {Url, Headers, ContentType, Payload};
                post -> {Url, Headers, ContentType, Payload};
                put -> {Url, Headers, ContentType, Payload};
                patch -> {Url, Headers, ContentType, Payload}
            end,
            case fetch_stream(start_stream(Host, Method, Url, Request, Opts), Max, OutDev, Opts) of
                {ok, Result} ->
                    maybe_redirect(Result, Method, Url, Payload, RedirectCount, Max, OutDev, Opts);
                {error, _} = Error ->
                    error_logger:warning_msg("Error fetching url ~p error: ~p", [Url, Error]),
                    Error
            end;
        {error, _} = Error ->
            Error
    end.

to_list(B) when is_binary(B) -> binary_to_list(B);
to_list(L) when is_list(L) -> L.

-spec normalize_url(string() | binary()) -> {ok, {binary(), binary()}} | {error, url}.
normalize_url(Url) ->
    case uri_string:parse(z_convert:to_binary(Url)) of
        #{
            host := Host,
            path := Path
        } = Parts ->
            Scheme = maps:get(scheme, Parts, <<"http">>),
            Port = case maps:get(port, Parts, undefined) of
                undefined -> <<>>;
                P -> <<$:,(integer_to_binary(P))/binary>>
            end,
            Query = case maps:get('query', Parts, <<>>) of
                <<>> -> <<>>;
                Q -> <<"?", Q/binary>>
            end,
            Url1 = iolist_to_binary([ Scheme, "://", Host, Port, Path, Query ]),
            {ok, {Host, Url1}};
        _ ->
            {error, url}
    end.

start_stream(Host, Method, Url, Request, Opts) ->
    SSLOptions = case proplists:get_value(insecure, Opts) of
        true ->
            [ {verify, verify_none} ];
        _ ->
            tls_certificate_check:options(Host)
    end,
    Timeout = proplists:get_value(timeout, Opts, ?HTTPC_TIMEOUT),
    HttpOptions = [
        {autoredirect, false},
        {relaxed, true},
        {timeout, Timeout},
        {connect_timeout, ?HTTPC_TIMEOUT_CONNECT},
        {ssl, SSLOptions}
     ],
    try
        httpc:request(Method,
                      Request,
                      HttpOptions,
                      [ {sync, false}, {body_format, binary}, {stream, {self, once}} ],
                      profile(Url))
    catch
        error:E -> {error, E};
        throw:E -> {error, E}
    end.


fetch_stream({ok, ReqId}, Max, OutDev, Opts) ->
    Timeout = proplists:get_value(timeout, Opts, ?HTTPC_TIMEOUT),
    receive
        {http, {ReqId, stream_end, Hs}} ->
            {ok, {200, Hs, 0, <<>>}};
        {http, {ReqId, stream_start, Hs, HandlerPid}} ->
            httpc:stream_next(HandlerPid),
            fetch_stream_data(ReqId, HandlerPid, Hs, <<>>, 0, Max, OutDev, Opts);
        {http, {ReqId, {error, _} = Error}} ->
            Error;
        {http, {_ReqId, {{_V, Code, _Msg}, Hs, Data}}} ->
            {ok, {Code, Hs, 0, Data}}
    after Timeout ->
        httpc:cancel_request(ReqId),
        {error, timeout}
    end;
fetch_stream({error, _} = Error, _Max, _OutDev, _Opts) ->
    Error.

fetch_stream_data(ReqId, HandlerPid, Hs, Data, N, Max, OutDev, Opts) when N =< Max ->
    Timeout = proplists:get_value(timeout, Opts, ?HTTPC_TIMEOUT),
    receive
        {http, {ReqId, stream_end, EndHs}} ->
            {ok, {200, EndHs++Hs, N, Data}};
        {http, {ReqId, stream, Part}} ->
            case append_data(Data, Part, OutDev) of
                {ok, Data1} ->
                    N1 = N + size(Part),
                    case N1 =< Max of
                        true ->
                            httpc:stream_next(HandlerPid),
                            fetch_stream_data(ReqId, HandlerPid, Hs, Data1, N1, Max, OutDev, Opts);
                        false ->
                            httpc:cancel_request(ReqId),
                            {ok, {200, Hs, N1, Data1}}
                    end;
                {error, _} = Error ->
                    httpc:cancel_request(ReqId),
                    Error
            end;
        {http, {ReqId, {error, socket_closed_remotely}}} ->
            % Remote closed the connection, this can happen at the moment
            % we received all data, then this error is received instead of
            % the expected data.
            % Return the data we received till now and pretend nothing is wrong.
            {ok, {200, Hs, N, Data}};
        {http, {ReqId, {error, _} = Error}} ->
            Error
    after Timeout ->
        httpc:cancel_request(ReqId),
        {error, timeout}
    end;
fetch_stream_data(ReqId, _HandlerPid, Hs, Data, N, _Max, _OutFile, _Opts) ->
    receive
        {http, {ReqId, stream_end, EndHs}} ->
            {ok, {200, EndHs++Hs, N, Data}};
        {http, _} ->
            httpc:cancel_request(ReqId),
            {ok, {200, Hs, N, Data}}
    after 100 ->
        httpc:cancel_request(ReqId),
        {ok, {200, Hs, N, Data}}
    end.

maybe_redirect({Code, Hs, Size, Data}, _Method, Url, _Payload, _RedirectCount, _Max, _OutDev, _Opts)
    when Code >= 200, Code =< 299 ->
    {ok, {Url, Hs, Size, Data}};
maybe_redirect({416, _Hs, _Size, _Data}, Method, Url, Payload, RedirectCount, _Max, OutDev, Opts) ->
    fetch_partial(Method, Url, Payload, RedirectCount+1, undefined, OutDev, Opts);
maybe_redirect({Code, Hs, _Size, _Data}, Method, BaseUrl, Payload, RedirectCount, Max, OutDev, Opts)
    when Code =:= 301; Code =:= 302; Code =:= 303; Code =:= 307 ->
    case proplists:get_value("location", Hs) of
        undefined ->
            {error, no_location_header};
        Location ->
            NewUrl = z_convert:to_list(z_url:abs_link(Location, BaseUrl)),
            fetch_partial(Method, NewUrl, Payload, RedirectCount+1, Max, OutDev, Opts)
    end;
maybe_redirect({Code, Hs, Size, Data}, _Method, Url, _Payload, _RedirectCount, _Max, _OutDev, _Opts) ->
    {error, {Code, Url, Hs, Size, Data}}.

append_data(Data, Part, undefined) ->
    {ok, <<Data/binary, Part/binary>>};
append_data(Data, Part, OutDev) ->
    case file:write(OutDev, Part) of
        ok -> {ok, Data};
        {error, _} = Error -> Error
    end.

%% @doc Some servers (Spotify) deliver gzip encoded content, even when we ask for identity.
maybe_handle_content_encoding({ok, {_FinalUrl, Hs, Length, Data}} = Result, MaxLength)
    when Length > 0, is_binary(Data), Data =/= <<>> ->
    CE = proplists:get_value("content-encoding", Hs, "identity"),
    handle_ce(CE, Result, MaxLength);
maybe_handle_content_encoding(Result, _MaxLength) ->
    Result.

handle_ce("gzip", {ok, {FinalUrl, Hs, Length, Data}} = Result, MaxLength)
    when Length > 0, is_binary(Data), Data =/= <<>> ->
    % Decode partial gzip data
    case partial_unzip(Data, MaxLength) of
        {ok, Data1} ->
            {ok, {FinalUrl, Hs, size(Data1), Data1}};
        {error, _} ->
            Result
    end;
handle_ce(_ContentEncoding, Result, _MaxLength) ->
    Result.

partial_unzip(Compressed, MaxLength) ->
    Z = zlib:open(),
    zlib:inflateInit(Z, 16 + 15),
    try
        Uncompressed = unzip_loop(Z, <<>>, zlib:safeInflate(Z, Compressed), MaxLength),
        {ok, Uncompressed}
    catch
        _:_ ->
            {error, gunzip}
    after
        zlib:close(Z)
    end.

unzip_loop(_Z, Acc, _, MaxLength) when size(Acc) >= MaxLength ->
    Acc;
unzip_loop(Z, Acc, {continue, Output}, MaxLength) ->
    Out1 = iolist_to_binary(Output),
    Acc1 = <<Acc/binary, Out1/binary>>,
    Next = try
        zlib:safeInflate(Z, [])
    catch
        _:_ ->
            {finished, <<>>}
    end,
    unzip_loop(Z, Acc1, Next, MaxLength);
unzip_loop(_Z, Acc, {finished, Output}, _MaxLength) ->
    Out1 = iolist_to_binary(Output),
    <<Acc/binary, Out1/binary>>.

%% @doc Flush any late results from previous requests
httpc_flush() ->
    receive
        {http, _} -> httpc_flush()
    after 0 ->
        ok
    end.

%% @doc Some url shorteners return HTML+Javascript, except for simple text-only browsers
httpc_ua(Url) ->
    case is_url_shortener(Url) of
        true -> ?CURL_UA;
        false -> ?HTTPC_UA
    end.

is_url_shortener(Url) ->
    case string:tokens(Url, "://") of
        [_Proto, DomainPath | _] ->
            is_url_shortener_1(DomainPath);
        _ ->
            false
    end.

is_url_shortener_1("t.co/" ++ _) -> true;
is_url_shortener_1("bit.ly/" ++ _) -> true;
is_url_shortener_1("ow.ly/" ++ _) -> true;
is_url_shortener_1("goo.gl/" ++ _) -> true;
is_url_shortener_1("lnkd.in/" ++ _) -> true;
is_url_shortener_1("tinyurl.com/" ++ _) -> true;
is_url_shortener_1("j.mp/" ++ _) -> true;
is_url_shortener_1("fb.me/" ++ _) -> true;
is_url_shortener_1("wp.me/" ++ _) -> true;
is_url_shortener_1("gu.com/" ++ _) -> true;
is_url_shortener_1("nyti.ms/" ++ _) -> true;
is_url_shortener_1("s.vk.nl/" ++ _) -> true;
is_url_shortener_1(_) -> false.