% @author Marc Worrell
%% @copyright 2014-2025 Marc Worrell
%% @doc Fetch (part of) the data of an Url, including its headers. Also
%% support decoding 'data:' URLs and streaming fetched data to a device.
%% Uses the httpc OTP library for the actual fetching.
%% The fetch functions are able to unzip partially fetched data.
%% @end
%% Copyright 2014-2025 Marc Worrell
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
-module(z_url_fetch).
-author("Marc Worrell <marc@worrell.nl>").
%% Maximum nmber of bytes fetched for metadata extraction
-define(HTTPC_LENGTH, 64*1024).
-define(HTTPC_MAX_LENGTH, 1024*1024*100). % Max 100MB
%% Number of redirects followed before giving up
-define(HTTPC_REDIRECT_COUNT, 10).
%% Total request timeout
-define(HTTPC_TIMEOUT, 20000).
%% Connect timeout, server has to respond before this
-define(HTTPC_TIMEOUT_CONNECT, 10000).
%% Some url shorteners return HTML+Javascript, except for simple text-only browsers
-define(CURL_UA, "curl/7.21.4 (universal-apple-darwin11.0) libcurl/7.21.4 OpenSSL/0.9.8r zlib/1.2.5").
%% Use our own user agent string. Sites sometimes handle well known user agents like
%% twitterbot or apple messages badly. Using our own user agent string works better.
%% Picky websites to test this on: asos.com, hm.com.
-define(HTTPC_UA, "ZStdLib/1.0").
% Default Accept header
-define(HTTP_ACCEPT, "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8").
-export([
fetch/2,
fetch/4,
fetch_partial/1,
fetch_partial/2,
fetch_partial/4,
profile/1,
ensure_profiles/0,
periodic_cleanup/0
]).
-type options() :: list(option()).
-type option() :: {device, pid()}
| {timeout, pos_integer()}
| {max_length, pos_integer()}
| {use_range, boolean()}
| {authorization, binary() | string()}
| {accept, binary() | string()}
| {user_agent, binary() | string()}
| {language, atom()}
| {content_type, binary() | string()}
| {headers, [ {binary()|string(), binary()|string()} ]}
| insecure.
-type fetch_result() :: {ok, {
FinalUrl :: string(),
RespHeaders :: list({string(), string()}),
ContentLength :: non_neg_integer(),
Content :: binary()
}}
| {error, Reason :: term()}.
-export_type([
options/0,
option/0,
fetch_result/0
]).
-define(is_method(M), (M =:= get orelse M =:= post orelse M =:= delete orelse M =:= put orelse M =:= patch)).
-include_lib("kernel/include/logger.hrl").
%% @doc Fetch the data and headers from an url
-spec fetch(Url, Options) -> fetch_result() when
Url :: string() | binary(),
Options :: options().
fetch(Url, Options) ->
fetch_partial(get, Url, <<>>, Options).
%% @doc Fetch the data and headers from an url
-spec fetch(Method, Url, Payload, Options) -> fetch_result() when
Method :: get | post | put | delete | patch,
Url :: string()|binary(),
Payload :: binary(),
Options :: options().
fetch(Method, Url, Payload, Options) when is_binary(Payload), ?is_method(Method) ->
fetch_partial(Method, Url, Payload, Options).
%% @doc Fetch the first 64 kilobytes of data and headers from an url
-spec fetch_partial(Url) -> fetch_result() when
Url :: string() | binary().
fetch_partial(Url) ->
fetch_partial(get, Url, <<>>, [{max_length, ?HTTPC_LENGTH}]).
%% @doc Fetch the first N bytes of data and headers from an url, optionally save to the file device
-spec fetch_partial(Url, Options) -> fetch_result() when
Url :: string() | binary(),
Options :: options().
fetch_partial("data:" ++ _ = DataUrl, Options) ->
fetch_data_url(DataUrl, Options);
fetch_partial(<<"data:", _/binary>> = DataUrl, Options) ->
fetch_data_url(DataUrl, Options);
fetch_partial(Url, Options) ->
fetch_partial(get, Url, <<>>, Options).
% Track the fetch state, some requests use redirects, some need
% multiple range fetches.
-record(fstate, {
method :: atom(),
url :: string(),
payload :: binary(),
code :: undefined | integer(),
headers :: list({string(), string()}),
redirects = 0 :: non_neg_integer(),
max = undefined :: non_neg_integer() | undefined,
length = 0 :: non_neg_integer(),
data = <<>> :: binary(),
device = undefined :: pid() | undefined,
options :: list()
}).
%% @doc Fetch the first N bytes of data and headers from an url, optionally save to the file device
-spec fetch_partial(Method, Url, Payload, Options) -> fetch_result() when
Method :: get | post | delete | put | patch,
Url :: string()|binary(),
Payload :: binary(),
Options :: options().
fetch_partial(Method, Url, Payload, Options) when is_binary(Payload), ?is_method(Method) ->
OutDevice = proplists:get_value(device, Options),
MaxLength = proplists:get_value(max_length, Options, ?HTTPC_MAX_LENGTH),
FState = #fstate{
code = undefined,
method = Method,
url = z_convert:to_list(Url),
headers = [],
redirects = 0,
payload = Payload,
length = 0,
max = MaxLength,
data = <<>>,
device = OutDevice,
options = Options
},
case fetch_partial_loop(FState) of
{ok, #fstate{ code = Code } = FState1} ->
FState2 = maybe_handle_content_encoding(FState1),
if
Code >= 200 andalso Code < 300 ->
{ok, {
FState2#fstate.url,
FState2#fstate.headers,
FState2#fstate.length,
FState2#fstate.data
}};
true ->
{error, {
Code,
FState2#fstate.url,
FState2#fstate.headers,
FState2#fstate.length,
FState2#fstate.data
}}
end;
{error, _} = Error ->
Error
end.
-spec ensure_profiles() -> ok.
ensure_profiles() ->
case inets:start(httpc, [{profile, z_url_fetch}]) of
{ok, _} ->
ok = httpc:set_options([
{max_sessions, 10},
{max_keep_alive_length, 10},
{keep_alive_timeout, 20000},
{cookies, enabled}
], z_url_fetch),
periodic_cleanup(),
ok;
{error, {already_started, _}} -> ok
end.
-spec periodic_cleanup() -> ok.
periodic_cleanup() ->
httpc:reset_cookies(z_url_fetch),
{ok, _} = timer:apply_after(3600*1000, ?MODULE, periodic_cleanup, []),
ok.
-spec profile(string()|binary()) -> atom().
profile(_Url) ->
ensure_profiles(),
z_url_fetch.
%% -------------------------------------- Fetch first part of a HTTP location -----------------------------------------
fetch_data_url(DataUrl, Options) when is_list(DataUrl) ->
fetch_data_url(iolist_to_binary(DataUrl), Options);
fetch_data_url(DataUrl, Options) when is_binary(DataUrl) ->
case z_url:decode_data_url(DataUrl) of
{ok, Mime, _Charset, Bytes} ->
% TODO: charset
Headers = [
{"content-type", z_convert:to_list(Mime)},
{"content-length", z_convert:to_list(size(Bytes))}
],
case proplists:get_value(device, Options) of
undefined ->
{ok, {"data://", Headers, size(Bytes), Bytes}};
Dev ->
file:write(Dev, Bytes),
{ok, {"data://", Headers, size(Bytes), <<>>}}
end;
{error, _} = Error ->
Error
end.
fetch_partial_loop(#fstate{ url = Url, redirects = Redirects }) when Redirects >= ?HTTPC_REDIRECT_COUNT ->
?LOG_ERROR(#{
in => zotonic_stdlib,
text => <<"Error fetching URL, too many redirects">>,
result => error,
reason => too_many_redirects,
redirects => Redirects,
url => unicode:characters_to_binary(Url, utf8)
}),
{error, too_many_redirects};
fetch_partial_loop(#fstate{ url = Url0, payload = Payload } = FState) when is_binary(Payload) ->
httpc_flush(),
case normalize_url(Url0) of
{ok, {Host, UrlBin}} ->
#fstate{
method = Method,
max = Max,
length = Length,
options = Opts
} = FState,
Url = to_list(UrlBin),
{Headers, ContentType} = headers_ct(Method, Url, Length, Max, Opts),
Request = case Method of
get -> {Url, Headers};
delete when Payload =:= <<>> -> {Url, Headers};
delete -> {Url, Headers, ContentType, Payload};
post -> {Url, Headers, ContentType, Payload};
put -> {Url, Headers, ContentType, Payload};
patch -> {Url, Headers, ContentType, Payload}
end,
FState1 = FState#fstate{
url = Url
},
case fetch_stream(start_stream(Host, Method, Url, Request, Opts), FState1) of
{ok, Result} ->
case maybe_redirect(Result) of
{redirect, NewMethod, NewUrl, NewOpts} ->
FState2 = FState1#fstate{
method = NewMethod,
url = NewUrl,
headers = [],
redirects = FState1#fstate.redirects + 1,
length = 0,
data = <<>>,
options = NewOpts
},
fetch_partial_loop(FState2);
{ok, #fstate{ method = ResMethod, code = Code, max = ResMax, length = ResLength } = FState2} when
Code >= 200, Code < 300,
is_integer(Max), ResLength < ResMax,
ResMethod =:= get ->
case is_fetch_complete(FState2#fstate.headers, ResMax) of
true ->
{ok, FState2};
false ->
fetch_partial_loop(FState2)
end;
{ok, FState2} ->
{ok, FState2};
{error, _} = Error ->
Error
end;
{error, Reason} = Error ->
?LOG_WARNING(#{
in => zotonic_stdlib,
text => <<"Error fetching URL">>,
result => error,
reason => Reason,
url => unicode:characters_to_binary(Url)
}),
Error
end;
{error, _} = Error ->
Error
end.
is_fetch_complete(_Hs, undefined) ->
true;
is_fetch_complete(Hs, Max) ->
% {"content-length","1000000"},
% {"content-range","bytes 0-999999/4645135"},
case proplists:get_value("content-range", Hs) of
undefined ->
true;
Range ->
case cow_http_hd:parse_content_range(list_to_binary(Range)) of
{bytes, '*', _} ->
true;
{_, _} ->
true;
{bytes, _First, Last, '*'} ->
Last >= Max + 1;
{bytes, _First, Last, Complete} when Last + 1 == Complete ->
true;
{bytes, _First, Last, _Complete} when Last >= Max + 1 ->
true;
{bytes, _First, _Last, _Complete} ->
false
end
end.
headers_ct(Method, Url, TotalSz, Max, Opts) ->
Language = z_convert:to_list(proplists:get_value(language, Opts, en)),
Accept = z_convert:to_list(proplists:get_value(accept, Opts, ?HTTP_ACCEPT)),
UserAgent = z_convert:to_list(proplists:get_value(user_agent, Opts, httpc_ua(Url))),
UseRange = z_convert:to_bool(proplists:get_value(use_range, Opts, true)),
ContentType = case proplists:get_value(content_type, Opts) of
undefined -> "application/octet-stream";
CT -> to_list(CT)
end,
Headers0 = [
{"Accept", Accept},
{"Accept-Encoding", "identity"},
{"Accept-Charset", "UTF-8;q=1.0, ISO-8859-1;q=0.5, *;q=0"},
{"Accept-Language", Language ++ ",*;q=0"},
{"User-Agent", UserAgent}
],
Headers1 = if
is_integer(Max), Method =:= get, UseRange ->
Range = "bytes="
++ integer_to_list(TotalSz)
++ "-"
++ integer_to_list(Max-1),
[ {"Range", Range} | Headers0 ];
true ->
Headers0
end,
Headers2 = case proplists:get_value(authorization, Opts) of
undefined ->
Headers1;
Auth ->
[ {"Authorization", to_list(Auth)} | Headers1 ]
end,
FinalHeaders = case proplists:get_value(headers, Opts) of
undefined ->
Headers2;
[] ->
Headers2;
Hs ->
Hs1 = lists:map(
fun({K,V}) ->
{z_convert:to_list(K), z_convert:to_list(V)}
end,
Hs),
Headers2 ++ Hs1
end,
{FinalHeaders, ContentType}.
to_list(B) when is_binary(B) -> binary_to_list(B);
to_list(L) when is_list(L) -> L.
-spec normalize_url(string() | binary()) -> {ok, {binary(), binary()}} | {error, url}.
normalize_url(Url) ->
case uri_string:parse(z_convert:to_binary(Url)) of
#{
host := Host,
path := Path
} = Parts ->
Scheme = maps:get(scheme, Parts, <<"http">>),
Port = case maps:get(port, Parts, undefined) of
undefined -> <<>>;
P -> <<$:,(integer_to_binary(P))/binary>>
end,
Query = case maps:get('query', Parts, <<>>) of
<<>> -> <<>>;
Q -> <<"?", Q/binary>>
end,
Url1 = iolist_to_binary([ Scheme, "://", Host, Port, Path, Query ]),
{ok, {Host, Url1}};
_ ->
{error, url}
end.
start_stream(Host, Method, Url, Request, Opts) ->
SSLOptions = case proplists:get_value(insecure, Opts) of
true ->
[ {verify, verify_none} ];
_ ->
tls_certificate_check:options(Host)
end,
Timeout = proplists:get_value(timeout, Opts, ?HTTPC_TIMEOUT),
HttpOptions = [
{autoredirect, false},
{relaxed, true},
{timeout, Timeout},
{connect_timeout, ?HTTPC_TIMEOUT_CONNECT},
{ssl, SSLOptions}
],
try
httpc:request(Method,
Request,
HttpOptions,
[ {sync, false}, {body_format, binary}, {stream, {self, once}} ],
profile(Url))
catch
error:E -> {error, E};
throw:E -> {error, E}
end.
fetch_stream({ok, ReqId}, #fstate{ options = Opts } = FState) ->
Timeout = proplists:get_value(timeout, Opts, ?HTTPC_TIMEOUT),
receive
{http, {ReqId, stream_end, Hs}} ->
FState1 = FState#fstate{
code = 200,
headers = Hs ++ FState#fstate.headers
},
{ok, FState1};
{http, {ReqId, stream_start, Hs, HandlerPid}} ->
httpc:stream_next(HandlerPid),
FState1 = FState#fstate{
headers = Hs
},
fetch_stream_data(ReqId, HandlerPid, FState1);
{http, {ReqId, {error, _} = Error}} ->
Error;
{http, {_ReqId, {{_V, Code, _Msg}, Hs, Data}}} when is_binary(Data) ->
case append_data(FState#fstate.data, Data, FState#fstate.device) of
{ok, Data1} ->
FState1 = FState#fstate{
code = Code,
headers = Hs ++ FState#fstate.headers,
data = Data1
},
{ok, FState1};
{error, _} = Error ->
Error
end;
{http, {_ReqId, {{_V, Code, _Msg}, Hs, _Data}}} ->
FState1 = FState#fstate{
code = Code,
headers = Hs ++ FState#fstate.headers
},
{ok, FState1}
after Timeout ->
httpc:cancel_request(ReqId),
{error, timeout}
end;
fetch_stream({error, _} = Error, _FState) ->
Error.
fetch_stream_data(ReqId, HandlerPid, #fstate{ length = Length, max = Max } = FState) when
Max =:= undefined; Length =< Max ->
#fstate{
options = Opts,
data = Data
} = FState,
Timeout = proplists:get_value(timeout, Opts, ?HTTPC_TIMEOUT),
receive
{http, {ReqId, stream_end, EndHs}} ->
FState1 = FState#fstate{
code = 200,
headers = EndHs ++ FState#fstate.headers
},
{ok, FState1};
{http, {ReqId, stream, Part}} ->
case append_data(Data, Part, FState#fstate.device) of
{ok, Data1} ->
Length1 = Length + size(Part),
FState1 = FState#fstate{
length = Length1,
data = Data1
},
case Length1 =< Max of
true ->
httpc:stream_next(HandlerPid),
fetch_stream_data(ReqId, HandlerPid, FState1);
false ->
httpc:cancel_request(ReqId),
FState2 = FState1#fstate{
code = 200
},
{ok, FState2}
end;
{error, _} = Error ->
httpc:cancel_request(ReqId),
Error
end;
{http, {ReqId, {error, socket_closed_remotely}}} ->
% Remote closed the connection, this can happen at the moment
% we received all data, then this error is received instead of
% the expected data.
% Return the data we received till now and pretend nothing is wrong.
FState1 = FState#fstate{
code = 200
},
{ok, FState1};
{http, {ReqId, {error, _} = Error}} ->
Error
after Timeout ->
httpc:cancel_request(ReqId),
{error, timeout}
end;
fetch_stream_data(ReqId, _HandlerPid, FState) ->
receive
{http, {ReqId, stream_end, EndHs}} ->
FState1 = FState#fstate{
code = 200,
headers = EndHs ++ FState#fstate.headers
},
{ok, FState1};
{http, _} ->
httpc:cancel_request(ReqId),
FState1 = FState#fstate{
code = 200
},
{ok, FState1}
after 100 ->
httpc:cancel_request(ReqId),
FState1 = FState#fstate{
code = 200
},
{ok, FState1}
end.
maybe_redirect(#fstate{ code = Code } = FState) when Code >= 200, Code =< 299 ->
{ok, FState};
maybe_redirect(#fstate{ code = 416, options = Opts, url = Url }) ->
% 416 Range Not Satisfiable - if this is the first request then it might
% be a picky server complaining that we requested beyond the size of the
% document.
Opts1 = proplists:delete(use_range, Opts),
{redirect, get, Url, [ {use_range, false} | Opts1 ]};
maybe_redirect(#fstate{ code = 303, headers = Hs, url = Url } = FState) ->
case proplists:get_value("location", Hs) of
undefined ->
{error, no_location_header};
Location ->
NewUrl = z_convert:to_list(z_url:abs_link(Location, Url)),
{redirect, get, NewUrl, FState#fstate.options}
end;
maybe_redirect(#fstate{ code = Code, headers = Hs, url = Url } = FState)
when Code =:= 301; Code =:= 302; Code =:= 303; Code =:= 307; Code =:= 308 ->
case proplists:get_value("location", Hs) of
undefined ->
{error, no_location_header};
Location ->
NewUrl = z_convert:to_list(z_url:abs_link(Location, Url)),
{redirect, FState#fstate.method, NewUrl, FState#fstate.options}
end;
maybe_redirect(FState) ->
{ok, FState}.
append_data(Data, <<>>, _Device) ->
{ok, Data};
append_data(Data, Part, undefined) ->
{ok, <<Data/binary, Part/binary>>};
append_data(Data, Part, OutDev) ->
case file:write(OutDev, Part) of
ok -> {ok, Data};
{error, _} = Error -> Error
end.
%% @doc Some servers (Spotify) deliver gzip encoded content, even when we ask for identity.
maybe_handle_content_encoding(#fstate{ length = Length, headers = Hs, data = Data } = FState)
when Length > 0, is_binary(Data), Data =/= <<>> ->
CE = proplists:get_value("content-encoding", Hs, "identity"),
handle_ce(CE, FState);
maybe_handle_content_encoding(FState) ->
FState.
handle_ce("gzip", #fstate{ length = Length, data = Data } = FState)
when Length > 0, is_binary(Data), Data =/= <<>> ->
% Decode partial gzip data
case partial_unzip(Data, FState#fstate.max) of
{ok, Data1} ->
FState1 = FState#fstate{
length = size(Data1),
data = Data1
},
FState1;
{error, _} ->
FState
end;
handle_ce(_ContentEncoding, FState) ->
FState.
partial_unzip(Compressed, MaxLength) ->
Z = zlib:open(),
zlib:inflateInit(Z, 16 + 15),
try
Uncompressed = unzip_loop(Z, <<>>, zlib:safeInflate(Z, Compressed), MaxLength),
{ok, Uncompressed}
catch
_:_ ->
{error, gunzip}
after
zlib:close(Z)
end.
unzip_loop(_Z, Acc, _, MaxLength) when size(Acc) >= MaxLength ->
Acc;
unzip_loop(Z, Acc, {continue, Output}, MaxLength) ->
Out1 = iolist_to_binary(Output),
Acc1 = <<Acc/binary, Out1/binary>>,
Next = try
zlib:safeInflate(Z, [])
catch
_:_ ->
{finished, <<>>}
end,
unzip_loop(Z, Acc1, Next, MaxLength);
unzip_loop(_Z, Acc, {finished, Output}, _MaxLength) ->
Out1 = iolist_to_binary(Output),
<<Acc/binary, Out1/binary>>.
%% @doc Flush any late results from previous requests
httpc_flush() ->
receive
{http, _} -> httpc_flush()
after 0 ->
ok
end.
%% @doc Some url shorteners return HTML+Javascript, except for simple text-only browsers
httpc_ua(Url) ->
case is_url_shortener(Url) of
true -> ?CURL_UA;
false -> ?HTTPC_UA
end.
is_url_shortener(Url) ->
case string:tokens(Url, "://") of
[_Proto, DomainPath | _] ->
is_url_shortener_1(DomainPath);
_ ->
false
end.
is_url_shortener_1("t.co/" ++ _) -> true;
is_url_shortener_1("bit.ly/" ++ _) -> true;
is_url_shortener_1("ow.ly/" ++ _) -> true;
is_url_shortener_1("goo.gl/" ++ _) -> true;
is_url_shortener_1("lnkd.in/" ++ _) -> true;
is_url_shortener_1("tinyurl.com/" ++ _) -> true;
is_url_shortener_1("j.mp/" ++ _) -> true;
is_url_shortener_1("fb.me/" ++ _) -> true;
is_url_shortener_1("wp.me/" ++ _) -> true;
is_url_shortener_1("gu.com/" ++ _) -> true;
is_url_shortener_1("nyti.ms/" ++ _) -> true;
is_url_shortener_1("s.vk.nl/" ++ _) -> true;
is_url_shortener_1(_) -> false.