%% @author Marc Worrell
%% @copyright 2014-2023 Marc Worrell
%% @doc Discover metadata about an url.
%% Copyright 2014-2023 Marc Worrell
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
-module(z_url_metadata).
-author("Marc Worrell <marc@worrell.nl>").
-export([
fetch/1,
fetch/2,
html_meta/1,
p/2,
header/2,
filename/2
]).
-include("../include/z_url_metadata.hrl").
-type metadata() :: #url_metadata{}.
-export_type([ metadata/0 ]).
% Per default we fetch max 1MB of data to analyze
-define(FETCH_LENGTH, 1024*1024).
% Below this size an image is considered too small to be a representative image or icon
-define(IMG_SMALL_SIZE, 16).
%% @doc Fetch metadata information for the URL
-spec fetch(binary()|string()) -> {ok, metadata()} | {error, term()}.
fetch(Url) ->
fetch(Url, []).
-spec fetch(binary()|string(), z_url_fetch:options()) -> {ok, metadata()} | {error, term()}.
fetch(Url, Options) ->
Options1 = case proplists:is_defined(max_length, Options) of
true -> Options;
false -> [ {max_length, ?FETCH_LENGTH} | Options ]
end,
case z_url_fetch:fetch_partial(Url, Options1) of
{ok, {FinalUrl, Headers, _Size, Data}} ->
{ok, partial_metadata(FinalUrl, Headers, Data)};
{error, _} = Error ->
Error
end.
%% @doc Fetch properties of the fetched metadata
-spec p(atom() | binary() | list( atom() | binary() ), metadata()) -> list(binary()) | binary() | undefined.
p(mime, MD) ->
MD#url_metadata.content_type;
p(mime_options, MD) ->
MD#url_metadata.content_type_options;
p(final_url, MD) ->
MD#url_metadata.final_url;
p(url, MD) ->
case p1([<<"og:url">>, <<"twitter:url">>, canonical_url, short_url], MD) of
undefined -> MD#url_metadata.final_url;
PrefUrl -> z_url:abs_link(PrefUrl, MD#url_metadata.final_url)
end;
p(content_length, MD) ->
MD#url_metadata.content_length;
p(headers, MD) ->
MD#url_metadata.headers;
p(title, MD) ->
case p1([<<"og:title">>, <<"twitter:title">>, mtitle, h1, title], MD) of
undefined -> p(filename, MD);
Title -> Title
end;
p(summary, MD) ->
p1([<<"og:description">>, <<"twitter:description">>, description], MD);
p(image, MD) ->
case MD#url_metadata.content_type of
<<"image/", _/binary>> ->
MD#url_metadata.final_url;
_ ->
Ps = case MD#url_metadata.is_index_page of
true ->
[<<"twitter:image:src">>, <<"twitter:image">>, <<"og:image">>,
image_nav, image];
false ->
[<<"twitter:image:src">>, <<"twitter:image">>, <<"og:image">>,
image, image_nav]
end,
case p1(Ps, MD) of
undefined -> undefined;
ImgSrc -> z_url:abs_link(ImgSrc, MD#url_metadata.final_url)
end
end;
p(icon, MD) ->
case MD#url_metadata.content_type of
<<"image/", _/binary>> ->
MD#url_metadata.final_url;
_ ->
Ps = case MD#url_metadata.is_index_page of
true ->
[image_nav, icon_touch, icon_shortcut, icon_fav];
false ->
[icon_touch, image_nav, icon_shortcut, icon_fav]
end,
case p1(Ps, MD) of
undefined -> undefined;
ImgSrc -> z_url:abs_link(ImgSrc, MD#url_metadata.final_url)
end
end;
p(tags, MD) ->
case p1([keywords], MD) of
undefined ->
% Check og tags? (youtube uses og:video:tag)
[];
KW ->
Ks = [ z_string:trim(K) || K <- binary:split(KW, <<",">>, [global]) ],
[ K || K <- Ks, K =/= <<>> ]
end;
p(filename, MD) ->
filename(MD#url_metadata.final_url, MD#url_metadata.headers);
p(Ks, MD) when is_list(Ks) ->
p1(Ks, MD);
p(K, MD) ->
p1([K], MD).
-spec header( binary() | string(), metadata() ) -> binary() | undefined.
header(H, #url_metadata{ headers = Hs }) ->
proplists:get_value(z_convert:to_binary(H), Hs).
-spec filename(binary()|string(), list()) -> binary() | undefined.
filename(Url, Hs) ->
case content_disp_filename(proplists:get_value(<<"content-disposition">>, Hs)) of
undefined -> basename(Url);
FN -> z_convert:to_binary(FN)
end.
%% ------------------------------------------------ Internal Functions ------------------------------------------------
%% Find the first defined property
p1([], _MD) ->
undefined;
p1([P|Ps], MD) ->
case proplists:get_value(P, MD#url_metadata.metadata) of
undefined -> p1(Ps, MD);
Value ->
case z_string:trim(Value) of
<<>> -> p1(Ps, MD);
Trimmed -> maybe_abs_link(is_link_property(P), Trimmed, MD#url_metadata.final_url)
end
end.
maybe_abs_link(false, Value, _FinalUrl) ->
Value;
maybe_abs_link(true, <<>>, _FinalUrl) ->
undefined;
maybe_abs_link(true, Value, FinalUrl) ->
Url1 = z_url:abs_link(Value, FinalUrl),
z_html:sanitize_uri(Url1).
is_link_property(canonical_url) -> true;
is_link_property(short_url) -> true;
is_link_property(image_nav) -> true;
is_link_property(image) -> true;
is_link_property(icon_nav) -> true;
is_link_property(icon_shortcut) -> true;
is_link_property(_) -> false.
content_disp_filename(undefined) ->
undefined;
content_disp_filename(Vs) ->
{_Disp, Options} = parse_header(Vs),
case proplists:get_value(<<"filename">>, Options) of
undefined -> undefined;
<<>> -> undefined;
FN -> FN
end.
basename(<<"data:", _/binary>>) ->
undefined;
basename("data:" ++ _) ->
undefined;
basename(Url) ->
#{ path := Path } = uri_string:parse( z_convert:to_binary(Url) ),
case Path of
<<>> -> undefined;
<<"/">> -> undefined;
_ ->
case lists:last( binary:split(Path, <<"/">>, [ global ]) ) of
<<>> ->
undefined;
Basename ->
% Perform percent-decode of the path
try
z_url:url_decode(Basename)
catch
_:_ ->
Basename
end
end
end.
%% ------------------------------------------------ From Mochiweb ------------------------------------------------
%% author Bob Ippolito <bob@mochimedia.com>
%% copyright 2007 Mochi Media, Inc.
%% @doc Parse a Content-Type like header, return the main Content-Type
%% and a property list of options.
-spec parse_header( binary() ) -> {binary(), [ {binary(), binary()} ]}.
parse_header(String) ->
%% TODO: This is exactly as broken as Python's cgi module.
%% Should parse properly like mochiweb_cookies.
[Type | Parts] = [z_string:trim(S) || S <- binary:split(String, <<";">>, [ global ])],
F = fun (S, Acc) ->
case binary:split(S, <<"=">>) of
[<<>>, _] ->
%% Skip anything with no name
Acc;
[_, <<>>] ->
%% Skip anything with no value
Acc;
[_] ->
Acc;
[Name, Value] ->
[{z_string:to_lower(z_string:trim(Name)),
unquote_header(z_string:trim(Value))} | Acc]
end
end,
{z_string:to_lower(Type), lists:foldr(F, [], Parts)}.
unquote_header(<<"\"", Rest/binary>>) ->
unquote_header(Rest, <<>>);
unquote_header(S) ->
S.
unquote_header(<<>>, Acc) ->
Acc;
unquote_header(<<"\"">>, Acc) ->
Acc;
unquote_header(<<$\\, C, Rest/binary>>, Acc) ->
unquote_header(Rest, <<Acc/binary, C>>);
unquote_header(<<C, Rest/binary>>, Acc) ->
unquote_header(Rest, <<Acc/binary, C>>).
%% -------------------------------------- Analyze fetched data -----------------------------------------
-record(ps, { in_nav = false }).
partial_metadata(Url, Hs, Data) ->
HsBin = [ {z_convert:to_binary(H), z_convert:to_binary(V)} || {H, V} <- Hs ],
{CT, CTOpts} = content_type(HsBin),
IsText = is_text(CT, Data),
IsHTML = IsText andalso is_html(CT),
Data1 = maybe_convert_utf8(IsText, IsHTML, proplists:get_value(<<"charset">>, CTOpts), Data),
#url_metadata{
final_url = z_convert:to_binary(Url),
content_type = CT,
content_type_options = CTOpts,
content_length = content_length(HsBin),
metadata = html_meta(IsHTML, Data1),
is_index_page = is_index_page(Url),
headers = HsBin,
partial_data = Data
}.
is_index_page(Url) ->
case uri_string:parse( z_convert:to_binary(Url) ) of
#{ query := _ } -> false;
#{ path := <<>> } -> true;
#{ path := <<"/">> } -> true;
#{ path := <<"/index.", _/binary>> } -> true;
#{ path := <<"/default.htm">> } -> true;
#{ path := <<"/Default.htm">> } -> true;
#{ path := <<"index.", _/binary>> } -> true;
#{ path := <<"default.htm">> } -> true;
#{ path := <<"Default.htm">> } -> true;
_ -> false
end.
html_meta(Data) ->
html_meta(true, Data).
html_meta(true, PartialData) ->
case parse(PartialData) of
{ok, Parsed} ->
lists:reverse(html(Parsed, [], #ps{}));
{error, _} ->
[]
end;
html_meta(false, _PartialData) ->
[].
parse(PartialData) when is_binary(PartialData) ->
parse_html(<<"<partial>", PartialData/binary, "</partial>">>);
parse(PartialData) when is_list(PartialData) ->
parse_html(iolist_to_binary([<<"<partial>">>, PartialData, <<"</partial>">>])).
parse_html(Html) ->
z_html_parse:parse(Html).
html([], MD, _P) ->
MD;
html([B|Es], MD, P) when is_binary(B) ->
html(Es, MD, P);
html([{comment, _}|Es], MD, P) ->
% <!-- ... -->
html(Es, MD, P);
html([{pi, _Xml, _Attrs}|Es], MD, P) ->
% <?xml version="1.0" encoding="UTF-8"?>
html(Es, MD, P);
html([Tag|Es], MD, P) ->
{MD1, P1} = tag(Tag, MD, P),
html(Es, MD1, P1);
html(Tag, MD, P) when is_tuple(Tag) ->
{MD1, _} = tag(Tag, MD, P),
MD1.
tag({<<"html">>, As, Es}, MD, P) ->
MD1 = case proplists:get_value(<<"lang">>, As) of
undefined -> MD;
Lang -> [{language, Lang} | MD]
end,
{html(Es, MD1, P), P};
tag({<<"meta">>, As, _}, MD, P) ->
Name = z_string:to_lower(proplists:get_value(<<"name">>, As)),
Property = proplists:get_value(<<"property">>, As),
HttpEquiv = proplists:get_value(<<"http-equiv">>, As),
Value = proplists:get_value(<<"value">>, As),
Content = proplists:get_value(<<"content">>, As, Value),
case first([Name, Property, HttpEquiv]) of
undefined ->
case proplists:get_value(<<"charset">>, As) of
undefined -> {MD, P};
Charset -> {[{charset,Charset} | MD], P}
end;
Prop ->
{meta_tag(Prop, Content, MD), P}
end;
tag({<<"title">>, _As, Es}, MD, P) ->
Text = z_string:trim(fetch_text(Es, <<>>)),
{[{title, Text} | MD], P};
tag({<<"link">>, As, _}, MD, P) ->
Name = z_string:to_lower(proplists:get_value(<<"rel">>, As)),
Content = proplists:get_value(<<"href">>, As),
{meta_link(Name, Content, As, MD), P};
tag({<<"img">>, As, _}, MD, P) ->
case proplists:get_value(<<"src">>, As, <<>>) of
<<>> ->
{MD, P};
Src ->
case is_img_allowed(Src, As) of
true ->
case P#ps.in_nav of
true -> {[{image_nav, Src} | MD], P};
false -> {[{image, Src} | MD], P}
end;
false ->
{MD, P}
end
end;
tag({<<"h1">>, _As, Es}, MD, #ps{in_nav=false} = P) ->
case proplists:is_defined(h1, MD) of
false ->
Text = z_string:trim(fetch_text(Es, <<>>)),
{[{h1, Text} | MD], P};
true ->
{MD, P}
end;
tag({<<"h1">>, _As, _Es}, MD, P) ->
{MD, P};
tag({<<"nav">>, _As, Es}, MD, P) ->
{html(Es, MD, P#ps{in_nav=true}), P};
tag({<<"header">>, _As, Es}, MD, P) ->
{html(Es, MD, P#ps{in_nav=true}), P};
tag({<<"footer">>, _As, Es}, MD, P) ->
{html(Es, MD, P#ps{in_nav=true}), P};
tag({<<"aside">>, _As, Es}, MD, P) ->
{html(Es, MD, P#ps{in_nav=true}), P};
tag({_Tag, As, Es}, MD, P) ->
Cs = split_class(proplists:get_value(<<"class">>, As)),
Id = proplists:get_value(<<"id">>, As),
case is_ads(Id, Cs) of
true ->
{MD, P};
false ->
{html(Es, MD, P#ps{in_nav = P#ps.in_nav orelse has_nav_class(Cs) orelse is_topbar_id(Id)}), P}
end.
meta_tag(<<"og:", _/binary>> = OG, Content, MD) -> [{OG, Content}|MD];
meta_tag(<<"twitter:", _/binary>> = Tw, Content, MD) -> [{Tw, Content}|MD];
meta_tag(<<"title">>, Content, MD) -> [{mtitle, Content}|MD];
meta_tag(<<"keywords">>, Content, MD) -> [{keywords, Content}|MD];
meta_tag(<<"description">>, Content, MD) -> [{description, Content}|MD];
meta_tag(<<"author">>, Content, MD) -> [{author, Content}|MD];
meta_tag(<<"thumbnail">>, Content, MD) -> [{thumbnail, Content}|MD];
meta_tag(<<"content-type">>, Content, MD) -> [{content_type, Content}|MD];
meta_tag(_Name, _Content, MD) -> MD.
meta_link(<<"canonical">>, Content, _As, MD) -> [{canonical_url, Content}|MD];
meta_link(<<"shortlink">>, Content, _As, MD) -> [{short_url, Content}|MD];
meta_link(<<"shorturl">>, Content, _As, MD) -> [{short_url, Content}|MD];
meta_link(<<"icon">>, Content, As, MD) ->
case proplists:is_defined(<<"mask">>, As) of
true -> MD;
false -> [{icon_fav, Content}|MD]
end;
meta_link(<<"shortcut icon">>, Content, _As, MD) -> [{icon_shortcut, Content}|MD];
meta_link(<<"apple-touch-icon">>, Content, _As, MD) -> [{icon_touch, Content}|MD];
meta_link(_Name, _Content, _As, MD) -> MD.
split_class(undefined) -> [];
split_class(Class) -> binary:split(Class, <<" ">>, [global]).
has_nav_class(Cs) ->
lists:any(fun is_nav_class/1, Cs).
is_nav_class(<<"nav", _/binary>>) -> true;
is_nav_class(<<"menu", _/binary>>) -> true;
is_nav_class(_) -> false.
is_topbar_id(<<"top">>) -> true;
is_topbar_id(<<"header", _/binary>>) -> true;
is_topbar_id(_) -> false.
is_ads(<<"ad">>, _Cs) -> true;
is_ads(<<"ads">>, _Cs) -> true;
is_ads(_, Cs) -> lists:any(fun is_ad_class/1, Cs).
is_ad_class(<<"ads">>) -> true;
is_ad_class(<<"ad">>) -> true;
is_ad_class(<<"deckad">>) -> true;
is_ad_class(_) -> false.
fetch_text(B, Acc) when is_binary(B) ->
<<Acc/binary, B/binary>>;
fetch_text({comment, _}, Acc) ->
Acc;
fetch_text({_Tag, _As, Es}, Acc) ->
fetch_text(Es, Acc);
fetch_text([], Acc) ->
Acc;
fetch_text([E|Es], Acc) ->
fetch_text(Es, fetch_text(E, Acc)).
first([]) -> undefined;
first([undefined|Rest]) -> first(Rest);
first([<<>>|Rest]) -> first(Rest);
first([X|_]) -> X.
is_html(<<"text/html">>) -> true;
is_html(<<"application/xhtml">>) -> true;
is_html(<<"application/xhtml+", _/binary>>) -> true;
is_html(_) -> false.
%% Some servers send us 'gzip', even when we ask for 'identity'
is_text(_CT, <<31, 198, Method, _/binary>>) when Method =< 8 ->
false;
is_text(CT, _Data) ->
is_text(CT).
is_text(<<"text/", _/binary>>) -> true;
is_text(<<"application/javascript">>) -> true;
is_text(<<"application/xhtml">>) -> true;
is_text(<<"application/xhtml+", _/binary>>) -> true;
is_text(_) -> false.
% Suppres tracking pixels and small images
is_img_allowed(<<>>, _As) ->
false;
is_img_allowed(Url, As) ->
not is_img_small(As)
andalso binary:match(Url, img_blocklist()) =:= nomatch.
% Images are considered small if their width or height is smaller than 16px
is_img_small(As) ->
is_small_size(proplists:get_value(<<"width">>, As))
orelse is_small_size(proplists:get_value(<<"height">>, As)).
is_small_size(undefined) -> false;
is_small_size(<<>>) -> false;
is_small_size(Size) ->
try
Sz = z_convert:to_integer(Size),
Sz =< ?IMG_SMALL_SIZE
catch
_:_ -> false
end.
% Add parts of image URLs to be suppressed
img_blocklist() -> [
<<"//www.facebook.com/tr?">>,
<<"//www.googleadservices.com/pagead/">>,
<<"futuresimple.com/api/v1/">>,
<<"tracking.cirrusinsight.com">>,
<<"list-manage.com/track">>,
<<"mjt.lu/oo">>,
<<"/1x1/">>
].
% Add nowarn because the iconv module is optional.
-dialyzer({[ nowarn_function ], maybe_convert_utf8/4}).
maybe_convert_utf8(true, IsHtml, Charset, Html) ->
CS1 = z_convert:to_list(
z_string:to_lower(
html_charset(IsHtml, Charset, Html))),
case is_utf8(CS1) of
true ->
Html;
false ->
try
case iconv:open(CS1, "UTF-8") of
{ok, C} ->
case iconv:conv(C, Html) of
{ok, Html1} ->
iconv:close(C),
Html1;
{error, _} ->
Html
end;
{error, _} ->
Html
end
catch
_:_ -> Html
end
end;
maybe_convert_utf8(false, _IsHtml, _Charset, Data) ->
Data.
is_utf8("utf-8") -> true;
is_utf8("utf8") -> true;
is_utf8(_) -> false.
html_charset(IsHtml, undefined, Text) ->
html_charset(IsHtml, <<"iso-8859-1">>, Text);
html_charset(true, Charset, Html) ->
meta_charset(Charset, Html);
html_charset(false, Charset, _Text) ->
Charset.
meta_charset(Ch, Html) ->
case re:run(Html, "<[mM][eE][tT][aA][^>]*[cC][hH][aA][rR][sS][eE][tT]\\s*=\\s*[\"']?([A-Za-z0-9_-]+)", [{capture,all_but_first,binary}]) of
{match, [CharSet|_]} -> CharSet;
_ -> Ch
end.
content_type(Hs) ->
case proplists:get_value(<<"content-type">>, Hs) of
undefined ->
{<<"application/octet-stream">>, []};
CT ->
{Mime, Options} = parse_header(CT),
{z_convert:to_binary(Mime), Options}
end.
content_length(Hs) ->
try
case proplists:get_value(<<"content-range">>, Hs) of
undefined ->
case proplists:get_value(<<"content-length">>, Hs) of
undefined -> undefined;
N -> binary_to_integer(N)
end;
<<"bytes ", Range/binary>> ->
Ts = binary:split(Range, <<"/">>, [global]),
binary_to_integer(lists:last(Ts))
end
catch
_:_ -> undefined
end.
%%
%% Tests
%%
-ifdef(TEST).
-include_lib("eunit/include/eunit.hrl").
simple_partial_metadata_test() ->
Url = "http://example.org",
Headers = [{"content-type", "text/html"}],
Data = <<"<html><head><title>Example</title><body></body></html>">>,
MD = partial_metadata(Url, Headers, Data),
?assertEqual(<<"http://example.org">>, MD#url_metadata.final_url),
?assertEqual(<<"text/html">>, MD#url_metadata.content_type),
?assertEqual([{title, <<"Example">>}], MD#url_metadata.metadata),
ok.
simple_html_meta_test() ->
Data = <<"<html><head><title>Example</title><body></body></html>">>,
?assertEqual([{title, <<"Example">>}], html_meta(Data)),
ok.
partial_unbalanced_tags_html_meta_test() ->
Data = <<"<head><meta name=\"description\" content=\"Example Content\"><title>Example</title>">>,
?assertEqual([{description, <<"Example Content">>},
{title, <<"Example">>}], html_meta(Data)),
ok.
partial_no_surrounding_tags_html_meta_test() ->
Data = <<"<meta name=\"description\" content=\"Example Content\"><title>Example</title>">>,
?assertEqual([{description, <<"Example Content">>},
{title, <<"Example">>}], html_meta(Data)),
ok.
partial_ampersant_in_html_meta_test() ->
Data = <<"<meta name=\"description\" content=\"Example & Stuff\"><title>Foo & Co</title>">>,
?assertEqual([{description, <<"Example & Stuff">>},
{title, <<"Foo & Co">>}], html_meta(Data)),
ok.
-endif.