%% @author Marc Worrell <marc@worrell.nl>
%% @copyright 2009-2025 Marc Worrell
%% @doc Utility functions sanitizing, escaping and filtering HTML, and sanitize property lists/maps.
%%
%% Utiliy functions to:
%%
%% - Sanitize HTML texts
%% - Truncate HTML, whilst ensuring that opened tags are properly closed.
%% - Escape and unescape HTML texts
%% - Newline to br-tag mapping (and vice versa)
%% - Stripping HTML tags from a text
%% - Sanitize nested property lists/maps, using type hints from the property names.
%% - Make relative URLs absolute based on a base URL.
%% @end
%% Copyright 2009-2025 Marc Worrell
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
-module(z_html).
-author("Marc Worrell <marc@worrell.nl").
-export([
escape_props/1,
escape_props/2,
escape/1,
escape_props_check/1,
escape_props_check/2,
escape_check/1,
unescape/1,
strip/1,
strip/2,
truncate/2,
truncate/3,
sanitize/1,
sanitize/2,
sanitize/4,
noscript/1,
noscript/2,
sanitize_uri/1,
sanitize_uri/2,
escape_link/1,
nl2br/1,
br2nl/1,
scrape_link_elements/1,
ensure_escaped_amp/1,
abs_links/2
]).
-type text() :: iodata() | {trans, list( {atom(), binary()} )}.
-type unsafe_text() :: iodata()
| {trans, list( {atom(), iodata()} )}
| {trans, list( {binary(), iodata()} )}
| {trans, map()}.
-type maybe_text() :: undefined | text().
-type maybe_unsafe_text() :: undefined | unsafe_text().
-type maybe_binary() :: undefined | binary().
-type maybe_iodata() :: undefined | iodata().
-type sanitize_options() :: [ sanitize_option() ].
-type sanitize_option() :: {elt_extra, list( binary() )}
| {attr_extra, list( binary() )}
| {element, function()}.
-export_type([
text/0,
unsafe_text/0,
maybe_text/0,
maybe_unsafe_text/0,
maybe_binary/0,
maybe_iodata/0,
sanitize_options/0,
sanitize_option/0
]).
% Used by z_svg.erl
% @todo: move this to separate erlang module
-export([
flatten_attr/1,
sanitize_attr_value/2,
escape_html_text/2,
escape_html_comment/2
]).
%% @doc Escape all properties used for an update statement. Only leaves the body property intact.
-spec escape_props(list() | map()) -> list() | map().
escape_props(Props) ->
escape_props(Props, []).
-spec escape_props(list() | map(), Options::list()) -> list() | map().
escape_props(Props, Options) when is_list(Props) ->
lists:map(
fun({P, V}) ->
V1 = escape_props1(z_convert:to_binary(P), V, Options),
{P, V1}
end,
Props);
escape_props(Props, Options) when is_map(Props) ->
maps:fold(
fun(K, V, Acc) ->
K1 = z_convert:to_binary(K),
Acc#{ K1 => escape_props1(K1, V, Options)}
end,
#{},
Props).
escape_props1(_K, null, _Options) ->
null;
escape_props1(_K, undefined, _Options) ->
undefined;
escape_props1(_K, V, _Options) when is_number(V); is_boolean(V) ->
V;
escape_props1(K, V, Options) when is_atom(V) ->
V1 = atom_to_binary(V, utf8),
case escape_props1(K, V1, Options) of
V1 -> V;
_ -> undefined
end;
escape_props1(<<"body", _/binary>>, V, Options) ->
sanitize(V, Options);
escape_props1(<<"summary">>, Summary, _Options) ->
nl2br(escape_value(Summary));
escape_props1(<<"blocks">>, V, Options) when is_list(V) ->
[ escape_props(L, Options) || L <- V ];
escape_props1(<<"website">>, V, _Options) ->
escape_value(sanitize_uri(V));
escape_props1(<<"@id">>, V, _Options) ->
escape_value(sanitize_uri(V));
escape_props1(<<"is_a", _/binary>>, V, Options) ->
sanitize_list([], V, Options);
escape_props1(<<"is_", _/binary>>, V, _Options) ->
z_convert:to_bool(V);
escape_props1(K, V, Options) ->
[Type|Ks] = lists:reverse(binary:split(K, <<"_">>, [global])),
sanitize_type(Type, Ks, V, Options).
sanitize_type(<<"html">>, _Ks, V, Options) -> sanitize(V, Options);
sanitize_type(<<"uri">>, _Ks, V, _Options) -> escape_value(sanitize_uri(V));
sanitize_type(<<"url">>, _Ks, V, _Options) -> escape_value(sanitize_uri(V));
sanitize_type(<<"list">>, Ks, V, Options) -> sanitize_list(Ks, V, Options);
sanitize_type(<<"int">>, _Ks, V, _Options) -> sanitize_int(V);
sanitize_type(<<"id">>, _Ks, undefined, _Options) -> undefined;
sanitize_type(<<"id">>, _Ks, <<>>, _Options) -> undefined;
sanitize_type(<<"id">>, _Ks, V, _Options) when V =/= undefined ->
try
z_convert:to_integer(V)
catch
_:_ -> escape_value(V)
end;
sanitize_type(<<"unsafe">>, _Ks, V, _Options) -> V;
sanitize_type(_, _Ks, V, Options) when is_map(V) -> escape_props(V, Options);
sanitize_type(_, Ks, V, Options) when is_list(V) -> sanitize_list(Ks, V, Options);
sanitize_type(_, _Ks, V, _Options) -> escape_value(V).
sanitize_list(Ks, L, Options) when is_list(L) ->
lists:map(
fun
({trans, Tr} = V) when is_list(Tr) ->
escape(V);
({P, V}) ->
P1 = z_convert:to_binary(P),
V1 = escape_props1(P1, V, Options),
{P1, V1};
(V) when is_list(V), Ks =:= [] ->
escape_props(V, Options);
(V) when is_map(V) ->
escape_props(V, Options);
(V) when Ks =:= [] ->
escape_value(V);
(V) ->
[Type|Ks1] = Ks,
sanitize_type(Type, Ks1, V, Options)
end,
L);
sanitize_list(Ks, Map, Options) when is_map(Map) ->
sanitize_list(Ks, maps:to_list(Map), Options);
sanitize_list(_Ks, undefined, _Options) ->
undefined;
sanitize_list(Ks, V, Options) ->
sanitize_list(Ks, [V], Options).
sanitize_int(V) ->
try
z_convert:to_integer(V)
catch
_:_ -> undefined
end.
escape_value(undefined) -> undefined;
escape_value(null) -> null;
escape_value(V) when is_boolean(V) -> V;
escape_value(V) when is_number(V) -> V;
escape_value(V) when is_atom(V) ->
V1 = atom_to_binary(V, utf8),
case escape_value(V1) of
V1 -> V;
_ -> undefined
end;
escape_value({trans, _Ts} = Tr) ->
escape(Tr);
escape_value(V) when is_list(V) ->
try
escape_value(unicode:characters_to_binary(V))
catch _:_ ->
V
end;
escape_value(B) when is_binary(B) ->
escape(B);
escape_value(V) ->
V.
%% @doc Checks if all properties are properly escaped
-spec escape_props_check(list() | map()) -> list() | map().
escape_props_check(Props) ->
escape_props_check(Props, []).
-spec escape_props_check(list() | map(), Options::list()) -> list() | map().
escape_props_check(Props, Options) when is_list(Props) ->
lists:map(
fun
({P, V}) ->
V1 = escape_props_check1(z_convert:to_binary(P), V, Options),
{P, V1};
(V) when is_list(V); is_map(V)->
escape_props_check(V, Options);
(V) ->
escape_value_check(V)
end,
Props);
escape_props_check(Props, Options) when is_map(Props) ->
maps:map(
fun(P, V) ->
escape_props_check1(z_convert:to_binary(P), V, Options)
end,
Props).
escape_props_check1(_K, null, _Options) ->
null;
escape_props_check1(_K, undefined, _Options) ->
undefined;
escape_props_check1(_K, V, _Options) when is_number(V); is_boolean(V) ->
V;
escape_props_check1(K, V, Options) when is_atom(V) ->
V1 = atom_to_binary(V, utf8),
case escape_props_check1(K, V1, Options) of
V1 -> V;
_ -> undefined
end;
escape_props_check1(<<"body", _/binary>>, V, Options) ->
sanitize(V, Options);
escape_props_check1(<<"summary">>, Summary, _Options) ->
nl2br(escape_check(br2nl(Summary)));
escape_props_check1(<<"blocks">>, V, Options) when is_list(V) ->
[ escape_props_check(L, Options) || L <- V ];
escape_props_check1(<<"website">>, V, _Options) ->
escape_value(sanitize_uri(unescape(V)));
escape_props_check1(<<"@id">>, V, _Options) ->
escape_value(sanitize_uri(unescape(V)));
escape_props_check1(<<"is_a">>, L, Options) when is_list(L) ->
sanitize_list_check(L, Options);
escape_props_check1(<<"is_", _/binary>>, V, _Options) ->
z_convert:to_bool(V);
escape_props_check1(K, V, Options) ->
Type = lists:last(binary:split(K, <<"_">>, [global])),
sanitize_type_check(Type, V, Options).
sanitize_type_check(<<"html">>, V, Options) -> sanitize(V, Options);
sanitize_type_check(<<"uri">>, V, _Options) -> escape_value(sanitize_uri(unescape(V)));
sanitize_type_check(<<"url">>, V, _Options) -> escape_value(sanitize_uri(unescape(V)));
sanitize_type_check(<<"list">>, V, Options) -> sanitize_list_check(V, Options);
sanitize_type_check(<<"int">>, V, _Options) -> sanitize_int(V);
sanitize_type_check(<<"unsafe">>, V, _Options) -> V;
sanitize_type_check(_, V, Options) when is_map(V) -> escape_props_check(V, Options);
sanitize_type_check(_, V, Options) when is_list(V) -> escape_props_check(V, Options);
sanitize_type_check(_, V, _Options) -> escape_value_check(V).
sanitize_list_check(L, Options) when is_list(L) ->
lists:map(
fun
({trans, Tr} = V) when is_list(Tr) ->
escape_check(V);
({P, V}) ->
P1 = z_convert:to_binary(P),
V1 = escape_props_check1(P1, V, Options),
{P1, V1};
(V) when is_list(V); is_map(V)->
escape_props_check(V, Options);
(V) ->
escape_value_check(V)
end,
L);
sanitize_list_check(Map, Options) when is_map(Map) ->
sanitize_list_check(maps:to_list(Map), Options);
sanitize_list_check(undefined, _Options) ->
undefined;
sanitize_list_check(V, Options) ->
sanitize_list_check([V], Options).
escape_value_check(undefined) -> undefined;
escape_value_check(null) -> null;
escape_value_check(V) when is_boolean(V) -> V;
escape_value_check(V) when is_number(V) -> V;
escape_value_check(V) when is_atom(V) ->
V1 = atom_to_binary(V, utf8),
case escape_check(V1) of
V1 -> V;
_ -> undefined
end;
escape_value_check({trans, _Ts} = Tr) ->
escape_check(Tr);
escape_value_check(V) when is_list(V) ->
try
escape_check(unicode:characters_to_binary(V))
catch _:_ ->
V
end;
escape_value_check(B) when is_binary(B) ->
escape_check(B);
escape_value_check(V) ->
V.
%% @doc Escape a string so that it is valid within HTML/ XML.
-spec escape( maybe_unsafe_text() ) -> maybe_text().
escape({trans, Tr}) when is_list(Tr) ->
Tr1 = lists:filtermap(
fun
({Lang, V}) when is_atom(Lang) ->
V1 = z_convert:to_binary(V),
{true, {Lang, escape(V1)}};
({Lang, V}) when is_binary(Lang) ->
try
Lang1 = binary_to_existing_atom(Lang, utf8),
V1 = z_convert:to_binary(V),
{true, {Lang1, escape(V1)}}
catch _:_ ->
false
end;
(_) ->
false
end,
Tr),
{trans, Tr1};
escape({trans, Tr}) when is_map(Tr) ->
escape({trans, maps:to_list(Tr)});
escape({trans, _}) ->
<<>>;
escape(undefined) ->
undefined;
escape(<<>>) ->
<<>>;
escape([]) ->
<<>>;
escape(L) when is_list(L) ->
escape(list_to_binary(L));
escape(B) when is_binary(B) ->
escape1(B, <<>>).
escape1(<<>>, Acc) ->
Acc;
escape1(<<"€", T/binary>>, Acc) ->
escape1(T, <<Acc/binary, "€">>);
escape1(<<$&, T/binary>>, Acc) ->
escape1(T, <<Acc/binary, "&">>);
escape1(<<$<, T/binary>>, Acc) ->
escape1(T, <<Acc/binary, "<">>);
escape1(<<$>, T/binary>>, Acc) ->
escape1(T, <<Acc/binary, ">">>);
escape1(<<$", T/binary>>, Acc) ->
escape1(T, <<Acc/binary, """>>);
escape1(<<$', T/binary>>, Acc) ->
escape1(T, <<Acc/binary, "'">>);
escape1(<<C, T/binary>>, Acc) ->
escape1(T, <<Acc/binary, C>>).
%% @doc Ensure that a string is escaped so that it is valid within HTML/ XML.
-spec escape_check( maybe_unsafe_text() ) -> maybe_text().
escape_check({trans, Tr}) when is_list(Tr) ->
Tr1 = lists:filtermap(
fun
({Lang, V}) when is_atom(Lang) ->
V1 = z_convert:to_binary(V),
{true, {Lang, escape_check(V1)}};
({Lang, V}) when is_binary(Lang) ->
try
Lang1 = binary_to_existing_atom(Lang, utf8),
V1 = z_convert:to_binary(V),
{true, {Lang1, escape_check(V1)}}
catch _:_ ->
false
end;
(_) ->
false
end,
Tr),
{trans, Tr1};
escape_check({trans, Tr}) when is_map(Tr) ->
escape_check({trans, maps:to_list(Tr)});
escape_check({trans, _}) ->
<<>>;
escape_check(undefined) ->
undefined;
escape_check(<<>>) ->
<<>>;
escape_check([]) ->
<<>>;
escape_check(L) when is_list(L) ->
escape_check1(iolist_to_binary(L), <<>>);
escape_check(B) when is_binary(B) ->
escape_check1(B, <<>>);
escape_check(Other) ->
Other.
escape_check1(<<>>, Acc) ->
Acc;
escape_check1(<<"€", T/binary>>, Acc) ->
escape_check1(T, <<Acc/binary, "€">>);
escape_check1(<<"&", T/binary>>, Acc) ->
escape_check1(T, <<Acc/binary, "&">>);
escape_check1(<<"<", T/binary>>, Acc) ->
escape_check1(T, <<Acc/binary, "<">>);
escape_check1(<<">", T/binary>>, Acc) ->
escape_check1(T, <<Acc/binary, ">">>);
escape_check1(<<""", T/binary>>, Acc) ->
escape_check1(T, <<Acc/binary, """>>);
escape_check1(<<"'", T/binary>>, Acc) ->
escape_check1(T, <<Acc/binary, "'">>);
escape_check1(<<"'", T/binary>>, Acc) ->
escape_check1(T, <<Acc/binary, "'">>);
escape_check1(<<"/", T/binary>>, Acc) ->
escape_check1(T, <<Acc/binary, "/">>);
escape_check1(<<$&, Rest/binary>>, Acc) ->
case try_amp(Rest, in_amp, <<>>) of
{Amp,Rest1} -> escape_check1(Rest1, <<Acc/binary, $&, Amp/binary>>);
false -> escape_check1(Rest, <<Acc/binary, "&">>)
end;
escape_check1(<<$<, T/binary>>, Acc) ->
escape_check1(T, <<Acc/binary, "<">>);
escape_check1(<<$>, T/binary>>, Acc) ->
escape_check1(T, <<Acc/binary, ">">>);
escape_check1(<<$", T/binary>>, Acc) ->
escape_check1(T, <<Acc/binary, """>>);
escape_check1(<<$', T/binary>>, Acc) ->
escape_check1(T, <<Acc/binary, "'">>);
escape_check1(<<C, T/binary>>, Acc) ->
escape_check1(T, <<Acc/binary, C>>).
%% @doc Unescape - reverses the effect of escape.
-spec unescape( maybe_text() ) -> maybe_text().
unescape({trans, Tr}) ->
{trans, [{Lang, unescape(V)} || {Lang,V} <- Tr]};
unescape(undefined) ->
undefined;
unescape(<<>>) ->
<<>>;
unescape([]) ->
<<>>;
unescape(L) when is_list(L) ->
unescape(list_to_binary(L));
unescape(B) when is_binary(B) ->
unescape(B, <<>>).
unescape(<<>>, Acc) ->
Acc;
unescape(<<"&", Rest/binary>>, Acc) ->
unescape_in_charref(Rest, <<>>, Acc);
unescape(<<C, T/binary>>, Acc) ->
unescape(T, <<Acc/binary, C>>).
unescape_in_charref(<<>>, CharAcc, ContAcc) ->
<<ContAcc/binary, $&, CharAcc/binary>>; %% premature end of string; keep.
unescape_in_charref(<<$;, Rest/binary>>, CharAcc, ContAcc) ->
case z_html_charref:charref(CharAcc) of
undefined ->
%% keep original code
unescape(Rest, <<ContAcc/binary, $&, CharAcc/binary, $;>>);
Ch ->
%% replace the real char
ChBin = unicode:characters_to_binary([Ch]),
unescape(Rest, <<ContAcc/binary, ChBin/binary>>)
end;
unescape_in_charref(<<Ch/integer, Rest/binary>>, CharAcc, ContAcc) ->
unescape_in_charref(Rest, <<CharAcc/binary, Ch>>, ContAcc).
%% @doc Escape a text. Expands any urls to links with a nofollow attribute.
-spec escape_link( maybe_iodata() ) -> maybe_binary().
escape_link(undefined) ->
undefined;
escape_link(<<>>) ->
<<>>;
escape_link("") ->
<<>>;
escape_link(Text) when is_binary(Text) ->
case re:run(Text,
"(("
"mailto:"
"[-a-zA-Z0-9_\\.\\(\\)\\+=%~]+"
"@"
"|"
"ftp://|http://|https://|www\\."
")"
"[-a-zA-Z0-9]+(\\.[-a-zA-Z0-9]+)+"
"(/[-/_a-zA-Z0-9\\.:\\+%;~]*)?"
"(\\?[-_a-zA-Z0-9\\.:\\+%=&;\\$/~]*)?"
"(#[-_a-zA-Z0-9\\.:\\+%=&;\\$/~]*)?)",
[{capture, first}, global])
of
{match, Matches} ->
Matches1 = [ hd(M) || M <- Matches ],
Parts = split_in_links(lists:reverse(Matches1), Text, []),
Linked = lists:map(fun make_link/1, Parts),
nl2br(iolist_to_binary(Linked));
nomatch ->
nl2br(escape(Text))
end;
escape_link(Text) ->
escape_link(iolist_to_binary(Text)).
make_link(B) when is_binary(B) ->
escape(B);
make_link({link, Link}) ->
NoScript = noscript(Link, false),
LinkText = escape(NoScript),
LinkUrl = escape(ensure_protocol(NoScript)),
<<
"<a href=\"",
LinkUrl/binary,
"\" rel=\"noopener nofollow noreferrer\">",
LinkText/binary,
"</a>"
>>.
split_in_links([], Text, Acc) ->
[ Text | Acc ];
split_in_links([ {Offset, Len}|Matches ], Text, Acc) ->
<<Before:Offset/binary, Link:Len/binary, Rest/binary>> = Text,
Acc1 = [ {link, Link}, Rest | Acc ],
split_in_links(Matches, Before, Acc1).
ensure_protocol(<<>>) -> <<>>;
ensure_protocol(<<"#", _/binary>> = Link) -> Link;
ensure_protocol(<<"?", _/binary>> = Link) -> Link;
ensure_protocol(<<"/", _/binary>> = Link) -> Link;
ensure_protocol(<<"://", _/binary>> = Link) -> <<"http", Link/binary>>;
ensure_protocol(<<"http:", _/binary>> = Link) -> Link;
ensure_protocol(<<"https:", _/binary>> = Link) -> Link;
ensure_protocol(<<"data:", _/binary>> = Link) -> Link;
ensure_protocol(<<"urn:", _/binary>> = Link) -> Link;
ensure_protocol(<<"ftp:", _/binary>> = Link) -> Link;
ensure_protocol(<<"mailto:", Rest/binary>>) -> <<"mailto:", (z_string:trim(Rest))/binary>>;
ensure_protocol(<<"www.", _/binary>> = Link) -> <<"https://", Link/binary>>;
ensure_protocol(Link) when is_binary(Link) ->
case binary:match(Link, <<":">>) of
nomatch ->
% If the first path element looks like a domain name then
% make it a https: link. Otherwise add a '/' in front.
[First|_] = binary:split(Link, <<"/">>),
case binary:match(First, <<".">>) of
nomatch -> <<$/, Link/binary>>;
_Match -> <<"https://", Link/binary>>
end;
_ ->
Link
end.
%% @doc Ensure that an uri is (quite) harmless by removing any script reference
-spec sanitize_uri( maybe_iodata() ) -> maybe_binary().
sanitize_uri(MaybeUrl) ->
sanitize_uri(MaybeUrl, true).
%% @doc Ensure that an uri is (quite) harmless by removing any script reference. Option
%% to allow 'data:' urls. Data urls are only passed if their mime type is denotes an image,
%% video, or plain-text. SVGs will be sanitized.
-spec sanitize_uri(MaybeUrl, IsAllowData) -> maybe_binary() when
MaybeUrl :: maybe_iodata(),
IsAllowData :: boolean().
sanitize_uri(undefined, _IsAllowData) ->
undefined;
sanitize_uri(<<>>, _IsAllowData) ->
<<>>;
sanitize_uri([], _IsAllowData) ->
<<>>;
sanitize_uri(Uri, IsAllowData) ->
case iolist_to_binary(ensure_protocol(noscript(z_string:trim(Uri), IsAllowData))) of
<<"data:", _/binary>> = Uri1 ->
cleanup_uri_chars(Uri1, data, <<>>);
Uri1 ->
cleanup_uri_chars(Uri1, url, <<>>)
end.
cleanup_uri_chars(<<>>, _Mode, Acc) ->
Acc;
cleanup_uri_chars(<<$%, A, B, C/binary>>, Mode, Acc)
when ((A >= $0 andalso A =< $9) orelse (A >= $A andalso A =< $Z))
andalso ((B >= $0 andalso B =< $9) orelse (B >= $A andalso B =< $Z)) ->
cleanup_uri_chars(C, Mode, <<Acc/binary, $%, A, B>>);
cleanup_uri_chars(<<C, B/binary>>, Mode, Acc)
when C =:= $.; C =:= $&; C =:= $:; C =:= $/;
C =:= $=; C =:= $?; C =:= $#; C =:= $+ ->
cleanup_uri_chars(B, Mode, <<Acc/binary, C>>);
cleanup_uri_chars(<<C, B/binary>>, data, Acc)
when C =:= $,; C =:= $; ->
cleanup_uri_chars(B, data, <<Acc/binary, C>>);
cleanup_uri_chars(<<C, B/binary>>, Mode, Acc) ->
case z_url:url_unreserved_char(C) of
false ->
C1 = iolist_to_binary(z_url:hex_encode([C])),
cleanup_uri_chars(B, Mode, <<Acc/binary, $%, C1/binary>>);
true ->
cleanup_uri_chars(B, Mode, <<Acc/binary, C>>)
end.
%% @doc Strip all html elements from the text. Simple parsing is applied to find the elements.
%% Does not escape the end result.
-spec strip( maybe_text() ) -> binary().
strip(Text) ->
strip(Text, nolimit).
%% @doc Strip all html elements from the text. Simple parsing is applied to find the elements.
%% Does not escape the end result. Limit the length of the result string to N characters.
-spec strip( maybe_text(), integer() | nolimit ) -> binary().
strip(undefined, _N) ->
<<>>;
strip(<<>>, _N) ->
<<>>;
strip("", _N) ->
<<>>;
strip({trans, Tr}, N) ->
{trans, [{Lang, strip(V, N)} || {Lang,V} <- Tr]};
strip(Html, N) when is_binary(Html) ->
strip(Html, <<>>, N);
strip(L, N) when is_list(L) ->
strip(iolist_to_binary(L), N);
strip(V, N) ->
strip(z_convert:to_binary(V), N).
strip(_, Acc, N) when is_integer(N), N =< 0 ->
Acc;
strip(<<>>, Acc, _N) ->
Acc;
strip(<<"<wbr>",T/binary>>, Acc, N) ->
strip(T, Acc, N);
strip(<<"</span>",T/binary>>, Acc, N) ->
strip(T, Acc, N);
strip(<<"</a>",T/binary>>, Acc, N) ->
strip(T, Acc, N);
strip(<<"<script", T/binary>>, Acc, N) ->
case binary:split(T, <<"</script">>) of
[_] -> Acc;
[_, Rest] -> strip_tag(Rest, Acc, N)
end;
strip(<<"<noscript", T/binary>>, Acc, N) ->
case binary:split(T, <<"</noscript">>) of
[_] -> Acc;
[_, Rest] -> strip_tag(Rest, Acc, N)
end;
strip(<<"<style", T/binary>>, Acc, N) ->
case binary:split(T, <<"</style">>) of
[_] -> Acc;
[_, Rest] -> strip_tag(Rest, Acc, N)
end;
strip(<<"<!--",T/binary>>, Acc, N) ->
case binary:split(T, <<"-->">>) of
[_] -> Acc;
[_, Rest] -> strip(Rest, Acc, N)
end;
strip(<<"<",T/binary>>, Acc, N) ->
strip_tag(T, Acc, N);
strip(<<H/utf8,T/binary>>, Acc, N) ->
strip(T, <<Acc/binary, H/utf8>>, sub1(N));
strip(<<_,T/binary>>, Acc, N) ->
% Drop non-utf8 data
strip(T, Acc, N).
strip_tag(<<>>, Acc, _N) ->
Acc;
strip_tag(<<">">>, Acc, _N) ->
Acc;
strip_tag(<<">", WS, T/binary>>, Acc, N) when WS =< 32 ->
strip(T, <<Acc/binary, WS>>, sub1(N));
strip_tag(<<">", T/binary>>, <<>>, N) ->
strip(T, <<>>, N);
strip_tag(<<">", T/binary>>, Acc, N) ->
case T of
<<"</", _/binary>> ->
strip(T, Acc, N);
_ ->
case binary:last(Acc) of
C when C =< 32 ->
strip(T, Acc, N);
_ ->
strip(T, <<Acc/binary, " ">>, sub1(N))
end
end;
strip_tag(<<$",T/binary>>, Acc, N) ->
strip_dstring(T, Acc, N);
strip_tag(<<$',T/binary>>, Acc, N) ->
strip_sstring(T, Acc, N);
strip_tag(<<_, T/binary>>, Acc, N) ->
strip_tag(T, Acc, N).
strip_dstring(<<>>, Acc, _) ->
Acc;
strip_dstring(<<$\\, _, T/binary>>, Acc, N) ->
strip_dstring(T, Acc, N);
strip_dstring(<<$",T/binary>>, Acc, N) ->
strip_tag(T, Acc, N);
strip_dstring(<<_,T/binary>>, Acc, N) ->
strip_dstring(T, Acc, N).
strip_sstring(<<>>, Acc, _) ->
Acc;
strip_sstring(<<$\\, _, T/binary>>, Acc, N) ->
strip_sstring(T, Acc, N);
strip_sstring(<<$',T/binary>>, Acc, N) ->
strip_tag(T, Acc, N);
strip_sstring(<<_,T/binary>>, Acc, N) ->
strip_sstring(T, Acc, N).
sub1(nolimit) -> nolimit;
sub1(N) -> N-1.
%% @doc Truncate a previously sanitized HTML string.
-spec truncate( maybe_text(), integer() ) -> maybe_text().
truncate(Html,Length) ->
truncate(Html, Length, <<>>).
-spec truncate( maybe_text(), integer(), iodata() ) -> maybe_text().
truncate(undefined, _Length, _Append) ->
undefined;
truncate(_, Length, _Append) when Length =< 0 ->
<<>>;
truncate(<<>>, _Length, _Append) ->
<<>>;
truncate("", _Length, _Append) ->
<<>>;
truncate({trans, Tr}, Length, Append) ->
{trans, [{Lang,truncate(V,Length, Append)} || {Lang,V} <- Tr]};
truncate(Html, Length, Append) when is_list(Html) ->
truncate(unicode:characters_to_binary(Html), Length, Append);
truncate(Html, Length, Append) when is_binary(Html) ->
case size(Html) of
N when N =< Length ->
Html;
_ ->
truncate(Html, in_text, [], <<>>, <<>>, Length, Append)
end.
truncate(<<>>, _State, _Stack, _TagAcc, Acc, _Length, _Append) ->
Acc;
truncate(<<"<!--", Rest/binary>>, in_text, Stack, <<>>, Acc, Length, Append) ->
truncate(Rest, in_comment, Stack, <<>>, <<Acc/binary,"<!--">>, Length, Append);
truncate(<<"-->", Rest/binary>>, in_comment, Stack, <<>>, Acc, Length, Append) ->
truncate(Rest, in_text, Stack, <<>>, <<Acc/binary,"-->">>, Length, Append);
truncate(<<C/utf8, Rest/binary>>, in_comment, Stack, <<>>, Acc, Length, Append) ->
truncate(Rest, in_comment, Stack, <<>>, <<Acc/binary,C/utf8>>, Length, Append);
truncate(<<$<, $/, Rest/binary>>, in_text, Stack, <<>>, Acc, 0, Append) ->
truncate(Rest, in_tag, Stack, <<$<,$/>>, Acc, 0, Append);
truncate(_Rest, in_text, [], _TagAcc, Acc, 0, Append) ->
<<Acc/binary,Append/binary>>;
truncate(Rest, in_text, [Tag|Stack], _TagAcc, Acc, 0, Append) ->
CloseTag = make_closetag(Tag),
truncate(Rest, in_text, Stack, <<>>, <<Acc/binary, Append/binary, CloseTag/binary>>, 0, <<>>);
truncate(<<$/,$>, Rest/binary>>, in_tag, Stack, Tag, Acc, Length, Append) ->
truncate(Rest, in_text, Stack, <<>>, <<Acc/binary,Tag/binary, $/, $>>>, Length, Append);
truncate(<<$>, Rest/binary>>, in_tag, [_Tag|Stack], <<$<,$/,_/binary>> = CloseTag, Acc, Length, Append) ->
truncate(Rest, in_text, Stack, <<>>, <<Acc/binary,CloseTag/binary,$>>>, Length, Append);
truncate(<<$>, Rest/binary>>, in_tag, Stack, Tag, Acc, Length, Append) ->
truncate(Rest, in_text, [Tag|Stack], <<>>, <<Acc/binary,Tag/binary,$>>>, Length, Append);
truncate(<<$<, Rest/binary>>, in_text, Stack, <<>>, Acc, Length, Append) ->
truncate(Rest, in_tag, Stack, <<$<>>, Acc, Length, Append);
truncate(<<C/utf8, Rest/binary>>, in_tag, Stack, Tag, Acc, Length, Append) ->
truncate(Rest, in_tag, Stack, <<Tag/binary,C/utf8>>, Acc, Length, Append);
truncate(<<$&, Rest/binary>>, in_text, Stack, <<>>, Acc, Length, Append) ->
truncate(Rest, in_element, Stack, <<>>, <<Acc/binary,$&>>, Length, Append);
truncate(<<$;, Rest/binary>>, in_element, Stack, <<>>, Acc, Length, Append) ->
truncate(Rest, in_text, Stack, <<>>, <<Acc/binary,$;>>, Length-1, Append);
truncate(<<C, Rest/binary>>, in_element, Stack, <<>>, Acc, Length, Append) ->
truncate(Rest, in_element, Stack, <<>>, <<Acc/binary,C>>, Length, Append);
truncate(<<C/utf8, Rest/binary>>, in_text, Stack, <<>>, Acc, Length, Append) ->
truncate(Rest, in_text, Stack, <<>>, <<Acc/binary, C/utf8>>, Length-1, Append).
make_closetag(<<$<, Rest/binary>>) ->
case binary:split(Rest, <<" ">>) of
[Tag,_] ->
<< $<,$/,Tag/binary,$> >>;
_ ->
[Tag|_] = binary:split(Rest, <<">">>),
<< $<,$/,Tag/binary,$> >>
end.
%% @doc Sanitize a (X)HTML string. Remove elements and attributes that might be harmful.
-spec sanitize( maybe_text() ) -> maybe_text().
sanitize(Html) ->
sanitize(Html, []).
-spec sanitize( maybe_text(), sanitize_options() ) -> maybe_text().
sanitize(undefined, _Options) ->
undefined;
sanitize({trans, Tr}, Options) ->
{trans, [{Lang, sanitize(V, Options)} || {Lang,V} <- Tr]};
sanitize(<<>>, _Options) ->
<<>>;
sanitize("", _Options) ->
<<>>;
sanitize(Html, Options) when is_binary(Html) ->
sanitize_opts(<<"<sanitize>",Html/binary,"</sanitize>">>, Options);
sanitize(Html, Options) when is_list(Html) ->
sanitize_opts(iolist_to_binary(["<sanitize>", Html, "</sanitize>"]), Options).
sanitize_opts(Html, Options) ->
sanitize1(Html, proplists:get_value(elt_extra, Options, []),
proplists:get_value(attr_extra, Options, []), Options).
sanitize1(Html, ExtraElts, ExtraAttrs, Options) ->
case z_html_parse:parse(ensure_escaped_amp(Html)) of
{ok, Parsed} ->
Sanitized = sanitize(Parsed, ExtraElts, ExtraAttrs, Options),
flatten(Sanitized);
{error, _} ->
<<>>
end.
%% @doc Sanitize a mochiwebparse tree. Remove harmful elements and attributes.
-spec sanitize( z_html_parse:html_element(), binary() | list(), binary() | list(), any()) -> z_html_parse:html_element().
sanitize(ParseTree, ExtraElts, ExtraAttrs, Options) when is_binary(ExtraElts) ->
sanitize(ParseTree, binary:split(ExtraElts, <<",">>, [global]), ExtraAttrs, Options);
sanitize(ParseTree, ExtraElts, ExtraAttrs, Options) when is_binary(ExtraAttrs) ->
sanitize(ParseTree, ExtraElts, binary:split(ExtraAttrs, <<",">>, [global]), Options);
sanitize(ParseTree, ExtraElts, ExtraAttrs, Options) ->
sanitize(ParseTree, [], ExtraElts, ExtraAttrs, Options).
sanitize({<<"li">>, _, _} = Elt, [], ExtraElts, ExtraAttrs, Options) ->
sanitize({<<"ul">>, [], [ Elt ]}, [], ExtraElts, ExtraAttrs, Options);
sanitize({<<"li">>, _, _} = Elt, [ParentElt | _ ] = Stack, ExtraElts, ExtraAttrs, Options)
when ParentElt =/= <<"ul">>, ParentElt =/= <<"ol">> ->
sanitize({<<"ul">>, [], [ Elt ]}, Stack, ExtraElts, ExtraAttrs, Options);
sanitize(B, Stack, _ExtraElts, _ExtraAttrs, Options) when is_binary(B) ->
case sanitize_element({'TextNode', B}, Stack, Options) of
{'TextNode', B1} -> escape(iolist_to_binary(B1));
Other -> Other
end;
sanitize({comment, _Text} = Comment, Stack, _ExtraElts, _ExtraAttrs, Options) ->
sanitize_element(Comment, Stack, Options);
sanitize({pi, _Raw}, _Stack, _ExtraElts, _ExtraAttrs, _Options) ->
<<>>;
sanitize({pi, _Tag, _Attrs}, _Stack, _ExtraElts, _ExtraAttrs, _Options) ->
<<>>;
sanitize({<<"svg">>, _Attrs, _Enclosed} = Element, _Stack, ExtraElts, _ExtraAttrs, _Options) ->
case allow_elt(<<"svg">>, ExtraElts) of
true ->
z_svg:sanitize_element(Element);
false ->
{nop, []}
end;
sanitize({Elt,Attrs,Enclosed}, Stack, ExtraElts, ExtraAttrs, Options) ->
case allow_elt(Elt, ExtraElts) orelse (not lists:member(Elt, Stack) andalso allow_once(Elt)) of
true ->
Attrs1 = lists:filter(fun({A,_}) -> allow_attr(A, ExtraAttrs) end, Attrs),
Stack1 = [Elt|Stack],
Tag = { Elt,
Attrs1,
[ sanitize(Encl, Stack1, ExtraElts, ExtraAttrs, Options) || Encl <- Enclosed ]},
sanitize_element(Tag, Stack, Options);
false ->
case skip_contents(Elt) of
false ->
{nop, [ sanitize(Encl, Stack, ExtraElts, ExtraAttrs, Options) || Encl <- Enclosed ]};
true ->
{nop, []}
end
end.
sanitize_element(Element, Stack, Options) ->
Callback = proplists:get_value(element, Options, fun sanitize_element_int/1),
sanitize_element(Callback, Element, Stack, Options).
sanitize_element(F, Element, _Stack, _Options) when is_function(F, 1) ->
F(Element);
sanitize_element(F, Element, _Stack, Options) when is_function(F, 2) ->
F(Element, Options);
sanitize_element(F, Element, Stack, Options) when is_function(F, 3) ->
F(Element, Stack, Options);
sanitize_element({M, F, A}, Element, Stack, _Options) ->
erlang:apply(M, F, [Element, Stack|A]).
sanitize_element_int({comment, Comment} = E) ->
% Remove comments that might contain injections for e.g. html editors.
% For example: <!--data-mce-selected="x"->"><img src onerror=import('//attacker.com')>-->
case binary:match(Comment, [ <<"<">>, <<">">> ]) of
nomatch -> E;
_ -> <<>>
end;
sanitize_element_int(E) ->
E.
%% @doc Flatten the sanitized html tree to a binary - the attributes are already filtered
%% using the allow_attr/1 whitelist.
-spec flatten( z_html_parse:html_element() ) -> binary().
flatten(B) when is_binary(B) ->
escape_html_text(B, <<>>);
flatten({nop, Enclosed}) ->
flatten(Enclosed);
flatten({comment, Text}) ->
Comment = binary:replace(Text, <<"-->">>, <<"-- >">>, [global]),
<<"<!--", Comment/binary, "-->">>;
flatten({sanitized_html, Html}) ->
Html;
flatten({Elt, Attrs, Enclosed}) ->
EncBin = flatten(Enclosed),
Attrs1 = sanitize_attrs(Attrs),
Attrs2 = [flatten_attr(Attr) || Attr <- Attrs1 ],
Attrs3 = iolist_to_binary(prefix(32, Attrs2)),
case is_selfclosing(Elt) andalso EncBin == <<>> of
true -> <<$<, Elt/binary, Attrs3/binary, 32, $/, $>>>;
false -> <<$<, Elt/binary, Attrs3/binary, $>, EncBin/binary, $<, $/, Elt/binary, $>>>
end;
flatten(L) when is_list(L) ->
iolist_to_binary([ flatten(A) || A <- L ]).
prefix(Sep, List) -> prefix(Sep,List,[]).
prefix(_Sep, [], Acc) -> lists:reverse(Acc);
prefix(Sep, [H|T], Acc) -> prefix(Sep, T, [H,Sep|Acc]).
sanitize_attrs(Attrs) ->
Attrs1 = lists:map(
fun({Attr, Value}) ->
{Attr, sanitize_attr_value(Attr, Value)}
end,
Attrs),
case lists:keymember(<<"target">>, 1, Attrs1) of
true ->
% Add 'rel="nofollow noopener noreferrer"' to all
% elements with a 'target' attribute, where the href
% is not a local link.
case proplists:get_value(<<"href">>, Attrs1) of
<<"#", _/binary>> -> Attrs1;
<<"/", _/binary>> -> Attrs1;
_ ->
Rel = proplists:get_value(<<"rel">>, Attrs1, <<>>),
Rels = re:split(Rel, <<"\\s">>),
Rels1 = [ R || R <- Rels, R =/= <<>> ],
Rels2 = Rels1 -- [
<<"follow">>,
<<"opener">>,
<<"referrer">>,
<<"nofollow">>,
<<"noopener">>,
<<"noreferrer">>
],
Rels3 = [ <<"nofollow">>, <<"noopener">>, <<"noreferrer">> | Rels2 ],
Rels4 = iolist_to_binary( lists:join(32, Rels3) ),
[ {<<"rel">>, Rels4} | proplists:delete(<<"rel">>, Attrs1) ]
end;
false ->
Attrs1
end.
sanitize_attr_value(<<"style">>, V) ->
filter_css(V);
sanitize_attr_value(<<"class">>, V) ->
% Remove all do_xxxx widget manager classes
filter_widget_class(V);
sanitize_attr_value(<<"href">>, V) ->
noscript(V, false);
sanitize_attr_value(Attr, V) ->
case is_url_attr(Attr) of
true -> noscript(V, true);
false -> V
end.
%% @doc Flatten an attribute, attributes have been whitelisted and
%% the values have been sanitized.
flatten_attr({Attr,Value}) ->
Value1 = escape(Value),
<<Attr/binary, $=, $", Value1/binary, $">>.
%% @doc Escape smaller-than, greater-than, single and double quotes in texts
%% (& is already removed or escaped).
escape_html_text(<<>>, Acc) ->
Acc;
escape_html_text(<<$<, T/binary>>, Acc) ->
escape_html_text(T, <<Acc/binary, "<">>);
escape_html_text(<<$>, T/binary>>, Acc) ->
escape_html_text(T, <<Acc/binary, ">">>);
escape_html_text(<<$", T/binary>>, Acc) ->
escape_html_text(T, <<Acc/binary, """>>);
escape_html_text(<<$', T/binary>>, Acc) ->
escape_html_text(T, <<Acc/binary, "'">>);
escape_html_text(<<C, T/binary>>, Acc) ->
escape_html_text(T, <<Acc/binary, C>>).
%% @doc Escape smaller-than, greater-than (for in comments)
escape_html_comment(<<>>, Acc) ->
Acc;
escape_html_comment(<<$<, T/binary>>, Acc) ->
escape_html_comment(T, <<Acc/binary, "<">>);
escape_html_comment(<<$>, T/binary>>, Acc) ->
escape_html_comment(T, <<Acc/binary, ">">>);
escape_html_comment(<<C, T/binary>>, Acc) ->
escape_html_comment(T, <<Acc/binary, C>>).
%% @doc Elements that can only occur once in a nesting.
%% Used for cleaning up code from html editors.
allow_once(<<"a">>) -> true;
allow_once(<<"abbr">>) -> true;
allow_once(<<"area">>) -> true;
allow_once(<<"article">>) -> true;
allow_once(<<"aside">>) -> true;
allow_once(<<"b">>) -> true;
allow_once(<<"bdo">>) -> true;
allow_once(<<"big">>) -> true;
allow_once(<<"br">>) -> true;
allow_once(<<"cite">>) -> true;
allow_once(<<"code">>) -> true;
allow_once(<<"del">>) -> true;
allow_once(<<"dfn">>) -> true;
allow_once(<<"em">>) -> true;
allow_once(<<"hr">>) -> true;
allow_once(<<"i">>) -> true;
allow_once(<<"ins">>) -> true;
allow_once(<<"kbd">>) -> true;
allow_once(<<"nav">>) -> true;
allow_once(<<"p">>) -> true;
allow_once(<<"pre">>) -> true;
allow_once(<<"q">>) -> true;
allow_once(<<"s">>) -> true;
allow_once(<<"samp">>) -> true;
allow_once(<<"small">>) -> true;
allow_once(<<"sub">>) -> true;
allow_once(<<"sup">>) -> true;
allow_once(<<"strong">>) -> true;
allow_once(<<"strike">>) -> true;
allow_once(<<"tt">>) -> true;
allow_once(<<"u">>) -> true;
allow_once(<<"var">>) -> true;
allow_once(_) -> false.
%% @doc Allowed elements (see also allow_once/1 above)
allow_elt(Elt, Extra) ->
allow_elt(Elt) orelse lists:member(Elt, Extra).
allow_elt(<<"audio">>) -> true;
allow_elt(<<"address">>) -> true;
allow_elt(<<"bdo">>) -> true;
allow_elt(<<"blockquote">>) -> true;
allow_elt(<<"caption">>) -> true;
allow_elt(<<"col">>) -> true;
allow_elt(<<"colgroup">>) -> true;
allow_elt(<<"dd">>) -> true;
allow_elt(<<"dl">>) -> true;
allow_elt(<<"dt">>) -> true;
allow_elt(<<"div">>) -> true;
allow_elt(<<"figcaption">>) -> true;
allow_elt(<<"figure">>) -> true;
allow_elt(<<"h1">>) -> true;
allow_elt(<<"h2">>) -> true;
allow_elt(<<"h3">>) -> true;
allow_elt(<<"h4">>) -> true;
allow_elt(<<"h5">>) -> true;
allow_elt(<<"h6">>) -> true;
allow_elt(<<"header">>) -> true;
allow_elt(<<"img">>) -> true;
allow_elt(<<"legend">>) -> true;
allow_elt(<<"li">>) -> true;
allow_elt(<<"map">>) -> true;
allow_elt(<<"ol">>) -> true;
allow_elt(<<"picture">>) -> true;
allow_elt(<<"samp">>) -> true;
allow_elt(<<"section">>) -> true;
allow_elt(<<"source">>) -> true;
allow_elt(<<"span">>) -> true;
allow_elt(<<"time">>) -> true;
allow_elt(<<"table">>) -> true;
allow_elt(<<"tbody">>) -> true;
allow_elt(<<"tfoot">>) -> true;
allow_elt(<<"thead">>) -> true;
allow_elt(<<"td">>) -> true;
allow_elt(<<"th">>) -> true;
allow_elt(<<"tr">>) -> true;
allow_elt(<<"ul">>) -> true;
allow_elt(<<"video">>) -> true;
allow_elt(<<"wbr">>) -> true;
allow_elt(_) -> false.
%% @doc Allowed attributes
allow_attr(Attr, Extra) ->
allow_attr(Attr) orelse lists:member(Attr, Extra).
allow_attr(<<"align">>) -> true;
allow_attr(<<"alt">>) -> true;
allow_attr(<<"autoplay">>) -> true;
allow_attr(<<"border">>) -> true;
allow_attr(<<"borderspacing">>) -> true;
allow_attr(<<"cellpadding">>) -> true;
allow_attr(<<"cellspacing">>) -> true;
allow_attr(<<"class">>) -> true;
allow_attr(<<"colspan">>) -> true;
allow_attr(<<"controls">>) -> true;
allow_attr(<<"coords">>) -> true;
allow_attr(<<"dir">>) -> true;
allow_attr(<<"height">>) -> true;
allow_attr(<<"href">>) -> true;
%allow_attr(<<"id">>) -> true;
allow_attr(<<"loop">>) -> true;
allow_attr(<<"name">>) -> true;
allow_attr(<<"poster">>) -> true;
allow_attr(<<"preload">>) -> true;
allow_attr(<<"rel">>) -> true;
allow_attr(<<"rowspan">>) -> true;
allow_attr(<<"shape">>) -> true;
allow_attr(<<"src">>) -> true;
allow_attr(<<"style">>) -> true;
allow_attr(<<"target">>) -> true;
allow_attr(<<"title">>) -> true;
allow_attr(<<"usemap">>) -> true;
allow_attr(<<"valign">>) -> true;
allow_attr(<<"width">>) -> true;
allow_attr(_) -> false.
%% @doc Check if the attribute might contain an url
is_url_attr(<<"src">>) -> true;
is_url_attr(<<"href">>) -> true;
is_url_attr(<<"poster">>) -> true;
is_url_attr(_) -> false.
%% @doc Elements that shouldn't use an open and close tag.
is_selfclosing(<<"br">>) -> true;
is_selfclosing(<<"hr">>) -> true;
is_selfclosing(<<"img">>) -> true;
is_selfclosing(<<"wbr">>) -> true;
is_selfclosing(_) -> false.
%% @doc Disallowed elements whose contents should be skipped
skip_contents(<<"style">>) -> true;
skip_contents(<<"script">>) -> true;
skip_contents(<<"deleteme">>) -> true;
skip_contents(<<"head">>) -> true;
skip_contents(_) -> false.
%% @doc Run the CSS sanitizer over 'style' attributes. This is a strict sanitizer, all
%% non-conforming css is rejected.
-spec filter_css( maybe_iodata() ) -> binary().
filter_css(undefined) -> <<>>;
filter_css(<<>>) -> <<>>;
filter_css("") -> <<>>;
filter_css(L) when is_list(L) ->
filter_css(iolist_to_binary(L));
filter_css(Css) when is_binary(Css) ->
case z_css:sanitize_style(Css) of
{ok, Css1} ->
Css1;
{error, _Error} ->
<<>>
end.
%% @doc Remove all do_xxxx classes to prevent widget manager invocations
filter_widget_class(Class) ->
z_convert:to_binary(re:replace(Class, <<"do_[0-9a-zA-Z_]+">>, <<>>, [global])).
%% @doc Filter a url, remove any "javascript:" and "data:" (as data can be text/html).
-spec noscript(Url) -> SafeUrl when
Url :: string() | binary(),
SafeUrl :: binary().
noscript(Url) ->
noscript(Url, false).
%% @doc Filter an url, if strict then also remove "data:" (as data can be text/html).
-spec noscript(Url, IsAllowData) -> SafeUrl when
Url :: string() | binary(),
IsAllowData :: boolean(),
SafeUrl :: binary().
noscript(Url0, IsAllowData) ->
Url = z_string:trim(z_string:sanitize_utf8(z_convert:to_binary(Url0))),
case nows_protocol_split(Url, <<>>) of
{<<"javascript">>, _} -> <<"#script-removed">>;
{<<"script">>, _} -> <<"#script-removed">>;
{<<"vbscript">>, _} -> <<"#script-removed">>;
{<<"data">>, Data} when IsAllowData ->
case noscript_data(Data) of
<<>> -> <<>>;
Data1 -> <<"data:", Data1/binary>>
end;
{<<"data">>, _} -> <<>>;
{<<"mailto">>, Rest} -> <<"mailto:", (z_string:trim(Rest))/binary>>;
{Protocol, Rest} when is_binary(Protocol) -> <<Protocol/binary, $:, Rest/binary>>;
{undefined, <<>>} -> <<>>;
{undefined, _} -> Url
end.
%% @doc Remove whitespace and make lowercase till we find a colon, slash or pound-sign. Also
%% deletes all invalid utf8 characters.
-spec nows_protocol_split( binary(), binary() ) -> {binary()|undefined, binary()}.
nows_protocol_split(<<>>, Acc) -> {undefined, Acc};
nows_protocol_split(<<$:, Rest/binary>>, Acc) -> {Acc, Rest};
nows_protocol_split(<<$/, Rest/binary>>, Acc) -> {undefined, <<Acc/binary, $/, Rest/binary>>};
nows_protocol_split(<<$#, Rest/binary>>, Acc) -> {undefined, <<Acc/binary, $#, Rest/binary>>};
nows_protocol_split(<<$\\, Rest/binary>>, Acc) -> nows_protocol_split(Rest, Acc);
nows_protocol_split(<<$%, A, B, Rest/binary>>, Acc) ->
case catch erlang:binary_to_integer(<<A, B>>, 16) of
V when is_integer(V) -> nows_protocol_split(<<V, Rest/binary>>, Acc);
_ -> {undefined, <<>>}
end;
nows_protocol_split(<<$%, _/binary>>, _Acc) ->
% Illegal: not enough characters left for escape sequence
{undefined, <<>>};
nows_protocol_split(<<C, Rest/binary>>, Acc) when C =< 32 ->
% Discard control characters
nows_protocol_split(Rest, Acc);
nows_protocol_split(<<C, Rest/binary>>, Acc) when C >= $A, C =< $Z ->
% Ensure lowercase a-z
nows_protocol_split(Rest, <<Acc/binary, (C+32)>>);
nows_protocol_split(<<C/utf8, Rest/binary>>, Acc) ->
nows_protocol_split(Rest, <<Acc/binary, C/utf8>>);
nows_protocol_split(<<_, Rest/binary>>, Acc) ->
% Discard non utf8 characters
nows_protocol_split(Rest, Acc).
%% @doc Sanitize the data link, drop anything suspected to be a script, or that could contain a script.
noscript_data(<<"image/svg", _/binary>> = Url) ->
Url1 = <<"data:", Url/binary>>,
case z_url:decode_data_url(Url1) of
{ok, Mime, _Charset, Decoded} ->
Sanitized = z_svg:sanitize(Decoded),
<<Mime/binary, ";base64,", (base64:encode(Sanitized))/binary>>;
{error, _} ->
<<>>
end;
noscript_data(<<"image/", _/binary>> = Data) -> Data;
noscript_data(<<"audio/", _/binary>> = Data) -> Data;
noscript_data(<<"video/", _/binary>> = Data) -> Data;
noscript_data(<<"text/plain;", _/binary>> = Data) -> Data;
noscript_data(_) -> <<>>.
%% @doc Translate any html br entities to newlines.
-spec br2nl( maybe_text() ) -> maybe_text().
br2nl(undefined) ->
undefined;
br2nl({trans, Ts}) ->
{trans, [ {Iso,br2nl(T)} || {Iso,T} <- Ts ]};
br2nl(B) when is_binary(B) ->
br2nl_bin(B, <<>>);
br2nl(L) ->
br2nl(L, []).
br2nl([], Acc) ->
lists:reverse(Acc);
br2nl("<br/>" ++ Rest, Acc) ->
br2nl(Rest, [$\n|Acc]);
br2nl("<br />" ++ Rest, Acc) ->
br2nl(Rest, [$\n|Acc]);
br2nl([C | Rest], Acc) ->
br2nl(Rest, [C | Acc]).
br2nl_bin(<<>>, Acc) ->
Acc;
br2nl_bin(<<"<br/>", Post/binary>>, Acc) ->
br2nl_bin(Post, <<Acc/binary, $\n>>);
br2nl_bin(<<"<br />", Post/binary>>, Acc) ->
br2nl_bin(Post, <<Acc/binary, $\n>>);
br2nl_bin(<<C, Post/binary>>, Acc) ->
br2nl_bin(Post, <<Acc/binary, C>>).
%% @doc Translate any newlines to html br entities.
-spec nl2br( maybe_text() ) -> maybe_text().
nl2br(undefined) ->
undefined;
nl2br({trans, Ts}) ->
{trans, [ {Iso,nl2br(T)} || {Iso,T} <- Ts ]};
nl2br(B) when is_binary(B) ->
nl2br_bin(B, <<>>);
nl2br(L) ->
nl2br(L, []).
nl2br([], Acc) ->
lists:reverse(Acc);
nl2br("\r\n" ++ Rest, Acc) ->
nl2br(Rest, lists:reverse("<br />", Acc));
nl2br("\n" ++ Rest, Acc) ->
nl2br(Rest, lists:reverse("<br />", Acc));
nl2br([C | Rest], Acc) ->
nl2br(Rest, [C | Acc]).
nl2br_bin(<<>>, Acc) ->
Acc;
nl2br_bin(<<$\r, $\n, Post/binary>>, Acc) ->
nl2br_bin(Post, <<Acc/binary, "<br />">>);
nl2br_bin(<<$\r, Post/binary>>, Acc) ->
nl2br_bin(Post, <<Acc/binary, "<br />">>);
nl2br_bin(<<$\n, Post/binary>>, Acc) ->
nl2br_bin(Post, <<Acc/binary, "<br />">>);
nl2br_bin(<<C, Post/binary>>, Acc) ->
nl2br_bin(Post, <<Acc/binary, C>>).
%% @doc Given a HTML list, scrape all `<link>' elements and return their attributes. Attribute names are lowercased.
-spec scrape_link_elements( iodata() ) -> list( [ z_html_parse:html_attr() ] ).
scrape_link_elements(Html) ->
case re:run(Html, "<link[^>]+>", [global, caseless, {capture,all,binary}]) of
{match, Elements} ->
F = fun(El) ->
H = iolist_to_binary(["<p>", El, "</p>"]),
case z_html_parse:parse(H) of
{ok, {<<"p">>, [], [{_, Attrs, []}]}} ->
[ {z_string:to_lower(K),V} || {K,V} <- lists:flatten(Attrs) ];
{error, _} ->
[]
end
end,
[ F(El) || [El] <- Elements ];
nomatch ->
[]
end.
%% @doc Ensure that `&'-characters are properly escaped inside a html string.
-spec ensure_escaped_amp( maybe_binary() ) -> binary().
ensure_escaped_amp(undefined) ->
<<>>;
ensure_escaped_amp(B) ->
ensure_escaped_amp(B, <<>>).
ensure_escaped_amp(<<>>, Acc) ->
Acc;
ensure_escaped_amp(<<"<!--", Rest/binary>>, Acc) ->
case try_comment(Rest, <<Acc/binary, "<!--">>) of
false -> Acc;
{Rest1, Acc1} -> ensure_escaped_amp(Rest1, Acc1)
end;
ensure_escaped_amp(<<$&, Rest/binary>>, Acc) ->
case try_amp(Rest, in_amp, <<>>) of
{Amp,Rest1} -> ensure_escaped_amp(Rest1, <<Acc/binary, $&, Amp/binary>>);
false -> ensure_escaped_amp(Rest, <<Acc/binary, "&">>)
end;
ensure_escaped_amp(<<C, Rest/binary>>, Acc) ->
ensure_escaped_amp(Rest, <<Acc/binary, C>>).
try_amp(<<$;,Rest/binary>>, in_ent_name, Acc) ->
{<<Acc/binary,$;>>, Rest};
try_amp(<<$;,Rest/binary>>, in_ent_val, Acc) ->
{<<Acc/binary,$;>>, Rest};
try_amp(<<$#,Rest/binary>>, in_amp, <<>>) ->
try_amp(Rest, in_ent_val, <<$#>>);
try_amp(<<C,Rest/binary>>, in_ent_val, Acc) ->
case is_valid_ent_val(C) of
true -> try_amp(Rest, in_ent_val, <<Acc/binary,C>>);
false -> false
end;
try_amp(<<C,Rest/binary>>, in_amp, <<>>) ->
case is_valid_ent_char(C) of
true -> try_amp(Rest, in_ent_name, <<C>>);
false -> false
end;
try_amp(<<C,Rest/binary>>, in_ent_name, Acc) ->
case is_valid_ent_char(C) of
true -> try_amp(Rest, in_ent_name, <<Acc/binary, C>>);
false -> false
end;
try_amp(_B, _, _Acc) ->
false.
try_comment(<<"-->", Rest/binary>>, Acc) ->
{Rest, <<Acc/binary, "-->">>};
try_comment(<<C/utf8, Rest/binary>>, Acc) ->
try_comment(Rest, <<Acc/binary, C/utf8>>);
try_comment(_B, _Acc) ->
false.
is_valid_ent_char(C) ->
(C >= $a andalso C =< $z) orelse (C >= $A andalso C =< $Z).
is_valid_ent_val(C) ->
(C >= $a andalso C =< $f) orelse (C >= $A andalso C =< $F)
orelse (C >= $0 andalso C =< $9).
%% @doc Make all links (href/src) in the html absolute to the base URL
%% This takes a shortcut by checking all ' (src|href)=".."'
-spec abs_links( maybe_iodata(), binary() ) -> iodata().
abs_links(undefined, _Base) ->
<<>>;
abs_links(Html, Base) ->
case re:run(Html,
<<"(src|href)=\"([^\"]*)\"">>,
[global, notempty, {capture, all, binary}])
of
{match, Matches} -> replace_matched_links(Html, Matches, Base);
nomatch -> Html
end.
replace_matched_links(Html, [], _Base) ->
Html;
replace_matched_links(Html, [[Found, Attr, Link]|Matches], Base) ->
Html1 = case z_url:abs_link(Link, Base) of
Link ->
Html;
AbsLink ->
New = iolist_to_binary([Attr, $=, $", AbsLink, $"]),
binary:replace(Html, Found, New)
end,
replace_matched_links(Html1, Matches, Base).