src/support/z_sanitize.erl

%% @author Marc Worrell <marc@worrell.nl>
%% @copyright 2014 Marc Worrell
%% @doc Interface to z_html sanitizers, sets options and adds embed sanitization.

%% Copyright 2014 Marc Worrell
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%%     http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.

-module(z_sanitize).

-author("Marc Worrell <marc@worrell.nl>").

-export([
    uri/1,
    default_sandbox_attr/1,
    ensure_safe_js_callback/1,
    escape_props/1,
    escape_props/2,
    escape_props_check/1,
    escape_props_check/2,
    escape_link/1,
    escape_link/2,
    html/1,
    html/2
    ]).

-include_lib("zotonic.hrl").


% Youtube needs at least: allow-popups allow-same-origin allow-scripts
% See: https://csplite.com/csp/test186/
-define(IFRAME_SANDBOX, <<"allow-popups allow-scripts allow-same-origin">>).


uri(Uri) ->
    z_html:sanitize_uri(Uri).


default_sandbox_attr(Context) ->
    case m_config:get_value(site, html_iframe_sandbox, Context) of
        undefined -> ?IFRAME_SANDBOX;
        <<>> -> ?IFRAME_SANDBOX;
        Sb -> Sb
    end.

%% @doc Escape a Javascript callback function. Crash if not a safe callback function name.
-spec ensure_safe_js_callback( string() | binary() ) -> binary().
ensure_safe_js_callback(Callback) ->
    nomatch = re:run(Callback, "[^a-zA-Z0-9_\\.]"),
    iolist_to_binary(Callback).

escape_props(Props) ->
    z_html:escape_props(Props, default_options()).

escape_props(Props, Context) ->
    z_html:escape_props(Props, context_options(Context)).

escape_props_check(Props) ->
    z_html:escape_props_check(Props, default_options()).

escape_props_check(Props, Context) ->
    z_html:escape_props_check(Props, context_options(Context)).

escape_link(#trans{ tr = Tr }) ->
    [ {Lang, escape_link(Text)} || {Lang, Text} <- Tr ];
escape_link(V) ->
    z_html:escape_link(V).

escape_link(V, Context) ->
    z_html:escape_link(z_trans:lookup_fallback(V, Context)).

html(Html) ->
    z_html:sanitize(Html, default_options()).

html(Html, Context) ->
    z_html:sanitize(Html, context_options(Context)).


context_options(Context) ->
    [
        {elt_extra, m_config:get_value(site, html_elt_extra, <<"embed,iframe,object,script">>, Context)},
        {attr_extra, m_config:get_value(site, html_attr_extra, <<"data,allowfullscreen,frameborder,scrolling,async,defer,allow">>, Context)},
        {element, fun(Element, Stack, Opts) -> sanitize_element(Element, Stack, Opts, Context) end}
    ].

default_options() ->
    [
        {elt_extra, <<>>},
        {attr_extra, <<>>},
        {element, fun sanitize_element_opts/3}
    ].


sanitize_element(Element, Stack, Opts, Context) ->
    case z_notifier:foldl(#sanitize_element{element=Element, stack=Stack}, Element, Context) of
        Element ->
            sanitize_element_1(Element, Stack, Opts, Context);
        NewElement ->
            NewElement
    end.

sanitize_element_1({<<"iframe">>, Props, _Inner}, _Stack, _Opts, Context) ->
    sanitize_iframe(Props, Context);
sanitize_element_1({<<"embed">>, Props, _Inner}, _Stack, _Opts, Context) ->
    sanitize_embed(Props, Context);
sanitize_element_1({<<"object">>, Props, []}, _Stack, _Opts, Context) ->
    sanitize_object(Props, Context);
sanitize_element_1({<<"object">>, _Props, Inner}, _Stack, _Opts, _Context) ->
    Inner;
sanitize_element_1({<<"script">>, Props, _Inner}, _Stack, _Opts, Context) ->
    sanitize_script(Props, Context);
sanitize_element_1(Element, Stack, Opts, _Context) ->
    sanitize_element_opts(Element, Stack, Opts).


sanitize_element_opts({<<"a">>, Attrs, Inner} = Element, _Stack, _Opts) ->
    case proplists:is_defined(<<"target">>, Attrs) of
        true ->
            Attrs1 = [ Attr || Attr = {K,_} <- Attrs, K =/= <<"rel">> ],
            Attrs2 = [ {<<"rel">>, <<"noopener noreferrer">>} | Attrs1 ],
            {<<"a">>, Attrs2, Inner};
        false ->
            Element
    end;
sanitize_element_opts({comment, <<" [", _/binary>> = Comment} = Element, _Stack, _Opts) ->
    % Conditionals by Microsoft Word: <!-- [if (..)] (..) [endif]-->
    case binary:last(Comment) of
        $] -> <<>>;
        _ -> Element
    end;
sanitize_element_opts({comment, <<"StartFragment">>}, _Stack, _Opts) ->
    % Inserted by Microsoft Word: <!--StartFragment-->
    <<>>;
sanitize_element_opts({comment, <<"EndFragment">>}, _Stack, _Opts) ->
    % Inserted by Microsoft Word: <!--EndFragment-->
    <<>>;
sanitize_element_opts({comment, <<" z-media ", ZMedia/binary>>}, _Stack, _Opts) ->
    % The z-media tag is very strict with spaces
    try
        [Id, Opts] = binary:split(ZMedia, <<" {">>),
        Opts1 = sanitize_z_media(<<${, Opts/binary>>),
        Id1 = z_string:to_name(z_string:trim(Id)),
        {comment, <<" z-media ", Id1/binary, " ", Opts1/binary, " ">>}
    catch
        _:_ ->
            ?LOG_NOTICE(#{
                text => <<"Dropping illegal z-media tag">>,
                in => zotonic_core,
                zmedia => ZMedia
            }),
            {comment, <<" ">>}
    end;
sanitize_element_opts({Tag, Attrs, Inner}, _Stack, _Opts) ->
    Attrs1 = cleanup_element_attrs(Attrs),
    {Tag, Attrs1, Inner};
sanitize_element_opts(Element, _Stack, _Opts) ->
    Element.

cleanup_element_attrs(Attrs) ->
    lists:filtermap(fun cleanup_element_attr/1, Attrs).

cleanup_element_attr({<<"class">>, Classes}) ->
    Classes1 = binary:split(Classes, <<" ">>, [global]),
    case lists:filter(fun is_acceptable_classname/1, Classes1) of
        [] -> false;
        Cs -> {true, {<<"class">>, iolist_to_binary(lists:join(32, Cs))}}
    end;
cleanup_element_attr({<<"style">>, <<"mso-", _/binary>>}) ->
    % This might need some extra parsing of the css.
    % For now we just drop styles starting with a "mso-" selector.
    false;
cleanup_element_attr({<<"allow">>, Allow}) ->
    List = binary:split(Allow, <<";">>, Allow),
    List1 = [ z_string:to_lower( z_string:trim(A) ) || A <- List ],
    List2 = lists:filter(fun sanitize_attr_allow/1, List1),
    List3 = lists:join(<<"; ">>, List2),
    iolist_to_binary(List3);
cleanup_element_attr(_Attr) ->
    true.

is_acceptable_classname(<<"Mso", _/binary>>) -> false;
is_acceptable_classname(<<>>) -> false;
is_acceptable_classname(_) -> true.

% Allowed feature policies for the iframe 'allow' attribute.
sanitize_attr_allow(<<"camera", _/binary>>) -> true;
sanitize_attr_allow(<<"microphone", _/binary>>) -> true;
sanitize_attr_allow(<<"midi", _/binary>>) -> true;
sanitize_attr_allow(<<"encrypted-media", _/binary>>) -> true;
sanitize_attr_allow(<<"autoplay", _/binary>>) -> true;
sanitize_attr_allow(<<"fullscreen", _/binary>>) -> true;
sanitize_attr_allow(<<"picture-in-picture", _/binary>>) -> true;
sanitize_attr_allow(<<"geolocation", _/binary>>) -> true;
sanitize_attr_allow(<<"gyroscope", _/binary>>) -> true;
sanitize_attr_allow(<<"accelerometer", _/binary>>) -> true;
sanitize_attr_allow(<<"ambient-light-sensor", _/binary>>) -> true;
sanitize_attr_allow(<<"magnetometer", _/binary>>) -> true;
sanitize_attr_allow(_) -> false.

sanitize_z_media(Data) ->
    Sanitized = maps:fold(
        fun(Key, Value, Acc) ->
            maps:merge(Acc, sanitize_z_media_arg(Key, Value))
        end,
        #{},
        z_json:decode(Data)
    ),
    z_json:encode(Sanitized).

sanitize_z_media_arg(<<"id">>, Id) when is_binary(Id) -> #{<<"id">> => z_string:to_name(Id)};
sanitize_z_media_arg(<<"id">>, Id) when is_integer(Id) -> #{<<"id">> => Id};
sanitize_z_media_arg(<<"size">>, <<"large">>) -> #{<<"size">> => <<"large">>};
sanitize_z_media_arg(<<"size">>, <<"small">>) -> #{<<"size">> => <<"small">>};
sanitize_z_media_arg(<<"size">>, <<"middle">>) -> #{<<"size">> => <<"middle">>};
sanitize_z_media_arg(<<"size">>, _) -> #{<<"size">> => <<"medium">>};
sanitize_z_media_arg(<<"align">>, <<"left">>) -> #{<<"align">> => <<"left">>};
sanitize_z_media_arg(<<"align">>, <<"right">>) -> #{<<"align">> => <<"right">>};
sanitize_z_media_arg(<<"align">>, _) -> #{<<"align">> => <<"block">>};
sanitize_z_media_arg(<<"crop">>, Crop) -> #{<<"crop">> => z_convert:to_bool(Crop)};
sanitize_z_media_arg(<<"link">>, Link) -> #{<<"link">> => z_convert:to_bool(Link)};
sanitize_z_media_arg(<<"link_url">>, LinkUrl) ->
    #{<<"link_url">> => z_html:sanitize_uri(z_string:trim(LinkUrl))};
sanitize_z_media_arg(<<"caption">>, Caption) ->
    #{<<"caption">> => binary:replace(Caption, <<"-->">>, <<"→"/utf8>>, [global])};
sanitize_z_media_arg(Key, Value) when is_binary(Value) ->
    #{z_string:to_name(Key) => binary:replace(Value, <<"-->">>, <<"→"/utf8>>, [global])};
sanitize_z_media_arg(Key, Value) when is_integer(Value); is_boolean(Value) ->
    #{z_string:to_name(Key) => Value}.

sanitize_script(Props, Context) ->
    Src = proplists:get_value(<<"src">>, Props),
    case to_allowed(Src, Context) of
        {ok, Url} ->
            {<<"script">>, [{<<"src">>,Url} | proplists:delete(<<"src">>, Props)], []};
        false ->
            ?LOG_NOTICE(#{
                text => <<"Dropped script with url">>,
                in => zotonic_core,
                url => Src
            }),
            <<>>
    end.

sanitize_iframe(Props, Context) ->
    Src = proplists:get_value(<<"src">>, Props),
    case to_allowed(Src, Context) of
        {ok, Url} ->
            {<<"iframe">>, [
                {<<"src">>,Url},
                {<<"sandbox">>, default_sandbox_attr(Context)}
                | proplists:delete(<<"src">>,
                    proplists:delete(<<"sandbox">>, Props))], []};
        false ->
            ?LOG_NOTICE(#{
                text => <<"Dropped iframe url">>,
                in => zotonic_core,
                url => Src
            }),
            <<>>
    end.

sanitize_object(Props, Context) ->
    Src = proplists:get_value(<<"data">>, Props),
    case maybe_youtube(Src, Props, Context) of
        {ok, YoutubeIframe} ->
            YoutubeIframe;
        false ->
            case to_allowed(Src, Context) of
                {ok, Url} ->
                    {<<"embed">>, [{<<"src">>,Url} | proplists:delete(<<"data">>, Props)], []};
                false ->
                    ?LOG_NOTICE(#{
                        text => <<"Dropped object url">>,
                        in => zotonic_core,
                        url => Src
                    }),
                    <<>>
            end
    end.

sanitize_embed(Props, Context) ->
    Src = proplists:get_value(<<"src">>, Props),
    case maybe_youtube(Src, Props, Context) of
        {ok, YoutubeIframe} ->
            YoutubeIframe;
        false ->
            case to_allowed(Src, Context) of
                {ok, Url} ->
                    {<<"embed">>, [{<<"src">>,Url} | proplists:delete(<<"src">>, Props)], []};
                false ->
                    ?LOG_NOTICE(#{
                        text => <<"Dropped embed url">>,
                        in => zotonic_core,
                        url => Src
                    }),
                    <<>>
            end
    end.

maybe_youtube(undefined, _Props, _Context) ->
    false;
maybe_youtube(Url, Props, Context) ->
    case binary:split(Url, <<"//">>) of
        [_,Loc] ->
            maybe_youtube_1(Loc, Props, Context);
        _ ->
            false
    end.

maybe_youtube_1(<<"www.youtube.com/v/", Rest/binary>>, Props, Context) ->
    [VideoCode|_] = binary:split(hd(binary:split(Rest, <<"?">>)), <<"&">>),
    make_iframe(<<"https://www.youtube.com/embed/", VideoCode/binary>>, Props, Context);
maybe_youtube_1(<<"www.youtube.com/embed/", _Rest/binary>> = EmbedUrl, Props, Context) ->
    make_iframe(<<"https://",EmbedUrl/binary>>, Props, Context);
maybe_youtube_1(_, _Props, _Context) ->
    false.

make_iframe(Url, Props, Context) ->
    {ok, {<<"iframe">>,
        [
            {<<"width">>, proplists:get_value(<<"width">>, Props, <<"480">>)},
            {<<"height">>, proplists:get_value(<<"height">>, Props, <<"360">>)},
            {<<"allowfullscreen">>, proplists:get_value(<<"allowfullscreen">>, Props, <<"1">>)},
            {<<"frameborder">>, <<"0">>},
            {<<"sandbox">>, default_sandbox_attr(Context)},
            {<<"src">>, Url}
        ],
        []}}.

to_allowed(undefined, _Context) ->
    false;
to_allowed(Url, Context) ->
    to_allowlist_1(binary:split(Url, <<"//">>), Context).

to_allowlist_1([Proto,Loc], Context) when Proto =:= <<>>; Proto =:= <<"http:">>; Proto =:= <<"https:">> ->
    case allowlist(Loc, Context) of
        {ok, Loc1} ->
            % Always use https - http is now defunct
            {ok, <<"https://", Loc1/binary>>};
        false ->
            false
    end;
to_allowlist_1(_, _Context) ->
    false.


allowlist(HostPath, Context) ->
    case z_notifier:first(#sanitize_embed_url{hostpath=HostPath}, Context) of
        undefined ->
            allowlist(HostPath);
        false ->
            false;
        HostPath1 when is_binary(HostPath1) ->
            {ok, HostPath1}
    end.


%% @doc Some allowed domains for embedding.
allowlist(<<"youtu.be/", Rest/binary>>) -> {ok, <<"www.youtube.com/", Rest/binary>>};
allowlist(<<"youtube.com/", Rest/binary>>) -> {ok, <<"www.youtube.com/", Rest/binary>>};
allowlist(<<"www.youtube.com/", _/binary>> = Url) -> {ok, Url};
allowlist(<<"player.vimeo.com/",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"vimeo.com/",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"www.slideshare.net/",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"embed.spotify.com/",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"api.soundcloud.com/",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"w.soundcloud.com/",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"cdn.knightlab.com/",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"maps.google.com/",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"www.google.com/maps/",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"video.google.com/",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"spreadsheets.google.com/",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"docs.google.com/viewer?",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"instagram.com/embed.js">> = Url) -> {ok, Url};
allowlist(<<"www.instagram.com/embed.js">> = Url) -> {ok, Url};
allowlist(<<"vine.co/",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"platform.instagram.com/",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"www.hulu.com/",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"www.metacafe.com/fplayer/", _/binary>> = Url) -> {ok, Url};
allowlist(<<"www.flickr.com/", _/binary>> = Url) -> {ok, Url};
allowlist(<<"flickrit.com/slideshowholder.php?", _/binary>> = Url) -> {ok, Url};
allowlist(<<"flv.video.yandex.ru/", _/binary>> = Url) -> {ok, Url};
allowlist(<<"www.tumblr.com/",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"assets.tumblr.com/",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"static.issuu.com/",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"e.issuu.com/",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"cdn.embedly.com/", _/binary>> = Url) -> {ok, Url};
allowlist(<<"vk.com/video_ext",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"platform.twitter.com/",  _/binary>> = Url) -> {ok, Url};
allowlist(<<"prezi.com/v/", _/binary>> = Url) -> {ok, Url};
allowlist(<<"prezi.com/embed/", _/binary>> = Url) -> {ok, Url};
allowlist(Url) ->
    case lists:dropwhile(fun(Re) ->
                            re:run(Url, Re) =:= nomatch
                         end,
                         allowlist_res())
    of
        [] -> false;
        [_|_] -> {ok, Url}
    end.

allowlist_res() ->
    [
        <<"^[a-z0-9\\-]+\\.tumblr.com/post/[0-9]+/audio_player_iframe/.*">>,
        <<"cdn.embedly.com/widgets/media.html\\?src=http%3A%2F%2F[a-z0-9-]+\\.ak\\.instagram.com%2F">>
    ].