defmodule StripJs do
@moduledoc ~S"""
StripJs is an Elixir module for stripping executable JavaScript from
blocks of HTML and CSS, based on the <a target=_blank
href="https://github.com/philss/floki">Floki</a> parsing library.
It handles:
* `<script>...</script>` and `<script src="..."></script>` tags
* Event handler attributes such as `onclick="..."`
* `javascript:...` URLs in HTML and CSS
* CSS `expression(...)` directives
* HTML entity attacks (like `<script>`)
StripJs is production ready, and has sanitized over 1.5 billion payloads
at Appcues.
## Installation
Add `strip_js` to your application's `mix.exs`:
def application do
[applications: [:strip_js]]
end
def deps do
[{:strip_js, "~> #{StripJs.Mixfile.project()[:version]}"}]
end
## Usage
`clean_html/2` removes all JS vectors from an HTML string:
iex> html = "<button onclick=\"alert('pwnt')\">Hi!</button>"
iex> StripJs.clean_html(html)
"<button>Hi!</button>"
`clean_css/2` removes all JS vectors from a CSS string:
iex> css = "body { background-image: url('javascript:alert()'); }"
iex> StripJs.clean_css(css)
"body { background-image: url('removed_by_strip_js:alert()'); }"
StripJs relies on the [Floki](https://github.com/philss/floki)
HTML parser library, which is built using
[Mochiweb](https://github.com/mochi/mochiweb) by default.
StripJs provides a `clean_html_tree/1` function to strip JS from
`Floki.parse_fragment/1`- and `:mochiweb_html.parse/1`- style HTML parse trees.
## Security
StripJs blocks every JS injection vector known to the authors. It has
survived four years in production, multiple professional penetration
tests, and over a billion invocations with no known security issues.
If you believe there are JS injection methods not covered by this library,
please submit an issue with a test case!
## Bugs and Limitations
The brokenness of invalid HTML may be amplified by `clean_html/2`.
In uncommon cases, innocent CSS which very closely resembles
JS-injection techniques may be mangled by `clean_css/2`.
## Authorship and License
Copyright 2017-2021, Appcues, Inc.
Project homepage:
[StripJs](https://github.com/appcues/strip_js)
StripJs is released under the
[MIT License](https://opensource.org/licenses/MIT).
"""
require Logger
# reserved for future use
@type opts :: Keyword.t()
@type html_tag :: String.t()
@type html_attr :: {String.t(), String.t()}
@type html_node :: String.t() | {html_tag, [html_attr], [html_node]}
@type html_tree :: html_node | [html_node]
@doc ~S"""
Removes JS vectors from the given HTML string.
All non-tag text and tag attribute values will be HTML-escaped, except
for the contents of `<style>` tags, which are passed through `clean_css/2`.
Even if the input HTML contained no JS, the output of `clean_html/2`
is not guaranteed to match its input byte-for-byte.
Examples:
iex> StripJs.clean_html("<button onclick=\"alert('phear');\">Click here</button>")
"<button>Click here</button>"
iex> StripJs.clean_html("<script> console.log('oh heck'); </script>")
""
iex> StripJs.clean_html("<script> console.log('oh heck'); </script>")
"<script> console.log('oh heck'); </script>" ## HTML entity attack didn't work
"""
@spec clean_html(String.t(), opts) :: String.t()
def clean_html(html, opts \\ []) when is_binary(html) do
html
|> parse_html(opts)
|> clean_html_tree(opts)
|> to_html
end
@doc false
def strip_js(html, opts \\ []) do
IO.warn("StripJs.strip_js is deprecated; use StripJs.clean_html instead")
clean_html(html, opts)
end
@doc ~S"""
Removes JS vectors from the given
[Floki](https://github.com/philss/floki)/
[Mochiweb](https://github.com/mochi/mochiweb)-style HTML tree
(`t:html_tree/0`).
All attribute values and tag bodies except embedded stylesheets
will be HTML-escaped.
"""
@spec clean_html_tree(html_tree, opts) :: html_tree
def clean_html_tree(trees, opts \\ [])
def clean_html_tree(trees, opts) when is_list(trees) do
Enum.map(trees, &clean_html_tree(&1, opts))
end
def clean_html_tree({:comment, comment}, _opts) do
{:comment, comment}
end
def clean_html_tree({tag, attrs, children}, _opts) do
case String.downcase(tag) do
"script" ->
# remove scripts entirely
""
"style" ->
# don't HTML-escape!
cleaned_css = children |> to_html |> clean_css
{tag, clean_attrs(attrs), [cleaned_css]}
_ ->
cleaned_children = Enum.map(children, &clean_html_tree(&1))
{tag, clean_attrs(attrs), cleaned_children}
end
end
def clean_html_tree(string, _opts) when is_binary(string) do
string |> html_escape
end
@doc false
@spec strip_js_from_tree(html_tree, opts) :: html_tree
def strip_js_from_tree(tree, opts \\ []) do
IO.warn(
"StripJs.strip_js_from_tree is deprecated; use StripJs.clean_html_tree instead"
)
clean_html_tree(tree, opts)
end
@doc ~S"""
Removes JS vectors from the given CSS string; i.e., the contents of a
stylesheet or `<style>` tag.
Does not HTML-escape its output. Care is taken to maintain valid CSS
syntax.
Example:
iex> css = "tt { background-color: expression('alert()'); }"
iex> StripJs.clean_css(css)
"tt { background-color: removed_by_strip_js('alert()'); }"
Warning: this step is performed using regexes, not a parser, so it is
possible for innocent CSS containing either of the strings `javascript:`
or `expression(` to be mangled.
"""
@spec clean_css(String.t(), opts) :: String.t()
def clean_css(css, _opts \\ []) when is_binary(css) do
css
|> String.replace(~r/javascript \s* :/xi, "removed_by_strip_js:")
|> String.replace(~r/expression \s* \(/xi, "removed_by_strip_js(")
end
## Removes JS vectors from the given HTML attributes.
@spec clean_attrs([{String.t(), String.t()}]) :: [{String.t(), String.t()}]
defp clean_attrs(attrs) do
attrs
|> Enum.reduce([], &clean_attr/2)
|> Enum.reverse()
end
@attrs_with_urls ["href", "src", "background", "dynsrc", "lowsrc"]
@spec clean_attr({String.t(), String.t()}, [{String.t(), String.t()}]) :: [
{String.t(), String.t()}
]
defp clean_attr({attr, value}, acc) do
attr = String.downcase(attr)
cond do
attr in @attrs_with_urls &&
String.match?(value, ~r/^ \s* javascript \s* :/xi) ->
# retain the attribute so we emit valid HTML
[{attr, "#"} | acc]
String.starts_with?(attr, "on") ->
# remove on* handlers entirely
acc
:else ->
[{attr, value} | acc]
end
end
## Performs good-enough HTML escaping to prevent HTML entity attacks.
@spec html_escape(String.t()) :: String.t()
defp html_escape(html) do
html
|> String.replace("&", "&")
|> String.replace("<", "<")
|> String.replace(">", ">")
end
## Parses the given HTML into an `t:html_tree/0` structure.
@spec parse_html(String.t(), opts) :: html_tree
defp parse_html(html, _opts), do: Floki.parse_fragment!(html)
## Converts HTML tree to string.
@spec to_html(html_tree) :: String.t()
defp to_html(tree) when is_binary(tree), do: tree
defp to_html(tree), do: tree |> Floki.raw_html(encode: false)
end