lib/readability/sanitizer.ex

Select File:
lib/readability/sanitizer.ex

defmodule Readability.Sanitizer do
  @moduledoc """
  Clean an element of all tags of type "tag" if they look fishy.
  "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
  """

  alias Readability.Helper
  alias Readability.Candidate
  alias Readability.Candidate.Scoring

  @type html_tree :: tuple | list

  @doc """
  Sanitizes article html tree
  """
  @spec sanitize(html_tree, [Candidate.t()], list) :: html_tree
  def sanitize(html_tree, candidates, opts \\ []) do
    html_tree =
      html_tree
      |> Helper.remove_tag(&clean_headline_tag?(&1))
      |> Helper.remove_tag(&clean_unlikely_tag?(&1))
      |> Helper.remove_tag(&clean_empty_p?(&1))

    if opts[:clean_conditionally] do
      html_tree |> Helper.remove_tag(conditionally_cleaing_fn(candidates))
    else
      html_tree
    end
  end

  defp conditionally_cleaing_fn(candidates) do
    fn {tag, attrs, _} = tree ->
      if Enum.any?(["table", "ul", "div"], &(&1 == tag)) do
        weight = Scoring.class_weight(attrs)

        same_tree =
          candidates
          |> Enum.find(%Candidate{}, &(&1.html_tree == tree))

        list? = tag == "ul"

        cond do
          weight + same_tree.score < 0 ->
            true

          length(Regex.scan(~r/\,/, Floki.text(tree))) < 10 ->
            # If there are not very many commas, and the number of
            # non-paragraph elements is more than paragraphs or other
            # ominous signs, remove the element.
            p_len = tree |> Floki.find("p") |> length
            img_len = tree |> Floki.find("img") |> length
            li_len = tree |> Floki.find("li") |> length
            input_len = tree |> Floki.find("input") |> length

            embed_len =
              tree
              |> Floki.find("embed")
              |> Enum.reject(&(&1 =~ Readability.regexes(:video)))
              |> length

            link_density = Scoring.calc_link_density(tree)
            conent_len = Helper.text_length(tree)

            # too many image
            # more <li>s than <p>s
            # less than 3x <p>s than <input>s
            # too short a content length without a single image
            # too many links for its weight (#{weight})
            # too many links for its weight (#{weight})
            # <embed>s with too short a content length, or too many <embed>s
            img_len > p_len || (!list? && li_len > p_len) || input_len > p_len / 3 ||
              (!list? && conent_len < Readability.regexes(:min_text_length) && img_len != 1) ||
              (weight < 25 && link_density > 0.2) || (weight >= 25 && link_density > 0.5) ||
              ((embed_len == 1 && conent_len < 75) || embed_len > 1)

          true ->
            false
        end
      end
    end
  end

  defp clean_headline_tag?({tag, attrs, _} = html_tree) do
    tag =~ ~r/^h\d{1}$/ &&
      (Scoring.class_weight(attrs) < 0 || Scoring.calc_link_density(html_tree) > 0.33)
  end

  defp clean_unlikely_tag?({tag, attrs, _}) do
    attrs_str = attrs |> Enum.map(&elem(&1, 1)) |> Enum.join("")
    tag =~ ~r/form|object|iframe|embed/ && !(attrs_str =~ Readability.regexes(:video))
  end

  defp clean_empty_p?({tag, _, _} = html_tree) do
    tag == "p" && Helper.text_length(html_tree) == 0
  end
end