lib/readability/candidate/cleaner.ex

defmodule Readability.Candidate.Cleaner do
  @moduledoc """
  Clean html tree for prepare candidates.
  It transforms misused tags and removes unlikely candidates.
  """

  alias Readability.Helper

  @type html_tree :: tuple | list

  @doc """
  Transform misused divs <div>s that do not contain other block elements into <p>s
  """
  @spec transform_misused_div_to_p(html_tree) :: html_tree
  def transform_misused_div_to_p(content) when is_binary(content), do: content
  def transform_misused_div_to_p([]), do: []

  def transform_misused_div_to_p([h | t]) do
    [transform_misused_div_to_p(h) | transform_misused_div_to_p(t)]
  end

  def transform_misused_div_to_p({tag, attrs, inner_tree}) do
    tag = if misused_divs?(tag, inner_tree), do: "p", else: tag
    {tag, attrs, transform_misused_div_to_p(inner_tree)}
  end

  @doc """
  Remove unlikely html tree
  """
  @spec remove_unlikely_tree(html_tree) :: html_tree
  def remove_unlikely_tree(html_tree) do
    Helper.remove_tag(html_tree, &unlikely_tree?(&1))
  end

  defp misused_divs?("div", inner_tree) do
    !(Floki.raw_html(inner_tree) =~ Readability.regexes(:div_to_p_elements))
  end

  defp misused_divs?(_, _), do: false

  defp unlikely_tree?({tag, attrs, _}) do
    idclass_str =
      attrs
      |> Enum.filter(&(elem(&1, 0) =~ ~r/id|class/i))
      |> Enum.map(&elem(&1, 1))
      |> Enum.join("")

    str = tag <> idclass_str

    str =~ Readability.regexes(:unlikely_candidate) &&
      !(str =~ Readability.regexes(:ok_maybe_its_a_candidate)) && tag != "html"
  end
end