lib/html_date/json_ld.ex

defmodule HTMLDate.JSONLD do
  @moduledoc """
  Parses publication dates from HTML <script type="application/ld+json"> tag.
  """

  require Logger

  @allowed_types ["WebPage", "Article", "NewsArticle", ["Article", "NewsArticle"]]

  @parsable_attributes [
    ["datePublished"],
    ["dateModified"],
    ["dateCreated"],
    ["mainEntity", "datePublished"],
    ["mainEntity", "dateModified"],
    ["mainEntity", "dateCreated"]
  ]

  @spec parse(Floki.HTMLTree.t()) :: [{String.t(), String.t()}]
  def parse(html_tree) do
    html_tree
    |> parse_all_json_ld()
    |> Enum.reduce([], fn map, acc ->
      date_strings_from_graph =
        map
        |> articles_from_graph()
        |> Enum.map(&parse_attributes(&1, [], prefix: "@graph.#{&1["@type"]}."))

      [parse_attributes(map), date_strings_from_graph | acc]
    end)
    |> List.flatten()
  end

  def parse_attributes(map, acc \\ [], options \\ []) do
    prefix = options[:prefix] || ""

    @parsable_attributes
    |> Enum.reduce(acc, fn attribute, acc ->
      case try_get_in(map, attribute) do
        date_string when is_binary(date_string) ->
          [{prefix <> Enum.join(attribute, "."), date_string} | acc]

        _ ->
          acc
      end
    end)
  end

  def try_get_in(map, keys) do
    get_in(map, keys)
  rescue
    ArgumentError -> nil
  end

  @spec parse_all_json_ld(Floki.HTMLTree.t()) :: [map]
  def parse_all_json_ld(html_tree) do
    html_tree
    |> Floki.find("script[type=\"application/ld+json\"]")
    |> Enum.reduce([], fn {"script", _, [content]}, acc ->
      case Jason.decode(content) do
        {:ok, map} when is_map(map) -> [map | acc]
        {:ok, _not_map} -> acc
        {:error, _} -> acc
      end
    end)
    |> List.flatten()
  end

  def articles_from_graph(%{"@graph" => list}) when is_list(list) do
    list
    |> Enum.filter(&(&1["@type"] in @allowed_types))
  end

  def articles_from_graph(_), do: []
end