lib/rdf/canonicalization/canonicalization.ex

Select File:
lib/rdf/canonicalization/canonicalization.ex

defmodule RDF.Canonicalization do
  @moduledoc """
  An implementation of the standard RDF Dataset Canonicalization Algorithm.

  See <https://www.w3.org/TR/rdf-canon/>.
  """

  use RDF
  alias RDF.Canonicalization.{IdentifierIssuer, State}
  alias RDF.{BlankNode, Dataset, Quad, Statement, Utils}

  @doc """
  Canonicalizes the blank nodes of a graph or dataset according to the RDF Dataset Canonicalization spec.

  This function always returns a `RDF.Dataset`. If you want to canonicalize a `RDF.Graph` and
  get a `RDF.Graph` back, use `RDF.Graph.canonicalize/1`.

  ## Example

      iex> RDF.Graph.new([{~B<foo>, EX.p(), ~B<bar>}, {~B<bar>, EX.p(), ~B<foo>}])
      ...> |> RDF.Canonicalization.canonicalize()
      RDF.Dataset.new([{~B<c14n0>, EX.p(), ~B<c14n1>}, {~B<c14n1>, EX.p(), ~B<c14n0>}])

  """
  @spec canonicalize(RDF.Graph.t() | RDF.Dataset.t()) :: RDF.Dataset.t()
  def canonicalize(input) do
    urdna2015(input)
  end

  @doc """
  Checks whether two graphs or datasets are equal, regardless of the concrete names of the blank nodes they contain.

  ## Examples

      iex> RDF.Graph.new([{~B<foo>, EX.p(), ~B<bar>}, {~B<bar>, EX.p(), 42}])
      ...> |> RDF.Canonicalization.isomorphic?(
      ...>      RDF.Graph.new([{~B<b1>, EX.p(), ~B<b2>}, {~B<b2>, EX.p(), 42}]))
      true

      iex> RDF.Graph.new([{~B<foo>, EX.p(), ~B<bar>}, {~B<bar>, EX.p(), 42}])
      ...> |> RDF.Canonicalization.isomorphic?(
      ...>      RDF.Graph.new([{~B<b1>, EX.p(), ~B<b2>}, {~B<b3>, EX.p(), 42}]))
      false
  """
  @spec isomorphic?(RDF.Graph.t() | RDF.Dataset.t(), RDF.Graph.t() | RDF.Dataset.t()) :: boolean
  def isomorphic?(a, b) do
    a |> canonicalize() |> Dataset.equal?(canonicalize(b))
  end

  defp urdna2015(input) do
    input
    |> State.new()
    |> create_canonical_identifiers_for_single_node_hashes()
    |> create_canonical_identifiers_for_multiple_node_hashes()
    |> apply_canonicalization(input)
  end

  defp create_canonical_identifiers_for_single_node_hashes(state) do
    # 3) Calculate hashes for first degree nodes
    state =
      Enum.reduce(state.bnode_to_quads, state, fn {n, _}, state ->
        State.add_bnode_hash(state, n, hash_first_degree_quads(state, n))
      end)

    # 4) Create canonical replacements for hashes mapping to a single node
    state.hash_to_bnodes
    # TODO: "Sort in Unicode code point order"
    |> Enum.sort()
    |> Enum.reduce(state, fn {hash, identifier_list}, state ->
      case MapSet.to_list(identifier_list) do
        [identifier] ->
          state
          |> State.issue_canonical_identifier(identifier)
          |> State.delete_bnode_hash(hash)

        [] ->
          raise "unexpected empty identifier list"

        _ ->
          state
      end
    end)
  end

  # 5)
  defp create_canonical_identifiers_for_multiple_node_hashes(state) do
    state.hash_to_bnodes
    |> Enum.sort()
    |> Enum.reduce(state, fn {_hash, identifier_list}, state ->
      # 5.1-2) Create a hash_path_list for all bnodes using a temporary identifier used to create canonical replacements
      identifier_list
      |> Enum.reduce([], fn identifier, hash_path_list ->
        if IdentifierIssuer.issued?(state.canonical_issuer, identifier) do
          hash_path_list
        else
          {_issued_identifier, temporary_issuer} =
            "b"
            |> IdentifierIssuer.new()
            |> IdentifierIssuer.issue_identifier(identifier)

          [
            hash_n_degree_quads(state, identifier, temporary_issuer)
            | hash_path_list
          ]
        end
      end)
      |> Enum.sort()
      # 5.3) Create canonical replacements for nodes
      |> Enum.reduce(state, fn {_hash, issuer}, state ->
        issuer
        |> IdentifierIssuer.issued_identifiers()
        |> Enum.reduce(state, &State.issue_canonical_identifier(&2, &1))
      end)
    end)
  end

  # 6)
  defp apply_canonicalization(state, data) do
    Enum.reduce(data, Dataset.new(), fn statement, canonicalized_data ->
      Dataset.add(
        canonicalized_data,
        if Statement.has_bnode?(statement) do
          Statement.map(statement, fn
            {_, %BlankNode{} = bnode} ->
              state.canonical_issuer
              |> IdentifierIssuer.identifier(bnode)
              |> BlankNode.new()

            {_, node} ->
              node
          end)
        else
          statement
        end
      )
    end)
  end

  # see https://www.w3.org/TR/rdf-canon/#hash-1d-quads
  defp hash_first_degree_quads(state, ref_bnode_id) do
    state.bnode_to_quads
    |> Map.get(ref_bnode_id, [])
    |> Enum.map(fn quads ->
      quads
      |> Quad.new()
      |> Statement.map(fn
        {_, ^ref_bnode_id} -> ~B<a>
        {_, %BlankNode{}} -> ~B<z>
        {_, node} -> node
      end)
      |> RDF.dataset()
      |> NQuads.write_string!()
    end)
    # TODO: "Sort nquads in Unicode code point order"
    |> Enum.sort()
    |> Enum.join()
    |> hash()

    # |> IO.inspect(label: "1deg: node: #{inspect(ref_bnode_id)}, hash_first_degree_quads")
  end

  # see https://www.w3.org/TR/rdf-canon/#hash-related-blank-node
  defp hash_related_bnode(state, related, quad, issuer, position) do
    input = to_string(position)

    input =
      if position != :g do
        "#{input}<#{Statement.predicate(quad)}>"
      else
        input
      end <>
        if identifier =
             IdentifierIssuer.identifier(state.canonical_issuer, related) ||
               IdentifierIssuer.identifier(issuer, related) do
          "_:" <> identifier
        else
          hash_first_degree_quads(state, related)
        end

    hash(input)
    # |> IO.inspect(label: "hrel: input: #{inspect(input)}, hash_related_bnode")
  end

  # see https://www.w3.org/TR/rdf-canon/#hash-nd-quads
  def hash_n_degree_quads(state, identifier, issuer) do
    # IO.inspect(identifier, label: "ndeg: identifier")

    # 1-3)
    # hash_to_related_bnodes is called now H_n in the new spec
    hash_to_related_bnodes =
      Enum.reduce(state.bnode_to_quads[identifier], %{}, fn quad, map ->
        Map.merge(
          map,
          hash_related_quad(state, identifier, quad, issuer),
          fn _, terms, new -> terms ++ new end
        )
      end)

    # |> IO.inspect(label: "ndeg: hash_to_related_bnodes")

    {data_to_hash, issuer} =
      hash_to_related_bnodes
      # TODO: "Sort in Unicode code point order"
      |> Enum.sort()
      |> Enum.reduce({"", issuer}, fn
        {related_hash, bnode_list}, {data_to_hash, issuer} ->
          # 5.1)
          data_to_hash = data_to_hash <> related_hash
          chosen_path = ""
          chosen_issuer = nil

          # 5.2-4)
          {chosen_path, chosen_issuer} =
            bnode_list
            |> Utils.permutations()
            |> Enum.reduce({chosen_path, chosen_issuer}, fn
              permutation, {chosen_path, chosen_issuer} ->
                # IO.inspect(permutation, label: "ndeg: perm")

                issuer_copy = IdentifierIssuer.copy(issuer)
                chosen_path_length = String.length(chosen_path)

                # 5.4.4)
                {path, recursion_list, issuer_copy} =
                  Enum.reduce_while(permutation, {"", [], issuer_copy}, fn
                    related, {path, recursion_list, issuer_copy} ->
                      {path, recursion_list, issuer_copy} =
                        if issued_identifier =
                             IdentifierIssuer.identifier(state.canonical_issuer, related) do
                          {path <> "_:" <> issued_identifier, recursion_list, issuer_copy}
                        else
                          if issued_identifier = IdentifierIssuer.identifier(issuer_copy, related) do
                            {path <> "_:" <> issued_identifier, recursion_list, issuer_copy}
                          else
                            {issued_identifier, issuer_copy} =
                              IdentifierIssuer.issue_identifier(issuer_copy, related)

                            {
                              path <> "_:" <> issued_identifier,
                              [related | recursion_list],
                              issuer_copy
                            }
                          end
                        end

                      # TODO: considering code point order
                      if chosen_path_length != 0 and
                           String.length(path) >= chosen_path_length and
                           path > chosen_path do
                        {:halt, {path, recursion_list, issuer_copy}}
                      else
                        {:cont, {path, recursion_list, issuer_copy}}
                      end
                  end)

                # IO.puts("ndeg: related_hash: #{related_hash}, path: #{path}, recursion: #{inspect(recursion_list)}")

                # 5.4.5)
                {issuer_copy, path} =
                  recursion_list
                  |> Enum.reverse()
                  |> Enum.reduce_while({issuer_copy, path}, fn related, {issuer_copy, path} ->
                    # Note: The following steps seem to be the only steps in the whole algorithm
                    # which really rely on global state.

                    # 5.4.5.1)
                    {result_hash, result_issuer} =
                      hash_n_degree_quads(state, related, issuer_copy)

                    # This step was added to circumvent the need for global state.
                    # It's unclear whether it is actually required, since all test
                    # of the test suite pass without it.
                    # see https://github.com/w3c-ccg/rdf-dataset-canonicalization/issues/31
                    result_issuer =
                      if result_issuer.id == issuer_copy.id do
                        {_, issuer} = IdentifierIssuer.issue_identifier(result_issuer, related)
                        issuer
                      else
                        result_issuer
                      end

                    # 5.4.5.2)
                    {issued_identifier, _issuer_copy} =
                      IdentifierIssuer.issue_identifier(issuer_copy, related)

                    path = path <> "_:" <> issued_identifier <> "<#{result_hash}>"

                    # TODO: considering code point order
                    if chosen_path_length != 0 and
                         String.length(path) >= chosen_path_length and
                         path > chosen_path do
                      {:halt, {result_issuer, path}}
                    else
                      {:cont, {result_issuer, path}}
                    end
                  end)

                # TODO: considering code point order
                if chosen_path_length == 0 or path < chosen_path do
                  {path, issuer_copy}
                else
                  {chosen_path, chosen_issuer}
                end
            end)

          # 5.5)
          {data_to_hash <> chosen_path, chosen_issuer}
      end)

    # IO.puts("ndeg: datatohash: #{data_to_hash}, hash: #{hash(data_to_hash)}")

    {hash(data_to_hash), issuer}
  end

  # 4.8.2.3.1) Group adjacent bnodes by hash
  defp hash_related_quad(state, identifier, quad, issuer) do
    [
      s: Statement.subject(quad),
      o: Statement.object(quad),
      g: Statement.graph_name(quad)
    ]
    |> Enum.reduce(%{}, fn
      {_, ^identifier}, map ->
        map

      {pos, %BlankNode{} = term}, map ->
        hash = hash_related_bnode(state, term, quad, issuer, pos)

        Map.update(map, hash, [term], fn terms ->
          if term in terms, do: terms, else: [term | terms]
        end)

      _, map ->
        map
    end)
  end

  defp hash(data) do
    :crypto.hash(:sha256, data) |> Base.encode16(case: :lower)
  end
end