defmodule RDF.Canonicalization do
@moduledoc """
An implementation of the standard RDF Dataset Canonicalization Algorithm.
See <https://www.w3.org/TR/rdf-canon/>.
## Options
All functions in this module support the following options:
- `:hash_algorithm`: Allows to set the hash algorithm to be used. Any of the `:crypto.hash_algorithm()`
values of Erlang's `:crypto` module are allowed.
Defaults to the runtime configured `:canon_hash_algorithm` of the `:rdf` application
or `:sha256` if not configured otherwise.
config :rdf,
canon_hash_algorithm: :sha512
- `:hndq_call_limit`: This algorithm has to go through complex cycles that may in extreme situations
result in an unreasonably long canonicalization process. Although this never occurs in practice,
attackers may use some "poison graphs" to create such situations
(see the [security consideration section](https://www.w3.org/TR/rdf-canon/#security-considerations) in the specification).
This implementation sets a maximum call limit for the Hash N-Degree Quads algorithm
which can be configured with this value. Note, that actual limit is the product
of the multiplication of the given value with the number blank nodes in the input graph.
Defaults to the runtime configured `:hndq_call_limit` of the `:rdf` application
or `50` if not configured otherwise.
config :rdf,
hndq_call_limit: 10
"""
alias RDF.Canonicalization.{IdentifierIssuer, State}
alias RDF.{BlankNode, Dataset, Quad, Statement, NQuads, Utils}
import RDF.Sigils
@doc """
Canonicalizes the blank nodes of a graph or dataset according to the RDF Dataset Canonicalization spec.
This function always returns a `RDF.Dataset` and wraps it in a tuple with the
resulting internal state from which the blank node mapping can be retrieved.
If you want to get just a `RDF.Dataset` back, use `RDF.Dataset.canonicalize/2`.
If you want to canonicalize just a `RDF.Graph` and get a `RDF.Graph` back,
use `RDF.Graph.canonicalize/2`.
See module documentation on the available options.
"""
@spec canonicalize(RDF.Graph.t() | RDF.Dataset.t(), keyword) :: {RDF.Dataset.t(), State.t()}
def canonicalize(input, opts \\ []) do
rdfc10(input, opts)
end
@doc """
Checks whether two graphs or datasets are equal, regardless of the concrete names of the blank nodes they contain.
See module documentation on the available options.
## Examples
iex> RDF.Graph.new([{~B<foo>, EX.p(), ~B<bar>}, {~B<bar>, EX.p(), 42}])
...> |> RDF.Canonicalization.isomorphic?(
...> RDF.Graph.new([{~B<b1>, EX.p(), ~B<b2>}, {~B<b2>, EX.p(), 42}]))
true
iex> RDF.Graph.new([{~B<foo>, EX.p(), ~B<bar>}, {~B<bar>, EX.p(), 42}])
...> |> RDF.Canonicalization.isomorphic?(
...> RDF.Graph.new([{~B<b1>, EX.p(), ~B<b2>}, {~B<b3>, EX.p(), 42}]))
false
"""
@spec isomorphic?(RDF.Graph.t() | RDF.Dataset.t(), RDF.Graph.t() | RDF.Dataset.t(), keyword) ::
boolean
def isomorphic?(a, b, opts \\ []) do
{canon_a, _} = canonicalize(a, opts)
{canon_b, _} = canonicalize(b, opts)
Dataset.equal?(canon_a, canon_b)
end
defp rdfc10(input, opts) do
input
|> State.new(opts)
|> create_canonical_identifiers_for_single_node_hashes()
|> create_canonical_identifiers_for_multiple_node_hashes()
|> apply_canonicalization(input)
end
defp create_canonical_identifiers_for_single_node_hashes(state) do
# 3) Calculate hashes for first degree nodes
state =
Enum.reduce(state.bnode_to_quads, state, fn {n, _}, state ->
State.add_bnode_hash(state, n, hash_first_degree_quads(state, n))
end)
# 4) Create canonical replacements for hashes mapping to a single node
state.hash_to_bnodes
# TODO: "Sort in Unicode code point order"
|> Enum.sort()
|> Enum.reduce(state, fn {hash, identifier_list}, state ->
case MapSet.to_list(identifier_list) do
[identifier] ->
state
|> State.issue_canonical_identifier(identifier)
|> State.delete_bnode_hash(hash)
[] ->
raise "unexpected empty identifier list"
_ ->
state
end
end)
end
# 5)
defp create_canonical_identifiers_for_multiple_node_hashes(state) do
state.hash_to_bnodes
|> Enum.sort()
|> Enum.reduce(state, fn {_hash, identifier_list}, state ->
# 5.1-2) Create a hash_path_list for all bnodes using a temporary identifier used to create canonical replacements
identifier_list
|> Enum.reduce([], fn identifier, hash_path_list ->
if IdentifierIssuer.issued?(state.canonical_issuer, identifier) do
hash_path_list
else
{_issued_identifier, temporary_issuer} =
"b"
|> IdentifierIssuer.new()
|> IdentifierIssuer.issue_identifier(identifier)
[
hash_n_degree_quads(state, identifier, temporary_issuer)
| hash_path_list
]
end
end)
|> Enum.sort()
# 5.3) Create canonical replacements for nodes
|> Enum.reduce(state, fn {_hash, issuer}, state ->
issuer
|> IdentifierIssuer.issued_identifiers()
|> Enum.reduce(state, &State.issue_canonical_identifier(&2, &1))
end)
end)
end
# 6)
defp apply_canonicalization(state, data) do
dataset =
Enum.reduce(data, Dataset.new(), fn statement, canonicalized_data ->
Dataset.add(
canonicalized_data,
if Statement.has_bnode?(statement) do
Statement.map(statement, fn
{_, %BlankNode{} = bnode} ->
state.canonical_issuer
|> IdentifierIssuer.identifier(bnode)
|> BlankNode.new()
{_, node} ->
node
end)
else
statement
end
)
end)
{dataset, state}
end
# see https://www.w3.org/TR/rdf-canon/#hash-1d-quads
defp hash_first_degree_quads(state, ref_bnode_id) do
state.bnode_to_quads
|> Map.get(ref_bnode_id, [])
|> Enum.map(fn quads ->
quads
|> Quad.new()
|> Statement.map(fn
{_, ^ref_bnode_id} -> ~B<a>
{_, %BlankNode{}} -> ~B<z>
{_, node} -> node
end)
|> Dataset.new()
|> NQuads.write_string!()
end)
# TODO: "Sort nquads in Unicode code point order"
|> Enum.sort()
|> Enum.join()
|> hash(state)
# |> IO.inspect(label: "1deg: node: #{inspect(ref_bnode_id)}, hash_first_degree_quads")
end
# see https://www.w3.org/TR/rdf-canon/#hash-related-blank-node
defp hash_related_bnode(state, related, quad, issuer, position) do
input = to_string(position)
input =
if position != :g do
"#{input}<#{Statement.predicate(quad)}>"
else
input
end <>
if identifier =
IdentifierIssuer.identifier(state.canonical_issuer, related) ||
IdentifierIssuer.identifier(issuer, related) do
"_:" <> identifier
else
hash_first_degree_quads(state, related)
end
hash(input, state)
# |> IO.inspect(label: "hrel: input: #{inspect(input)}, hash_related_bnode")
end
# see https://www.w3.org/TR/rdf-canon/#hash-nd-quads
defp hash_n_degree_quads(state, identifier, issuer) do
hash_n_degree_quads(state, identifier, issuer, 0, State.max_calls(state))
end
defp hash_n_degree_quads(_, _, _, max_calls, max_calls) do
raise "Exceeded maximum number of calls (#{max_calls}) allowed to hash_n_degree_quads"
end
defp hash_n_degree_quads(state, identifier, issuer, call_count, max_calls) do
# IO.inspect(identifier, label: "ndeg: identifier")
# 1-3)
# hash_to_related_bnodes is called now H_n in the new spec
hash_to_related_bnodes =
Enum.reduce(state.bnode_to_quads[identifier], %{}, fn quad, map ->
Map.merge(
map,
hash_related_quad(state, identifier, quad, issuer),
fn _, terms, new -> terms ++ new end
)
end)
# |> IO.inspect(label: "ndeg: hash_to_related_bnodes")
{data_to_hash, issuer, _} =
hash_to_related_bnodes
# TODO: "Sort in Unicode code point order"
|> Enum.sort()
|> Enum.reduce({"", issuer, call_count}, fn
{related_hash, bnode_list}, {data_to_hash, issuer, call_count} ->
# 5.1)
data_to_hash = data_to_hash <> related_hash
chosen_path = ""
chosen_issuer = nil
# 5.2-4)
{chosen_path, chosen_issuer, call_count} =
bnode_list
|> Utils.permutations()
|> Enum.reduce({chosen_path, chosen_issuer, call_count}, fn
permutation, {chosen_path, chosen_issuer, call_count} ->
# IO.inspect(permutation, label: "ndeg: perm")
issuer_copy = IdentifierIssuer.copy(issuer)
chosen_path_length = String.length(chosen_path)
# 5.4.4)
{path, recursion_list, issuer_copy} =
Enum.reduce_while(permutation, {"", [], issuer_copy}, fn
related, {path, recursion_list, issuer_copy} ->
{path, recursion_list, issuer_copy} =
if issued_identifier =
IdentifierIssuer.identifier(state.canonical_issuer, related) do
{path <> "_:" <> issued_identifier, recursion_list, issuer_copy}
else
if issued_identifier = IdentifierIssuer.identifier(issuer_copy, related) do
{path <> "_:" <> issued_identifier, recursion_list, issuer_copy}
else
{issued_identifier, issuer_copy} =
IdentifierIssuer.issue_identifier(issuer_copy, related)
{
path <> "_:" <> issued_identifier,
[related | recursion_list],
issuer_copy
}
end
end
# TODO: considering code point order
if chosen_path_length != 0 and
String.length(path) >= chosen_path_length and
path > chosen_path do
{:halt, {path, recursion_list, issuer_copy}}
else
{:cont, {path, recursion_list, issuer_copy}}
end
end)
# IO.puts("ndeg: related_hash: #{related_hash}, path: #{path}, recursion: #{inspect(recursion_list)}")
# 5.4.5)
{issuer_copy, path, call_count} =
recursion_list
|> Enum.reverse()
|> Enum.reduce_while({issuer_copy, path, call_count}, fn
related, {issuer_copy, path, call_count} ->
# Note: The following steps seem to be the only steps in the whole algorithm
# which really rely on global state.
call_count = call_count + 1
# 5.4.5.1)
{result_hash, result_issuer} =
hash_n_degree_quads(state, related, issuer_copy, call_count, max_calls)
# This step was added to circumvent the need for global state.
# It's unclear whether it is actually required, since all test
# of the test suite pass without it.
# see https://github.com/w3c-ccg/rdf-dataset-canonicalization/issues/31
result_issuer =
if result_issuer.id == issuer_copy.id do
{_, issuer} = IdentifierIssuer.issue_identifier(result_issuer, related)
issuer
else
result_issuer
end
# 5.4.5.2)
{issued_identifier, _issuer_copy} =
IdentifierIssuer.issue_identifier(issuer_copy, related)
path = path <> "_:" <> issued_identifier <> "<#{result_hash}>"
# TODO: considering code point order
if chosen_path_length != 0 and
String.length(path) >= chosen_path_length and
path > chosen_path do
{:halt, {result_issuer, path, call_count}}
else
{:cont, {result_issuer, path, call_count}}
end
end)
# TODO: considering code point order
if chosen_path_length == 0 or path < chosen_path do
{path, issuer_copy, call_count}
else
{chosen_path, chosen_issuer, call_count}
end
end)
# 5.5)
{data_to_hash <> chosen_path, chosen_issuer, call_count}
end)
# IO.puts("ndeg: datatohash: #{data_to_hash}, hash: #{hash(data_to_hash)}")
{hash(data_to_hash, state), issuer}
end
# 4.8.2.3.1) Group adjacent bnodes by hash
defp hash_related_quad(state, identifier, quad, issuer) do
[
s: Statement.subject(quad),
o: Statement.object(quad),
g: Statement.graph_name(quad)
]
|> Enum.reduce(%{}, fn
{_, ^identifier}, map ->
map
{pos, %BlankNode{} = term}, map ->
hash = hash_related_bnode(state, term, quad, issuer, pos)
Map.update(map, hash, [term], fn terms ->
if term in terms, do: terms, else: [term | terms]
end)
_, map ->
map
end)
end
defp hash(data, state) do
:crypto.hash(state.hash_algorithm, data) |> Base.encode16(case: :lower)
end
end