defmodule Unicode.String.Segment do
@moduledoc """
Implements the compilation of the Unicode
segment rules.
"""
import SweetXml
require Unicode.Set
@root_locale "root"
@suppressions_variable "$Suppressions"
# This is the formal definition but it takes a while to compile
# and all of the known variable names are in the Latin-1 set
# defguard is_id_start(char) when Unicode.Set.match?(char, "\\p{ID_start}")
# defguard is_id_continue(char) when Unicode.Set.match?(char, "\\p{ID_continue}")
@doc "Identifies if a codepoint is a valid start of an identifier"
defguard is_id_start(char)
when char in ?A..?Z
@doc "Identifies if a codepoint is a valid identifier character"
defguard is_id_continue(char)
when char in ?a..?z or char in ?A..?Z or char in ?0..?9 or char == ?_
@doc """
Return the rules as defined by CLDR for a given
locale and break type.
"""
def rules(locale, segment_type, additional_variables \\ []) do
with {:ok, segment} <- segments(locale, segment_type) do
variables = Map.fetch!(segment, :variables) |> expand_variables(additional_variables)
rules = Map.fetch!(segment, :rules)
rules
|> compile_rules(variables, [])
|> wrap(:ok)
end
end
@doc """
Return the rules as defined by CLDR for a given
locale and break type and raises on error.
"""
def rules!(locale, segment_type, additional_variables \\ []) do
case rules(locale, segment_type, additional_variables) do
{:ok, rules} -> rules
{:error, reason} -> raise ArgumentError, reason
end
end
def compile_rules(rules, variables, regex_options) when is_list(rules) do
rules
|> expand_rules(variables)
|> compile_rules(regex_options)
end
# These options set unicode mode. Interpret certain
# codes like \B and \w in the unicode space, ignore
# unescaped whitespace in regexs
@regex_options [:unicode, :extended, :ucp, :dollar_endonly, :dotall, :bsr_unicode]
@rule_splitter "[×÷]"
defp compile_rules(rules, regex_options) do
Enum.map(rules, fn {sequence, rule} ->
[left, operator, right] = Regex.split(~r/#{@rule_splitter}/u, rule, include_captures: true)
operator = if operator == "×", do: :no_break, else: :break
left = if left != "", do: left <> "$", else: left
right = if right != "", do: "^" <> right, else: right
{sequence,
{operator, expand_regex(left, regex_options), expand_regex(right, regex_options)}}
end)
end
@doc """
Compiles a segment rule in the context of a list
of variables.
The compile rule can then be inserted into a
rule set.
"""
def compile_rule(rule, variables, regex_options \\ []) when is_map(rule) do
compile_rules([rule], variables, regex_options)
|> hd
end
@doc false
def suppressions_variable(locale, segment_type) do
variable =
locale
|> suppressions!(segment_type)
|> suppressions_regex()
if variable do
%{name: @suppressions_variable, value: variable}
else
nil
end
end
defp suppressions_regex([]) do
nil
end
defp suppressions_regex(suppressions) do
suppression_regex = Enum.map_join(suppressions, "|", &String.replace(&1, ".", "\\."))
"(" <> suppression_regex <> ")"
end
@doc """
Returns a list of the suppressions for a given
locale and segment type.
"""
def suppressions(locale, segment_type) do
with {:ok, segment} <- segments(locale, segment_type) do
{:ok, Map.get(segment, :suppressions, [])}
end
end
@doc """
Returns a list of the suppressions for a given
locale and segment type and raises on error.
"""
def suppressions!(locale, segment_type) do
case suppressions(locale, segment_type) do
{:ok, suppressions} -> suppressions
{:error, reason} -> raise ArgumentError, reason
end
end
defp expand_regex("", _regex_options) do
:any
end
# Delete spaces because PCRE doesn't ignore them
defp expand_regex(string, regex_options) do
string
|> String.trim()
|> String.replace(" ", "")
|> Unicode.Regex.expand_regex(@regex_options ++ regex_options)
end
@doc """
Evaluates a list of rules against a given
string.
"""
def evaluate_rules(string, rules) when is_binary(string) do
evaluate_rules({"", string}, rules)
end
def evaluate_rules({string_before, string_after}, rules) do
Enum.reduce_while(rules, [], fn rule, _acc ->
{_rule_number, {operator, _fore, _aft}} = rule
case evaluate_rule({string_before, string_after}, rule) do
{:pass, result} ->
{:halt, {:pass, operator, result}}
{:fail, string} ->
{:cont, {:fail, string}}
end
end)
|> return_break_or_no_break
end
# The final implicit rule is to to break. ie: :any ÷ :any
defp return_break_or_no_break({:fail, {before_string, ""}}) do
{:break, {before_string, {"", ""}}}
end
defp return_break_or_no_break({:fail, {before_string, after_string}}) do
<<char::utf8, rest::binary>> = after_string
{:break, {before_string, {<<char::utf8>>, rest}}}
end
defp return_break_or_no_break({:pass, operator, result}) do
{operator, result}
end
@split_options [parts: 2, include_captures: true, trim: true]
# Process an `:any op regex` rule at end of string
defp evaluate_rule({string_before, <<_::utf8>> = string_after}, {_seq, {_operator, :any, {aft, regex_options}}}) do
aft = Regex.compile!(aft, regex_options)
if Regex.match?(aft, string_after) do
{:pass, {string_before, {string_after, ""}}}
else
{:fail, {string_before, string_after}}
end
end
defp evaluate_rule({string_before, string_after}, {_seq, {_operator, :any, {aft, regex_options}}}) do
aft = Regex.compile!(aft, regex_options)
case Regex.split(aft, string_after, @split_options) do
[match, rest] -> {:pass, {string_before, {match, rest}}}
_other -> {:fail, {string_before, string_after}}
end
end
# Ignore suppressions at end of the string
defp evaluate_rule({string_before, string_after}, {10.5, {_operator, {fore, regex_options}, :any}}) do
fore = Regex.compile!(fore, regex_options)
if Regex.match?(fore, string_before) do
# IO.inspect {string_before, string_after}, label: "Matched Rule 10.5"
case Regex.split(fore, string_before, @split_options) do
[match] ->
# IO.inspect {operator, match}, label: "Matched One"
{:pass, {string_before, {match, ""}}}
[match, rest] ->
# IO.inspect {operator, match, rest}, label: "Matched"
{:pass, {string_before, {match, rest}}}
end
else
# IO.inspect {string_before, string_after}, label: "Did not match Rule 10.5"
{:fail, {string_before, string_after}}
end
end
# :any matches end of string
defp evaluate_rule({string_before, "" = string_after}, {_seq, {_operator, {fore, regex_options}, :any}}) do
fore = Regex.compile!(fore, regex_options)
if Regex.match?(fore, string_before) do
{:pass, {string_before, {"", ""}}}
else
{:fail, {string_before, string_after}}
end
end
defp evaluate_rule({string_before, string_after}, {_seq, {_operator, {fore, regex_options}, :any}}) do
fore = Regex.compile!(fore, regex_options)
if Regex.match?(fore, string_before) do
<<char::utf8, rest::binary>> = string_after
{:pass, {string_before, {<<char::utf8>>, rest}}}
else
{:fail, {string_before, string_after}}
end
end
defp evaluate_rule({string_before, string_after}, {_seq, {_operator, {fore, fore_regex_options}, {aft, aft_regex_options}}}) do
fore = Regex.compile!(fore, fore_regex_options)
aft = Regex.compile!(aft, aft_regex_options)
if Regex.match?(fore, string_before) && Regex.match?(aft, string_after) do
case Regex.split(aft, string_after, @split_options) do
[match, rest] -> {:pass, {string_before, {match, rest}}}
[match] -> {:pass, {string_before, {match, ""}}}
end
else
{:fail, {string_before, string_after}}
end
end
defp expand_rules(rules, variables) do
Enum.reduce(rules, [], fn %{id: sequence, value: rule}, acc ->
rule =
rule
|> String.trim()
|> substitute_variables(variables)
[{sequence, rule} | acc]
end)
|> Enum.sort()
end
def expand_variables(variables, additional_variables)
when is_list(variables) and is_list(additional_variables) do
Enum.reduce(variables ++ additional_variables, %{}, fn
%{name: <<"$", name::binary>>, value: value}, variables ->
new_value = substitute_variables(value, variables)
Map.put(variables, name, new_value)
end)
end
defp substitute_variables("", _variables) do
""
end
defp substitute_variables(<<"$", char::utf8, rest::binary>>, variables)
when is_id_start(char) do
{name, rest} = extract_variable_name(<<char::utf8>> <> rest)
Map.fetch!(variables, name) <> substitute_variables(rest, variables)
end
defp substitute_variables(<<char::binary-1, rest::binary>>, variables) do
char <> substitute_variables(rest, variables)
end
defp extract_variable_name("" = string) do
{string, ""}
end
defp extract_variable_name(<<char::utf8, rest::binary>>)
when is_id_continue(char) do
{string, rest} = extract_variable_name(rest)
{<<char::utf8>> <> string, rest}
end
defp extract_variable_name(rest) do
{"", rest}
end
@app_name Mix.Project.config()[:app]
@doctype "<!DOCTYPE ldml SYSTEM \"../../common/dtd/ldml.dtd\">"
@doc false
def segments_dir do
Path.join(:code.priv_dir(@app_name), "/segments")
end
@doc false
def locale_map do
segments_dir()
|> File.ls!()
|> Enum.map(fn locale_file ->
locale =
locale_file
|> String.split(".xml")
|> hd
|> String.replace("_", "-")
{locale, locale_file}
end)
|> Map.new()
end
@doc """
Returns a list of the known locales that have
segmentation data.
"""
def known_segmentation_locales do
locale_map()
|> Map.keys()
|> Enum.map(&String.to_atom/1)
end
@doc """
Returns a list of the ancestor locales
of the a given locale.
The list includes the given locale.
"""
def ancestors(locale_name) do
if Map.get(locale_map(), locale_name) do
case String.split(locale_name, "-") do
[locale] -> [locale, @root_locale]
[locale, _territory] -> [locale_name, locale, @root_locale]
[locale, script, _territory] -> [locale_name, "#{locale}-#{script}", locale, @root_locale]
end
|> wrap(:ok)
else
{:error, unknown_locale_error(locale_name)}
end
end
@doc false
def merge_ancestors(@root_locale) do
raw_segments!(@root_locale)
|> wrap(:ok)
end
def merge_ancestors(locale) when is_binary(locale) do
with {:ok, ancestors} <- ancestors(locale) do
merge_ancestors(ancestors)
|> wrap(:ok)
end
end
@doc false
def merge_ancestors([locale, root]) do
merge_ancestor(locale, raw_segments!(root))
end
def merge_ancestors([locale | rest]) do
merge_ancestor(locale, merge_ancestors(rest))
end
# For each segment type, add the variables, rules and
# suppressions from locale to other
defp merge_ancestor(locale, other) do
locale_segments = raw_segments!(locale)
Enum.map(other, fn {segment_type, content} ->
variables =
Map.fetch!(content, :variables) ++
(get_in(locale_segments, [segment_type, :variables]) || [])
rules =
Map.fetch!(content, :rules) ++
(get_in(locale_segments, [segment_type, :rules]) || [])
suppressions =
Map.fetch!(content, :suppressions) ++
(get_in(locale_segments, [segment_type, :suppressions]) || [])
{segment_type, %{content | variables: variables, rules: rules, suppressions: suppressions}}
end)
|> Map.new()
end
defp raw_segments(locale) do
if file = Map.get(locale_map(), locale) do
content =
segments_dir()
|> Path.join(file)
|> File.read!()
|> String.replace(@doctype, "")
|> xpath(~x"//segmentation"l,
type: ~x"./@type"s,
variables: [
~x".//variable"l,
name: ~x"./@id"s,
value: ~x"./text()"s
],
rules: [
~x".//rule"l,
id: ~x"./@id"f,
value: ~x"./text()"s
],
suppressions: ~x".//suppression/text()"ls
)
Enum.map(content, fn c ->
type =
c.type
|> Macro.underscore()
|> String.replace("__", "_")
|> String.to_atom()
{type, %{rules: c.rules, variables: c.variables, suppressions: c.suppressions}}
end)
|> Map.new()
|> wrap(:ok)
else
{:error, unknown_locale_error(locale)}
end
end
defp raw_segments!(locale) do
case raw_segments(locale) do
{:ok, segments} -> segments
{:error, reason} -> raise ArgumentError, reason
end
end
@doc false
def segments(locale) when is_binary(locale) do
merge_ancestors(locale)
end
def segments(locale) when is_atom(locale) do
locale
|> Atom.to_string()
|> segments()
end
@doc false
def segments(locale, segment_type) do
with {:ok, segments} <- segments(to_string(locale)) do
if segment = Map.get(segments, segment_type) do
{:ok, segment}
else
{:error, unknown_segment_type_error(segment_type)}
end
end
end
defp wrap(term, atom) do
{atom, term}
end
@doc false
def unknown_locale_error(locale) do
"Unknown locale #{inspect(locale)}"
end
@doc false
def unknown_segment_type_error(segment_type) do
"Unknown segment type #{inspect(segment_type)}"
end
end