defmodule Unicode.String.Break do
@moduledoc """
Implements the Unicode break algorithm
"""
alias Unicode.String.Segment
@break_map %{
grapheme: :grapheme_cluster_break,
word: :word_break,
sentence: :sentence_break,
line: :line_break,
graphemes: :grapheme_cluster_break,
grapheme_cluster: :grapheme_cluster_break,
words: :word_break,
sentences: :sentence_break,
lines: :line_break
}
@break_keys Map.keys(@break_map)
@doc false
def break(string, locale, break, options) when break in @break_keys do
break_at(string, locale, Map.fetch!(@break_map, break), options)
end
defp break_at(string, locale, segment_type, options) when is_binary(string) do
break_at({"", string}, locale, segment_type, options)
end
defp break_at({string_before, string_after}, locale, segment_type, options) do
suppress? = Keyword.get(options, :suppressions, true)
{:ok, rules} = rules(locale, segment_type, suppress?)
{string_before, string_after}
|> Segment.evaluate_rules(rules)
end
@doc false
def split(string, locale, break, options) when break in @break_keys do
split_at(string, locale, Map.fetch!(@break_map, break), options)
end
defp split_at(string, locale, segment_type, options) when is_binary(string) do
split_at({"", string}, locale, segment_type, options)
end
defp split_at({string_before, string_after}, locale, segment_type, options) do
suppress? = Keyword.get(options, :suppressions, true)
{:ok, rules} = rules(locale, segment_type, suppress?)
{string_before, string_after}
|> Segment.evaluate_rules(rules)
|> split(rules, [""])
end
defp split({:break, {_string_before, {"", ""}}}, _rules, [head | rest]) do
Enum.reverse([head | rest])
end
defp split({:break, {_string_before, {fore, ""}}}, _rules, [head | rest]) do
Enum.reverse([fore | [head | rest]])
end
defp split({:break, {_string_before, {fore, aft}}}, rules, ["" | rest]) do
{fore, aft}
|> Segment.evaluate_rules(rules)
|> split(rules, [fore | rest])
end
defp split({:break, {_string_before, {fore, aft}}}, rules, [head | rest]) do
{head <> fore, aft}
|> Segment.evaluate_rules(rules)
|> split(rules, [fore | [head | rest]])
end
defp split({:no_break, {_string_before, {fore, aft}}}, rules, [head | rest]) do
{head <> fore, aft}
|> Segment.evaluate_rules(rules)
|> split(rules, [head <> fore | rest])
end
@doc false
def next("", _locale, _break, _options) do
nil
end
def next(string, locale, break, options) when break in @break_keys and is_binary(string) do
<<char::utf8, rest::binary>> = string
case next_at({<<char::utf8>>, rest}, locale, Map.fetch!(@break_map, break), options) do
{fore, {match, rest}} ->
{<<char::utf8>> <> fore, match <> rest}
{fore, rest} ->
{<<char::utf8>> <> fore, rest}
end
|> repeat_if_trimming_required(locale, break, options, options[:trim])
end
defp repeat_if_trimming_required({match, rest}, locale, break, options, true) do
if Unicode.Property.white_space?(match) do
next(rest, locale, break, options)
else
{match, rest}
end
end
defp repeat_if_trimming_required({match, rest}, _locale, _break, _options, _) do
{match, rest}
end
defp next_at({string_before, string_after}, locale, segment_type, options) do
suppress? = Keyword.get(options, :suppressions, true)
{:ok, rules} = rules(locale, segment_type, suppress?)
{string_before, string_after}
|> Segment.evaluate_rules(rules)
|> do_next(rules, "")
end
defp do_next({:break, {_string_before, {"", ""}}}, _rules, acc) do
{acc, ""}
end
defp do_next({:break, {_string_before, {fore, ""}}}, _rules, acc) do
{acc, fore}
end
defp do_next({:break, {_string_before, rest}}, _rules, acc) do
{acc, rest}
end
defp do_next({:no_break, {_string_before, {fore, aft}}}, rules, acc) do
{acc <> fore, aft}
|> Segment.evaluate_rules(rules)
|> do_next(rules, acc <> fore)
end
# Recompile this module if any of the segment
# files change.
for {_locale, file} <- Segment.locale_map() do
@external_resource Path.join(Segment.segments_dir(), file)
end
@suppression_rules %{
sentence_break: %{id: 10.5, value: "$Suppressions $Close* $Sp* $ParaSep? ×"}
}
# Returns a list of rules applicable for
# a given locale and segment type.
defp rules(locale, segment_type)
# Returns the variable definitions for
# a given locale and segment typ.
defp variables(locale, segment_type)
# Returns a list of suppressions
# (abbreviations) that can be used
# to suppress an otherwise acceptable
# break point.
# Examples
#
# => Unicode.String.Break.variables "en", :sentence_break
# [
# %{name: "$CR", value: "\\p{Sentence_Break=CR}"},
# %{name: "$LF", value: "\\p{Sentence_Break=LF}"},
# %{name: "$Extend", value: "\\p{Sentence_Break=Extend}"},
# %{name: "$Format", value: "\\p{Sentence_Break=Format}"},
# %{name: "$Sep", value: "\\p{Sentence_Break=Sep}"},
# %{name: "$Sp", value: "\\p{Sentence_Break=Sp}"},
# %{name: "$Lower", value: "\\p{Sentence_Break=Lower}"},
# ...
# ]
defp suppressions(locale, segment_type)
defp suppressions_rule(locale, segment_type)
for locale <- Segment.known_segmentation_locales() do
{:ok, segments} = Segment.segments(locale)
for segment_type <- Map.keys(segments) do
defp rules(unquote(locale), unquote(segment_type)) do
unquote(Macro.escape(Segment.rules(locale, segment_type)))
end
defp variables(unquote(locale), unquote(segment_type)) do
unquote(Macro.escape(get_in(segments, [segment_type, :variables])))
end
defp suppressions(unquote(locale), unquote(segment_type)) do
unquote(Macro.escape(Segment.suppressions!(locale, segment_type)))
end
suppressions_rule = Map.get(@suppression_rules, segment_type)
suppressions_variable = Segment.suppressions_variable(locale, segment_type)
if suppressions_rule && suppressions_variable do
variables =
get_in(segments, [segment_type, :variables])
|> Segment.expand_variables([suppressions_variable])
rule = Segment.compile_rule(suppressions_rule, variables, [:caseless])
defp suppressions_rule(unquote(locale), unquote(segment_type)) do
unquote(Macro.escape(rule))
end
end
end
end
@default_locale :root
defp rules(_other, segment_type) do
Segment.rules(@default_locale, segment_type)
end
defp suppressions_rule(_locale, _segment_type) do
nil
end
@doc false
defp rules(locale, break_type, true) do
if suppressions_rule = suppressions_rule(locale, break_type) do
{:ok, rules} = rules(locale, break_type)
{:ok, [suppressions_rule | rules]}
else
rules(locale, break_type)
end
end
defp rules(locale, break_type, _) do
rules(locale, break_type)
end
end