-module(textmetrics@similarity).
-compile([no_auto_import, nowarn_unused_vars, nowarn_unused_function, nowarn_nomatch, inline]).
-define(FILEPATH, "src/textmetrics/similarity.gleam").
-export([default_jaro_winkler_config/0, jaro_winkler_config/2, prefix_scale/1, prefix_max/1, jaro/2, jaro_winkler_with/3, jaro_winkler/2, sorensen_dice/3]).
-export_type([sorensen_dice_error/0, jaro_winkler_config_error/0, jaro_winkler_config/0]).
-if(?OTP_RELEASE >= 27).
-define(MODULEDOC(Str), -moduledoc(Str)).
-define(DOC(Str), -doc(Str)).
-else.
-define(MODULEDOC(Str), -compile([])).
-define(DOC(Str), -compile([])).
-endif.
?MODULEDOC(
" Similarity scores in the closed interval `[0.0, 1.0]`.\n"
"\n"
" `1.0` means \"identical\", `0.0` means \"no similarity by this\n"
" metric\". No function in this module returns `NaN` or a negative\n"
" `Float`.\n"
"\n"
" All string-typed functions operate on extended grapheme clusters\n"
" and do not normalize their inputs — callers wanting NFC\n"
" equivalence must normalize first.\n"
).
-type sorensen_dice_error() :: {ngram_size_invalid, integer()}.
-type jaro_winkler_config_error() :: {prefix_scale_out_of_range, float()} |
{prefix_max_negative, integer()}.
-opaque jaro_winkler_config() :: {jaro_winkler_config, float(), integer()}.
-file("src/textmetrics/similarity.gleam", 42).
?DOC(" Winkler-1990 defaults: `prefix_scale = 0.1`, `prefix_max = 4`.\n").
-spec default_jaro_winkler_config() -> jaro_winkler_config().
default_jaro_winkler_config() ->
{jaro_winkler_config, 0.1, 4}.
-file("src/textmetrics/similarity.gleam", 52).
?DOC(
" Construct a [`JaroWinklerConfig`](#JaroWinklerConfig).\n"
"\n"
" Invariants:\n"
" - `prefix_scale` must be in `[0.0, 0.25]` (Winkler's upper bound\n"
" that keeps the score in `[0, 1]`).\n"
" - `prefix_max` must be `>= 0`.\n"
).
-spec jaro_winkler_config(float(), integer()) -> {ok, jaro_winkler_config()} |
{error, jaro_winkler_config_error()}.
jaro_winkler_config(Prefix_scale, Prefix_max) ->
case (Prefix_scale < +0.0) orelse (Prefix_scale > 0.25) of
true ->
{error, {prefix_scale_out_of_range, Prefix_scale}};
false ->
case Prefix_max < 0 of
true ->
{error, {prefix_max_negative, Prefix_max}};
false ->
{ok, {jaro_winkler_config, Prefix_scale, Prefix_max}}
end
end.
-file("src/textmetrics/similarity.gleam", 71).
?DOC(" Read the prefix-scale parameter of a config.\n").
-spec prefix_scale(jaro_winkler_config()) -> float().
prefix_scale(Config) ->
erlang:element(2, Config).
-file("src/textmetrics/similarity.gleam", 76).
?DOC(" Read the prefix-cap parameter of a config.\n").
-spec prefix_max(jaro_winkler_config()) -> integer().
prefix_max(Config) ->
erlang:element(3, Config).
-file("src/textmetrics/similarity.gleam", 242).
-spec find_match_in_range(
binary(),
gleam@dict:dict(integer(), binary()),
integer(),
integer(),
gleam@dict:dict(integer(), boolean())
) -> {ok, integer()} | {error, nil}.
find_match_in_range(Ai, B_arr, J, Hi, B_matched) ->
case J > Hi of
true ->
{error, nil};
false ->
Already = case gleam_stdlib:map_get(B_matched, J) of
{ok, true} ->
true;
_ ->
false
end,
case Already of
true ->
find_match_in_range(Ai, B_arr, J + 1, Hi, B_matched);
false ->
Bj = gleam@result:unwrap(
gleam_stdlib:map_get(B_arr, J),
<<""/utf8>>
),
case Ai =:= Bj of
true ->
{ok, J};
false ->
find_match_in_range(Ai, B_arr, J + 1, Hi, B_matched)
end
end
end.
-file("src/textmetrics/similarity.gleam", 195).
-spec find_matches(
gleam@dict:dict(integer(), binary()),
gleam@dict:dict(integer(), binary()),
integer(),
integer(),
integer(),
integer(),
integer(),
gleam@dict:dict(integer(), boolean()),
gleam@dict:dict(integer(), boolean())
) -> {integer(),
gleam@dict:dict(integer(), boolean()),
gleam@dict:dict(integer(), boolean())}.
find_matches(A_arr, B_arr, La, Lb, Window, I, Matches, A_matched, B_matched) ->
case I >= La of
true ->
{Matches, A_matched, B_matched};
false ->
Ai = gleam@result:unwrap(
gleam_stdlib:map_get(A_arr, I),
<<""/utf8>>
),
Lo = gleam@int:max(0, I - Window),
Hi = gleam@int:min(Lb - 1, I + Window),
case find_match_in_range(Ai, B_arr, Lo, Hi, B_matched) of
{ok, J} ->
find_matches(
A_arr,
B_arr,
La,
Lb,
Window,
I + 1,
Matches + 1,
gleam@dict:insert(A_matched, I, true),
gleam@dict:insert(B_matched, J, true)
);
{error, _} ->
find_matches(
A_arr,
B_arr,
La,
Lb,
Window,
I + 1,
Matches,
A_matched,
B_matched
)
end
end.
-file("src/textmetrics/similarity.gleam", 319).
-spec skip_unmatched_b(gleam@dict:dict(integer(), boolean()), integer()) -> integer().
skip_unmatched_b(B_matched, K) ->
case gleam_stdlib:map_get(B_matched, K) of
{ok, true} ->
K;
_ ->
skip_unmatched_b(B_matched, K + 1)
end.
-file("src/textmetrics/similarity.gleam", 270).
-spec count_half_transpositions(
gleam@dict:dict(integer(), binary()),
gleam@dict:dict(integer(), binary()),
gleam@dict:dict(integer(), boolean()),
gleam@dict:dict(integer(), boolean()),
integer(),
integer(),
integer(),
integer()
) -> integer().
count_half_transpositions(A_arr, B_arr, A_matched, B_matched, La, I, K, Trans) ->
case I >= La of
true ->
Trans;
false ->
case gleam_stdlib:map_get(A_matched, I) of
{ok, true} ->
New_k = skip_unmatched_b(B_matched, K),
Ai = gleam@result:unwrap(
gleam_stdlib:map_get(A_arr, I),
<<""/utf8>>
),
Bk = gleam@result:unwrap(
gleam_stdlib:map_get(B_arr, New_k),
<<""/utf8>>
),
New_trans = case Ai =:= Bk of
true ->
Trans;
false ->
Trans + 1
end,
count_half_transpositions(
A_arr,
B_arr,
A_matched,
B_matched,
La,
I + 1,
New_k + 1,
New_trans
);
_ ->
count_half_transpositions(
A_arr,
B_arr,
A_matched,
B_matched,
La,
I + 1,
K,
Trans
)
end
end.
-file("src/textmetrics/similarity.gleam", 326).
-spec common_prefix_length(list(binary()), list(binary()), integer(), integer()) -> integer().
common_prefix_length(A, B, Count, Max) ->
case Count >= Max of
true ->
Count;
false ->
case {A, B} of
{[X | Xs], [Y | Ys]} ->
case X =:= Y of
true ->
common_prefix_length(Xs, Ys, Count + 1, Max);
false ->
Count
end;
{_, _} ->
Count
end
end.
-file("src/textmetrics/similarity.gleam", 377).
-spec build_ngram_at(
gleam@dict:dict(integer(), binary()),
integer(),
integer(),
list(binary())
) -> binary().
build_ngram_at(Arr, Start, Count, Acc) ->
case Count of
0 ->
_pipe = lists:reverse(Acc),
erlang:list_to_binary(_pipe);
_ ->
G = gleam@result:unwrap(
gleam_stdlib:map_get(Arr, Start),
<<""/utf8>>
),
build_ngram_at(Arr, Start + 1, Count - 1, [G | Acc])
end.
-file("src/textmetrics/similarity.gleam", 361).
-spec build_ngrams_loop(
gleam@dict:dict(integer(), binary()),
integer(),
integer(),
integer(),
list(binary())
) -> list(binary()).
build_ngrams_loop(Arr, N, Len, I, Acc) ->
case (I + N) > Len of
true ->
lists:reverse(Acc);
false ->
Ngram = build_ngram_at(Arr, I, N, []),
build_ngrams_loop(Arr, N, Len, I + 1, [Ngram | Acc])
end.
-file("src/textmetrics/similarity.gleam", 392).
-spec increment_count(gleam@dict:dict(binary(), integer()), binary()) -> gleam@dict:dict(binary(), integer()).
increment_count(Acc, Key) ->
gleam@dict:upsert(Acc, Key, fun(Opt) -> case Opt of
{some, C} ->
C + 1;
none ->
1
end end).
-file("src/textmetrics/similarity.gleam", 401).
-spec intersection_size(
gleam@dict:dict(binary(), integer()),
gleam@dict:dict(binary(), integer())
) -> integer().
intersection_size(A, B) ->
gleam@dict:fold(
A,
0,
fun(Acc, Key, Ca) -> case gleam_stdlib:map_get(B, Key) of
{ok, Cb} ->
Acc + gleam@int:min(Ca, Cb);
{error, _} ->
Acc
end end
).
-file("src/textmetrics/similarity.gleam", 418).
-spec list_to_dict_loop(list(EZE), integer(), gleam@dict:dict(integer(), EZE)) -> gleam@dict:dict(integer(), EZE).
list_to_dict_loop(Items, I, Acc) ->
case Items of
[] ->
Acc;
[X | Rest] ->
list_to_dict_loop(Rest, I + 1, gleam@dict:insert(Acc, I, X))
end.
-file("src/textmetrics/similarity.gleam", 414).
-spec list_to_dict(list(EZA)) -> gleam@dict:dict(integer(), EZA).
list_to_dict(Items) ->
list_to_dict_loop(Items, 0, maps:new()).
-file("src/textmetrics/similarity.gleam", 155).
-spec jaro_graphemes(list(binary()), list(binary())) -> float().
jaro_graphemes(Ga, Gb) ->
La = erlang:length(Ga),
Lb = erlang:length(Gb),
case {La, Lb} of
{0, 0} ->
1.0;
{0, _} ->
+0.0;
{_, 0} ->
+0.0;
{_, _} ->
Max_len = gleam@int:max(La, Lb),
Window = gleam@int:max(0, (Max_len div 2) - 1),
A_arr = list_to_dict(Ga),
B_arr = list_to_dict(Gb),
{Matches, A_matched, B_matched} = find_matches(
A_arr,
B_arr,
La,
Lb,
Window,
0,
0,
maps:new(),
maps:new()
),
case Matches of
0 ->
+0.0;
_ ->
Half_t = count_half_transpositions(
A_arr,
B_arr,
A_matched,
B_matched,
La,
0,
0,
0
),
T = Half_t div 2,
M_f = erlang:float(Matches),
La_f = erlang:float(La),
Lb_f = erlang:float(Lb),
T_f = erlang:float(T),
(((case La_f of
+0.0 -> +0.0;
-0.0 -> -0.0;
Gleam@denominator -> M_f / Gleam@denominator
end) + (case Lb_f of
+0.0 -> +0.0;
-0.0 -> -0.0;
Gleam@denominator@1 -> M_f / Gleam@denominator@1
end)) + (case M_f of
+0.0 -> +0.0;
-0.0 -> -0.0;
Gleam@denominator@2 -> (M_f - T_f) / Gleam@denominator@2
end)) / 3.0
end
end.
-file("src/textmetrics/similarity.gleam", 88).
?DOC(
" Jaro similarity at the grapheme level.\n"
"\n"
" Edge cases (defined by convention):\n"
" - `jaro(\"\", \"\")` = `1.0`\n"
" - `jaro(\"\", b)` = `0.0` for non-empty `b`\n"
" - `jaro(a, \"\")` = `0.0` for non-empty `a`\n"
"\n"
" Time `O(m·n)`, space `O(m + n)`.\n"
).
-spec jaro(binary(), binary()) -> float().
jaro(A, B) ->
jaro_graphemes(gleam@string:to_graphemes(A), gleam@string:to_graphemes(B)).
-file("src/textmetrics/similarity.gleam", 99).
?DOC(" Jaro-Winkler similarity with caller-supplied parameters.\n").
-spec jaro_winkler_with(binary(), binary(), jaro_winkler_config()) -> float().
jaro_winkler_with(A, B, Config) ->
Ga = gleam@string:to_graphemes(A),
Gb = gleam@string:to_graphemes(B),
J = jaro_graphemes(Ga, Gb),
L = begin
_pipe = common_prefix_length(Ga, Gb, 0, erlang:element(3, Config)),
erlang:float(_pipe)
end,
J + ((L * erlang:element(2, Config)) * (1.0 - J)).
-file("src/textmetrics/similarity.gleam", 94).
?DOC(
" Jaro-Winkler similarity using Winkler-1990 defaults\n"
" (`prefix_scale = 0.1`, `prefix_max = 4`).\n"
).
-spec jaro_winkler(binary(), binary()) -> float().
jaro_winkler(A, B) ->
jaro_winkler_with(A, B, default_jaro_winkler_config()).
-file("src/textmetrics/similarity.gleam", 350).
-spec build_ngrams(list(binary()), integer()) -> list(binary()).
build_ngrams(Gs, N) ->
Len = erlang:length(Gs),
case Len < N of
true ->
[];
false ->
Arr = list_to_dict(Gs),
build_ngrams_loop(Arr, N, Len, 0, [])
end.
-file("src/textmetrics/similarity.gleam", 120).
?DOC(
" Sørensen-Dice coefficient over grapheme n-grams of size `n`.\n"
"\n"
" Edge cases (per spec §7.5):\n"
" - When both n-gram multisets are empty, the result is `Ok(1.0)` if\n"
" the inputs are equal (including both empty) and `Ok(0.0)`\n"
" otherwise.\n"
" - When exactly one input has no n-grams the score is `Ok(0.0)`\n"
" (no overlap is possible).\n"
" - `n < 1` returns `Error(NgramSizeInvalid(n))`.\n"
).
-spec sorensen_dice(binary(), binary(), integer()) -> {ok, float()} |
{error, sorensen_dice_error()}.
sorensen_dice(A, B, N) ->
case N < 1 of
true ->
{error, {ngram_size_invalid, N}};
false ->
Ga = gleam@string:to_graphemes(A),
Gb = gleam@string:to_graphemes(B),
Ngs_a = build_ngrams(Ga, N),
Ngs_b = build_ngrams(Gb, N),
Total_a = erlang:length(Ngs_a),
Total_b = erlang:length(Ngs_b),
case Total_a + Total_b of
0 ->
case A =:= B of
true ->
{ok, 1.0};
false ->
{ok, +0.0}
end;
Denom ->
Multiset_a = gleam@list:fold(
Ngs_a,
maps:new(),
fun increment_count/2
),
Multiset_b = gleam@list:fold(
Ngs_b,
maps:new(),
fun increment_count/2
),
Inter = intersection_size(Multiset_a, Multiset_b),
{ok, case erlang:float(Denom) of
+0.0 -> +0.0;
-0.0 -> -0.0;
Gleam@denominator -> erlang:float(2 * Inter) / Gleam@denominator
end}
end
end.