-module(textmetrics@distance).
-compile([no_auto_import, nowarn_unused_vars, nowarn_unused_function, nowarn_nomatch, inline]).
-define(FILEPATH, "src/textmetrics/distance.gleam").
-export([hamming/2, levenshtein_list/2, levenshtein/2, normalized_levenshtein/2, osa/2, damerau_levenshtein/2]).
-export_type([hamming_error/0]).
-if(?OTP_RELEASE >= 27).
-define(MODULEDOC(Str), -moduledoc(Str)).
-define(DOC(Str), -doc(Str)).
-else.
-define(MODULEDOC(Str), -compile([])).
-define(DOC(Str), -compile([])).
-endif.
?MODULEDOC(
" Edit-distance functions.\n"
"\n"
" All string-typed functions operate on **extended grapheme\n"
" clusters** as defined by Unicode UAX #29; user-visible characters\n"
" (CJK ideographs, emoji ZWJ sequences, Hangul jamo) count as one\n"
" unit. Inputs are not normalized — callers wanting NFC equivalence\n"
" must normalize strings before invoking these functions.\n"
"\n"
" `levenshtein_list/2` exposes the same algorithm over arbitrary\n"
" equality-comparable lists for callers diffing tokens, AST nodes,\n"
" or any non-string sequence.\n"
).
-type hamming_error() :: {length_mismatch, integer(), integer()}.
-file("src/textmetrics/distance.gleam", 171).
-spec init_dl_rows(
gleam@dict:dict({integer(), integer()}, integer()),
integer(),
integer(),
integer()
) -> gleam@dict:dict({integer(), integer()}, integer()).
init_dl_rows(D, M, Inf, I) ->
case I > M of
true ->
D;
false ->
_pipe = D,
_pipe@1 = gleam@dict:insert(_pipe, {I + 1, 0}, Inf),
_pipe@2 = gleam@dict:insert(_pipe@1, {I + 1, 1}, I),
init_dl_rows(_pipe@2, M, Inf, I + 1)
end.
-file("src/textmetrics/distance.gleam", 188).
-spec init_dl_cols(
gleam@dict:dict({integer(), integer()}, integer()),
integer(),
integer(),
integer()
) -> gleam@dict:dict({integer(), integer()}, integer()).
init_dl_cols(D, N, Inf, J) ->
case J > N of
true ->
D;
false ->
_pipe = D,
_pipe@1 = gleam@dict:insert(_pipe, {0, J + 1}, Inf),
_pipe@2 = gleam@dict:insert(_pipe@1, {1, J + 1}, J),
init_dl_cols(_pipe@2, N, Inf, J + 1)
end.
-file("src/textmetrics/distance.gleam", 164).
-spec init_dl_matrix(integer(), integer(), integer()) -> gleam@dict:dict({integer(),
integer()}, integer()).
init_dl_matrix(M, N, Inf) ->
_pipe = maps:new(),
_pipe@1 = gleam@dict:insert(_pipe, {0, 0}, Inf),
_pipe@2 = init_dl_rows(_pipe@1, M, Inf, 0),
init_dl_cols(_pipe@2, N, Inf, 0).
-file("src/textmetrics/distance.gleam", 376).
-spec count_diffs(list(binary()), list(binary()), integer()) -> integer().
count_diffs(A, B, Acc) ->
case {A, B} of
{[], []} ->
Acc;
{[X | Xs], [Y | Ys]} ->
case X =:= Y of
true ->
count_diffs(Xs, Ys, Acc);
false ->
count_diffs(Xs, Ys, Acc + 1)
end;
{_, _} ->
Acc
end.
-file("src/textmetrics/distance.gleam", 365).
?DOC(
" Hamming distance: the number of grapheme positions at which the\n"
" inputs differ. Returns `Error(LengthMismatch(...))` when the inputs\n"
" have different grapheme counts.\n"
"\n"
" Edge cases:\n"
" - `hamming(\"\", \"\")` = `Ok(0)`\n"
" - `hamming(\"a\", \"\")` = `Error(LengthMismatch(1, 0))`\n"
"\n"
" Time `O(n)`, space `O(1)`.\n"
).
-spec hamming(binary(), binary()) -> {ok, integer()} | {error, hamming_error()}.
hamming(A, B) ->
Ga = gleam@string:to_graphemes(A),
Gb = gleam@string:to_graphemes(B),
La = erlang:length(Ga),
Lb = erlang:length(Gb),
case La =:= Lb of
false ->
{error, {length_mismatch, La, Lb}};
true ->
{ok, count_diffs(Ga, Gb, 0)}
end.
-file("src/textmetrics/distance.gleam", 396).
-spec list_to_dict_loop(list(EJY), integer(), gleam@dict:dict(integer(), EJY)) -> gleam@dict:dict(integer(), EJY).
list_to_dict_loop(Items, I, Acc) ->
case Items of
[] ->
Acc;
[X | Rest] ->
list_to_dict_loop(Rest, I + 1, gleam@dict:insert(Acc, I, X))
end.
-file("src/textmetrics/distance.gleam", 392).
-spec list_to_dict(list(EJU)) -> gleam@dict:dict(integer(), EJU).
list_to_dict(Items) ->
list_to_dict_loop(Items, 0, maps:new()).
-file("src/textmetrics/distance.gleam", 413).
-spec int_range_inclusive_loop(integer(), integer(), list(integer())) -> list(integer()).
int_range_inclusive_loop(Current, End, Acc) ->
case Current > End of
true ->
lists:reverse(Acc);
false ->
int_range_inclusive_loop(Current + 1, End, [Current | Acc])
end.
-file("src/textmetrics/distance.gleam", 409).
-spec int_range_inclusive(integer(), integer()) -> list(integer()).
int_range_inclusive(Start, End) ->
int_range_inclusive_loop(Start, End, []).
-file("src/textmetrics/distance.gleam", 403).
-spec make_init_row(integer()) -> gleam@dict:dict(integer(), integer()).
make_init_row(N) ->
_pipe = int_range_inclusive(0, N),
_pipe@1 = gleam@list:index_map(_pipe, fun(V, I) -> {I, V} end),
maps:from_list(_pipe@1).
-file("src/textmetrics/distance.gleam", 420).
-spec min3(integer(), integer(), integer()) -> integer().
min3(A, B, C) ->
gleam@int:min(A, gleam@int:min(B, C)).
-file("src/textmetrics/distance.gleam", 111).
-spec levenshtein_inner(
EHJ,
list(EHJ),
integer(),
list(integer()),
integer(),
list(integer())
) -> list(integer()).
levenshtein_inner(Ai, Bs, Prev_diag, Prev_rest, Curr_left, Acc) ->
case {Bs, Prev_rest} of
{[], _} ->
lists:reverse(Acc);
{_, []} ->
lists:reverse(Acc);
{[Bj | Bs_rest], [Prev_above | Prev_more]} ->
Sub_cost = case Ai =:= Bj of
true ->
0;
false ->
1
end,
Curr_j = min3(Prev_above + 1, Curr_left + 1, Prev_diag + Sub_cost),
levenshtein_inner(
Ai,
Bs_rest,
Prev_above,
Prev_more,
Curr_j,
[Curr_j | Acc]
)
end.
-file("src/textmetrics/distance.gleam", 89).
-spec levenshtein_outer(list(EHF), list(EHF), list(integer()), integer()) -> integer().
levenshtein_outer(Remaining_a, Bs, Prev, I) ->
case Remaining_a of
[] ->
case gleam@list:last(Prev) of
{ok, V} ->
V;
{error, _} ->
0
end;
[Ai | Rest] ->
Curr = case Prev of
[P0 | P_rest] ->
levenshtein_inner(Ai, Bs, P0, P_rest, I, [I]);
[] ->
[I]
end,
levenshtein_outer(Rest, Bs, Curr, I + 1)
end.
-file("src/textmetrics/distance.gleam", 74).
?DOC(
" Generic Levenshtein distance over any equality-comparable list\n"
" type. Used by `diff` internally and exposed for callers diffing\n"
" token streams or AST nodes.\n"
).
-spec levenshtein_list(list(EHC), list(EHC)) -> integer().
levenshtein_list(A, B) ->
N = erlang:length(B),
case A of
[] ->
N;
_ ->
case B of
[] ->
erlang:length(A);
_ ->
Initial_prev = int_range_inclusive(0, N),
levenshtein_outer(A, B, Initial_prev, 1)
end
end.
-file("src/textmetrics/distance.gleam", 35).
?DOC(
" Minimum number of single-grapheme insert / delete / substitute\n"
" operations needed to transform `a` into `b`.\n"
"\n"
" Edge cases:\n"
" - `levenshtein(\"\", \"\")` = `0`\n"
" - `levenshtein(\"\", b)` = grapheme count of `b`\n"
" - `levenshtein(a, \"\")` = grapheme count of `a`\n"
" - `levenshtein(a, a)` = `0`\n"
"\n"
" Time `O(m·n)`, space `O(min(m, n))`.\n"
).
-spec levenshtein(binary(), binary()) -> integer().
levenshtein(A, B) ->
levenshtein_list(gleam@string:to_graphemes(A), gleam@string:to_graphemes(B)).
-file("src/textmetrics/distance.gleam", 53).
?DOC(
" Levenshtein-based similarity in `[0.0, 1.0]`.\n"
"\n"
" Defined as `1.0 - levenshtein(a, b) / max(|a|, |b|)` where lengths\n"
" are grapheme counts. `1.0` means identical, `0.0` means every\n"
" grapheme position differs at the longer length.\n"
"\n"
" Edge cases:\n"
" - `normalized_levenshtein(\"\", \"\")` = `1.0` (convention).\n"
" - `normalized_levenshtein(a, a)` = `1.0`.\n"
" - `normalized_levenshtein(\"\", non_empty)` = `0.0`.\n"
"\n"
" Use this when ranking by Levenshtein-style edit distance is\n"
" preferred over Jaro / Jaro-Winkler. Time `O(m·n)`, space\n"
" `O(min(m, n))` — same as [`levenshtein`](#levenshtein).\n"
).
-spec normalized_levenshtein(binary(), binary()) -> float().
normalized_levenshtein(A, B) ->
Ga = gleam@string:to_graphemes(A),
Gb = gleam@string:to_graphemes(B),
La = erlang:length(Ga),
Lb = erlang:length(Gb),
Max_len = case La >= Lb of
true ->
La;
false ->
Lb
end,
case Max_len of
0 ->
1.0;
_ ->
D = levenshtein_list(Ga, Gb),
1.0 - (case erlang:float(Max_len) of
+0.0 -> +0.0;
-0.0 -> -0.0;
Gleam@denominator -> erlang:float(D) / Gleam@denominator
end)
end.
-file("src/textmetrics/distance.gleam", 315).
-spec osa_inner(
binary(),
{ok, binary()} | {error, nil},
gleam@dict:dict(integer(), binary()),
integer(),
gleam@dict:dict(integer(), integer()),
gleam@dict:dict(integer(), integer()),
integer(),
gleam@dict:dict(integer(), integer())
) -> gleam@dict:dict(integer(), integer()).
osa_inner(Ai, Prev_a, B_dict, N, Prev, Prev_prev, J, Curr) ->
case J > N of
true ->
Curr;
false ->
Bj = gleam@result:unwrap(
gleam_stdlib:map_get(B_dict, J - 1),
<<""/utf8>>
),
Prev_above = gleam@result:unwrap(gleam_stdlib:map_get(Prev, J), 0),
Prev_diag = gleam@result:unwrap(
gleam_stdlib:map_get(Prev, J - 1),
0
),
Curr_left = gleam@result:unwrap(
gleam_stdlib:map_get(Curr, J - 1),
0
),
Sub_cost = case Ai =:= Bj of
true ->
0;
false ->
1
end,
V0 = min3(Prev_above + 1, Curr_left + 1, Prev_diag + Sub_cost),
V1 = case {J >= 2, Prev_a} of
{true, {ok, Pa}} ->
Prev_b = gleam@result:unwrap(
gleam_stdlib:map_get(B_dict, J - 2),
Bj
),
case (Ai =:= Prev_b) andalso (Pa =:= Bj) of
true ->
Pp = gleam@result:unwrap(
gleam_stdlib:map_get(Prev_prev, J - 2),
0
),
gleam@int:min(V0, Pp + 1);
false ->
V0
end;
{_, _} ->
V0
end,
New_curr = gleam@dict:insert(Curr, J, V1),
osa_inner(Ai, Prev_a, B_dict, N, Prev, Prev_prev, J + 1, New_curr)
end.
-file("src/textmetrics/distance.gleam", 291).
-spec osa_outer(
gleam@dict:dict(integer(), binary()),
gleam@dict:dict(integer(), binary()),
integer(),
integer(),
integer(),
gleam@dict:dict(integer(), integer()),
gleam@dict:dict(integer(), integer())
) -> integer().
osa_outer(A_dict, B_dict, M, N, I, Prev, Prev_prev) ->
case I > M of
true ->
gleam@result:unwrap(gleam_stdlib:map_get(Prev, N), 0);
false ->
Ai = gleam@result:unwrap(
gleam_stdlib:map_get(A_dict, I - 1),
<<""/utf8>>
),
Prev_a = case gleam_stdlib:map_get(A_dict, I - 2) of
{ok, V} ->
{ok, V};
{error, _} ->
{error, nil}
end,
Curr0 = gleam@dict:insert(maps:new(), 0, I),
Curr = osa_inner(Ai, Prev_a, B_dict, N, Prev, Prev_prev, 1, Curr0),
osa_outer(A_dict, B_dict, M, N, I + 1, Curr, Prev)
end.
-file("src/textmetrics/distance.gleam", 275).
-spec osa_graphemes(list(binary()), list(binary())) -> integer().
osa_graphemes(Ga, Gb) ->
M = erlang:length(Ga),
N = erlang:length(Gb),
case {M, N} of
{0, _} ->
N;
{_, 0} ->
M;
{_, _} ->
A_dict = list_to_dict(Ga),
B_dict = list_to_dict(Gb),
Prev = make_init_row(N),
Prev_prev = maps:new(),
osa_outer(A_dict, B_dict, M, N, 1, Prev, Prev_prev)
end.
-file("src/textmetrics/distance.gleam", 271).
?DOC(
" Optimal String Alignment distance. Same operations as\n"
" Damerau-Levenshtein but each substring is edited at most once,\n"
" which is what most \"Damerau distance\" implementations actually\n"
" compute.\n"
"\n"
" OSA does **not** satisfy the triangle inequality. Use\n"
" [`damerau_levenshtein`](#damerau_levenshtein) when a true metric is\n"
" required.\n"
"\n"
" Time `O(m·n)`, space `O(min(m, n))` (three rows).\n"
).
-spec osa(binary(), binary()) -> integer().
osa(A, B) ->
osa_graphemes(gleam@string:to_graphemes(A), gleam@string:to_graphemes(B)).
-file("src/textmetrics/distance.gleam", 424).
-spec min4(integer(), integer(), integer(), integer()) -> integer().
min4(A, B, C, D) ->
gleam@int:min(gleam@int:min(A, B), gleam@int:min(C, D)).
-file("src/textmetrics/distance.gleam", 225).
-spec dl_inner(
gleam@dict:dict(integer(), binary()),
gleam@dict:dict(integer(), binary()),
integer(),
integer(),
integer(),
integer(),
gleam@dict:dict(binary(), integer()),
gleam@dict:dict({integer(), integer()}, integer())
) -> gleam@dict:dict({integer(), integer()}, integer()).
dl_inner(A_dict, B_dict, N, I, J, Db, Da, D) ->
case J > N of
true ->
D;
false ->
Ai = gleam@result:unwrap(
gleam_stdlib:map_get(A_dict, I - 1),
<<""/utf8>>
),
Bj = gleam@result:unwrap(
gleam_stdlib:map_get(B_dict, J - 1),
<<""/utf8>>
),
K = gleam@result:unwrap(gleam_stdlib:map_get(Da, Bj), 0),
L = Db,
{Cost, New_db} = case Ai =:= Bj of
true ->
{0, J};
false ->
{1, Db}
end,
Sub = gleam@result:unwrap(gleam_stdlib:map_get(D, {I, J}), 0) + Cost,
Ins = gleam@result:unwrap(gleam_stdlib:map_get(D, {I + 1, J}), 0) + 1,
Del = gleam@result:unwrap(gleam_stdlib:map_get(D, {I, J + 1}), 0) + 1,
Trans = ((gleam@result:unwrap(gleam_stdlib:map_get(D, {K, L}), 0) + ((I
- K)
- 1))
+ 1)
+ ((J - L) - 1),
Val = min4(Sub, Ins, Del, Trans),
New_d = gleam@dict:insert(D, {I + 1, J + 1}, Val),
dl_inner(A_dict, B_dict, N, I, J + 1, New_db, Da, New_d)
end.
-file("src/textmetrics/distance.gleam", 205).
-spec dl_outer(
gleam@dict:dict(integer(), binary()),
gleam@dict:dict(integer(), binary()),
integer(),
integer(),
integer(),
gleam@dict:dict(binary(), integer()),
gleam@dict:dict({integer(), integer()}, integer())
) -> gleam@dict:dict({integer(), integer()}, integer()).
dl_outer(A_dict, B_dict, M, N, I, Da, D) ->
case I > M of
true ->
D;
false ->
New_d = dl_inner(A_dict, B_dict, N, I, 1, 0, Da, D),
Ai = gleam@result:unwrap(
gleam_stdlib:map_get(A_dict, I - 1),
<<""/utf8>>
),
New_da = gleam@dict:insert(Da, Ai, I),
dl_outer(A_dict, B_dict, M, N, I + 1, New_da, New_d)
end.
-file("src/textmetrics/distance.gleam", 147).
-spec damerau_levenshtein_graphemes(list(binary()), list(binary())) -> integer().
damerau_levenshtein_graphemes(Ga, Gb) ->
M = erlang:length(Ga),
N = erlang:length(Gb),
case {M, N} of
{0, _} ->
N;
{_, 0} ->
M;
{_, _} ->
A_dict = list_to_dict(Ga),
B_dict = list_to_dict(Gb),
Inf = M + N,
Initial = init_dl_matrix(M, N, Inf),
Final_d = dl_outer(A_dict, B_dict, M, N, 1, maps:new(), Initial),
gleam@result:unwrap(
gleam_stdlib:map_get(Final_d, {M + 1, N + 1}),
0
)
end.
-file("src/textmetrics/distance.gleam", 143).
?DOC(
" True Damerau-Levenshtein distance: insert, delete, substitute, and\n"
" transposition of two adjacent graphemes are each counted as one\n"
" operation. Unlike OSA, the same substring may participate in\n"
" multiple edits (e.g. `\"ca\"` → `\"abc\"` is `2`).\n"
"\n"
" Time `O(m·n)`, space `O(m·n)` (full matrix is required for the\n"
" transposition recurrence).\n"
).
-spec damerau_levenshtein(binary(), binary()) -> integer().
damerau_levenshtein(A, B) ->
damerau_levenshtein_graphemes(
gleam@string:to_graphemes(A),
gleam@string:to_graphemes(B)
).