-module(babble).
-compile([no_auto_import, nowarn_unused_vars, nowarn_unused_function, nowarn_nomatch, inline]).
-define(FILEPATH, "src/babble.gleam").
-export([new/2, config/1, is_empty/1, message_count/1, tokenize/2, sentences/1, train/2, train_many/2, generate/3, generate_paragraph/4, generate_starting_with/4, weighted/1, most_likely/1]).
-export_type([token/0, tokenization/0, config/0, generate_error/0, step/0, model/0]).
-if(?OTP_RELEASE >= 27).
-define(MODULEDOC(Str), -moduledoc(Str)).
-define(DOC(Str), -doc(Str)).
-else.
-define(MODULEDOC(Str), -compile([])).
-define(DOC(Str), -compile([])).
-endif.
?MODULEDOC(
" A small Markov chain text generator: train it on example text, then generate\n"
" new sentences that sound _almost_ like the source. It offers incremental\n"
" training, sentence-aware generation that stops at a natural full stop, and a\n"
" pluggable sampler that puts you in control of _how_ each next word is chosen.\n"
"\n"
" ```gleam\n"
" let model =\n"
" babble.new(order: 2, tokenization: babble.Words)\n"
" |> babble.train(\"the cat sat on the mat.\")\n"
" |> babble.train(\"the dog sat on the log.\")\n"
"\n"
" let assert Ok(sentence) = babble.generate(model, babble.weighted, max_tokens: 200)\n"
" ```\n"
"\n"
" Generation is driven by a [`Sampler`](#Sampler) — see that type along with\n"
" [`weighted`](#weighted) and [`most_likely`](#most_likely).\n"
).
-type token() :: start | 'end' | {word, binary()}.
-type tokenization() :: words | characters.
-type config() :: {config, integer(), tokenization()}.
-type generate_error() :: empty_model.
-type step() :: {continue, binary()} | stop.
-opaque model() :: {model,
config(),
gleam@dict:dict(list(token()), gleam@dict:dict(token(), integer())),
integer()}.
-file("src/babble.gleam", 122).
?DOC(
" A new empty model, ready to [`train`](#train).\n"
"\n"
" Both settings are fixed at construction (changing them would invalidate the\n"
" learned counts, so there are no setters):\n"
" - `order` — the n-gram context length: how many previous tokens to condition\n"
" on when picking the next. Clamped to >= 1. Higher = more coherent but more\n"
" verbatim; 2 is a good default.\n"
" - `tokenization` — `Words` or `Characters`.\n"
"\n"
" The generation length cap is passed to [`generate`](#generate), not set here.\n"
"\n"
" ## Examples\n"
"\n"
" ```gleam\n"
" let model = babble.new(order: 2, tokenization: babble.Words)\n"
" assert babble.is_empty(model)\n"
" ```\n"
).
-spec new(integer(), tokenization()) -> model().
new(Order, Tokenization) ->
{model, {config, gleam@int:max(1, Order), Tokenization}, maps:new(), 0}.
-file("src/babble.gleam", 131).
?DOC(" The (clamped) configuration this model was built with.\n").
-spec config(model()) -> config().
config(Model) ->
erlang:element(2, Model).
-file("src/babble.gleam", 136).
?DOC(" True when the model has learned no transitions yet.\n").
-spec is_empty(model()) -> boolean().
is_empty(Model) ->
gleam@dict:is_empty(erlang:element(3, Model)).
-file("src/babble.gleam", 141).
?DOC(" How many non-empty messages have been folded into the model.\n").
-spec message_count(model()) -> integer().
message_count(Model) ->
erlang:element(4, Model).
-file("src/babble.gleam", 209).
?DOC(
" Increment the count of `next` following `context`, creating the successor\n"
" table if absent.\n"
).
-spec bump(
gleam@dict:dict(list(token()), gleam@dict:dict(token(), integer())),
list(token()),
token()
) -> gleam@dict:dict(list(token()), gleam@dict:dict(token(), integer())).
bump(Transitions, Context, Next) ->
gleam@dict:upsert(Transitions, Context, fun(Existing) -> _pipe = Existing,
_pipe@1 = gleam@option:unwrap(_pipe, maps:new()),
gleam@dict:upsert(
_pipe@1,
Next,
fun(Count) -> gleam@option:unwrap(Count, 0) + 1 end
) end).
-file("src/babble.gleam", 195).
?DOC(
" Slide an `order`-length window across `padded`, counting each\n"
" context -> next transition. Stops when no token follows the context.\n"
).
-spec count_window(
gleam@dict:dict(list(token()), gleam@dict:dict(token(), integer())),
list(token()),
integer()
) -> gleam@dict:dict(list(token()), gleam@dict:dict(token(), integer())).
count_window(Transitions, Padded, Order) ->
gleam@list:fold(
gleam@list:window(Padded, Order + 1),
Transitions,
fun(Acc, Window) -> case gleam@list:last(Window) of
{ok, Next} ->
bump(Acc, gleam@list:take(Window, Order), Next);
{error, nil} ->
Acc
end end
).
-file("src/babble.gleam", 189).
?DOC(" Pad a sentence's base tokens with `order` `Start`s and a trailing `End`.\n").
-spec pad(list(binary()), integer()) -> list(token()).
pad(Base, Order) ->
lists:append(
[gleam@list:repeat(start, Order),
gleam@list:map(Base, fun(Field@0) -> {word, Field@0} end),
['end']]
).
-file("src/babble.gleam", 475).
?DOC(false).
-spec tokenize(binary(), tokenization()) -> list(binary()).
tokenize(Sentence, Tokenization) ->
case Tokenization of
words ->
_pipe = Sentence,
_pipe@1 = gleam@string:replace(_pipe, <<"\n"/utf8>>, <<" "/utf8>>),
_pipe@2 = gleam@string:replace(_pipe@1, <<"\t"/utf8>>, <<" "/utf8>>),
_pipe@3 = gleam@string:replace(_pipe@2, <<"\r"/utf8>>, <<" "/utf8>>),
_pipe@4 = gleam@string:split(_pipe@3, <<" "/utf8>>),
gleam@list:filter(_pipe@4, fun(S) -> S /= <<""/utf8>> end);
characters ->
gleam@string:to_graphemes(Sentence)
end.
-file("src/babble.gleam", 488).
-spec segment(list(binary()), binary(), list(binary())) -> list(binary()).
segment(Graphemes, Buffer, Acc) ->
case Graphemes of
[] ->
[Buffer | Acc];
[Grapheme | Rest] ->
Buffer@1 = <<Buffer/binary, Grapheme/binary>>,
Terminal = ((Grapheme =:= <<"."/utf8>>) orelse (Grapheme =:= <<"!"/utf8>>))
orelse (Grapheme =:= <<"?"/utf8>>),
Boundary = case Rest of
[] ->
true;
[Next | _] ->
(((Next =:= <<" "/utf8>>) orelse (Next =:= <<"\n"/utf8>>))
orelse (Next =:= <<"\t"/utf8>>))
orelse (Next =:= <<"\r"/utf8>>)
end,
case Terminal andalso Boundary of
true ->
segment(Rest, <<""/utf8>>, [Buffer@1 | Acc]);
false ->
segment(Rest, Buffer@1, Acc)
end
end.
-file("src/babble.gleam", 464).
?DOC(false).
-spec sentences(binary()) -> list(binary()).
sentences(Message) ->
_pipe = Message,
_pipe@1 = gleam@string:to_graphemes(_pipe),
_pipe@2 = segment(_pipe@1, <<""/utf8>>, []),
_pipe@3 = lists:reverse(_pipe@2),
_pipe@4 = gleam@list:map(_pipe@3, fun gleam@string:trim/1),
gleam@list:filter(_pipe@4, fun(S) -> S /= <<""/utf8>> end).
-file("src/babble.gleam", 161).
?DOC(
" Fold a single message into the model, returning a new model.\n"
"\n"
" Each sentence is tokenised, padded with `order` `Start` markers and a\n"
" trailing `End`, and every `order`-length context -> next transition is\n"
" counted. The message counter bumps once if the message held a non-empty\n"
" sentence. It is cheap and never rebuilds, so you can keep folding in new text.\n"
"\n"
" ## Examples\n"
"\n"
" ```gleam\n"
" let model =\n"
" babble.new(order: 2, tokenization: babble.Words)\n"
" |> babble.train(\"the cat sat.\")\n"
"\n"
" assert babble.message_count(model) == 1\n"
" ```\n"
).
-spec train(model(), binary()) -> model().
train(Model, Message) ->
Sentence_tokens = begin
gleam@list:filter_map(
sentences(Message),
fun(Sentence) ->
case tokenize(
Sentence,
erlang:element(3, erlang:element(2, Model))
) of
[] ->
{error, nil};
Base ->
{ok,
pad(
Base,
erlang:element(2, erlang:element(2, Model))
)}
end
end
)
end,
{model,
erlang:element(2, Model),
begin
gleam@list:fold(
Sentence_tokens,
erlang:element(3, Model),
fun(Acc, Padded) ->
count_window(
Acc,
Padded,
erlang:element(2, erlang:element(2, Model))
)
end
)
end,
case Sentence_tokens of
[] ->
erlang:element(4, Model);
_ ->
erlang:element(4, Model) + 1
end}.
-file("src/babble.gleam", 184).
?DOC(" Fold many messages into the model, in order.\n").
-spec train_many(model(), list(binary())) -> model().
train_many(Model, Messages) ->
gleam@list:fold(Messages, Model, fun train/2).
-file("src/babble.gleam", 435).
?DOC(" The all-`Start` context generation begins from.\n").
-spec start_context(model()) -> list(token()).
start_context(Model) ->
gleam@list:repeat(start, erlang:element(2, erlang:element(2, Model))).
-file("src/babble.gleam", 451).
?DOC(
" Join base tokens (already in final order) into a string under the given\n"
" tokenization.\n"
).
-spec join(tokenization(), list(binary())) -> binary().
join(Tokenization, Tokens) ->
case Tokenization of
words ->
gleam@string:join(Tokens, <<" "/utf8>>);
characters ->
erlang:list_to_binary(Tokens)
end.
-file("src/babble.gleam", 420).
?DOC(
" The successor table as sampler-facing weighted candidates. `End` is the only\n"
" non-`Word` token reachable as a successor, so it becomes `Stop`.\n"
).
-spec candidates(gleam@dict:dict(token(), integer())) -> list({step(),
integer()}).
candidates(Counts) ->
gleam@list:map(
maps:to_list(Counts),
fun(_use0) ->
{Token, Count} = _use0,
Step = case Token of
{word, Word} ->
{continue, Word};
start ->
stop;
'end' ->
stop
end,
{Step, Count}
end
).
-file("src/babble.gleam", 391).
-spec gen_loop(
model(),
list(token()),
list(binary()),
integer(),
fun((list({step(), integer()})) -> step()),
integer()
) -> list(binary()).
gen_loop(Model, Context, Emitted, Count, Sampler, Max_tokens) ->
case {Count >= Max_tokens,
gleam_stdlib:map_get(erlang:element(3, Model), Context)} of
{false, {ok, Counts}} ->
case Sampler(candidates(Counts)) of
{continue, Word} ->
gen_loop(
Model,
lists:append(
gleam@list:drop(Context, 1),
[{word, Word}]
),
[Word | Emitted],
Count + 1,
Sampler,
Max_tokens
);
stop ->
Emitted
end;
{_, _} ->
Emitted
end.
-file("src/babble.gleam", 379).
?DOC(
" Walk from `context` to a sentence end with `sampler`, emitting at most\n"
" `max_tokens` words, then join. `acc` holds already-emitted prefix words,\n"
" newest-first.\n"
).
-spec generate_sentence(
model(),
list(token()),
list(binary()),
fun((list({step(), integer()})) -> step()),
integer()
) -> binary().
generate_sentence(Model, Context, Acc, Sampler, Max_tokens) ->
Emitted = gen_loop(
Model,
Context,
Acc,
0,
Sampler,
gleam@int:max(1, Max_tokens)
),
join(erlang:element(3, erlang:element(2, Model)), lists:reverse(Emitted)).
-file("src/babble.gleam", 430).
?DOC(" Whether generation can begin: the all-`Start` context has transitions.\n").
-spec startable(model()) -> boolean().
startable(Model) ->
gleam@dict:has_key(erlang:element(3, Model), start_context(Model)).
-file("src/babble.gleam", 242).
?DOC(
" Generate one sentence, choosing each next word with `sampler` and emitting at\n"
" most `max_tokens` of them.\n"
"\n"
" Walks the chain from the start of a sentence, asking `sampler` for the next\n"
" step at each point, until it stops at a learned sentence end or reaches\n"
" `max_tokens` (clamped to >= 1). Returns `Error(EmptyModel)` if the model has\n"
" never been trained.\n"
"\n"
" Pass [`weighted`](#weighted) for varied, corpus-like output or\n"
" [`most_likely`](#most_likely) for deterministic output. See [`Sampler`](#Sampler)\n"
" to write your own.\n"
"\n"
" ## Examples\n"
"\n"
" ```gleam\n"
" // Varied output — a different sentence each call:\n"
" let assert Ok(sentence) = babble.generate(model, babble.weighted, max_tokens: 200)\n"
"\n"
" // No data yet:\n"
" let empty = babble.new(order: 2, tokenization: babble.Words)\n"
" assert babble.generate(empty, babble.weighted, max_tokens: 50) == Error(babble.EmptyModel)\n"
" ```\n"
).
-spec generate(model(), fun((list({step(), integer()})) -> step()), integer()) -> {ok,
binary()} |
{error, generate_error()}.
generate(Model, Sampler, Max_tokens) ->
case startable(Model) of
false ->
{error, empty_model};
true ->
{ok,
generate_sentence(
Model,
start_context(Model),
[],
Sampler,
Max_tokens
)}
end.
-file("src/babble.gleam", 256).
?DOC(
" Generate `sentences` sentences (at least 1) with `sampler`, each capped at\n"
" `max_tokens`, joined by spaces.\n"
).
-spec generate_paragraph(
model(),
integer(),
fun((list({step(), integer()})) -> step()),
integer()
) -> {ok, binary()} | {error, generate_error()}.
generate_paragraph(Model, Sentences, Sampler, Max_tokens) ->
case startable(Model) of
false ->
{error, empty_model};
true ->
_pipe = gleam@list:repeat(nil, gleam@int:max(1, Sentences)),
_pipe@1 = gleam@list:map(
_pipe,
fun(_) ->
generate_sentence(
Model,
start_context(Model),
[],
Sampler,
Max_tokens
)
end
),
_pipe@2 = gleam@string:join(_pipe@1, <<" "/utf8>>),
{ok, _pipe@2}
end.
-file("src/babble.gleam", 441).
?DOC(
" Build a seed context from prefix words: the last `order` words, left-padded\n"
" with `Start` when there are fewer than `order` of them.\n"
).
-spec seed_context(list(token()), integer()) -> list(token()).
seed_context(Words, Order) ->
Count = erlang:length(Words),
case Count >= Order of
true ->
gleam@list:drop(Words, Count - Order);
false ->
lists:append(gleam@list:repeat(start, Order - Count), Words)
end.
-file("src/babble.gleam", 280).
?DOC(
" Generate a sentence that begins with `prefix`, choosing with `sampler` and\n"
" emitting at most `max_tokens` words beyond the prefix.\n"
"\n"
" The continuation seeds from the last `order` prefix words (left-padded with\n"
" `Start`); an unknown prefix falls back to the start context, but the prefix\n"
" words are always kept at the front. Empty models return `Error(EmptyModel)`.\n"
).
-spec generate_starting_with(
model(),
binary(),
fun((list({step(), integer()})) -> step()),
integer()
) -> {ok, binary()} | {error, generate_error()}.
generate_starting_with(Model, Prefix, Sampler, Max_tokens) ->
case is_empty(Model) of
true ->
{error, empty_model};
false ->
Base = tokenize(Prefix, erlang:element(3, erlang:element(2, Model))),
Context = seed_context(
gleam@list:map(Base, fun(Field@0) -> {word, Field@0} end),
erlang:element(2, erlang:element(2, Model))
),
Start = case gleam@dict:has_key(erlang:element(3, Model), Context) of
true ->
Context;
false ->
start_context(Model)
end,
{ok,
generate_sentence(
Model,
Start,
lists:reverse(Base),
Sampler,
Max_tokens
)}
end.
-file("src/babble.gleam", 323).
-spec pick(list({step(), integer()}), integer()) -> step().
pick(Candidates, R) ->
case Candidates of
[] ->
stop;
[{Step, _}] ->
Step;
[{Step@1, Weight} | Rest] ->
case R < Weight of
true ->
Step@1;
false ->
pick(Rest, R - Weight)
end
end.
-file("src/babble.gleam", 318).
?DOC(
" A [`Sampler`](#Sampler) that picks a successor at random, with probability\n"
" proportional to how often it followed the context in training.\n"
"\n"
" This is the natural \"talk like the corpus\" behaviour and the one you'll want\n"
" most of the time. It uses the platform RNG, so output varies between calls —\n"
" pass it straight to [`generate`](#generate); you rarely call it yourself.\n"
"\n"
" ## Examples\n"
"\n"
" ```gleam\n"
" let assert Ok(sentence) = babble.generate(model, babble.weighted)\n"
" ```\n"
).
-spec weighted(list({step(), integer()})) -> step().
weighted(Candidates) ->
Total = gleam@list:fold(
Candidates,
0,
fun(Sum, Candidate) -> Sum + erlang:element(2, Candidate) end
),
pick(Candidates, gleam@int:random(gleam@int:max(1, Total))).
-file("src/babble.gleam", 369).
?DOC(
" A deterministic sort key for tie-breaking: `Stop` before any `Continue`,\n"
" words alphabetically.\n"
).
-spec step_key(step()) -> binary().
step_key(Step) ->
case Step of
stop ->
<<"0"/utf8>>;
{continue, Word} ->
<<"1"/utf8, Word/binary>>
end.
-file("src/babble.gleam", 353).
?DOC(
" A [`Sampler`](#Sampler) that always picks the most frequent successor, with\n"
" ties broken deterministically so the result never depends on internal map\n"
" ordering.\n"
"\n"
" Generation with this sampler is fully reproducible: a given model always\n"
" produces the same sentence. That makes it ideal for tests and snapshots, or a\n"
" fixed \"house style\" output. Because it always takes the single most-travelled\n"
" path, its output tends to reproduce whole training sentences verbatim.\n"
"\n"
" ## Examples\n"
"\n"
" ```gleam\n"
" let model =\n"
" babble.new(order: 2, tokenization: babble.Words)\n"
" |> babble.train(\"the cat sat.\")\n"
"\n"
" assert babble.generate(model, babble.most_likely, max_tokens: 50) == Ok(\"the cat sat.\")\n"
" ```\n"
).
-spec most_likely(list({step(), integer()})) -> step().
most_likely(Candidates) ->
Ranked = gleam@list:sort(
Candidates,
fun(A, B) ->
case gleam@int:compare(erlang:element(2, B), erlang:element(2, A)) of
eq ->
gleam@string:compare(
step_key(erlang:element(1, A)),
step_key(erlang:element(1, B))
);
Ordering ->
Ordering
end
end
),
case Ranked of
[{Step, _} | _] ->
Step;
[] ->
stop
end.