Skip to main content

src/babble.erl

-module(babble).
-compile([no_auto_import, nowarn_unused_vars, nowarn_unused_function, nowarn_nomatch, inline]).
-define(FILEPATH, "src/babble.gleam").
-export([new/2, config/1, is_empty/1, message_count/1, tokenize/2, sentences/1, train/2, train_many/2, generate/3, generate_paragraph/4, generate_starting_with/4, weighted/1, most_likely/1]).
-export_type([token/0, tokenization/0, config/0, generate_error/0, step/0, model/0]).

-if(?OTP_RELEASE >= 27).
-define(MODULEDOC(Str), -moduledoc(Str)).
-define(DOC(Str), -doc(Str)).
-else.
-define(MODULEDOC(Str), -compile([])).
-define(DOC(Str), -compile([])).
-endif.

?MODULEDOC(
    " A small Markov chain text generator: train it on example text, then generate\n"
    " new sentences that sound _almost_ like the source. It offers incremental\n"
    " training, sentence-aware generation that stops at a natural full stop, and a\n"
    " pluggable sampler that puts you in control of _how_ each next word is chosen.\n"
    "\n"
    " ```gleam\n"
    " let model =\n"
    "   babble.new(order: 2, tokenization: babble.Words)\n"
    "   |> babble.train(\"the cat sat on the mat.\")\n"
    "   |> babble.train(\"the dog sat on the log.\")\n"
    "\n"
    " let assert Ok(sentence) = babble.generate(model, babble.weighted, max_tokens: 200)\n"
    " ```\n"
    "\n"
    " Generation is driven by a [`Sampler`](#Sampler) — see that type along with\n"
    " [`weighted`](#weighted) and [`most_likely`](#most_likely).\n"
).

-type token() :: start | 'end' | {word, binary()}.

-type tokenization() :: words | characters.

-type config() :: {config, integer(), tokenization()}.

-type generate_error() :: empty_model.

-type step() :: {continue, binary()} | stop.

-opaque model() :: {model,
        config(),
        gleam@dict:dict(list(token()), gleam@dict:dict(token(), integer())),
        integer()}.

-file("src/babble.gleam", 122).
?DOC(
    " A new empty model, ready to [`train`](#train).\n"
    "\n"
    " Both settings are fixed at construction (changing them would invalidate the\n"
    " learned counts, so there are no setters):\n"
    " - `order` — the n-gram context length: how many previous tokens to condition\n"
    "   on when picking the next. Clamped to >= 1. Higher = more coherent but more\n"
    "   verbatim; 2 is a good default.\n"
    " - `tokenization` — `Words` or `Characters`.\n"
    "\n"
    " The generation length cap is passed to [`generate`](#generate), not set here.\n"
    "\n"
    " ## Examples\n"
    "\n"
    " ```gleam\n"
    " let model = babble.new(order: 2, tokenization: babble.Words)\n"
    " assert babble.is_empty(model)\n"
    " ```\n"
).
-spec new(integer(), tokenization()) -> model().
new(Order, Tokenization) ->
    {model, {config, gleam@int:max(1, Order), Tokenization}, maps:new(), 0}.

-file("src/babble.gleam", 131).
?DOC(" The (clamped) configuration this model was built with.\n").
-spec config(model()) -> config().
config(Model) ->
    erlang:element(2, Model).

-file("src/babble.gleam", 136).
?DOC(" True when the model has learned no transitions yet.\n").
-spec is_empty(model()) -> boolean().
is_empty(Model) ->
    gleam@dict:is_empty(erlang:element(3, Model)).

-file("src/babble.gleam", 141).
?DOC(" How many non-empty messages have been folded into the model.\n").
-spec message_count(model()) -> integer().
message_count(Model) ->
    erlang:element(4, Model).

-file("src/babble.gleam", 209).
?DOC(
    " Increment the count of `next` following `context`, creating the successor\n"
    " table if absent.\n"
).
-spec bump(
    gleam@dict:dict(list(token()), gleam@dict:dict(token(), integer())),
    list(token()),
    token()
) -> gleam@dict:dict(list(token()), gleam@dict:dict(token(), integer())).
bump(Transitions, Context, Next) ->
    gleam@dict:upsert(Transitions, Context, fun(Existing) -> _pipe = Existing,
            _pipe@1 = gleam@option:unwrap(_pipe, maps:new()),
            gleam@dict:upsert(
                _pipe@1,
                Next,
                fun(Count) -> gleam@option:unwrap(Count, 0) + 1 end
            ) end).

-file("src/babble.gleam", 195).
?DOC(
    " Slide an `order`-length window across `padded`, counting each\n"
    " context -> next transition. Stops when no token follows the context.\n"
).
-spec count_window(
    gleam@dict:dict(list(token()), gleam@dict:dict(token(), integer())),
    list(token()),
    integer()
) -> gleam@dict:dict(list(token()), gleam@dict:dict(token(), integer())).
count_window(Transitions, Padded, Order) ->
    gleam@list:fold(
        gleam@list:window(Padded, Order + 1),
        Transitions,
        fun(Acc, Window) -> case gleam@list:last(Window) of
                {ok, Next} ->
                    bump(Acc, gleam@list:take(Window, Order), Next);

                {error, nil} ->
                    Acc
            end end
    ).

-file("src/babble.gleam", 189).
?DOC(" Pad a sentence's base tokens with `order` `Start`s and a trailing `End`.\n").
-spec pad(list(binary()), integer()) -> list(token()).
pad(Base, Order) ->
    lists:append(
        [gleam@list:repeat(start, Order),
            gleam@list:map(Base, fun(Field@0) -> {word, Field@0} end),
            ['end']]
    ).

-file("src/babble.gleam", 475).
?DOC(false).
-spec tokenize(binary(), tokenization()) -> list(binary()).
tokenize(Sentence, Tokenization) ->
    case Tokenization of
        words ->
            _pipe = Sentence,
            _pipe@1 = gleam@string:replace(_pipe, <<"\n"/utf8>>, <<" "/utf8>>),
            _pipe@2 = gleam@string:replace(_pipe@1, <<"\t"/utf8>>, <<" "/utf8>>),
            _pipe@3 = gleam@string:replace(_pipe@2, <<"\r"/utf8>>, <<" "/utf8>>),
            _pipe@4 = gleam@string:split(_pipe@3, <<" "/utf8>>),
            gleam@list:filter(_pipe@4, fun(S) -> S /= <<""/utf8>> end);

        characters ->
            gleam@string:to_graphemes(Sentence)
    end.

-file("src/babble.gleam", 488).
-spec segment(list(binary()), binary(), list(binary())) -> list(binary()).
segment(Graphemes, Buffer, Acc) ->
    case Graphemes of
        [] ->
            [Buffer | Acc];

        [Grapheme | Rest] ->
            Buffer@1 = <<Buffer/binary, Grapheme/binary>>,
            Terminal = ((Grapheme =:= <<"."/utf8>>) orelse (Grapheme =:= <<"!"/utf8>>))
            orelse (Grapheme =:= <<"?"/utf8>>),
            Boundary = case Rest of
                [] ->
                    true;

                [Next | _] ->
                    (((Next =:= <<" "/utf8>>) orelse (Next =:= <<"\n"/utf8>>))
                    orelse (Next =:= <<"\t"/utf8>>))
                    orelse (Next =:= <<"\r"/utf8>>)
            end,
            case Terminal andalso Boundary of
                true ->
                    segment(Rest, <<""/utf8>>, [Buffer@1 | Acc]);

                false ->
                    segment(Rest, Buffer@1, Acc)
            end
    end.

-file("src/babble.gleam", 464).
?DOC(false).
-spec sentences(binary()) -> list(binary()).
sentences(Message) ->
    _pipe = Message,
    _pipe@1 = gleam@string:to_graphemes(_pipe),
    _pipe@2 = segment(_pipe@1, <<""/utf8>>, []),
    _pipe@3 = lists:reverse(_pipe@2),
    _pipe@4 = gleam@list:map(_pipe@3, fun gleam@string:trim/1),
    gleam@list:filter(_pipe@4, fun(S) -> S /= <<""/utf8>> end).

-file("src/babble.gleam", 161).
?DOC(
    " Fold a single message into the model, returning a new model.\n"
    "\n"
    " Each sentence is tokenised, padded with `order` `Start` markers and a\n"
    " trailing `End`, and every `order`-length context -> next transition is\n"
    " counted. The message counter bumps once if the message held a non-empty\n"
    " sentence. It is cheap and never rebuilds, so you can keep folding in new text.\n"
    "\n"
    " ## Examples\n"
    "\n"
    " ```gleam\n"
    " let model =\n"
    "   babble.new(order: 2, tokenization: babble.Words)\n"
    "   |> babble.train(\"the cat sat.\")\n"
    "\n"
    " assert babble.message_count(model) == 1\n"
    " ```\n"
).
-spec train(model(), binary()) -> model().
train(Model, Message) ->
    Sentence_tokens = begin
        gleam@list:filter_map(
            sentences(Message),
            fun(Sentence) ->
                case tokenize(
                    Sentence,
                    erlang:element(3, erlang:element(2, Model))
                ) of
                    [] ->
                        {error, nil};

                    Base ->
                        {ok,
                            pad(
                                Base,
                                erlang:element(2, erlang:element(2, Model))
                            )}
                end
            end
        )
    end,
    {model,
        erlang:element(2, Model),
        begin
            gleam@list:fold(
                Sentence_tokens,
                erlang:element(3, Model),
                fun(Acc, Padded) ->
                    count_window(
                        Acc,
                        Padded,
                        erlang:element(2, erlang:element(2, Model))
                    )
                end
            )
        end,
        case Sentence_tokens of
            [] ->
                erlang:element(4, Model);

            _ ->
                erlang:element(4, Model) + 1
        end}.

-file("src/babble.gleam", 184).
?DOC(" Fold many messages into the model, in order.\n").
-spec train_many(model(), list(binary())) -> model().
train_many(Model, Messages) ->
    gleam@list:fold(Messages, Model, fun train/2).

-file("src/babble.gleam", 435).
?DOC(" The all-`Start` context generation begins from.\n").
-spec start_context(model()) -> list(token()).
start_context(Model) ->
    gleam@list:repeat(start, erlang:element(2, erlang:element(2, Model))).

-file("src/babble.gleam", 451).
?DOC(
    " Join base tokens (already in final order) into a string under the given\n"
    " tokenization.\n"
).
-spec join(tokenization(), list(binary())) -> binary().
join(Tokenization, Tokens) ->
    case Tokenization of
        words ->
            gleam@string:join(Tokens, <<" "/utf8>>);

        characters ->
            erlang:list_to_binary(Tokens)
    end.

-file("src/babble.gleam", 420).
?DOC(
    " The successor table as sampler-facing weighted candidates. `End` is the only\n"
    " non-`Word` token reachable as a successor, so it becomes `Stop`.\n"
).
-spec candidates(gleam@dict:dict(token(), integer())) -> list({step(),
    integer()}).
candidates(Counts) ->
    gleam@list:map(
        maps:to_list(Counts),
        fun(_use0) ->
            {Token, Count} = _use0,
            Step = case Token of
                {word, Word} ->
                    {continue, Word};

                start ->
                    stop;

                'end' ->
                    stop
            end,
            {Step, Count}
        end
    ).

-file("src/babble.gleam", 391).
-spec gen_loop(
    model(),
    list(token()),
    list(binary()),
    integer(),
    fun((list({step(), integer()})) -> step()),
    integer()
) -> list(binary()).
gen_loop(Model, Context, Emitted, Count, Sampler, Max_tokens) ->
    case {Count >= Max_tokens,
        gleam_stdlib:map_get(erlang:element(3, Model), Context)} of
        {false, {ok, Counts}} ->
            case Sampler(candidates(Counts)) of
                {continue, Word} ->
                    gen_loop(
                        Model,
                        lists:append(
                            gleam@list:drop(Context, 1),
                            [{word, Word}]
                        ),
                        [Word | Emitted],
                        Count + 1,
                        Sampler,
                        Max_tokens
                    );

                stop ->
                    Emitted
            end;

        {_, _} ->
            Emitted
    end.

-file("src/babble.gleam", 379).
?DOC(
    " Walk from `context` to a sentence end with `sampler`, emitting at most\n"
    " `max_tokens` words, then join. `acc` holds already-emitted prefix words,\n"
    " newest-first.\n"
).
-spec generate_sentence(
    model(),
    list(token()),
    list(binary()),
    fun((list({step(), integer()})) -> step()),
    integer()
) -> binary().
generate_sentence(Model, Context, Acc, Sampler, Max_tokens) ->
    Emitted = gen_loop(
        Model,
        Context,
        Acc,
        0,
        Sampler,
        gleam@int:max(1, Max_tokens)
    ),
    join(erlang:element(3, erlang:element(2, Model)), lists:reverse(Emitted)).

-file("src/babble.gleam", 430).
?DOC(" Whether generation can begin: the all-`Start` context has transitions.\n").
-spec startable(model()) -> boolean().
startable(Model) ->
    gleam@dict:has_key(erlang:element(3, Model), start_context(Model)).

-file("src/babble.gleam", 242).
?DOC(
    " Generate one sentence, choosing each next word with `sampler` and emitting at\n"
    " most `max_tokens` of them.\n"
    "\n"
    " Walks the chain from the start of a sentence, asking `sampler` for the next\n"
    " step at each point, until it stops at a learned sentence end or reaches\n"
    " `max_tokens` (clamped to >= 1). Returns `Error(EmptyModel)` if the model has\n"
    " never been trained.\n"
    "\n"
    " Pass [`weighted`](#weighted) for varied, corpus-like output or\n"
    " [`most_likely`](#most_likely) for deterministic output. See [`Sampler`](#Sampler)\n"
    " to write your own.\n"
    "\n"
    " ## Examples\n"
    "\n"
    " ```gleam\n"
    " // Varied output — a different sentence each call:\n"
    " let assert Ok(sentence) = babble.generate(model, babble.weighted, max_tokens: 200)\n"
    "\n"
    " // No data yet:\n"
    " let empty = babble.new(order: 2, tokenization: babble.Words)\n"
    " assert babble.generate(empty, babble.weighted, max_tokens: 50) == Error(babble.EmptyModel)\n"
    " ```\n"
).
-spec generate(model(), fun((list({step(), integer()})) -> step()), integer()) -> {ok,
        binary()} |
    {error, generate_error()}.
generate(Model, Sampler, Max_tokens) ->
    case startable(Model) of
        false ->
            {error, empty_model};

        true ->
            {ok,
                generate_sentence(
                    Model,
                    start_context(Model),
                    [],
                    Sampler,
                    Max_tokens
                )}
    end.

-file("src/babble.gleam", 256).
?DOC(
    " Generate `sentences` sentences (at least 1) with `sampler`, each capped at\n"
    " `max_tokens`, joined by spaces.\n"
).
-spec generate_paragraph(
    model(),
    integer(),
    fun((list({step(), integer()})) -> step()),
    integer()
) -> {ok, binary()} | {error, generate_error()}.
generate_paragraph(Model, Sentences, Sampler, Max_tokens) ->
    case startable(Model) of
        false ->
            {error, empty_model};

        true ->
            _pipe = gleam@list:repeat(nil, gleam@int:max(1, Sentences)),
            _pipe@1 = gleam@list:map(
                _pipe,
                fun(_) ->
                    generate_sentence(
                        Model,
                        start_context(Model),
                        [],
                        Sampler,
                        Max_tokens
                    )
                end
            ),
            _pipe@2 = gleam@string:join(_pipe@1, <<" "/utf8>>),
            {ok, _pipe@2}
    end.

-file("src/babble.gleam", 441).
?DOC(
    " Build a seed context from prefix words: the last `order` words, left-padded\n"
    " with `Start` when there are fewer than `order` of them.\n"
).
-spec seed_context(list(token()), integer()) -> list(token()).
seed_context(Words, Order) ->
    Count = erlang:length(Words),
    case Count >= Order of
        true ->
            gleam@list:drop(Words, Count - Order);

        false ->
            lists:append(gleam@list:repeat(start, Order - Count), Words)
    end.

-file("src/babble.gleam", 280).
?DOC(
    " Generate a sentence that begins with `prefix`, choosing with `sampler` and\n"
    " emitting at most `max_tokens` words beyond the prefix.\n"
    "\n"
    " The continuation seeds from the last `order` prefix words (left-padded with\n"
    " `Start`); an unknown prefix falls back to the start context, but the prefix\n"
    " words are always kept at the front. Empty models return `Error(EmptyModel)`.\n"
).
-spec generate_starting_with(
    model(),
    binary(),
    fun((list({step(), integer()})) -> step()),
    integer()
) -> {ok, binary()} | {error, generate_error()}.
generate_starting_with(Model, Prefix, Sampler, Max_tokens) ->
    case is_empty(Model) of
        true ->
            {error, empty_model};

        false ->
            Base = tokenize(Prefix, erlang:element(3, erlang:element(2, Model))),
            Context = seed_context(
                gleam@list:map(Base, fun(Field@0) -> {word, Field@0} end),
                erlang:element(2, erlang:element(2, Model))
            ),
            Start = case gleam@dict:has_key(erlang:element(3, Model), Context) of
                true ->
                    Context;

                false ->
                    start_context(Model)
            end,
            {ok,
                generate_sentence(
                    Model,
                    Start,
                    lists:reverse(Base),
                    Sampler,
                    Max_tokens
                )}
    end.

-file("src/babble.gleam", 323).
-spec pick(list({step(), integer()}), integer()) -> step().
pick(Candidates, R) ->
    case Candidates of
        [] ->
            stop;

        [{Step, _}] ->
            Step;

        [{Step@1, Weight} | Rest] ->
            case R < Weight of
                true ->
                    Step@1;

                false ->
                    pick(Rest, R - Weight)
            end
    end.

-file("src/babble.gleam", 318).
?DOC(
    " A [`Sampler`](#Sampler) that picks a successor at random, with probability\n"
    " proportional to how often it followed the context in training.\n"
    "\n"
    " This is the natural \"talk like the corpus\" behaviour and the one you'll want\n"
    " most of the time. It uses the platform RNG, so output varies between calls —\n"
    " pass it straight to [`generate`](#generate); you rarely call it yourself.\n"
    "\n"
    " ## Examples\n"
    "\n"
    " ```gleam\n"
    " let assert Ok(sentence) = babble.generate(model, babble.weighted)\n"
    " ```\n"
).
-spec weighted(list({step(), integer()})) -> step().
weighted(Candidates) ->
    Total = gleam@list:fold(
        Candidates,
        0,
        fun(Sum, Candidate) -> Sum + erlang:element(2, Candidate) end
    ),
    pick(Candidates, gleam@int:random(gleam@int:max(1, Total))).

-file("src/babble.gleam", 369).
?DOC(
    " A deterministic sort key for tie-breaking: `Stop` before any `Continue`,\n"
    " words alphabetically.\n"
).
-spec step_key(step()) -> binary().
step_key(Step) ->
    case Step of
        stop ->
            <<"0"/utf8>>;

        {continue, Word} ->
            <<"1"/utf8, Word/binary>>
    end.

-file("src/babble.gleam", 353).
?DOC(
    " A [`Sampler`](#Sampler) that always picks the most frequent successor, with\n"
    " ties broken deterministically so the result never depends on internal map\n"
    " ordering.\n"
    "\n"
    " Generation with this sampler is fully reproducible: a given model always\n"
    " produces the same sentence. That makes it ideal for tests and snapshots, or a\n"
    " fixed \"house style\" output. Because it always takes the single most-travelled\n"
    " path, its output tends to reproduce whole training sentences verbatim.\n"
    "\n"
    " ## Examples\n"
    "\n"
    " ```gleam\n"
    " let model =\n"
    "   babble.new(order: 2, tokenization: babble.Words)\n"
    "   |> babble.train(\"the cat sat.\")\n"
    "\n"
    " assert babble.generate(model, babble.most_likely, max_tokens: 50) == Ok(\"the cat sat.\")\n"
    " ```\n"
).
-spec most_likely(list({step(), integer()})) -> step().
most_likely(Candidates) ->
    Ranked = gleam@list:sort(
        Candidates,
        fun(A, B) ->
            case gleam@int:compare(erlang:element(2, B), erlang:element(2, A)) of
                eq ->
                    gleam@string:compare(
                        step_key(erlang:element(1, A)),
                        step_key(erlang:element(1, B))
                    );

                Ordering ->
                    Ordering
            end
        end
    ),
    case Ranked of
        [{Step, _} | _] ->
            Step;

        [] ->
            stop
    end.