Skip to main content

src/rebar3_erli18n_prv_merge.erl

-module(rebar3_erli18n_prv_merge).

-moduledoc """
`rebar3 erli18n merge` — msgmerge-style sync of a `.po` against the fresh
`.pot`.

For each `{Domain, Locale}` it parses the existing `.po` (via
`erli18n_po:parse`) for the translations and serializes the merged result
through `rebar3_erli18n_po_meta`, applying the lifecycle the plain
parse/dump round-trip cannot:

- a `.pot` msgid present in the old `.po` keeps its translation and gains
  the fresh `#:` references;
- a `.pot` msgid ABSENT from the old `.po` is added as an untranslated
  entry, fuzzy-matched (`rebar3_erli18n_jaro`) against the removed msgids so
  a renamed string carries its old translation as `#, fuzzy` with a `#|`
  previous-msgid hint;
- an old msgid no longer in the `.pot` (and not consumed as a fuzzy source)
  is demoted to a `#~` obsolete entry rather than deleted;
- msgid equality is wrapping-insensitive (`--no-wrap` / line-wrapped msgids
  compare equal), because both sides are decoded binaries.
""".

%% This module implements the rebar3 `provider` contract (`init/1`, `do/1`,
%% `format_error/1`); it is registered via `providers:create([{module, ?MODULE}, ...])`
%% in `init/1`. The `-behaviour(provider)` attribute is intentionally omitted:
%% the `provider` behaviour ships inside the rebar3 escript (stripped of
%% debug_info and not on Hex), so neither dialyzer nor eqwalizer can load its
%% callback info standalone — the attribute would only yield false
%% "behaviour/callback not available" diagnostics for a contract the exports
%% already satisfy.

-export([init/1, do/1, format_error/1]).

%% `previous_of/1` is exported for white-box testing only (build-tool
%% internal, not a published Hex API). See its `-doc` for the full rationale.
-export([previous_of/1]).

-define(PROVIDER, merge).
-define(NAMESPACE, erli18n).
-define(DEPS, [{default, compile}]).

-doc "Register the `merge` provider under the `erli18n` namespace.".
-spec init(rebar3_erli18n_host:state()) -> {ok, rebar3_erli18n_host:state()}.
init(State) ->
    Provider = rebar3_erli18n_host:create_provider([
        {name, ?PROVIDER},
        {namespace, ?NAMESPACE},
        {module, ?MODULE},
        {bare, true},
        {deps, ?DEPS},
        {example, "rebar3 erli18n merge --locale pt_BR"},
        {opts, rebar3_erli18n_common:common_opts()},
        {short_desc, "Sync .po catalogs against the freshly extracted .pot (msgmerge-style)."},
        {desc,
            "Re-extract the .pot, then for each target .po keep existing translations for "
            "still-present msgids, add new msgids (fuzzy-matched against removed ones), demote "
            "removed msgids to #~ obsolete, and preserve #: references. Reuses erli18n_po:parse "
            "for the body and rebar3_erli18n_po_meta for the metadata."}
    ]),
    {ok, rebar3_erli18n_host:add_provider(State, Provider)}.

-doc "Run the merge for the selected `{Domain, Locale}` catalogs.".
-spec do(rebar3_erli18n_host:state()) -> {ok, rebar3_erli18n_host:state()} | {error, string()}.
do(State) ->
    case rebar3_erli18n_common:extract_project(State) of
        {ok, ByDomain} ->
            case targets(State) of
                {ok, Locale} ->
                    Result = merge_all(State, ByDomain, Locale),
                    handle(State, Result);
                {error, Reason} ->
                    {error, format_error(Reason)}
            end;
        {error, Reason} ->
            {error, format_error(Reason)}
    end.

-doc "Render a provider error to a human string.".
-spec format_error(term()) -> string().
format_error(Reason) ->
    rebar3_erli18n_common:format_error(Reason).

%% =========================
%% Target selection
%% =========================

-spec targets(rebar3_erli18n_host:state()) -> {ok, string()} | {error, term()}.
targets(State) ->
    Args = rebar3_erli18n_host:parsed_args(State),
    case proplists:get_value(locale, Args) of
        undefined -> {error, locale_required};
        Locale -> {ok, Locale}
    end.

-spec handle(rebar3_erli18n_host:state(), ok | {error, term()}) ->
    {ok, rebar3_erli18n_host:state()} | {error, string()}.
handle(State, ok) -> {ok, State};
handle(_State, {error, Reason}) -> {error, format_error(Reason)}.

%% =========================
%% Merge
%% =========================

-spec merge_all(rebar3_erli18n_host:state(), #{atom() => [Entry]}, string()) ->
    ok | {error, term()}
when
    Entry :: rebar3_erli18n_common:dedup_entry().
merge_all(State, ByDomain, Locale) ->
    maps:fold(
        fun
            (Domain, Entries, ok) -> merge_one(State, Domain, Entries, Locale);
            (_Domain, _Entries, {error, _} = Err) -> Err
        end,
        ok,
        ByDomain
    ).

-spec merge_one(rebar3_erli18n_host:state(), atom(), [Entry], string()) ->
    ok | {error, term()}
when
    Entry :: rebar3_erli18n_common:dedup_entry().
merge_one(State, Domain, PotEntries, Locale) ->
    Path = rebar3_erli18n_common:po_path(State, Domain, Locale),
    case read_old(Path) of
        {ok, OldHeader, OldEntries} ->
            Merged = merge_entries(PotEntries, OldEntries),
            Catalog = #{header => OldHeader, entries => Merged},
            write_po(Path, rebar3_erli18n_po_meta:dump(Catalog));
        {error, Reason} ->
            {error, {po_parse_failed, Path, Reason}}
    end.

%% Write the merged `.po`, returning a structured `{error, {write_failed, ...}}`
%% (not a `badmatch` crash) when the destination dir cannot be created or the
%% file cannot be written — so `do/1` reports a clean provider error.
-spec write_po(file:filename(), binary()) ->
    ok | {error, {write_failed, file:filename(), term()}}.
write_po(Path, Bytes) ->
    case filelib:ensure_dir(Path) of
        ok ->
            case file:write_file(Path, Bytes) of
                ok ->
                    rebar3_erli18n_host:info("erli18n: merged ~ts", [Path]),
                    ok;
                {error, Reason} ->
                    {error, {write_failed, Path, Reason}}
            end;
        {error, Reason} ->
            {error, {write_failed, Path, Reason}}
    end.

%% Read an existing `.po`. A missing file means a brand-new locale: treat it
%% as an empty catalog with a UTF-8 header so the merge produces the initial
%% translated template.
-spec read_old(file:filename()) ->
    {ok, binary(), [erli18n_po:entry()]} | {error, term()}.
read_old(Path) ->
    case file:read_file(Path) of
        {ok, Bin} ->
            case erli18n_po:parse(Bin) of
                %% `erli18n_po:parse/1` always populates the header `raw`
                %% field (a synthetic header is supplied when the `.po` has
                %% none), so we match it directly — no missing-key fallback.
                {ok, #{header := #{raw := Raw}, entries := Entries}} ->
                    {ok, Raw, Entries};
                {error, Reason} ->
                    {error, Reason}
            end;
        {error, enoent} ->
            {ok, default_header(), []};
        {error, Reason} ->
            {error, Reason}
    end.

-spec default_header() -> binary().
default_header() ->
    ~"Content-Type: text/plain; charset=UTF-8\n".

%% Core merge, in three passes:
%%   1. Exact carry-over: a fresh `.pot` msgid present in the old catalog
%%      keeps its translation and refreshes its `#:` references.
%%   2. Fuzzy carry-over: a fresh msgid ABSENT from the old catalog is
%%      paired (jaro >= 0.8) with a removed old msgid; the new entry takes
%%      the old translation, is flagged `#, fuzzy`, and records the old
%%      msgid as a `#|` previous-msgid hint. The matched old key is consumed.
%%   3. Obsolete: any old entry whose key was neither carried over nor
%%      consumed as a fuzzy source is demoted to a `#~` obsolete entry.
-spec merge_entries([Entry], [erli18n_po:entry()]) ->
    [rebar3_erli18n_po_meta:meta_entry()]
when
    Entry :: rebar3_erli18n_common:dedup_entry().
merge_entries(PotEntries, OldEntries) ->
    OldIndex = index_old(OldEntries),
    %% Split fresh entries into exact-matched and new (no exact old entry).
    {Exact, New} = lists:partition(
        fun(#{context := Ctx, msgid := Msgid}) -> maps:is_key({Ctx, Msgid}, OldIndex) end,
        PotEntries
    ),
    ExactKeys = #{{Ctx, Msgid} => true || #{context := Ctx, msgid := Msgid} <- Exact},
    %% Removed = old keys not exactly carried over; candidates for fuzzy.
    Removed = [E || E <- OldEntries, not maps:is_key(old_key(E), ExactKeys)],
    {NewMeta, FuzzyUsed} = fuzzy_merge(New, Removed),
    ExactMeta = [exact_meta(PotE, OldIndex) || PotE <- Exact],
    Consumed = maps:merge(ExactKeys, FuzzyUsed),
    Obsolete = obsolete_entries(OldEntries, Consumed),
    ExactMeta ++ NewMeta ++ Obsolete.

%% Index old entries by {Context, Msgid} for O(1) carry-over lookup.
-spec index_old([erli18n_po:entry()]) -> #{{undefined | binary(), binary()} => erli18n_po:entry()}.
index_old(Entries) ->
    lists:foldl(
        fun(E, Acc) -> Acc#{old_key(E) => E} end,
        #{},
        Entries
    ).

-spec old_key(erli18n_po:entry()) -> {undefined | binary(), binary()}.
old_key({singular, Ctx, Msgid, _}) -> {Ctx, Msgid};
old_key({plural, Ctx, Msgid, _, _}) -> {Ctx, Msgid}.

%% Build the meta-entry for an exact-matched fresh entry: old translation,
%% fresh references, no fuzzy flag.
-spec exact_meta(Entry, OldIndex) -> rebar3_erli18n_po_meta:meta_entry() when
    Entry :: rebar3_erli18n_common:dedup_entry(),
    OldIndex :: #{{undefined | binary(), binary()} => erli18n_po:entry()}.
exact_meta(#{context := Ctx, msgid := Msgid, references := Refs} = PotE, OldIndex) ->
    OldEntry = maps:get({Ctx, Msgid}, OldIndex),
    #{body => body_with_translation(PotE, OldEntry), references => Refs}.

%% Pair each new fresh msgid with the best removed old msgid (jaro >= 0.8).
%% A successful pairing consumes the old key (so it is not also obsoleted)
%% and yields a `#, fuzzy` entry carrying the old translation plus a `#|`
%% previous-msgid hint. New msgids with no fuzzy source stay untranslated.
-spec fuzzy_merge([Entry], [erli18n_po:entry()]) ->
    {[rebar3_erli18n_po_meta:meta_entry()], #{{undefined | binary(), binary()} => true}}
when
    Entry :: rebar3_erli18n_common:dedup_entry().
fuzzy_merge(New, Removed) ->
    {MetaRev, Used, _Left} = lists:foldl(
        fun(NewE, {AccMeta, AccUsed, Candidates}) ->
            {MetaE, Consumed, Rest} = fuzzy_one(NewE, Candidates),
            {[MetaE | AccMeta], add_used(Consumed, AccUsed), Rest}
        end,
        {[], #{}, Removed},
        New
    ),
    {lists:reverse(MetaRev), Used}.

%% Fuzzy-match one new entry against the remaining removed candidates.
-spec fuzzy_one(Entry, [erli18n_po:entry()]) ->
    {rebar3_erli18n_po_meta:meta_entry(), none | {undefined | binary(), binary()}, [
        erli18n_po:entry()
    ]}
when
    Entry :: rebar3_erli18n_common:dedup_entry().
fuzzy_one(#{msgid := Msgid, references := Refs} = NewE, Candidates) ->
    CandMsgids = [old_msgid(C) || C <- Candidates],
    case rebar3_erli18n_jaro:best_match(Msgid, CandMsgids) of
        nomatch ->
            {#{body => empty_body(NewE), references => Refs}, none, Candidates};
        {ok, MatchMsgid, _Score} ->
            {Match, Rest} = take_by_msgid(MatchMsgid, Candidates),
            Meta = fuzzy_meta(NewE, Match, Refs),
            {Meta, old_key(Match), Rest}
    end.

-spec old_msgid(erli18n_po:entry()) -> binary().
old_msgid({singular, _, Msgid, _}) -> Msgid;
old_msgid({plural, _, Msgid, _, _}) -> Msgid.

%% Remove the first candidate whose msgid matches, returning it and the rest.
-spec take_by_msgid(binary(), [erli18n_po:entry()]) ->
    {erli18n_po:entry(), [erli18n_po:entry()]}.
take_by_msgid(Msgid, Candidates) ->
    take_by_msgid(Msgid, Candidates, []).

-spec take_by_msgid(binary(), [erli18n_po:entry()], [erli18n_po:entry()]) ->
    {erli18n_po:entry(), [erli18n_po:entry()]}.
take_by_msgid(Msgid, [C | Rest], Acc) ->
    case old_msgid(C) =:= Msgid of
        true -> {C, lists:reverse(Acc, Rest)};
        false -> take_by_msgid(Msgid, Rest, [C | Acc])
    end.

%% Build the `#, fuzzy` meta-entry: fresh shape, old translation, prev-msgid.
-spec fuzzy_meta(Entry, erli18n_po:entry(), [{file:filename(), pos_integer()}]) ->
    rebar3_erli18n_po_meta:meta_entry()
when
    Entry :: rebar3_erli18n_common:dedup_entry().
fuzzy_meta(NewE, Match, Refs) ->
    Body = transplant_translation(NewE, Match),
    #{
        body => Body,
        references => Refs,
        flags => [fuzzy],
        previous => previous_of(Match)
    }.

%% Carry the old translation onto the new shape regardless of singular/plural
%% mismatch: a singular->singular keeps the string; any cross-shape pairing
%% keeps the new shape but reuses what translation bytes exist.
-spec transplant_translation(Entry, erli18n_po:entry()) -> rebar3_erli18n_po_meta:body() when
    Entry :: rebar3_erli18n_common:dedup_entry().
transplant_translation(#{kind := singular, context := Ctx, msgid := Msgid}, {singular, _, _, Tr}) ->
    {singular, Ctx, Msgid, Tr};
transplant_translation(
    #{kind := plural, context := Ctx, msgid := Msgid, plural := Plural},
    {plural, _, _, _, Forms}
) ->
    {plural, Ctx, Msgid, Plural, Forms};
transplant_translation(NewE, _Mismatched) ->
    empty_body(NewE).

-doc """
Build the `#|` previous-msgid hint for a fuzzy match: the old
context+msgid, plus the old msgid_plural when the matched entry carried one.

This is a build-tool internal, exported only so the CT suite can white-box
every clause; it is not part of any published (Hex) API surface.

`erli18n_po:entry()` types a plural's `msgid_plural` as `undefined |
binary()`, so the clause head below must cover `undefined` for the match to
be total over the imported type (dialyzer/eqwalizer exhaustiveness). In
practice `erli18n_po:parse/1` never yields a plural with an undefined
msgid_plural — a degenerate `msgstr[N]`-without-`msgid_plural` block is
parsed as a SINGULAR entry — so the `undefined` clause is type-mandated, not
behaviourally reachable through the merge's parse-driven inputs.
""".
-spec previous_of(erli18n_po:entry()) ->
    {undefined | binary(), binary()} | {undefined | binary(), binary(), binary()}.
previous_of({singular, Ctx, Msgid, _}) -> {Ctx, Msgid};
previous_of({plural, Ctx, Msgid, undefined, _}) -> {Ctx, Msgid};
previous_of({plural, Ctx, Msgid, MsgidPlural, _}) -> {Ctx, Msgid, MsgidPlural}.

-spec add_used(none | {undefined | binary(), binary()}, Used) -> Used when
    Used :: #{{undefined | binary(), binary()} => true}.
add_used(none, Used) -> Used;
add_used(Key, Used) -> Used#{Key => true}.

%% Build a translated body by transplanting the old translation onto the
%% fresh `.pot` shape (which carries the authoritative msgid/plural).
-spec body_with_translation(Entry, erli18n_po:entry()) -> rebar3_erli18n_po_meta:body() when
    Entry :: rebar3_erli18n_common:dedup_entry().
body_with_translation(#{kind := singular, context := Ctx, msgid := Msgid}, {singular, _, _, Tr}) ->
    {singular, Ctx, Msgid, Tr};
body_with_translation(
    #{kind := plural, context := Ctx, msgid := Msgid, plural := Plural},
    {plural, _, _, _, Forms}
) ->
    {plural, Ctx, Msgid, Plural, Forms};
body_with_translation(PotE, _Mismatched) ->
    %% Shape changed singular<->plural between old and new: drop the stale
    %% translation, keep the fresh shape untranslated.
    empty_body(PotE).

-spec empty_body(Entry) -> rebar3_erli18n_po_meta:body() when
    Entry :: rebar3_erli18n_common:dedup_entry().
empty_body(#{kind := singular, context := Ctx, msgid := Msgid}) ->
    {singular, Ctx, Msgid, <<>>};
empty_body(#{kind := plural, context := Ctx, msgid := Msgid, plural := Plural}) ->
    {plural, Ctx, Msgid, Plural, [{0, <<>>}, {1, <<>>}]}.

%% Demote every old entry whose key was not carried over into a `#~`
%% obsolete meta-entry, preserving its translation bytes.
-spec obsolete_entries([erli18n_po:entry()], Used) -> [rebar3_erli18n_po_meta:meta_entry()] when
    Used :: #{{undefined | binary(), binary()} => true}.
obsolete_entries(OldEntries, Used) ->
    [
        #{body => E, obsolete => true}
     || E <- OldEntries, not maps:is_key(old_key(E), Used)
    ].