src/tflite_beam/tflite_beam_private_utils_unicode_data.erl

-module(tflite_beam_private_utils_unicode_data).
-export([
    get_puncuation_list_from_unicode_data/1,
    release_memory/0
]).
-behaviour(gen_server).
-export([init/1, handle_call/3, handle_cast/2]).
-record(state, {
    puncuation_list = []
}).

get_puncuation_list_from_unicode_data(UnicodeDataFile) ->
    ServerPid = get_running_instance(true),
    gen_server:call(ServerPid, {get_puncuation_list, UnicodeDataFile}).

release_memory() ->
    case get_running_instance(false) of
        undefined ->
            ok;
        ServerPid when is_pid(ServerPid) ->
            gen_server:stop(ServerPid)
    end.

get_running_instance(CreateIfNotRunning) ->
    case erlang:whereis(?MODULE) of
        undefined ->
            if 
                CreateIfNotRunning ->
                    {ok, Pid} = gen_server:start_link({local, ?MODULE}, ?MODULE, [], []),
                    Pid;
                true ->
                    undefined
            end;
        Pid ->
            Pid
    end.

init(_) ->
    {ok, #state{}}.

handle_call({get_puncuation_list, UnicodeDataFile}, _From, State) ->
    case State#state.puncuation_list of
        [] ->
            {ok, FileDescriptor} = file:open(UnicodeDataFile, [read, raw]),
            PuncuationList = read_from_unicode_data(FileDescriptor, #{}),
            {reply, PuncuationList, State#state{puncuation_list = PuncuationList}};
        PuncuationList when is_list(PuncuationList) ->
            {reply, PuncuationList, State}        
    end.

handle_cast(_Msg, State) ->
    {noreply, State}.

read_from_unicode_data(FileDescriptor, TypeAcc) ->
    case file:read_line(FileDescriptor) of
        {ok, Line} ->
            read_from_unicode_data(FileDescriptor, process_unicode_data_line(Line, TypeAcc));
        _ ->
            lists:flatten(maps:values(TypeAcc))
    end.

process_unicode_data_line(Line, TypeAcc) ->
    BinaryLine = unicode:characters_to_binary(Line),
    process_unicode_data_line_impl(BinaryLine, [], TypeAcc).

process_unicode_data_line_impl(BinaryLine, Acc, TypeAcc) ->
    case binary:split(BinaryLine, <<";">>) of
        [Chunk, Rest] ->
            process_unicode_data_line_impl(Rest, Acc ++ [Chunk], TypeAcc);
        [Chunk] ->
            LineValues = Acc ++ [Chunk],
            accumlate_type(LineValues, TypeAcc)
    end.

accumlate_type(LineValues, TypeAcc) ->
    [CodePoint, _Name, Type | _Rest] = LineValues,
    case Type of
        <<"P", _>> ->
            Value = erlang:list_to_integer(unicode:characters_to_list(CodePoint), 16),
            IsKey = maps:is_key(Type, TypeAcc),
            if
                IsKey ->
                    SameTypeValues = maps:get(Type, TypeAcc),
                    maps:update(Type, SameTypeValues ++ [Value], TypeAcc);
                true ->
                    maps:put(Type, [Value], TypeAcc)
            end;
        _ ->
            TypeAcc
    end.