src/ys_utils.erl

-module(ys_utils).

-export([default_file_continuation/1, default_file_continuation/2]).
-export([trancoding_file_continuation/1, trancoding_file_continuation/2]).
-export([external_file_reader/2]).

default_file_continuation(Filename) -> default_file_continuation(Filename, 16384).

default_file_continuation(Filename, Size) ->
    case file:open(Filename, [raw, {read_ahead, Size}, read, binary]) of
        {ok, FD} ->
            {
                fun
                    (close) ->
                        file:close(FD);
                    (_) ->
                        case file:read(FD, Size) of
                            eof ->
                                eof;
                            {ok, Bin} ->
                                {Bin, nil};
                            {error, _} = Err ->
                                Err
                        end
                end,
                <<>>
            };
        {error, _} = OpenErr ->
            OpenErr
    end.

% F.1 Detection Without External Encoding Information
% Because each XML entity not accompanied by external encoding information and not in UTF-8 or
% UTF-16 encoding must begin with an XML encoding declaration, in which the first characters must
% be '<?xml', any conforming processor can detect, after two to four octets of input, which of the
% following cases apply. In reading this list, it may help to know that in UCS-4, '<' is
% " #x0000003C " and '?' is " #x0000003F ", and the Byte Order Mark required of UTF-16 data
% streams is " #xFEFF ". The notation ## is used to denote any byte value except that two
% consecutive ##s cannot be both 00.
trancoding_file_continuation(Filename) -> trancoding_file_continuation(Filename, 16384).

trancoding_file_continuation(Filename, Size) ->
    case file:open(Filename, [raw, {read_ahead, Size}, read, binary]) of
        {ok, FD} -> tfc_open(FD, Size);
        {error, _} = OpenErr -> OpenErr
    end.

tfc_bom(Bin) ->
    case unicode:bom_to_encoding(Bin) of
        {latin1, 0} ->
            case Bin of
                <<0, 0, 0, 16#3C, _/bitstring>> -> {{utf32, big}, 0};
                <<16#3C, 0, 0, 0, _/bitstring>> -> {{utf32, little}, 0};
                <<0, 16#3C, 0, 16#3F, _/bitstring>> -> {{utf16, big}, 0};
                <<16#3C, 0, 16#3F, 0, _/bitstring>> -> {{utf16, little}, 0};
                _ -> {utf8, 0}
            end;
        Other ->
            Other
    end.

tfc_enc(utf8, FD, Size, Stream) ->
    Cont = fun
        (close) ->
            file:close(FD);
        (_) ->
            case file:read(FD, Size) of
                eof ->
                    eof;
                {ok, ReadBin} ->
                    {ReadBin, none};
                {error, _} = Err ->
                    Err
            end
    end,
    {Cont, Stream};
tfc_enc(Enc, FD, 1, Stream) ->
    tfc_enc(Enc, FD, 2, Stream);
tfc_enc(Enc, FD, Size, Stream) ->
    EncInit = unicode:characters_to_binary(Stream, Enc),
    Cont = fun
        (close) ->
            file:close(FD);
        (ContState) ->
            case file:read(FD, Size) of
                eof ->
                    eof;
                {ok, ReadBin} when is_binary(ContState) ->
                    Raw = <<ContState/bitstring, ReadBin/bitstring>>,
                    case unicode:characters_to_binary(Raw, Enc) of
                        Encoded when is_binary(Encoded) ->
                            {Encoded, none};
                        {incomplete, Encoded, Leftovers} ->
                            {Encoded, Leftovers};
                        Error ->
                            Error
                    end;
                {ok, ReadBin} ->
                    case unicode:characters_to_binary(ReadBin, Enc) of
                        Encoded when is_binary(Encoded) ->
                            {Encoded, none};
                        {incomplete, Encoded, Leftovers} ->
                            {Encoded, Leftovers};
                        Error ->
                            Error
                    end;
                {error, _} = Err ->
                    Err
            end
    end,
    {Cont, EncInit}.

tfc_open(FD, Size) when Size < 4 ->
    case file:read(FD, 4) of
        eof ->
            file:close(FD),
            {fun(_) -> eof end, none};
        {ok, Bin} ->
            {Enc, BomLen} = tfc_bom(Bin),
            <<_:BomLen/binary, InitialStream/bitstring>> = Bin,
            tfc_enc(Enc, FD, Size, InitialStream);
        {error, _} = Err ->
            Err
    end;
tfc_open(FD, Size) ->
    case file:read(FD, Size) of
        eof ->
            file:close(FD),
            {fun(_) -> eof end, none};
        {ok, Bin} ->
            {Enc, BomLen} = tfc_bom(Bin),
            <<_:BomLen/binary, InitialStream/bitstring>> = Bin,
            tfc_enc(Enc, FD, Size, InitialStream);
        {error, _} = Err ->
            Err
    end.

external_file_reader({_Pub, Sys}, Base) ->
    Filename = filename:join([Base, Sys]),
    NewBase = filename:dirname(Filename),
    io:format("~p~n", [{NewBase, Filename}]),
    case file:read_file(Filename) of
        {ok, Bin} ->
            {Enc, BomLen} = tfc_bom(Bin),
            <<_:BomLen/binary, InitialStream/bitstring>> = Bin,
            {NewBase, unicode:characters_to_binary(InitialStream, Enc)};
        {error, Reason} ->
            io:format("~p~n", [{Reason, Base, Sys}]),
            {NewBase, <<>>}
    end.