Skip to main content

src/packkit@detect.erl

-module(packkit@detect).
-compile([no_auto_import, nowarn_unused_vars, nowarn_unused_function, nowarn_nomatch, inline]).
-define(FILEPATH, "src/packkit/detect.gleam").
-export([codec/1, archive/1, recipe/1, extension/1, from_bytes/1, from_filename/1, from_path_or_bytes/2]).
-export_type([detected/0]).

-if(?OTP_RELEASE >= 27).
-define(MODULEDOC(Str), -moduledoc(Str)).
-define(DOC(Str), -doc(Str)).
-else.
-define(MODULEDOC(Str), -compile([])).
-define(DOC(Str), -compile([])).
-endif.

-opaque detected() :: {detected,
        gleam@option:option(packkit@codec:codec()),
        gleam@option:option(packkit@archive:archive_format()),
        gleam@option:option(packkit@recipe:recipe()),
        gleam@option:option(binary())}.

-file("src/packkit/detect.gleam", 216).
-spec looks_like_zlib(bitstring()) -> boolean().
looks_like_zlib(Bytes) ->
    case Bytes of
        <<Cmf, Flg, _/binary>> ->
            Cm = erlang:'band'(Cmf, 16#0F),
            Cinfo = erlang:'bsr'(Cmf, 4),
            ((Cm =:= 8) andalso (Cinfo =< 7)) andalso ((((Cmf * 256) + Flg) rem 31)
            =:= 0);

        _ ->
            false
    end.

-file("src/packkit/detect.gleam", 227).
-spec has_ustar_magic(bitstring()) -> boolean().
has_ustar_magic(Bytes) ->
    case gleam_stdlib:bit_array_slice(Bytes, 257, 5) of
        {ok, <<"ustar"/utf8>>} ->
            true;

        _ ->
            false
    end.

-file("src/packkit/detect.gleam", 235).
?DOC(" Read the detected codec if one was found.\n").
-spec codec(detected()) -> gleam@option:option(packkit@codec:codec()).
codec(Detected) ->
    erlang:element(2, Detected).

-file("src/packkit/detect.gleam", 240).
?DOC(" Read the detected archive family if one was found.\n").
-spec archive(detected()) -> gleam@option:option(packkit@archive:archive_format()).
archive(Detected) ->
    erlang:element(3, Detected).

-file("src/packkit/detect.gleam", 245).
?DOC(" Read the detected recipe if one was found.\n").
-spec recipe(detected()) -> gleam@option:option(packkit@recipe:recipe()).
recipe(Detected) ->
    erlang:element(4, Detected).

-file("src/packkit/detect.gleam", 250).
?DOC(" Read the matched extension label, if any.\n").
-spec extension(detected()) -> gleam@option:option(binary()).
extension(Detected) ->
    erlang:element(5, Detected).

-file("src/packkit/detect.gleam", 254).
-spec detected_recipe(packkit@recipe:recipe(), binary()) -> detected().
detected_recipe(Value, Extension) ->
    {detected,
        packkit@recipe:outermost_codec(Value),
        {some, packkit@recipe:archive_format(Value)},
        {some, Value},
        {some, Extension}}.

-file("src/packkit/detect.gleam", 266).
-spec detected_archive(packkit@archive:archive_format(), binary()) -> detected().
detected_archive(Value, Extension) ->
    {detected, none, {some, Value}, none, {some, Extension}}.

-file("src/packkit/detect.gleam", 278).
-spec detected_codec(packkit@codec:codec(), binary()) -> detected().
detected_codec(Value, Extension) ->
    {detected, {some, Value}, none, none, {some, Extension}}.

-file("src/packkit/detect.gleam", 60).
?DOC(
    " Filename rules, ordered most-specific first so the first match wins.\n"
    " Compound extensions (`.tar.gz`, `.tar.bz2`, …) must precede the\n"
    " single extensions (`.gz`, `.bz2`, …) for the obvious reason.\n"
).
-spec filename_rules() -> list({list(binary()), fun(() -> detected())}).
filename_rules() ->
    [{[<<".tar.gz"/utf8>>, <<".tgz"/utf8>>],
            fun() ->
                detected_recipe(packkit@recipe:tar_gzip(), <<"tar.gz"/utf8>>)
            end},
        {[<<".tar.zlib"/utf8>>],
            fun() ->
                detected_recipe(packkit@recipe:tar_zlib(), <<"tar.zlib"/utf8>>)
            end},
        {[<<".tar.lz4"/utf8>>],
            fun() ->
                detected_recipe(packkit@recipe:tar_lz4(), <<"tar.lz4"/utf8>>)
            end},
        {[<<".tar.sz"/utf8>>, <<".tar.snappy"/utf8>>],
            fun() ->
                detected_recipe(
                    packkit@recipe:tar_snappy(),
                    <<"tar.snappy"/utf8>>
                )
            end},
        {[<<".tar.bz2"/utf8>>],
            fun() ->
                detected_recipe(packkit@recipe:tar_bzip2(), <<"tar.bz2"/utf8>>)
            end},
        {[<<".tar.xz"/utf8>>],
            fun() ->
                detected_recipe(packkit@recipe:tar_xz(), <<"tar.xz"/utf8>>)
            end},
        {[<<".tar.zst"/utf8>>],
            fun() ->
                detected_recipe(packkit@recipe:tar_zstd(), <<"tar.zst"/utf8>>)
            end},
        {[<<".tar.br"/utf8>>],
            fun() ->
                detected_recipe(packkit@recipe:tar_brotli(), <<"tar.br"/utf8>>)
            end},
        {[<<".tar.z"/utf8>>, <<".taz"/utf8>>],
            fun() ->
                detected_recipe(packkit@recipe:tar_lzw(), <<"tar.Z"/utf8>>)
            end},
        {[<<".cpio.gz"/utf8>>],
            fun() ->
                detected_recipe(packkit@recipe:cpio_gzip(), <<"cpio.gz"/utf8>>)
            end},
        {[<<".cpio.bz2"/utf8>>],
            fun() ->
                detected_recipe(
                    packkit@recipe:cpio_bzip2(),
                    <<"cpio.bz2"/utf8>>
                )
            end},
        {[<<".cpio.xz"/utf8>>],
            fun() ->
                detected_recipe(packkit@recipe:cpio_xz(), <<"cpio.xz"/utf8>>)
            end},
        {[<<".cpio.zst"/utf8>>],
            fun() ->
                detected_recipe(packkit@recipe:cpio_zstd(), <<"cpio.zst"/utf8>>)
            end},
        {[<<".tar"/utf8>>],
            fun() -> detected_archive(packkit@archive:tar(), <<"tar"/utf8>>) end},
        {[<<".zip"/utf8>>],
            fun() -> detected_archive(packkit@archive:zip(), <<"zip"/utf8>>) end},
        {[<<".7z"/utf8>>],
            fun() ->
                detected_archive(packkit@archive:seven_z(), <<"7z"/utf8>>)
            end},
        {[<<".cpio"/utf8>>],
            fun() ->
                detected_archive(packkit@archive:cpio_newc(), <<"cpio"/utf8>>)
            end},
        {[<<".ar"/utf8>>, <<".a"/utf8>>],
            fun() -> detected_archive(packkit@archive:ar(), <<"ar"/utf8>>) end},
        {[<<".gz"/utf8>>],
            fun() -> detected_codec(packkit@codec:gzip(), <<"gz"/utf8>>) end},
        {[<<".zlib"/utf8>>],
            fun() -> detected_codec(packkit@codec:zlib(), <<"zlib"/utf8>>) end},
        {[<<".deflate"/utf8>>, <<".dfl"/utf8>>],
            fun() ->
                detected_codec(packkit@codec:deflate(), <<"deflate"/utf8>>)
            end},
        {[<<".lz4"/utf8>>],
            fun() -> detected_codec(packkit@codec:lz4(), <<"lz4"/utf8>>) end},
        {[<<".sz"/utf8>>, <<".snappy"/utf8>>],
            fun() ->
                detected_codec(packkit@codec:snappy(), <<"snappy"/utf8>>)
            end},
        {[<<".bz2"/utf8>>],
            fun() -> detected_codec(packkit@codec:bzip2(), <<"bz2"/utf8>>) end},
        {[<<".xz"/utf8>>],
            fun() -> detected_codec(packkit@codec:xz(), <<"xz"/utf8>>) end},
        {[<<".br"/utf8>>],
            fun() -> detected_codec(packkit@codec:brotli(), <<"br"/utf8>>) end},
        {[<<".zst"/utf8>>],
            fun() -> detected_codec(packkit@codec:zstd(), <<"zst"/utf8>>) end},
        {[<<".z"/utf8>>],
            fun() -> detected_codec(packkit@codec:lzw(), <<"Z"/utf8>>) end}].

-file("src/packkit/detect.gleam", 164).
?DOC(
    " Detect a format from the leading bytes of an input stream.\n"
    "\n"
    " Signatures are matched as strictly as practical:\n"
    "\n"
    " * gzip (`1F 8B`) also requires the compression-method byte to be\n"
    "   `08` (DEFLATE), since RFC 1952 reserves the other values and\n"
    "   no production gzip stream uses them.\n"
    " * zlib (`78 _`) requires CMF.CM == 8 (DEFLATE), CMF.CINFO ≤ 7\n"
    "   (15-bit window), and `(CMF*256 + FLG) % 31 == 0` per RFC 1950.\n"
    " * bzip2 (`BZh`) additionally requires the block-size byte to be\n"
    "   an ASCII digit `1`..`9`.\n"
    " * lz4 (`04 22 4D 18`) and `.Z` (`1F 9D`) keep their fixed magic.\n"
    " * zstd skippable frames (magic `184D2A50`..`184D2A5F`) are\n"
    "   recognised as zstd so wrappers that embed user metadata in\n"
    "   skippable frames at the start of the stream do not fail to\n"
    "   detect.\n"
    " * snappy framed format starts with a stream identifier chunk\n"
    "   (`FF 06 00 00 sNaPpY`); the raw snappy block format has no\n"
    "   magic so it can only be detected from filename.\n"
    "\n"
    " Looser signatures like a bare `0x78 _` would false-positive on\n"
    " any byte stream whose first byte happens to be `0x78`.\n"
).
-spec from_bytes(bitstring()) -> {ok, detected()} |
    {error, packkit@error:detect_error()}.
from_bytes(Bytes) ->
    case Bytes of
        <<16#1F, 16#8B, Cm, _/binary>> when Cm =:= 16#08 ->
            {ok, detected_codec(packkit@codec:gzip(), <<"gz"/utf8>>)};

        <<16#50, 16#4B, 16#03, 16#04, _/binary>> ->
            {ok, detected_archive(packkit@archive:zip(), <<"zip"/utf8>>)};

        <<16#50, 16#4B, 16#05, 16#06, _/binary>> ->
            {ok, detected_archive(packkit@archive:zip(), <<"zip"/utf8>>)};

        <<16#37, 16#7A, 16#BC, 16#AF, 16#27, 16#1C, _/binary>> ->
            {ok, detected_archive(packkit@archive:seven_z(), <<"7z"/utf8>>)};

        <<16#FD, 16#37, 16#7A, 16#58, 16#5A, 16#00, _/binary>> ->
            {ok, detected_codec(packkit@codec:xz(), <<"xz"/utf8>>)};

        <<16#28, 16#B5, 16#2F, 16#FD, _/binary>> ->
            {ok, detected_codec(packkit@codec:zstd(), <<"zst"/utf8>>)};

        <<Low, 16#2A, 16#4D, 16#18, _/binary>> when (Low >= 16#50) andalso (Low =< 16#5F) ->
            {ok, detected_codec(packkit@codec:zstd(), <<"zst"/utf8>>)};

        <<16#04, 16#22, 16#4D, 16#18, _/binary>> ->
            {ok, detected_codec(packkit@codec:lz4(), <<"lz4"/utf8>>)};

        <<16#02, 16#21, 16#4C, 16#18, _/binary>> ->
            {ok, detected_codec(packkit@codec:lz4(), <<"lz4"/utf8>>)};

        <<16#FF, 16#06, 16#00, 16#00, "sNaPpY"/utf8, _/binary>> ->
            {ok, detected_codec(packkit@codec:snappy(), <<"snappy"/utf8>>)};

        <<16#42, 16#5A, 16#68, Lvl, _/binary>> when (Lvl >= 16#31) andalso (Lvl =< 16#39) ->
            {ok, detected_codec(packkit@codec:bzip2(), <<"bz2"/utf8>>)};

        <<16#1F, 16#9D, _/binary>> ->
            {ok, detected_codec(packkit@codec:lzw(), <<"Z"/utf8>>)};

        <<"!<arch>\n"/utf8, _/binary>> ->
            {ok, detected_archive(packkit@archive:ar(), <<"ar"/utf8>>)};

        <<"070701"/utf8, _/binary>> ->
            {ok, detected_archive(packkit@archive:cpio_newc(), <<"cpio"/utf8>>)};

        _ ->
            case looks_like_zlib(Bytes) of
                true ->
                    {ok, detected_codec(packkit@codec:zlib(), <<"zlib"/utf8>>)};

                false ->
                    case has_ustar_magic(Bytes) of
                        true ->
                            {ok,
                                detected_archive(
                                    packkit@archive:tar(),
                                    <<"tar"/utf8>>
                                )};

                        false ->
                            {error,
                                {detect_unknown_format,
                                    <<"byte-signature scan"/utf8>>}}
                    end
            end
    end.

-file("src/packkit/detect.gleam", 287).
-spec matches_any(binary(), list(binary())) -> boolean().
matches_any(Path, Suffixes) ->
    case Suffixes of
        [] ->
            false;

        [Suffix | Rest] ->
            gleam_stdlib:string_ends_with(Path, Suffix) orelse matches_any(
                Path,
                Rest
            )
    end.

-file("src/packkit/detect.gleam", 128).
-spec find_filename_match(
    binary(),
    list({list(binary()), fun(() -> detected())})
) -> gleam@option:option(detected()).
find_filename_match(Path, Rules) ->
    case Rules of
        [] ->
            none;

        [{Suffixes, Build} | Rest] ->
            case matches_any(Path, Suffixes) of
                true ->
                    {some, Build()};

                false ->
                    find_filename_match(Path, Rest)
            end
    end.

-file("src/packkit/detect.gleam", 23).
?DOC(" Detect a format from a filename or path suffix.\n").
-spec from_filename(binary()) -> {ok, detected()} |
    {error, packkit@error:detect_error()}.
from_filename(Path) ->
    Lower = string:lowercase(Path),
    case find_filename_match(Lower, filename_rules()) of
        {some, Detected} ->
            {ok, Detected};

        none ->
            {error, {detect_unknown_format, Path}}
    end.

-file("src/packkit/detect.gleam", 42).
?DOC(
    " Try filename detection first, then fall back to magic-byte\n"
    " detection on the supplied content.  Mirrors the resolution order\n"
    " most CLI tools use: a meaningful extension is a strong signal, but\n"
    " when the path is uninformative (`-`, `/dev/stdin`, an arbitrary\n"
    " upload, etc.) the file's first bytes still pin the format.\n"
    "\n"
    " On total failure (neither the filename nor the magic bytes\n"
    " classified the input) the returned `DetectUnknownFormat` carries\n"
    " the original caller-supplied `path` in its `input` field — never\n"
    " the internal `\"byte-signature scan\"` sentinel — so the message\n"
    " stays specific to the user's input.\n"
).
-spec from_path_or_bytes(binary(), bitstring()) -> {ok, detected()} |
    {error, packkit@error:detect_error()}.
from_path_or_bytes(Path, Bytes) ->
    _pipe = from_filename(Path),
    _pipe@1 = gleam@result:'or'(_pipe, from_bytes(Bytes)),
    gleam@result:'or'(_pipe@1, {error, {detect_unknown_format, Path}}).