Skip to main content

src/roadrunner_multipart.erl

-module(roadrunner_multipart).
-moduledoc """
Parser for `multipart/form-data` request bodies (RFC 7578).

Three entry points:

- `boundary/1` — pull the `boundary=…` parameter out of a
  `Content-Type` header value, handling unquoted, quoted, and
  parameter-mixed forms.
- `parse/2` — split a buffered body into a list of `part()` maps,
  each with its own `headers` and decoded `body`.
- `params/1` — parse the `key=value` parameters of any structured
  header value (e.g. `Content-Type`, `Content-Disposition`) into
  a lowercase-keyed map.

Typical handler shape:

```erlang
handle(Req) ->
    {ok, Body, Req2} = roadrunner_req:read_body(Req),
    {ok, Boundary} = roadrunner_multipart:boundary(
        roadrunner_req:header(~"content-type", Req2)
    ),
    {ok, Parts} = roadrunner_multipart:parse(Body, Boundary),
    %% Parts is a list of #{headers := [...], body := <<...>>}.
    ...
```

This is a buffered parser — the entire body must be in memory first.
For very large file uploads where you can't afford to buffer, a
streaming variant is a future feature; today, cap them at the
listener's `max_content_length` (default 10 MB).

## What gets parsed

- The preamble (bytes before the first boundary) is discarded per
  RFC 7578 §4.1.
- Each part's headers are returned as a list of `{Name, Value}`
  binaries, with the name lowercased (matching the convention in
  `roadrunner_req:headers/1`). Values are LWS-trimmed.
- Each part's body is the bytes between `\r\n\r\n` (end-of-headers)
  and the next `\r\n--<boundary>` (start of next boundary or
  terminator).
- The terminating boundary is `--<boundary>--`. Anything after it
  (the epilogue) is ignored.

## What does NOT get parsed

- `Content-Disposition` parameters (`name`, `filename`, etc.) —
  callers parse them out of the raw header value if they need to.
  Adding a `disposition/1` helper is a straightforward follow-up.
- Per-part transfer encodings (`Content-Transfer-Encoding`) — bodies
  are returned as-is. Modern browsers send raw bytes for
  `multipart/form-data`, so this is rarely needed.
""".

-on_load(init_patterns/0).

-define(SEMI_CP_KEY, {?MODULE, semi_cp}).
-define(EQ_CP_KEY, {?MODULE, eq_cp}).
-define(QUOTE_CP_KEY, {?MODULE, quote_cp}).
-define(CRLF_CP_KEY, {?MODULE, crlf_cp}).
-define(DBL_CRLF_CP_KEY, {?MODULE, dbl_crlf_cp}).
-define(COLON_CP_KEY, {?MODULE, colon_cp}).

-export([parse/2, boundary/1, params/1]).
-export_type([part/0]).

-doc """
One section of a parsed `multipart/form-data` body. `headers` is
the part's headers (names lowercased, values OWS-trimmed) as a list
of `{Name, Value}` binaries; `body` is the raw bytes between
end-of-headers and the next boundary.
""".
-type part() :: #{
    headers := [{binary(), binary()}],
    body := binary()
}.

-doc """
Extract the `boundary=…` parameter from a `Content-Type` header value.
Handles unquoted (`boundary=abc`), quoted (`boundary="a b c"`), and
mixed-with-other-parameters forms.

Returns `{error, no_boundary}` when the parameter isn't present.
""".
-spec boundary(binary()) -> {ok, binary()} | {error, no_boundary}.
boundary(ContentType) when is_binary(ContentType) ->
    case params(ContentType) of
        #{~"boundary" := Boundary} -> {ok, Boundary};
        #{} -> {error, no_boundary}
    end.

-doc """
Parse the parameters of a structured header value (e.g. `Content-Type`,
`Content-Disposition`) into a map. The "type" prefix before the first
`;` is discarded — only the `key=value` pairs after it are returned.

Param names are lowercased per RFC 9110 §8.3.1 (media-type parameter
names are case-insensitive); values are returned as-is, with
surrounding quotes stripped.

Examples:
- `~"text/html; charset=utf-8"` → `#{~"charset" => ~"utf-8"}`
- `~"form-data; name=\"a\"; filename=\"f.txt\""` →
  `#{~"name" => ~"a", ~"filename" => ~"f.txt"}`
- `~"text/html"` → `#{}`

Malformed pairs (no `=`) are silently skipped.
""".
-spec params(binary()) -> #{binary() => binary()}.
params(Value) when is_binary(Value) ->
    SemiCp = persistent_term:get(?SEMI_CP_KEY),
    EqCp = persistent_term:get(?EQ_CP_KEY),
    QuoteCp = persistent_term:get(?QUOTE_CP_KEY),
    Tail =
        case binary:split(Value, SemiCp) of
            [_Type] -> <<>>;
            [_Type, Rest] -> Rest
        end,
    parse_pairs(binary:split(Tail, SemiCp, [global]), EqCp, QuoteCp, #{}).

-spec parse_pairs([binary()], binary:cp(), binary:cp(), #{binary() => binary()}) ->
    #{binary() => binary()}.
parse_pairs([], _EqCp, _QuoteCp, Acc) ->
    Acc;
parse_pairs([Pair | Rest], EqCp, QuoteCp, Acc) ->
    case binary:split(roadrunner_bin:trim_ows(Pair), EqCp) of
        [Key, Val] ->
            %% Unquote first so internal whitespace inside quoted strings
            %% is preserved; trim afterwards catches trailing whitespace
            %% on bare (unquoted) values.
            parse_pairs(
                Rest,
                EqCp,
                QuoteCp,
                Acc#{
                    roadrunner_bin:ascii_lowercase(Key) => roadrunner_bin:trim_ows(
                        unquote(Val, QuoteCp)
                    )
                }
            );
        _ ->
            parse_pairs(Rest, EqCp, QuoteCp, Acc)
    end.

-spec unquote(binary(), binary:cp()) -> binary().
unquote(<<$", Rest/binary>>, QuoteCp) ->
    case binary:match(Rest, QuoteCp) of
        {End, _} -> binary:part(Rest, 0, End);
        nomatch -> Rest
    end;
unquote(Bin, _QuoteCp) ->
    Bin.

-doc """
Split `Body` into a list of multipart parts using `Boundary` as the
delimiter. The boundary must NOT include the leading `--` — that is
the multipart wire prefix and is added internally.

Returns `{error, no_initial_boundary}` if the body doesn't start
with (or contain) the opening boundary, `{error, bad_header}` on
a malformed part header, or other `{error, _}` shapes when the
multipart structure is otherwise broken.
""".
-spec parse(binary(), binary()) -> {ok, [part()]} | {error, term()}.
parse(Body, Boundary) when is_binary(Body), is_binary(Boundary) ->
    Sep = <<"--", Boundary/binary>>,
    case binary:split(Body, Sep) of
        [_Preamble, Rest] ->
            %% Fetch the compiled patterns ONCE on the success branch
            %% and thread them through `parse_parts/5` → `parse_one_part/5`
            %% → header-line parsing. Avoids three
            %% `persistent_term:get/1` calls per part on a multi-part
            %% body, and avoids fetching them at all when the body
            %% lacks the opening boundary. `BodyEnd` is dynamic (depends
            %% on the multipart boundary string) so it can't live in
            %% persistent_term, but compiling it once here amortizes
            %% across every part's `extract_body/3` split.
            DblCrlfCp = persistent_term:get(?DBL_CRLF_CP_KEY),
            CrlfCp = persistent_term:get(?CRLF_CP_KEY),
            ColonCp = persistent_term:get(?COLON_CP_KEY),
            BodyEndCp = binary:compile_pattern(<<"\r\n", Sep/binary>>),
            parse_parts(Rest, BodyEndCp, DblCrlfCp, CrlfCp, ColonCp);
        _ ->
            {error, no_initial_boundary}
    end.

%% After the first boundary marker, what follows is either:
%% - "--\r\n" or "--" → terminating boundary, end of multipart.
%% - "\r\n" + part bytes → another part to parse.
-spec parse_parts(binary(), binary:cp(), binary:cp(), binary:cp(), binary:cp()) ->
    {ok, [part()]} | {error, term()}.
parse_parts(<<"--\r\n", _Epilogue/binary>>, _BodyEndCp, _DblCrlfCp, _CrlfCp, _ColonCp) ->
    {ok, []};
parse_parts(<<"--">>, _BodyEndCp, _DblCrlfCp, _CrlfCp, _ColonCp) ->
    {ok, []};
parse_parts(<<"\r\n", PartAndRest/binary>>, BodyEndCp, DblCrlfCp, CrlfCp, ColonCp) ->
    maybe
        {ok, Part, Rest} ?= parse_one_part(PartAndRest, BodyEndCp, DblCrlfCp, CrlfCp, ColonCp),
        {ok, More} ?= parse_parts(Rest, BodyEndCp, DblCrlfCp, CrlfCp, ColonCp),
        {ok, [Part | More]}
    end;
parse_parts(_, _, _, _, _) ->
    {error, malformed}.

-spec parse_one_part(binary(), binary:cp(), binary:cp(), binary:cp(), binary:cp()) ->
    {ok, part(), binary()} | {error, term()}.
%% Empty header block — the part starts with the empty-line terminator
%% immediately. RFC 5322 §2.2.3 (referenced by RFC 7578) allows this:
%% headers are followed by a blank line, and the header list itself
%% may be empty. Without this clause we'd reject a perfectly valid
%% file-upload that omits Content-Disposition / Content-Type.
parse_one_part(<<"\r\n", BodyAndRest/binary>>, BodyEndCp, _DblCrlfCp, _CrlfCp, _ColonCp) ->
    extract_body(BodyAndRest, [], BodyEndCp);
parse_one_part(Bytes, BodyEndCp, DblCrlfCp, CrlfCp, ColonCp) ->
    maybe
        [HeaderBlock, BodyAndRest] ?= binary:split(Bytes, DblCrlfCp),
        {ok, Headers} ?=
            parse_header_lines(binary:split(HeaderBlock, CrlfCp, [global]), ColonCp),
        extract_body(BodyAndRest, Headers, BodyEndCp)
    else
        [_] -> {error, no_header_terminator};
        Other -> Other
    end.

-spec extract_body(binary(), [{binary(), binary()}], binary:cp()) ->
    {ok, part(), binary()} | {error, no_part_terminator}.
extract_body(BodyAndRest, Headers, BodyEndCp) ->
    case binary:split(BodyAndRest, BodyEndCp) of
        [Body, Rest] -> {ok, #{headers => Headers, body => Body}, Rest};
        _ -> {error, no_part_terminator}
    end.

-spec parse_header_lines([binary()], binary:cp()) ->
    {ok, [{binary(), binary()}]} | {error, bad_header}.
parse_header_lines(Lines, ColonCp) ->
    parse_header_lines_loop(Lines, ColonCp).

-spec parse_header_lines_loop([binary()], binary:cp()) ->
    {ok, [{binary(), binary()}]} | {error, bad_header}.
parse_header_lines_loop([], _ColonCp) ->
    {ok, []};
parse_header_lines_loop([Line | Rest], ColonCp) ->
    maybe
        [Name, Value] ?= binary:split(Line, ColonCp),
        {ok, More} ?= parse_header_lines_loop(Rest, ColonCp),
        {ok, [{roadrunner_bin:ascii_lowercase(Name), roadrunner_bin:trim_ows(Value)} | More]}
    else
        [_] -> {error, bad_header};
        Other -> Other
    end.

%% `-on_load` callback. See `feedback_compile_pattern_convention`.
-spec init_patterns() -> ok.
init_patterns() ->
    persistent_term:put(?SEMI_CP_KEY, binary:compile_pattern(~";")),
    persistent_term:put(?EQ_CP_KEY, binary:compile_pattern(~"=")),
    persistent_term:put(?QUOTE_CP_KEY, binary:compile_pattern(~"\"")),
    persistent_term:put(?CRLF_CP_KEY, binary:compile_pattern(~"\r\n")),
    persistent_term:put(?DBL_CRLF_CP_KEY, binary:compile_pattern(~"\r\n\r\n")),
    persistent_term:put(?COLON_CP_KEY, binary:compile_pattern(~":")),
    ok.