-module(aws_ffi).
-include_lib("xmerl/include/xmerl.hrl").
-export([sha256/1, sha1/1, md5/1, crc32c/1, hmac_sha256/2, hex_encode/1,
ecdsa_p256_sign/2, ecdsa_p256_verify/3, ecdsa_p256_public_key/1, get_env/1,
read_file/1, unix_seconds/0, parse_iso8601/1, run_process/2,
sha1_hex/1, aws_timestamp/0, random_float/0,
encode_dynamic_to_json/1, float_nan/0, float_infinity/0,
float_neg_infinity/0, float_is_nan/1, float_is_infinite/1,
json_canonicalize/1, xml_parse/1, float_short/1,
format_iso8601/1, parse_http_date/1, format_http_date/1,
idempotency_token/0, set_env/2, rescue_call/1, plain_args/0]).
%% Shortest round-tripping float string, e.g. `1.1` not `1.10000000…e+00`.
%% Used by query / header / URI-label / XML formatters; matches AWS's
%% canonical wire form for the SimpleScalarProperties protocol-test
%% suite. Requires OTP 25+.
float_short(F) when is_float(F) -> float_to_binary(F, [short]).
%% RFC 7231 / RFC 1123 HTTP-date: "Sun, 06 Nov 1994 08:49:37 GMT".
%% Used by Smithy `@timestampFormat("http-date")` on response headers
%% / bodies. Returns epoch seconds or `{error, nil}` on malformed input.
parse_http_date(Bin) when is_binary(Bin) ->
%% `calendar:datetime_to_gregorian_seconds/1` raises `function_clause`
%% on out-of-range fields (e.g. day 39, hour 25) that pass the regex
%% shape check; this is server-controlled input, so catch and report
%% it as `{error, nil}` rather than crash the consumer's process.
try
Months = #{<<"Jan">>=>1, <<"Feb">>=>2, <<"Mar">>=>3, <<"Apr">>=>4,
<<"May">>=>5, <<"Jun">>=>6, <<"Jul">>=>7, <<"Aug">>=>8,
<<"Sep">>=>9, <<"Oct">>=>10, <<"Nov">>=>11, <<"Dec">>=>12},
Re = "^[A-Za-z]+, ([0-9]{2}) ([A-Za-z]{3}) ([0-9]{4}) ([0-9]{2}):([0-9]{2}):([0-9]{2}) GMT$",
case re:run(Bin, Re, [{capture, all_but_first, binary}]) of
{match, [D, MoBin, Y, H, Mi, S]} ->
case maps:find(MoBin, Months) of
{ok, Mo} ->
DateTime = {
{binary_to_integer(Y), Mo, binary_to_integer(D)},
{binary_to_integer(H), binary_to_integer(Mi), binary_to_integer(S)}
},
Epoch = 62167219200,
{ok, calendar:datetime_to_gregorian_seconds(DateTime) - Epoch};
error -> {error, nil}
end;
_ -> {error, nil}
end
catch
_:_ -> {error, nil}
end.
%% Inverse of `parse_iso8601`: epoch seconds (Int) → `"YYYY-MM-DDTHH:MM:SSZ"`
%% UTC string. The default `@timestampFormat` for `date-time` in
%% restJson1 / restXml is RFC 3339 with second precision; we never emit
%% fractional seconds. Negative epoch seconds (pre-1970) round-trip via
%% `calendar:gregorian_seconds_to_datetime`.
format_iso8601(Seconds) when is_integer(Seconds) ->
Epoch = 62167219200,
{{Y, Mo, D}, {H, Mi, S}} =
calendar:gregorian_seconds_to_datetime(Seconds + Epoch),
iolist_to_binary(io_lib:format(
"~4..0w-~2..0w-~2..0wT~2..0w:~2..0w:~2..0wZ",
[Y, Mo, D, H, Mi, S]
)).
%% Auto-fill value for `@idempotencyToken` members when the caller
%% leaves the field unset. Returns the nil-v4 UUID
%% `00000000-0000-4000-8000-000000000000` by default — the Smithy
%% protocol-test fixtures expect this exact string. Production
%% callers can override via the application env
%% `aws.idempotency_token` (any string).
idempotency_token() ->
case application:get_env(aws, idempotency_token) of
{ok, V} when is_binary(V) -> V;
_ -> <<"00000000-0000-4000-8000-000000000000">>
end.
%% RFC 7231 / IMF-fixdate, e.g. `Tue, 29 Apr 2014 18:30:38 GMT`. Used
%% by `@timestampFormat("http-date")` body fields and the
%% `If-Modified-Since`-style header members.
format_http_date(Seconds) when is_integer(Seconds) ->
Epoch = 62167219200,
{{Y, Mo, D}, {H, Mi, S}} =
calendar:gregorian_seconds_to_datetime(Seconds + Epoch),
DayOfWeek = calendar:day_of_the_week({Y, Mo, D}),
DayName = element(DayOfWeek,
{<<"Mon">>, <<"Tue">>, <<"Wed">>, <<"Thu">>,
<<"Fri">>, <<"Sat">>, <<"Sun">>}),
MonthName = element(Mo,
{<<"Jan">>, <<"Feb">>, <<"Mar">>, <<"Apr">>, <<"May">>, <<"Jun">>,
<<"Jul">>, <<"Aug">>, <<"Sep">>, <<"Oct">>, <<"Nov">>, <<"Dec">>}),
iolist_to_binary(io_lib:format(
"~s, ~2..0w ~s ~4..0w ~2..0w:~2..0w:~2..0w GMT",
[DayName, D, MonthName, Y, H, Mi, S]
)).
sha256(Data) ->
crypto:hash(sha256, Data).
%% Raw SHA-1 digest as a 20-byte binary. Exposed for the AWS
%% multi-algorithm checksum feature (`sha1` variant) — base64-
%% encoded into the `x-amz-checksum-sha1` header. Distinct from
%% `sha1_hex/1`, which returns the lowercase hex form used by the
%% SSO cache-file naming.
sha1(Data) ->
crypto:hash(sha, Data).
%% Raw MD5 digest, used by the `@httpChecksumRequired` body-checksum
%% helper. MD5 is a degraded primitive for security work but the
%% AWS wire spec for this trait requires it (Content-MD5 is the
%% S3-control / restJson1 wire shape); SigV4 covers the actual auth.
md5(Data) ->
crypto:hash(md5, Data).
hmac_sha256(Key, Data) ->
crypto:mac(hmac, sha256, Key, Data).
%% Sign `Data` with an ECDSA P-256 (secp256r1) private key, using
%% SHA-256 as the message digest. `PrivateKey` is the 32-byte
%% scalar value as a binary. Returns the DER-encoded ASN.1
%% signature blob.
%%
%% Used by SigV4a (`AWS4-ECDSA-P256-SHA256`). Erlang's `crypto`
%% module generates a fresh random nonce per call rather than the
%% RFC 6979 deterministic nonce AWS reference vectors use, so
%% signatures verify correctly against the AWS public-key check
%% but won't match the aws-c-auth v4a fixture's literal bytes.
ecdsa_p256_sign(PrivateKey, Data) when is_binary(PrivateKey), is_binary(Data) ->
crypto:sign(ecdsa, sha256, Data, [PrivateKey, secp256r1]).
%% Verify a SigV4a signature. `PublicKey` is the uncompressed 65-byte
%% SEC1 form (`04 || X || Y`); `Signature` is the DER-encoded blob
%% returned by `ecdsa_p256_sign`. Returns a boolean.
ecdsa_p256_verify(PublicKey, Data, Signature)
when is_binary(PublicKey), is_binary(Data), is_binary(Signature) ->
crypto:verify(ecdsa, sha256, Data, Signature, [PublicKey, secp256r1]).
%% Derive the uncompressed SEC1 public key (`<<4, X:32/binary, Y:32/binary>>`)
%% for a given P-256 private scalar. Used by the SigV4a key-derivation
%% tests and by anyone who wants to surface the public counterpart of a
%% derived signing key. `crypto:generate_key/3` with an explicit private
%% scalar leaves the scalar untouched and only fills in the public side.
ecdsa_p256_public_key(PrivateKey) when is_binary(PrivateKey) ->
{PubKey, PrivateKey} = crypto:generate_key(ecdh, secp256r1, PrivateKey),
PubKey.
%% CRC-32C (Castagnoli polynomial 0x1EDC6F41, reflected 0x82F63B78).
%% Used by AWS multi-algorithm checksum (`crc32c` variant) — base64
%% of the BE 4-byte form goes into `x-amz-checksum-crc32c`. Not
%% available in OTP's stdlib so we compute it byte-by-byte; the
%% loop is short enough that a fully-precomputed table isn't worth
%% the static overhead. Returns the unsigned 32-bit integer value.
crc32c(Data) when is_binary(Data) ->
Crc = crc32c_loop(Data, 16#FFFFFFFF),
Crc bxor 16#FFFFFFFF.
crc32c_loop(<<>>, Crc) -> Crc;
crc32c_loop(<<B, Rest/binary>>, Crc) ->
crc32c_loop(Rest, crc32c_byte(Crc bxor B)).
%% Single-byte CRC-32C step using the reflected polynomial. Eight
%% conditional shifts per byte; runs ~25 MB/s on a modern laptop
%% which is enough for SDK checksum use (payloads bounded by S3
%% put-object size + memory anyway).
crc32c_byte(Crc) ->
crc32c_bits(Crc, 8).
crc32c_bits(Crc, 0) -> Crc;
crc32c_bits(Crc, N) ->
NewCrc = case Crc band 1 of
1 -> (Crc bsr 1) bxor 16#82F63B78;
0 -> Crc bsr 1
end,
crc32c_bits(NewCrc, N - 1).
hex_encode(Bin) ->
binary:encode_hex(Bin, lowercase).
%% Lowercase-hex SHA-1 of the input binary. Used by the SSO provider to
%% derive the cache-file name from a session name / start URL.
sha1_hex(Bin) ->
binary:encode_hex(crypto:hash(sha, Bin), lowercase).
%% Return {ok, Value} if the env var is set, {error, nil} otherwise.
%% os:getenv/1 returns the string value or the atom false; we coerce to the
%% Gleam Result(String, Nil) shape that the credentials module expects.
get_env(Name) ->
case os:getenv(unicode:characters_to_list(Name)) of
false -> {error, nil};
Value -> {ok, unicode:characters_to_binary(Value)}
end.
%% Set an OS environment variable. The Lambda runtime uses this to copy
%% the per-invocation X-Ray trace id from the `Lambda-Runtime-Trace-Id`
%% response header into `_X_AMZ_TRACE_ID`, the variable the AWS SDKs read
%% to attach downstream calls to the active trace. Returns nil.
set_env(Name, Value) when is_binary(Name), is_binary(Value) ->
os:putenv(unicode:characters_to_list(Name), unicode:characters_to_list(Value)),
nil.
%% Plain command-line arguments — everything after `--` in
%% `gleam run -- ...` — as binaries. The Lambda local-run path reads
%% `--event <json>` from these; empty list under a release/Lambda boot.
plain_args() ->
[unicode:characters_to_binary(A) || A <- init:get_plain_arguments()].
%% Run a 0-arity fun inside try/catch and reflect the outcome as a Gleam
%% Result. Success becomes {ok, Value}; any raise/throw/exit becomes
%% {error, {handler_crash, Class, Message, StackLines}} — i.e. the Gleam
%% value Error(HandlerCrash(class, message, stack_trace)). The Lambda
%% runtime wraps each user handler call in this so an exception is
%% reported to the Runtime API /error endpoint (and the loop survives to
%% serve the next invocation) instead of taking the whole process down.
rescue_call(Fun) when is_function(Fun, 0) ->
try {ok, Fun()}
catch
Class:Reason:Stack ->
{error, {handler_crash,
atom_to_binary(Class, utf8),
reason_to_binary(Reason),
stack_to_lines(Stack)}}
end.
%% Gleam `panic`, `todo`, and failed `let assert` raise a map carrying a
%% `message` binary; surface that verbatim. Everything else gets a
%% depth-bounded ~p rendering so a deeply nested term can't produce an
%% unbounded error body.
reason_to_binary(Reason) when is_map(Reason) ->
case Reason of
#{message := M} when is_binary(M) -> M;
_ -> iolist_to_binary(io_lib:format("~tP", [Reason, 12]))
end;
reason_to_binary(Reason) ->
iolist_to_binary(io_lib:format("~tP", [Reason, 12])).
stack_to_lines(Stack) ->
[iolist_to_binary(io_lib:format("~tP", [Frame, 8])) || Frame <- Stack].
%% Read a file as text. Path is a binary; return {ok, Binary} | {error, nil}.
%% Used by the default profile-provider reader so the runtime doesn't take
%% on simplifile as a transitive dependency just to read ~/.aws/credentials.
read_file(Path) ->
case file:read_file(binary_to_list(Path)) of
{ok, Bin} -> {ok, Bin};
{error, _} -> {error, nil}
end.
%% Unix seconds since epoch. Default production clock for the credentials
%% cache. system_time(second) is monotonic-corrected and matches AWS's
%% credential-expiration timestamps (also unix seconds).
unix_seconds() ->
erlang:system_time(second).
%% Current UTC time formatted as a SigV4 `X-Amz-Date` value, e.g.
%% "20240315T143022Z". The signer uses opts.timestamp verbatim in both the
%% Authorization scope and the X-Amz-Date header, so they're guaranteed to
%% agree.
aws_timestamp() ->
{{Y, Mo, D}, {H, Mi, S}} = calendar:universal_time(),
iolist_to_binary(
io_lib:format(
"~4..0w~2..0w~2..0wT~2..0w~2..0w~2..0wZ",
[Y, Mo, D, H, Mi, S]
)
).
%% Uniform random float in [0.0, 1.0). The default backoff jitter source for
%% the retry strategy. Real callers can substitute a deterministic RNG in
%% tests so synthetic 429/5xx sequences produce reproducible sleep amounts.
random_float() ->
rand:uniform().
%% Encode an arbitrary decoded JSON term (the Erlang shape gleam_json
%% returns: integers/floats/atoms/binaries/lists/maps) back into a JSON
%% binary string. Used by the protocol-test loader to round-trip the
%% `params` blob from the Smithy AST into something the runner can
%% structurally compare. Returns {ok, Binary} | {error, nil}.
encode_dynamic_to_json(Term) ->
try iolist_to_binary(do_encode_json(Term)) of
Bin -> {ok, Bin}
catch
_:_ -> {error, nil}
end.
do_encode_json(null) -> <<"null">>;
do_encode_json(true) -> <<"true">>;
do_encode_json(false) -> <<"false">>;
do_encode_json(undefined) -> <<"null">>;
do_encode_json(N) when is_integer(N) -> integer_to_binary(N);
do_encode_json(F) when is_float(F) -> float_to_binary(F, [{decimals, 17}, compact]);
do_encode_json(B) when is_binary(B) -> [<<"\"">>, escape_json_string(B), <<"\"">>];
do_encode_json(L) when is_list(L) ->
Items = [do_encode_json(I) || I <- L],
[<<"[">>, lists:join(<<",">>, Items), <<"]">>];
do_encode_json(M) when is_map(M) ->
Pairs = maps:fold(
fun(K, V, Acc) ->
KB = if is_binary(K) -> K; is_atom(K) -> atom_to_binary(K, utf8); true -> iolist_to_binary(io_lib:format("~p", [K])) end,
Pair = [<<"\"">>, escape_json_string(KB), <<"\":">>, do_encode_json(V)],
[Pair | Acc]
end, [], M),
[<<"{">>, lists:join(<<",">>, Pairs), <<"}">>].
escape_json_string(B) when is_binary(B) ->
escape_json_string(B, <<>>).
escape_json_string(<<>>, Acc) -> Acc;
escape_json_string(<<$\\, R/binary>>, Acc) -> escape_json_string(R, <<Acc/binary, $\\, $\\>>);
escape_json_string(<<$", R/binary>>, Acc) -> escape_json_string(R, <<Acc/binary, $\\, $">>);
escape_json_string(<<$\n, R/binary>>, Acc) -> escape_json_string(R, <<Acc/binary, $\\, $n>>);
escape_json_string(<<$\r, R/binary>>, Acc) -> escape_json_string(R, <<Acc/binary, $\\, $r>>);
escape_json_string(<<$\t, R/binary>>, Acc) -> escape_json_string(R, <<Acc/binary, $\\, $t>>);
escape_json_string(<<C, R/binary>>, Acc) when C < 32 ->
escape_json_string(R, <<Acc/binary, $\\, $u, "00", (hex_digit(C div 16)):8, (hex_digit(C rem 16)):8>>);
escape_json_string(<<C, R/binary>>, Acc) -> escape_json_string(R, <<Acc/binary, C>>).
hex_digit(N) when N < 10 -> $0 + N;
hex_digit(N) -> $a + N - 10.
%% IEEE 754 special-float helpers used by awsJson serializers.
%% Erlang floats are IEEE 754; we just need to produce the three sentinel
%% values and recognise them on the way in.
%%
%% NaN is the canonical "not a number"; Inf is +∞; -Inf is −∞. The OTP
%% standard library exposes these via `float_to_*` formatting, but for
%% Gleam consumers we need plain constructors.
%% awsJson encodes IEEE 754 specials (NaN / Infinity / -Infinity) as JSON
%% strings. Unlike Rust's `f64`, the Erlang `float()` type CANNOT hold
%% these values — `0.0/0.0` raises `badarith`, `<<F/float>> = <<NaN_bits>>`
%% bombs on Badmatch, `binary_to_term` rejects the NaN tag with Badarg.
%% This is a BEAM platform constraint, documented in
%% docs/audits/m5.md (§7 "Platform limitations").
%%
%% The helpers below DEGRADE GRACEFULLY: constructors return 0.0 when
%% the underlying VM refuses to allocate the special value, so call
%% sites don't crash. SimpleScalarProperties special-float protocol-test
%% cases consequently produce body mismatches (not crashes) — a real,
%% honest signal that this codec corner is not representable on Erlang.
%% First-class support requires a wrapper type (e.g. `type Float { F(Float)
%% NaN PosInf NegInf }`), tracked in the audit's next-iteration roadmap.
float_nan() ->
try binary_to_term(<<131, 70, 16#7FF8000000000000:64>>) of
F when is_float(F) -> F
catch
_:_ -> 0.0
end.
float_infinity() ->
try binary_to_term(<<131, 70, 16#7FF0000000000000:64>>) of
F when is_float(F) -> F
catch
_:_ -> 0.0
end.
float_neg_infinity() ->
try binary_to_term(<<131, 70, 16#FFF0000000000000:64>>) of
F when is_float(F) -> F
catch
_:_ -> 0.0
end.
float_is_nan(F) when is_float(F) ->
%% IEEE 754: NaN is the only value not equal to itself. On Erlang
%% we never actually hold NaN, so this is reliably false; kept for
%% API symmetry with platforms that do.
F =/= F;
float_is_nan(_) -> false.
float_is_infinite(F) when is_float(F) ->
%% Comparison against the (degraded) sentinels. Reliably false on
%% Erlang.
%% `+0.0` (rather than the literal `0.0`) silences an OTP 27 warning:
%% as of 27, `0.0` only matches +0.0, not -0.0. We're testing
%% `=/= 0.0` (positive OR negative), so explicitly say +0.0 to
%% acknowledge the asymmetry — the comparison is still correct
%% because IEEE 754 says +0.0 =:= -0.0 evaluates equal anyway.
F =:= float_infinity() andalso F =/= +0.0;
float_is_infinite(_) -> false.
%% Parse a JSON string and re-encode it canonically — object keys sorted,
%% all whitespace stripped. The Smithy protocol-test spec compares JSON
%% bodies structurally; this is how we collapse the byte-level
%% representation to a stable shape before equality testing.
%% Returns {ok, CanonicalBinary} | {error, nil}.
json_canonicalize(Bin) when is_binary(Bin) ->
case decode_json_string(Bin) of
{ok, Term} ->
try iolist_to_binary(do_encode_canonical(Term)) of
Out -> {ok, Out}
catch
_:_ -> {error, nil}
end;
error -> {error, nil}
end.
%% Use Erlang OTP 27+ json module for parsing.
decode_json_string(Bin) ->
try json:decode(Bin) of
T -> {ok, T}
catch
_:_ -> error
end.
%% Canonical encoder: sorts object keys, no whitespace. Identical
%% semantics to do_encode_json above except that maps emit pairs in
%% key-sorted order.
do_encode_canonical(null) -> <<"null">>;
do_encode_canonical(true) -> <<"true">>;
do_encode_canonical(false) -> <<"false">>;
do_encode_canonical(undefined) -> <<"null">>;
do_encode_canonical(N) when is_integer(N) -> integer_to_binary(N);
do_encode_canonical(F) when is_float(F) -> float_to_binary(F, [{decimals, 17}, compact]);
do_encode_canonical(B) when is_binary(B) -> [<<"\"">>, escape_json_string(B), <<"\"">>];
do_encode_canonical(L) when is_list(L) ->
Items = [do_encode_canonical(I) || I <- L],
[<<"[">>, lists:join(<<",">>, Items), <<"]">>];
do_encode_canonical(M) when is_map(M) ->
Sorted = lists:sort(maps:to_list(M)),
Pairs = [
begin
KB = if is_binary(K) -> K;
is_atom(K) -> atom_to_binary(K, utf8);
true -> iolist_to_binary(io_lib:format("~p", [K]))
end,
[<<"\"">>, escape_json_string(KB), <<"\":">>, do_encode_canonical(V)]
end
|| {K, V} <- Sorted
],
[<<"{">>, lists:join(<<",">>, Pairs), <<"}">>].
%% Parse an AWS-style ISO 8601 UTC timestamp ("2023-11-30T15:30:00Z" or with
%% fractional seconds like "2023-11-30T15:30:00.000Z") into unix seconds.
%% Returns {ok, Seconds} | {error, nil}. Used by IMDS/STS/SSO providers to
%% turn the Expiration field of a credentials response into expires_at.
parse_iso8601(Bin) when is_binary(Bin) ->
%% Accept either `Z` or `±HH:MM` zone suffix, with optional fractional
%% seconds. The offset is folded into the returned epoch seconds.
%%
%% `calendar:datetime_to_gregorian_seconds/1` raises `function_clause`
%% on out-of-range fields (e.g. month 13, hour 25) that pass the regex
%% shape check; this parses server-controlled credential `Expiration`
%% and response timestamps, so catch and report it as `{error, nil}`
%% rather than crash the consumer's process.
try
Re = "^([0-9]{4})-([0-9]{2})-([0-9]{2})T([0-9]{2}):([0-9]{2}):([0-9]{2})(?:\\.[0-9]+)?(Z|([+-])([0-9]{2}):([0-9]{2}))$",
case re:run(Bin, Re, [{capture, all_but_first, binary}]) of
{match, [Y, Mo, D, H, Mi, S | Zone]} ->
DateTime = {
{binary_to_integer(Y), binary_to_integer(Mo), binary_to_integer(D)},
{binary_to_integer(H), binary_to_integer(Mi), binary_to_integer(S)}
},
Epoch = 62167219200,
Base = calendar:datetime_to_gregorian_seconds(DateTime) - Epoch,
OffsetSec = case Zone of
[<<"Z">>] -> 0;
[<<"Z">>, _, _, _] -> 0;
[_, Sign, OffH, OffM] ->
Sec = binary_to_integer(OffH) * 3600 + binary_to_integer(OffM) * 60,
case Sign of <<"-">> -> Sec; <<"+">> -> -Sec end
end,
{ok, Base + OffsetSec};
_ ->
{error, nil}
end
catch
_:_ -> {error, nil}
end.
%% Run an external command and capture its stdout + exit status.
%% Args is a list of binary arguments (not interpolated through a shell, so
%% there's no shell-injection surface). Returns {ok, {ExitCode, Stdout}} on
%% success, {error, nil} if the command could not be launched.
run_process(Command, Args) when is_binary(Command), is_list(Args) ->
CmdStr = binary_to_list(Command),
ArgStrs = [binary_to_list(A) || A <- Args],
try
Port = open_port(
{spawn_executable, os:find_executable(CmdStr)},
[
{args, ArgStrs},
binary,
exit_status,
stderr_to_stdout,
use_stdio,
stream
]
),
collect_port(Port, <<>>)
catch
_:_ -> {error, nil}
end.
collect_port(Port, Acc) ->
receive
{Port, {data, Data}} -> collect_port(Port, <<Acc/binary, Data/binary>>);
{Port, {exit_status, Status}} -> {ok, {Status, Acc}}
after 30000 ->
try port_close(Port) catch _:_ -> ok end,
{error, nil}
end.
%% Parse an XML document into a simple nested tuple tree usable from
%% Gleam without pulling in the xmerl record headers. Returns
%% {ok, {element, Name, Attrs, Children}}
%% where:
%% Name :: binary()
%% Attrs :: [{binary(), binary()}]
%% Children:: [{element, _, _, _} | {text, binary()}]
%%
%% Whitespace-only text nodes between elements are dropped, so the
%% generated decoders can address members by element name without
%% accounting for layout whitespace. Returns {error, nil} on parse
%% failure. Backed by xmerl_scan; xmerl is in the OTP standard library
%% and pulls in no extra dependencies.
xml_parse(Bin) when is_binary(Bin) ->
try
Str = binary_to_list(Bin),
{Doc, _} = xmerl_scan:string(Str, [{quiet, true}]),
{ok, convert_xmerl(Doc)}
catch
_:_ -> {error, nil}
end.
convert_xmerl(#xmlElement{name = Name, attributes = Attrs, content = Content}) ->
AttrTuples = [convert_attr(A) || A <- Attrs],
Children = lists:filtermap(
fun
(#xmlElement{} = X) -> {true, convert_xmerl(X)};
(#xmlText{value = V}) ->
Bin = unicode:characters_to_binary(V),
case is_blank(Bin) of
true -> false;
false -> {true, {text, Bin}}
end;
(_) -> false
end,
Content
),
{element, name_to_binary(Name), AttrTuples, Children}.
convert_attr(#xmlAttribute{name = Name, value = Value}) ->
{name_to_binary(Name), unicode:characters_to_binary(Value)}.
name_to_binary(N) when is_atom(N) -> atom_to_binary(N, utf8);
name_to_binary(N) when is_list(N) -> list_to_binary(N);
name_to_binary(N) when is_binary(N) -> N.
is_blank(<<>>) -> true;
is_blank(<<C, R/binary>>) when C =:= $\s; C =:= $\t; C =:= $\n; C =:= $\r ->
is_blank(R);
is_blank(_) -> false.