%%%% Lossless, comment-preserving tokenizer for the LFE source formatter.
%%%% Unlike lfe_scan, this keeps every comment and every whitespace character so
%%%% the formatter can preserve them. See docs/design/022-arc1-lexer.md.
-module(lfmt_fezzik_lexer).
-export([tokens/1, to_iolist/1, kind/1, text/1, line/1, col/1]).
-export_type([token/0, kind/0]).
-record(tok, {kind :: kind(),
text :: string(), %% verbatim codepoints this token spans
line :: pos_integer(),
col :: pos_integer()}).
-opaque token() :: #tok{}.
-type kind() :: lparen | rparen | lbracket | rbracket
| tuple_open | map_open | binary_open | eval_open
| quote | quasiquote | unquote | unquote_splicing | fun_ref
| symbol | qsymbol | number | char
| string | bstring | tqstring | tqbstring
| line_comment | block_comment
| whitespace | newline
| dot.
%%====================================================================
%% Exported API
%%====================================================================
-spec tokens(binary() | string()) -> {ok, [token()]} | {error, term()}.
tokens(Bin) when is_binary(Bin) ->
case unicode:characters_to_list(Bin, utf8) of
Cs when is_list(Cs) ->
scan(Cs, 1, 1, []);
{error, _, _} ->
{error, {invalid_encoding, Bin}};
{incomplete, _, _} ->
{error, {invalid_encoding, Bin}}
end;
tokens(Cs) when is_list(Cs) ->
scan(Cs, 1, 1, []).
-spec to_iolist([token()]) -> iolist().
to_iolist(Tokens) ->
[T#tok.text || T <- Tokens].
-spec kind(token()) -> kind().
kind(#tok{kind = K}) -> K.
-spec text(token()) -> string().
text(#tok{text = T}) -> T.
-spec line(token()) -> pos_integer().
line(#tok{line = L}) -> L.
-spec col(token()) -> pos_integer().
col(#tok{col = C}) -> C.
%%====================================================================
%% Internal: main scan loop — §5 priority order
%%====================================================================
%% Done.
scan([], _L, _C, Acc) ->
{ok, lists:reverse(Acc)};
%% §5.1 Newline: one token per \n.
scan([$\n | Rest], Line, Col, Acc) ->
Tok = #tok{kind = newline, text = "\n", line = Line, col = Col},
scan(Rest, Line + 1, 1, [Tok | Acc]);
%% §5.1 Whitespace run: maximal run of non-newline whitespace.
scan([C | _] = Cs, Line, Col, Acc) when C >= 0, C =< $\s, C =/= $\n ->
{Text, Rest, Len} = collect_ws(Cs, [], 0),
Tok = #tok{kind = whitespace, text = Text, line = Line, col = Col},
scan(Rest, Line, Col + Len, [Tok | Acc]);
%% §5.2 Line comment: ; through end of line (excluding \n).
scan([$; | _] = Cs, Line, Col, Acc) ->
{Text, Rest, Len} = collect_line_comment(Cs, [], 0),
Tok = #tok{kind = line_comment, text = Text, line = Line, col = Col},
scan(Rest, Line, Col + Len, [Tok | Acc]);
%% §5.3 / §5.2 Block comment: #|...|#
scan([$#, $| | Rest], Line, Col, Acc) ->
case scan_block_comment(Rest, [$|, $#], Line, Col + 2, Line) of
{ok, Body, Rest2, NewLine, NewCol} ->
Tok = #tok{kind = block_comment,
text = lists:reverse(Body),
line = Line, col = Col},
scan(Rest2, NewLine, NewCol, [Tok | Acc]);
{error, _} = Err ->
Err
end;
%% §5.3 fun_ref: #' (emit the two chars; name/arity falls out as symbol).
scan([$#, $' | Rest], Line, Col, Acc) ->
Tok = #tok{kind = fun_ref, text = "#'", line = Line, col = Col},
scan(Rest, Line, Col + 2, [Tok | Acc]);
%% §5.5 Char literal: #\ followed by hex form or a single codepoint.
scan([$#, $\\ | Rest], Line, Col, Acc) ->
case scan_char_body(Rest) of
{ok, Body, Rest2, BodyLen} ->
Tok = #tok{kind = char,
text = [$#, $\\ | Body],
line = Line, col = Col},
scan(Rest2, Line, Col + 2 + BodyLen, [Tok | Acc]);
{error, _} = Err ->
Err
end;
%% §5.4 Triple-quoted binary string: #"""..."""
scan([$#, $", $", $" | Rest], Line, Col, Acc) ->
case scan_tq_string(Rest, [$", $", $", $#], Line, Col + 4, Line) of
{ok, Body, Rest2, NewLine, NewCol} ->
Tok = #tok{kind = tqbstring,
text = lists:reverse(Body),
line = Line, col = Col},
scan(Rest2, NewLine, NewCol, [Tok | Acc]);
{error, _} = Err ->
Err
end;
%% §5.4 Binary string: #"..."
scan([$#, $" | Rest], Line, Col, Acc) ->
case scan_sq_string(Rest, [$", $#], Line, Col + 2, Line) of
{ok, Body, Rest2, NewLine, NewCol} ->
Tok = #tok{kind = bstring,
text = lists:reverse(Body),
line = Line, col = Col},
scan(Rest2, NewLine, NewCol, [Tok | Acc]);
{error, _} = Err ->
Err
end;
%% §5.3 Tuple open: #(
scan([$#, $( | Rest], Line, Col, Acc) ->
Tok = #tok{kind = tuple_open, text = "#(", line = Line, col = Col},
scan(Rest, Line, Col + 2, [Tok | Acc]);
%% §5.3 Map open: #m( or #M(
scan([$#, M, $( | Rest], Line, Col, Acc) when M =:= $m; M =:= $M ->
Tok = #tok{kind = map_open, text = [$#, M, $(], line = Line, col = Col},
scan(Rest, Line, Col + 3, [Tok | Acc]);
%% §5.3 Binary open: #b( or #B(
scan([$#, B, $( | Rest], Line, Col, Acc) when B =:= $b; B =:= $B ->
Tok = #tok{kind = binary_open, text = [$#, B, $(], line = Line, col = Col},
scan(Rest, Line, Col + 3, [Tok | Acc]);
%% §5.3 Eval open: #.(
scan([$#, $., $( | Rest], Line, Col, Acc) ->
Tok = #tok{kind = eval_open, text = "#.(", line = Line, col = Col},
scan(Rest, Line, Col + 3, [Tok | Acc]);
%% §5.3 # fallthrough: radix number or other run starting with #.
scan([$# | _] = Cs, Line, Col, Acc) ->
{Text, Rest, Len} = collect_run(Cs, [], 0),
Tok = #tok{kind = classify_run(Text), text = Text, line = Line, col = Col},
scan(Rest, Line, Col + Len, [Tok | Acc]);
%% §5.4 Triple-quoted string: """...""" — check before single-quote.
scan([$", $", $" | Rest], Line, Col, Acc) ->
case scan_tq_string(Rest, [$", $", $"], Line, Col + 3, Line) of
{ok, Body, Rest2, NewLine, NewCol} ->
Tok = #tok{kind = tqstring,
text = lists:reverse(Body),
line = Line, col = Col},
scan(Rest2, NewLine, NewCol, [Tok | Acc]);
{error, _} = Err ->
Err
end;
%% §5.4 Regular string: "..."
scan([$" | Rest], Line, Col, Acc) ->
case scan_sq_string(Rest, [$"], Line, Col + 1, Line) of
{ok, Body, Rest2, NewLine, NewCol} ->
Tok = #tok{kind = string,
text = lists:reverse(Body),
line = Line, col = Col},
scan(Rest2, NewLine, NewCol, [Tok | Acc]);
{error, _} = Err ->
Err
end;
%% §5.7 Bar-quoted symbol: |...|
scan([$| | Rest], Line, Col, Acc) ->
case scan_qsymbol(Rest, [$|], Line, Col + 1, Line) of
{ok, Body, Rest2, NewLine, NewCol} ->
Tok = #tok{kind = qsymbol,
text = lists:reverse(Body),
line = Line, col = Col},
scan(Rest2, NewLine, NewCol, [Tok | Acc]);
{error, _} = Err ->
Err
end;
%% §5.8 Unquote-splicing: ,@ — must precede unquote.
scan([$,, $@ | Rest], Line, Col, Acc) ->
Tok = #tok{kind = unquote_splicing, text = ",@", line = Line, col = Col},
scan(Rest, Line, Col + 2, [Tok | Acc]);
%% §5.8 Unquote: ,
scan([$, | Rest], Line, Col, Acc) ->
Tok = #tok{kind = unquote, text = ",", line = Line, col = Col},
scan(Rest, Line, Col + 1, [Tok | Acc]);
%% §5.8 Quote: '
scan([$' | Rest], Line, Col, Acc) ->
Tok = #tok{kind = quote, text = "'", line = Line, col = Col},
scan(Rest, Line, Col + 1, [Tok | Acc]);
%% §5.8 Quasiquote: `
scan([$` | Rest], Line, Col, Acc) ->
Tok = #tok{kind = quasiquote, text = "`", line = Line, col = Col},
scan(Rest, Line, Col + 1, [Tok | Acc]);
%% §5.8 Parens and brackets.
scan([$( | Rest], Line, Col, Acc) ->
Tok = #tok{kind = lparen, text = "(", line = Line, col = Col},
scan(Rest, Line, Col + 1, [Tok | Acc]);
scan([$) | Rest], Line, Col, Acc) ->
Tok = #tok{kind = rparen, text = ")", line = Line, col = Col},
scan(Rest, Line, Col + 1, [Tok | Acc]);
scan([$[ | Rest], Line, Col, Acc) ->
Tok = #tok{kind = lbracket, text = "[", line = Line, col = Col},
scan(Rest, Line, Col + 1, [Tok | Acc]);
scan([$] | Rest], Line, Col, Acc) ->
Tok = #tok{kind = rbracket, text = "]", line = Line, col = Col},
scan(Rest, Line, Col + 1, [Tok | Acc]);
%% §5.6 Symbol / number run: any remaining symbol-constituent char.
scan([C | _] = Cs, Line, Col, Acc) ->
case symbol_char(C) of
true ->
{Text, Rest, Len} = collect_run(Cs, [], 0),
Tok = #tok{kind = classify_run(Text), text = Text,
line = Line, col = Col},
scan(Rest, Line, Col + Len, [Tok | Acc]);
false ->
{error, {unexpected_char, C, Line, Col}}
end.
%%====================================================================
%% Internal: trivia collectors
%%====================================================================
%% collect_ws: maximal run of non-newline whitespace chars.
collect_ws([C | Rest], Acc, N) when C >= 0, C =< $\s, C =/= $\n ->
collect_ws(Rest, [C | Acc], N + 1);
collect_ws(Rest, Acc, N) ->
{lists:reverse(Acc), Rest, N}.
%% collect_line_comment: from ; through end of line (excludes \n).
collect_line_comment([$\n | _] = Rest, Acc, N) ->
{lists:reverse(Acc), Rest, N};
collect_line_comment([], Acc, N) ->
{lists:reverse(Acc), [], N};
collect_line_comment([C | Rest], Acc, N) ->
collect_line_comment(Rest, [C | Acc], N + 1).
%%====================================================================
%% Internal: delimited-token scanners (accumulate reversed; caller reverses)
%%====================================================================
%% scan_block_comment: Acc starts as reversed "#|"; text ends with "|#".
scan_block_comment([$|, $# | Rest], Acc, Line, Col, _SLine) ->
{ok, [$#, $| | Acc], Rest, Line, Col + 2};
scan_block_comment([$\n | Rest], Acc, Line, _Col, SLine) ->
scan_block_comment(Rest, [$\n | Acc], Line + 1, 1, SLine);
scan_block_comment([C | Rest], Acc, Line, Col, SLine) ->
scan_block_comment(Rest, [C | Acc], Line, Col + 1, SLine);
scan_block_comment([], _Acc, _Line, _Col, SLine) ->
{error, {unterminated_block_comment, SLine}}.
%% scan_sq_string: Acc starts as reversed opening delimiter; keeps escapes verbatim.
scan_sq_string([$\\, C | Rest], Acc, Line, Col, SLine) ->
scan_sq_string(Rest, [C, $\\ | Acc], Line, Col + 2, SLine);
scan_sq_string([$" | Rest], Acc, Line, Col, _SLine) ->
{ok, [$" | Acc], Rest, Line, Col + 1};
scan_sq_string([$\n | Rest], Acc, Line, _Col, SLine) ->
scan_sq_string(Rest, [$\n | Acc], Line + 1, 1, SLine);
scan_sq_string([C | Rest], Acc, Line, Col, SLine) ->
scan_sq_string(Rest, [C | Acc], Line, Col + 1, SLine);
scan_sq_string([], _Acc, _Line, _Col, SLine) ->
{error, {unterminated_string, SLine}}.
%% scan_tq_string: opening-line phase — only spaces valid before the required \n.
%% Mirrors lfe_scan's scan_tq_string_1. Spaces are kept verbatim in Acc.
scan_tq_string([$\s | Rest], Acc, Line, Col, SLine) ->
scan_tq_string(Rest, [$\s | Acc], Line, Col + 1, SLine);
scan_tq_string([$\n | Rest], Acc, Line, _Col, SLine) ->
scan_tq_string_body(Rest, [$\n | Acc], true, Line + 1, 1, SLine);
scan_tq_string([], _Acc, _Line, _Col, SLine) ->
{error, {unterminated_string, SLine}};
scan_tq_string(_Cs, _Acc, _Line, _Col, SLine) ->
{error, {bad_tq_string, SLine}}.
%% scan_tq_string_body: content phase.
%% AllSpaces tracks whether every char on the current line so far is a space;
%% """ is the closer iff AllSpaces is true (mirrors lfe_scan's blank_line/1 check).
%% 4+ consecutive quotes on a blank line: the first """ closes; the rest are
%% returned as the next input — matching lfe_scan's behaviour.
scan_tq_string_body([$", $", $" | Rest], Acc, AllSpaces, Line, Col, SLine) ->
case AllSpaces of
true ->
{ok, [$", $", $" | Acc], Rest, Line, Col + 3};
false ->
scan_tq_string_body(Rest, [$", $", $" | Acc],
false, Line, Col + 3, SLine)
end;
scan_tq_string_body([$\n | Rest], Acc, _AllSpaces, Line, _Col, SLine) ->
scan_tq_string_body(Rest, [$\n | Acc], true, Line + 1, 1, SLine);
scan_tq_string_body([$\s | Rest], Acc, AllSpaces, Line, Col, SLine) ->
scan_tq_string_body(Rest, [$\s | Acc], AllSpaces, Line, Col + 1, SLine);
scan_tq_string_body([C | Rest], Acc, _AllSpaces, Line, Col, SLine) ->
scan_tq_string_body(Rest, [C | Acc], false, Line, Col + 1, SLine);
scan_tq_string_body([], _Acc, _AllSpaces, _Line, _Col, SLine) ->
{error, {unterminated_string, SLine}}.
%% scan_qsymbol: Acc starts as reversed "|"; \| and \\ escape sequences kept verbatim.
scan_qsymbol([$\\, C | Rest], Acc, Line, Col, SLine) ->
scan_qsymbol(Rest, [C, $\\ | Acc], Line, Col + 2, SLine);
scan_qsymbol([$| | Rest], Acc, Line, Col, _SLine) ->
{ok, [$| | Acc], Rest, Line, Col + 1};
scan_qsymbol([$\n | Rest], Acc, Line, _Col, SLine) ->
scan_qsymbol(Rest, [$\n | Acc], Line + 1, 1, SLine);
scan_qsymbol([C | Rest], Acc, Line, Col, SLine) ->
scan_qsymbol(Rest, [C | Acc], Line, Col + 1, SLine);
scan_qsymbol([], _Acc, _Line, _Col, SLine) ->
{error, {unterminated_qsymbol, SLine}}.
%%====================================================================
%% Internal: char literal body (after #\)
%%====================================================================
%% scan_char_body: returns {ok, Body, Rest, BodyLen}.
scan_char_body([$x | Rest]) ->
case collect_hex_for_char(Rest, [], 0) of
{[_ | _] = Hex, [$; | Rest2], HexLen} ->
Body = [$x | Hex] ++ [$;],
{ok, Body, Rest2, 1 + HexLen + 1};
_ ->
{ok, [$x], Rest, 1}
end;
scan_char_body([C | Rest]) ->
{ok, [C], Rest, 1};
scan_char_body([]) ->
{error, unterminated_char}.
collect_hex_for_char([C | Rest], Acc, N) when
(C >= $0 andalso C =< $9) orelse
(C >= $a andalso C =< $f) orelse
(C >= $A andalso C =< $F) ->
collect_hex_for_char(Rest, [C | Acc], N + 1);
collect_hex_for_char(Rest, Acc, N) ->
{lists:reverse(Acc), Rest, N}.
%%====================================================================
%% Internal: run collection and classification — §5.6
%%====================================================================
collect_run([C | Rest], Acc, N) ->
case symbol_char(C) of
true -> collect_run(Rest, [C | Acc], N + 1);
false -> {lists:reverse(Acc), [C | Rest], N}
end;
collect_run([], Acc, N) ->
{lists:reverse(Acc), [], N}.
%% classify_run: decide number, dot, or symbol for a maximal run of symbol chars.
classify_run(".") -> dot;
classify_run([$# | _] = Text) ->
case is_radix_number(Text) of
true -> number;
false -> symbol
end;
classify_run(Text) ->
try list_to_integer(Text) of
_ -> number
catch error:badarg ->
try list_to_float(Text) of
_ -> number
catch error:badarg -> symbol
end
end.
%%====================================================================
%% Internal: number classification predicates
%%====================================================================
is_radix_number([$#, C | Rest]) when C =:= $b; C =:= $B ->
valid_based_digits(Rest, 2);
is_radix_number([$#, C | Rest]) when C =:= $o; C =:= $O ->
valid_based_digits(Rest, 8);
is_radix_number([$#, C | Rest]) when C =:= $d; C =:= $D ->
valid_based_digits(Rest, 10);
is_radix_number([$#, C | Rest]) when C =:= $x; C =:= $X ->
valid_based_digits(Rest, 16);
is_radix_number([$#, $* | Rest]) ->
valid_based_digits(Rest, 2);
is_radix_number([$# | Rest]) ->
case split_at_r(Rest, []) of
{[_ | _] = Digits, More} ->
try list_to_integer(Digits) of
Base when Base >= 2, Base =< 36 ->
valid_based_digits(More, Base);
_ ->
false
catch error:badarg -> false
end;
_ ->
false
end;
is_radix_number(_) ->
false.
%% valid_based_digits: optional sign then at least one valid digit.
valid_based_digits([$+ | Rest], Base) -> valid_digits(Rest, Base);
valid_based_digits([$- | Rest], Base) -> valid_digits(Rest, Base);
valid_based_digits(Rest, Base) -> valid_digits(Rest, Base).
valid_digits([_ | _] = Cs, Base) ->
lists:all(fun(C) -> is_base_digit(C, Base) end, Cs);
valid_digits([], _Base) ->
false.
is_base_digit(C, Base) when Base >= 2, Base =< 10 ->
C >= $0 andalso C =< $0 + Base - 1;
is_base_digit(C, Base) when Base > 10, Base =< 36 ->
(C >= $0 andalso C =< $9) orelse
(C >= $a andalso C =< $a + Base - 11) orelse
(C >= $A andalso C =< $A + Base - 11).
%% split_at_r: split a list at the first 'r' or 'R' that follows digits.
split_at_r([$r | Rest], Acc) when Acc =/= [] -> {lists:reverse(Acc), Rest};
split_at_r([$R | Rest], Acc) when Acc =/= [] -> {lists:reverse(Acc), Rest};
split_at_r([C | Rest], Acc) when C >= $0, C =< $9 -> split_at_r(Rest, [C | Acc]);
split_at_r(_, _) -> {[], []}.
%%====================================================================
%% Internal: symbol_char — mirrors lfe_scan's definition
%%====================================================================
symbol_char($() -> false;
symbol_char($)) -> false;
symbol_char($[) -> false;
symbol_char($]) -> false;
symbol_char(${) -> false;
symbol_char($}) -> false;
symbol_char($") -> false;
symbol_char($;) -> false;
symbol_char(C) -> (C > $\s andalso C =< $~) orelse (C > 16#A0).