Skip to main content

src/lfmt_fezzik_cst.erl

%%%% CST parser: flat lossless token stream -> tree with comment/trivia attachment.
%%%% See docs/design/022-lfe-format/arc2-cst/cc-prompt.md.
%%%%
%%%% Note: Erlang reserves node/0 and document/0 as built-in or common names,
%%%% so the exported opaque types are cst_node/0 and cst_document/0.
-module(lfmt_fezzik_cst).

-export([parse/1,
         significant_tokens/1, comments/1,
         type/1, open/1, close/1, prefix/1, children/1,
         leading/1, trailing/1, dangling/1,
         nl_before/1, multiline/1, dot_token/1,
         document_children/1, document_dangling/1]).

-export_type([cst_node/0, cst_document/0, node_type/0, trivia/0]).

-type token() :: lfmt_fezzik_lexer:token().

-type trivia() :: {comment, token()} | blank.

-type node_type() :: list | tuple | map | binary | eval
                   | symbol | number | string | char
                   | prefixed.

-record(node, {
    type      :: node_type(),
    open      :: token() | undefined,
    close     :: token() | undefined,
    prefix    :: token() | undefined,
    children  :: [#node{}],
    leading   :: [trivia()],
    trailing  :: [trivia()],
    dangling  :: [trivia()],
    nl_before = false :: boolean(),
    dot_token = undefined :: token() | undefined
}).

-record(document, {
    children :: [#node{}],
    dangling :: [trivia()]
}).

-opaque cst_node()     :: #node{}.
-opaque cst_document() :: #document{}.

%%====================================================================
%% Exported API
%%====================================================================

-spec parse([token()]) -> {ok, cst_document()} | {error, term()}.
parse(Tokens) ->
    case parse_seq_loop(Tokens, eof, [], false, []) of
        {ok, Nodes, none, Dangling, []} ->
            {ok, #document{children = Nodes, dangling = Dangling}};
        {ok, _Nodes, {_, _}, _Dangling, _} ->
            {error, {unexpected_dot, 0}};
        {ok, _Nodes, _DotInfo, _Dangling, [Tok | _]} ->
            {error, {unbalanced, eof, lfmt_fezzik_lexer:line(Tok)}};
        {error, _} = Err ->
            Err
    end.

-spec significant_tokens(cst_document()) -> [token()].
significant_tokens(#document{children = Nodes}) ->
    lists:flatmap(fun node_significant_tokens/1, Nodes).

-spec comments(cst_document()) -> [token()].
comments(#document{children = Nodes, dangling = Dangling}) ->
    lists:flatmap(fun node_comments/1, Nodes) ++ trivia_comments(Dangling).

-spec type(cst_node()) -> node_type().
type(#node{type = T}) -> T.

-spec open(cst_node()) -> token() | undefined.
open(#node{open = O}) -> O.

-spec close(cst_node()) -> token() | undefined.
close(#node{close = C}) -> C.

-spec prefix(cst_node()) -> token() | undefined.
prefix(#node{prefix = P}) -> P.

-spec children(cst_node()) -> [cst_node()].
children(#node{children = Ch}) -> Ch.

-spec leading(cst_node()) -> [trivia()].
leading(#node{leading = L}) -> L.

-spec trailing(cst_node()) -> [trivia()].
trailing(#node{trailing = T}) -> T.

-spec dangling(cst_node()) -> [trivia()].
dangling(#node{dangling = D}) -> D.

%% nl_before/1: true if a newline preceded this node among its siblings in the
%% source (i.e. the author started a new line here). Single \n is sufficient.
-spec nl_before(cst_node()) -> boolean().
nl_before(#node{nl_before = NL}) -> NL.

%% dot_token/1: the cons-dot token for an improper list, or undefined.
%% When present, the last child of the node is the cons-tail.
-spec dot_token(cst_node()) -> token() | undefined.
dot_token(#node{dot_token = D}) -> D.

%% multiline/1: true if any direct child of the node/document has nl_before=true.
-spec multiline(cst_node() | cst_document()) -> boolean().
multiline(#document{children = Children}) ->
    lists:any(fun(N) -> nl_before(N) end, Children);
multiline(#node{children = Children}) ->
    lists:any(fun(N) -> nl_before(N) end, Children).

-spec document_children(cst_document()) -> [cst_node()].
document_children(#document{children = Ch}) -> Ch.

-spec document_dangling(cst_document()) -> [trivia()].
document_dangling(#document{dangling = D}) -> D.

%%====================================================================
%% Internal: main parse loop
%%====================================================================

%% parse_seq_loop: parse tokens until matching CloserKind (rparen|rbracket|eof)
%% or EOF. Returns {ok, Nodes, Dangling, Rest} where Rest starts with the
%% (unconsumed) closer token, or [] at EOF. Pending holds leading trivia
%% accumulated for the next node, in source order. NlBefore tracks whether at
%% least one newline has been seen since the last node (or start of container),
%% and is stamped onto the next node then reset to false.
%%
%% Pending uses ++ for append: it is bounded by file-level trivia density
%% (never accumulates proportionally to input size) so O(n) per append is fine.
parse_seq_loop([], _CloserKind, Pending, _NlBefore, Nodes) ->
    {ok, lists:reverse(Nodes), none, Pending, []};
parse_seq_loop([Tok | Rest], CloserKind, Pending, NlBefore, Nodes) ->
    Kind = lfmt_fezzik_lexer:kind(Tok),
    case Kind of
        whitespace ->
            parse_seq_loop(Rest, CloserKind, Pending, NlBefore, Nodes);
        newline ->
            {Pending2, Rest2} = consume_newlines_inner(Rest, Pending, 1),
            parse_seq_loop(Rest2, CloserKind, Pending2, true, Nodes);
        line_comment ->
            parse_seq_loop(Rest, CloserKind, Pending ++ [{comment, Tok}], NlBefore, Nodes);
        block_comment ->
            parse_seq_loop(Rest, CloserKind, Pending ++ [{comment, Tok}], NlBefore, Nodes);
        rparen ->
            case CloserKind of
                rparen -> {ok, lists:reverse(Nodes), none, Pending, [Tok | Rest]};
                _      -> {error, {unbalanced, CloserKind, lfmt_fezzik_lexer:line(Tok)}}
            end;
        rbracket ->
            case CloserKind of
                rbracket -> {ok, lists:reverse(Nodes), none, Pending, [Tok | Rest]};
                _        -> {error, {unbalanced, CloserKind, lfmt_fezzik_lexer:line(Tok)}}
            end;
        dot when CloserKind =/= eof ->
            %% Cons-dot inside a container: the next node is the improper tail.
            case parse_one_node(Rest, [], false) of
                {ok, TailNode, Rest2} ->
                    {ok, lists:reverse([TailNode | Nodes]), {Tok, TailNode}, Pending, Rest2};
                {error, _} = Err -> Err
            end;
        dot ->
            {error, {unexpected_dot, lfmt_fezzik_lexer:line(Tok)}};
        K when K =:= lparen; K =:= lbracket; K =:= tuple_open;
               K =:= map_open; K =:= binary_open; K =:= eval_open ->
            case parse_container(Tok, K, Rest, Pending, NlBefore) of
                {ok, CNode, Rest2} ->
                    {CNode2, Rest3} = try_attach_trailing(CNode, Rest2),
                    parse_seq_loop(Rest3, CloserKind, [], false, [CNode2 | Nodes]);
                {error, _} = Err -> Err
            end;
        K when K =:= quote; K =:= quasiquote; K =:= unquote;
               K =:= unquote_splicing; K =:= fun_ref ->
            case parse_one_node(Rest, [], false) of
                {ok, Inner, Rest2} ->
                    PNode = #node{type = prefixed, prefix = Tok,
                                  open = undefined, close = undefined,
                                  children = [Inner], leading = Pending,
                                  trailing = [], dangling = [],
                                  nl_before = NlBefore},
                    {PNode2, Rest3} = try_attach_trailing(PNode, Rest2),
                    parse_seq_loop(Rest3, CloserKind, [], false, [PNode2 | Nodes]);
                {error, _} = Err -> Err
            end;
        _ ->
            %% Leaf: symbol, qsymbol, number, char, string, bstring, tqstring, tqbstring
            LNode = #node{type = leaf_type(Kind), open = Tok,
                          close = undefined, prefix = undefined,
                          children = [], leading = Pending,
                          trailing = [], dangling = [],
                          nl_before = NlBefore},
            {LNode2, Rest2} = try_attach_trailing(LNode, Rest),
            parse_seq_loop(Rest2, CloserKind, [], false, [LNode2 | Nodes])
    end.

%%====================================================================
%% Internal: parse exactly one node (for prefix inner-form parsing)
%%====================================================================

%% parse_one_node: skip trivia (accumulating into Pending), then parse one node.
%% Trivia between a prefix and its target attaches to the inner node's leading.
%% NlBefore tracks whether a newline occurred between the prefix token and the
%% inner form (e.g. '\n(foo) has NlBefore=true on the inner list node).
parse_one_node([], _Pending, _NlBefore) ->
    {error, {missing_inner_node, 0}};
parse_one_node([Tok | Rest], Pending, NlBefore) ->
    Kind = lfmt_fezzik_lexer:kind(Tok),
    case Kind of
        whitespace ->
            parse_one_node(Rest, Pending, NlBefore);
        newline ->
            {Pending2, Rest2} = consume_newlines_inner(Rest, Pending, 1),
            parse_one_node(Rest2, Pending2, true);
        K when K =:= line_comment; K =:= block_comment ->
            parse_one_node(Rest, Pending ++ [{comment, Tok}], NlBefore);
        K when K =:= lparen; K =:= lbracket; K =:= tuple_open;
               K =:= map_open; K =:= binary_open; K =:= eval_open ->
            case parse_container(Tok, K, Rest, Pending, NlBefore) of
                {ok, CNode, Rest2} ->
                    {CNode2, Rest3} = try_attach_trailing(CNode, Rest2),
                    {ok, CNode2, Rest3};
                {error, _} = Err -> Err
            end;
        K when K =:= quote; K =:= quasiquote; K =:= unquote;
               K =:= unquote_splicing; K =:= fun_ref ->
            case parse_one_node(Rest, [], false) of
                {ok, Inner, Rest2} ->
                    PNode = #node{type = prefixed, prefix = Tok,
                                  open = undefined, close = undefined,
                                  children = [Inner], leading = Pending,
                                  trailing = [], dangling = [],
                                  nl_before = NlBefore},
                    {PNode2, Rest3} = try_attach_trailing(PNode, Rest2),
                    {ok, PNode2, Rest3};
                {error, _} = Err -> Err
            end;
        K when K =:= rparen; K =:= rbracket ->
            {error, {missing_inner_node, lfmt_fezzik_lexer:line(Tok)}};
        _ ->
            LNode = #node{type = leaf_type(Kind), open = Tok,
                          close = undefined, prefix = undefined,
                          children = [], leading = Pending,
                          trailing = [], dangling = [],
                          nl_before = NlBefore},
            {LNode2, Rest2} = try_attach_trailing(LNode, Rest),
            {ok, LNode2, Rest2}
    end.

%%====================================================================
%% Internal: container parsing
%%====================================================================

parse_container(OpenerTok, OpenerKind, Rest, Leading, NlBefore) ->
    {ContainerType, CloserKind} = container_type(OpenerKind),
    case parse_seq_loop(Rest, CloserKind, [], false, []) of
        {ok, Children, DotInfo, Dangling, [CloserTok | Rest2]} ->
            DotTok = case DotInfo of
                none          -> undefined;
                {DotTok1, _}  -> DotTok1
            end,
            CNode = #node{type = ContainerType,
                          open = OpenerTok, close = CloserTok,
                          prefix = undefined, children = Children,
                          dangling = Dangling, leading = Leading,
                          trailing = [], nl_before = NlBefore,
                          dot_token = DotTok},
            {ok, CNode, Rest2};
        {ok, _Children, _DotInfo, _Dangling, []} ->
            {error, {unbalanced, CloserKind, lfmt_fezzik_lexer:line(OpenerTok)}};
        {error, _} = Err ->
            Err
    end.

container_type(lparen)      -> {list,   rparen};
container_type(lbracket)    -> {list,   rbracket};
container_type(tuple_open)  -> {tuple,  rparen};
container_type(map_open)    -> {map,    rparen};
container_type(binary_open) -> {binary, rparen};
container_type(eval_open)   -> {eval,   rparen}.

%%====================================================================
%% Internal: trailing-comment attachment (ยง4)
%%====================================================================

%% try_attach_trailing: skip whitespace (not newlines); if next is a comment,
%% attach it as the node's single trailing trivia and consume it.
try_attach_trailing(ANode, [Tok | Rest]) ->
    case lfmt_fezzik_lexer:kind(Tok) of
        whitespace ->
            try_attach_trailing(ANode, Rest);
        K when K =:= line_comment; K =:= block_comment ->
            {ANode#node{trailing = [{comment, Tok}]}, Rest};
        _ ->
            {ANode, [Tok | Rest]}
    end;
try_attach_trailing(ANode, []) ->
    {ANode, []}.

%%====================================================================
%% Internal: blank-line detection
%%====================================================================

%% consume_newlines_inner: called after consuming one newline (Count=1).
%% Consumes further newlines and interleaved whitespace. Appends a single
%% `blank` to Pending if total newline count >= 2.
consume_newlines_inner([Tok | Rest], Pending, Count) ->
    case lfmt_fezzik_lexer:kind(Tok) of
        newline    -> consume_newlines_inner(Rest, Pending, Count + 1);
        whitespace -> consume_newlines_inner(Rest, Pending, Count);
        _          -> {append_blank_if(Count >= 2, Pending), [Tok | Rest]}
    end;
consume_newlines_inner([], Pending, Count) ->
    {append_blank_if(Count >= 2, Pending), []}.

append_blank_if(true,  P) -> P ++ [blank];
append_blank_if(false, P) -> P.

%%====================================================================
%% Internal: helpers
%%====================================================================

leaf_type(symbol)    -> symbol;
leaf_type(qsymbol)   -> symbol;
leaf_type(number)    -> number;
leaf_type(char)      -> char;
leaf_type(string)    -> string;
leaf_type(bstring)   -> string;
leaf_type(tqstring)  -> string;
leaf_type(tqbstring) -> string.

%%====================================================================
%% Internal: oracle helpers
%%====================================================================

%% node_significant_tokens: in-order significant tokens for one node.
node_significant_tokens(#node{type = prefixed, prefix = Pfx,
                               children = Children}) ->
    [Pfx | lists:flatmap(fun node_significant_tokens/1, Children)];
node_significant_tokens(#node{type = T, open = Open, close = Close,
                               dot_token = DotTok, children = Children})
  when T =:= list; T =:= tuple; T =:= map; T =:= binary; T =:= eval ->
    case DotTok of
        undefined ->
            [Open | lists:flatmap(fun node_significant_tokens/1, Children)] ++ [Close];
        _ when Children =/= [] ->
            AllButLast = lists:droplast(Children),
            Tail = lists:last(Children),
            [Open]
            ++ lists:flatmap(fun node_significant_tokens/1, AllButLast)
            ++ [DotTok]
            ++ node_significant_tokens(Tail)
            ++ [Close]
    end;
node_significant_tokens(#node{open = Open}) ->
    [Open].

%% node_comments: comment tokens in source order
%% (leading โ†’ children โ†’ dangling โ†’ trailing).
node_comments(#node{leading = L, children = Ch, dangling = D, trailing = T}) ->
    trivia_comments(L)
    ++ lists:flatmap(fun node_comments/1, Ch)
    ++ trivia_comments(D)
    ++ trivia_comments(T).

trivia_comments(Trivia) ->
    [Tok || {comment, Tok} <- Trivia].