Skip to main content

src/mesv@parse.erl

-module(mesv@parse).
-compile([no_auto_import, nowarn_unused_vars, nowarn_unused_function, nowarn_nomatch, inline]).
-define(FILEPATH, "src\\mesv\\parse.gleam").
-export([build/1, column/2, expect_headers/2, set_row_sep/2, set_escaper/2, set_trim_whitespace/3, set_col_sep/2, set_strict_columns/1, partition_on_unescaped_/2, parse/2]).
-export_type([parsing_error/0, parser/1]).

-if(?OTP_RELEASE >= 27).
-define(MODULEDOC(Str), -moduledoc(Str)).
-define(DOC(Str), -doc(Str)).
-else.
-define(MODULEDOC(Str), -compile([])).
-define(DOC(Str), -compile([])).
-endif.

?MODULEDOC(
    " Module containing the functions for creating a `Parser`, and using the `Parser`\n"
    " to parse an input CSV String into a `List` of some data types.\n"
    " \n"
    " ### Important!\n"
    " At this stage, everything is still in flux, and breaking changes can occur\n"
    " on minor version updates. Be careful and check for possible issues before updating!\n"
    " \n"
    " ## Examples\n"
    " A full example of parsing an example CSV String.\n"
    " ```gleam\n"
    " import gleam/int\n"
    " import mesv\n"
    " import mesv/parse\n"
    " \n"
    " const expected_data: List(#(String, Int, Bool)) = [\n"
    "   #(\"Andrew\", 20, True),\n"
    "   #(\"Blake\", 25, True),\n"
    "   #(\"Cassandra\", 2, False),\n"
    " ]\n"
    " \n"
    " pub fn main() -> Nil {\n"
    "   let parsed_data =\n"
    "     parse.build({\n"
    "       // Create a parsing function using `mesv.parsed`\n"
    "       // to construct a curried parsing function\n"
    "       use name <- mesv.parsed\n"
    "       use age <- mesv.parsed\n"
    "       use adult <- mesv.parsed\n"
    " \n"
    "       // If any value fails (ie, returns Error(Nil)),\n"
    "       // the parsing of a row will stop.\n"
    "       // However, if it reaches here,\n"
    "       // it returns the following data type\n"
    "       #(name, age, adult)\n"
    "     })\n"
    "     |> parse.column(Ok)\n"
    "     |> parse.column(int.parse)\n"
    "     |> parse.column(fn(val: String) -> Result(Bool, Nil) {\n"
    "       case val {\n"
    "         \"true\" | \"True\" -> Ok(True)\n"
    "         \"false\" | \"False\" -> Ok(False)\n"
    "         _ -> Error(Nil)\n"
    "       }\n"
    "     })\n"
    "     // Specify that the first row is the headers,\n"
    "     // and if they don't match what is specified, \n"
    "     // the parsing will fail\n"
    "     |> parse.expect_headers([\"Name\", \"Age\", \"Is an adult\"])\n"
    "     // Pass in the CSV String to parse\n"
    "     |> parse.parse(\n"
    "       \"Name,Age,Is an adult\\n\"\n"
    "       <> \"Andrew,20,true\\n\"\n"
    "       <> \"Blake,25,True\\n\"\n"
    "       <> \"Cassandra,2,False\",\n"
    "     )\n"
    " \n"
    "   assert parsed_data == Ok(#(expected_data, []))\n"
    " }\n"
    " ```\n"
    " \n"
    " Parsing a CSV and performing some operations on the data immediately after parsing\n"
    " ```gleam\n"
    " // [...]\n"
    " const expected_data: List(#(String, Int, Bool)) = [\n"
    "   #(\"Anna\", 20, True),\n"
    "   #(\"Bob\", 25, True),\n"
    "   #(\"Cleopatra\", 2095, False),\n"
    "   // She's dead, she can't be an adult.\n"
    "   // But alas, our parser is too simple to understand\n"
    "   // this fact, so it will throw an error.\n"
    " ]\n"
    " \n"
    " pub fn main() -> Nil {\n"
    "   let parsed_data =\n"
    "     parse.build({\n"
    "       use name <- mesv.parsed\n"
    "       use age <- mesv.parsed\n"
    "       // As long as the operation is guaranteed to result\n"
    "       // in the data type specified in the Parser,\n"
    "       // you can do anything in here!\n"
    "       #(name, age, age >= 18)\n"
    "     })\n"
    "     |> parse.column(Ok)\n"
    "     |> parse.column(int.parse)\n"
    "     // Pass in the CSV String to parse\n"
    "     |> parse.parse(\n"
    "       \"Anna,20\\n\"\n"
    "       <> \"Bob,25\\n\"\n"
    "       <> \"Cleopatra,2095\",\n"
    "     )\n"
    " \n"
    "   assert parsed_data == Ok(#(expected_data, []))\n"
    " }\n"
    " ```\n"
    " \n"
).

-type parsing_error() :: {cant_parse_row, integer(), binary(), binary()} |
    {expected_headers_mismatch, list(binary()), list(binary())} |
    ran_out_of_values |
    {strict_parsed_with_leftovers, list(binary())} |
    {encountered_malformed_element, binary(), binary()}.

-opaque parser(DPL) :: {parser,
        binary(),
        binary(),
        binary(),
        gleam@option:option(list(binary())),
        fun((list(binary())) -> {ok, {DPL, list(binary())}} |
            {error, parsing_error()}),
        boolean(),
        {boolean(), boolean()}}.

-file("src\\mesv\\parse.gleam", 175).
?DOC(
    " Function for directly building a `Parser` that uses the subsequent elements in order.\n"
    " \n"
    " The function passed in should be a curried one - ie, a function that returns a\n"
    " function, and so on, with every subsequent function taking in some type of argument.\n"
    " \n"
    " To build the parser, transform it using the `parse.column` function to specify\n"
    " how to parse each subsequent value in a row.\n"
    " \n"
    " ## Examples\n"
    " The simplest parser is one element:\n"
    " ```gleam\n"
    " parse.build(fn(str) { str })\n"
    "   |> parse.column(Ok)\n"
    " ```\n"
    " When used, it will create a `List(String)` containing the first cell of each\n"
    " row of the input CSV String.\n"
    " \n"
    " Infallible transformation of the data can be done both inside of the initial\n"
    " function that is passed to `parse.build` and in `parse.column`, but fallible\n"
    " transformations (those that output a `Result` or `Option` when the argument\n"
    " requires what's inside the `Option`) must reside in the `parse.column` call.\n"
    " \n"
    " A more complex `Parser` would be something like this:\n"
    " ```gleam\n"
    " parse.build({\n"
    "   use name: String <- mesv.parsed\n"
    "   use age: Int <- mesv.parsed\n"
    "   use adult: Bool <- mesv.parsed\n"
    "\n"
    "   #(name, age, adult)\n"
    " })\n"
    " ```\n"
    " and to parse the arguments to construct the result, again, use the\n"
    " `parse.column` function.\n"
).
-spec build(fun((DPM) -> DPN)) -> parser(fun((DPM) -> DPN)).
build(F) ->
    {parser,
        <<","/utf8>>,
        <<"\n"/utf8>>,
        <<"\""/utf8>>,
        none,
        fun(Tokens) -> {ok, {F, Tokens}} end,
        false,
        {true, true}}.

-file("src\\mesv\\parse.gleam", 208).
?DOC(
    " Transform a `Parser`, by passing in a parsing function for a specified column.\n"
    " \n"
    " This function will be called for every row, and the output of this function,\n"
    " if it's `Ok(a)`, will be passed to the `Parser`'s internal function,\n"
    " and the parsing of the row continued;\n"
    " \n"
    " If it's `Error(Nil)`, the parsing of the row will fail.\n"
    " \n"
    " ## Examples\n"
    " ```gleam\n"
    " // Parser(fn(String) -> a)\n"
    " parser\n"
    "   |> parse.column(Ok)\n"
    "   // Parser(a)\n"
    " ```\n"
).
-spec column(
    parser(fun((DPP) -> DPQ)),
    fun((binary()) -> {ok, DPP} | {error, nil})
) -> parser(DPQ).
column(Parser, Parse) ->
    {parser,
        erlang:element(2, Parser),
        erlang:element(3, Parser),
        erlang:element(4, Parser),
        erlang:element(5, Parser),
        fun(Tokens) ->
            gleam@result:'try'(
                (erlang:element(6, Parser))(Tokens),
                fun(_use0) ->
                    {Constructor, Remaining_tokens} = _use0,
                    case Remaining_tokens of
                        [Token | Rest] ->
                            _pipe = Token,
                            _pipe@1 = Parse(_pipe),
                            _pipe@2 = gleam@result:map_error(
                                _pipe@1,
                                fun(_) ->
                                    {cant_parse_row,
                                        -1,
                                        Token,
                                        <<"idk, think of a better error system."/utf8>>}
                                end
                            ),
                            _pipe@3 = gleam@result:map(_pipe@2, Constructor),
                            gleam@result:map(_pipe@3, fun(B) -> {B, Rest} end);

                        [] ->
                            {error, ran_out_of_values}
                    end
                end
            )
        end,
        erlang:element(7, Parser),
        erlang:element(8, Parser)}.

-file("src\\mesv\\parse.gleam", 256).
?DOC(
    " Configure the parser to treat the first parsed row as the headers,\n"
    " and specify that we expect the CSV headers to equal these headers.\n"
    " \n"
    " If the first row is not **strictly identical** to the contents of\n"
    " the arguments to this function, the parser will return an `Error`.\n"
    " \n"
    " ## Examples\n"
    " ```gleam\n"
    " parser\n"
    "   |> parse.parse(\"a,1,c\")\n"
    "   // -> row returns Ok(#(\"a\", 1, \"c\"))\n"
    " \n"
    " parser\n"
    "   |> set_col_sep(\"|\")\n"
    "   |> parse.parse(\"a,1,c\")\n"
    "   // -> row returns Error(RanOutOfValues)\n"
    " parser\n"
    "   |> set_col_sep(\"|\")\n"
    "   |> parse.parse(\"a|1|c\")\n"
    "   // -> row returns Ok(#(\"a\", 1, \"c\"))\n"
    " ```\n"
).
-spec expect_headers(parser(DPV), list(binary())) -> parser(DPV).
expect_headers(Parser, Headers) ->
    {parser,
        erlang:element(2, Parser),
        erlang:element(3, Parser),
        erlang:element(4, Parser),
        {some, Headers},
        erlang:element(6, Parser),
        erlang:element(7, Parser),
        erlang:element(8, Parser)}.

-file("src\\mesv\\parse.gleam", 279).
?DOC(
    " Function to set a specific row separator, instead of the default newline (`\\n`)\n"
    " \n"
    " ## Examples\n"
    " ```gleam\n"
    " parser\n"
    "   |> parse.parse(\"a,1,c\\nd,4,a\")\n"
    "   // -> parse returns [#(\"a\", 1, \"c\"), #(\"d\", 4, \"a\")]\n"
    " \n"
    " parser\n"
    "   |> set_row_sep(\"|\")\n"
    "   |> parse.parse(\"a,1,c\\nd,4,a\")\n"
    "   // -> parse returns [#(\"a\", 1, \"c\\nd\")]\n"
    "   // the two cells \"4\" and \"a\" are treated as leftovers\n"
    " parser\n"
    "   |> set_row_sep(\"|\")\n"
    "   |> parse.parse(\"a,1,c|d,4,a\")\n"
    "   // -> parse returns [#(\"a\", 1, \"c\"), #(\"d\", 4, \"a\")]\n"
    " ```\n"
).
-spec set_row_sep(parser(DPZ), binary()) -> parser(DPZ).
set_row_sep(Parser, New_row_separator) ->
    {parser,
        erlang:element(2, Parser),
        New_row_separator,
        erlang:element(4, Parser),
        erlang:element(5, Parser),
        erlang:element(6, Parser),
        erlang:element(7, Parser),
        erlang:element(8, Parser)}.

-file("src\\mesv\\parse.gleam", 315).
?DOC(
    " Function to set a specific value escaper, instead of the default doublequotes (`\"`)\n"
    " \n"
    " Escapers are wrapped around a cell if that cell contains any one or more of:\n"
    " - column separator (by default `,`)\n"
    " - row separator (by default `\\n`)\n"
    " - escaper itself\n"
    " \n"
    " In the event that a cell contains an escaper, the escaper is first replaced\n"
    " with two escapers.\n"
    " \n"
    " So `here's \" ` would first become `here's \"\" `, then be wrapped and become\n"
    " `\"here's \"\" \"`.\n"
    " \n"
    " ## Examples\n"
    " ```gleam\n"
    " parser\n"
    "   |> parse.parse(\"a,'b','c'''\")\n"
    "   // -> row returns Ok(#(\"a\", \"'b'\", \"'c'''\"))\n"
    " parser\n"
    "   |> parse.parse(\"a,\\\"b\\\",\\\"c\\\"\\\"\\\"\")\n"
    "   // -> row returns Ok(#(\"a\", \"b\", \"c\\\"\"))\n"
    " \n"
    " parser\n"
    "   |> set_escaper(\"'\")\n"
    "   |> parse.parse(\"a,'b','c'''\")\n"
    "   // -> row returns Ok(#(\"a\", \"b\", \"c'\"))\n"
    " parser\n"
    "   |> set_escaper(\"'\")\n"
    "   |> parse.parse(\"a,\\\"b\\\",\\\"c\\\"\\\"\\\"\")\n"
    "   // -> row returns Ok(#(\"a\", \"\\\"b\\\"\", \"\\\"c\\\"\\\"\\\"\"))\n"
    " ```\n"
).
-spec set_escaper(parser(DQC), binary()) -> parser(DQC).
set_escaper(Parser, New_escaper) ->
    {parser,
        erlang:element(2, Parser),
        erlang:element(3, Parser),
        New_escaper,
        erlang:element(5, Parser),
        erlang:element(6, Parser),
        erlang:element(7, Parser),
        erlang:element(8, Parser)}.

-file("src\\mesv\\parse.gleam", 329).
?DOC(
    " Function to set whether the parser should trim the whitespace on both ends of each value.\n"
    " \n"
    " This operation is performed before the cell is unwrapped (escapers removed), so if the CSV\n"
    " file was modified somehow\n"
    " (for example, using VSCode plugin [Rainbow CSV](https://marketplace.visualstudio.com/items?itemName=mechatroner.rainbow-csv) to align the columns),\n"
    " the cell can be correctly unescaped and parsed.\n"
    " \n"
    " I think the behaviour of this function and internal order of operations will change\n"
    " in the future, so no examples yet.\n"
).
-spec set_trim_whitespace(parser(DQF), boolean(), boolean()) -> parser(DQF).
set_trim_whitespace(Parser, Trim_start, Trim_end) ->
    {parser,
        erlang:element(2, Parser),
        erlang:element(3, Parser),
        erlang:element(4, Parser),
        erlang:element(5, Parser),
        erlang:element(6, Parser),
        erlang:element(7, Parser),
        {Trim_start, Trim_end}}.

-file("src\\mesv\\parse.gleam", 355).
?DOC(
    " Function to set a specific column separator, instead of the default comma (`,`)\n"
    " \n"
    " ## Examples\n"
    " ```gleam\n"
    " parser\n"
    "   |> parse.parse(\"a,1,c\")\n"
    "   // -> row returns Ok(#(\"a\", 1, \"c\"))\n"
    " \n"
    " parser\n"
    "   |> set_col_sep(\"|\")\n"
    "   |> parse.parse(\"a,1,c\")\n"
    "   // -> row returns Error(RanOutOfValues)\n"
    " parser\n"
    "   |> set_col_sep(\"|\")\n"
    "   |> parse.parse(\"a|1|c\")\n"
    "   // -> row returns Ok(#(\"a\", 1, \"c\"))\n"
    " ```\n"
).
-spec set_col_sep(parser(DQI), binary()) -> parser(DQI).
set_col_sep(Parser, New_column_separator) ->
    {parser,
        New_column_separator,
        erlang:element(3, Parser),
        erlang:element(4, Parser),
        erlang:element(5, Parser),
        erlang:element(6, Parser),
        erlang:element(7, Parser),
        erlang:element(8, Parser)}.

-file("src\\mesv\\parse.gleam", 381).
?DOC(
    " Function to make the parser strict in terms of columns.\n"
    " \n"
    " This means that when parsing a row, there must be exactly as many cells as there were\n"
    " arguments for the internal `Parser` function. If this function is called, if there are\n"
    " any leftover values after the parsing is finished, parsing that row returns an `Error`\n"
    " even if the parsing returned a value.\n"
    " \n"
    " ## Examples\n"
    " ```gleam\n"
    " parser\n"
    "   |> parse.parse(\"a,1,c\")\n"
    "   // -> row returns Ok(#(\"a\", 1))\n"
    " \n"
    " parser\n"
    "   |> set_strict_columns()\n"
    "   |> parse.parse(\"a,1,c\")\n"
    "   // -> row returns Error(StrictParsedWithLeftovers([\"c\"]))\n"
    " ```\n"
).
-spec set_strict_columns(parser(DQL)) -> parser(DQL).
set_strict_columns(Parser) ->
    {parser,
        erlang:element(2, Parser),
        erlang:element(3, Parser),
        erlang:element(4, Parser),
        erlang:element(5, Parser),
        erlang:element(6, Parser),
        true,
        erlang:element(8, Parser)}.

-file("src\\mesv\\parse.gleam", 554).
?DOC(
    " Internal helper function to check whether the CSV headers that were found match\n"
    " the expected pattern that was specified in the Parser building process.\n"
).
-spec process_headers(gleam@option:option(list(binary())), list(binary())) -> {ok,
        list(binary())} |
    {error, parsing_error()}.
process_headers(Expected, Found) ->
    case Expected of
        {some, Pattern} ->
            case Found =:= Pattern of
                true ->
                    {ok, Found};

                false ->
                    {error, {expected_headers_mismatch, Pattern, Found}}
            end;

        none ->
            {ok, Found}
    end.

-file("src\\mesv\\parse.gleam", 539).
?DOC(
    " Internal helper function for creating a function for 'unescaping' an element\n"
    " (for each `rule`, replacing the second element in the tuple with the first).\n"
    " \n"
    " This function takes in a String that is guaranteed to be a value - that is,\n"
    " it'seither unescaped, or it starts with an escaper and ends with an escaper.\n"
    " \n"
    " It's a curried function because I like functional programming, and because it *should*\n"
    " give some performance improvements if I create such a function before any looping\n"
    " instead of constructing one for each iteration.\n"
).
-spec unescape(list({binary(), binary()})) -> fun((binary()) -> binary()).
unescape(Rules) ->
    fun(El) -> _pipe = Rules,
        _pipe@1 = gleam@list:map(
            _pipe,
            fun(Rule) ->
                fun(_capture) ->
                    gleam@string:replace(
                        _capture,
                        erlang:element(2, Rule),
                        erlang:element(1, Rule)
                    )
                end
            end
        ),
        gleam@list:fold(_pipe@1, El, fun(Acc, Rule@1) -> Rule@1(Acc) end) end.

-file("src\\mesv\\parse.gleam", 576).
?DOC(
    " Internal helper function for constructing a function that splits a `String`\n"
    " on `separator`, as long as the `separator` is not between two `not_in`.\n"
    " \n"
    " It is public because I created unit tests for it.\n"
    " \n"
    " Feel free to use it, but it is not part of the API, so a breaking change\n"
    " can occur in every version change, without prior notice.\n"
).
-spec partition_on_unescaped_(binary(), binary()) -> fun((binary()) -> list(binary())).
partition_on_unescaped_(El, Escaper) ->
    fun(To_split) -> _pipe = To_split,
        _pipe@1 = gleam@string:split(_pipe, El),
        mesv@util:list_merge_map(
            _pipe@1,
            fun(First, Second) ->
                case (mesv@util:count_occurences(Escaper, First) rem 2) =:= 1 of
                    true ->
                        {some,
                            <<<<First/binary, El/binary>>/binary,
                                Second/binary>>};

                    false ->
                        none
                end
            end
        ) end.

-file("src\\mesv\\parse.gleam", 399).
?DOC(
    " Function to use the specified `Parser(a)` to transform the source into a `List(a)`\n"
    " \n"
    " If the headers specified in the `expect_headers` function did not match the specified pattern,\n"
    " a `ParsingError` will be returned, of the type `ExpectedHeadersMismatch`, containing both\n"
    " the expected headers, and what was found.\n"
    " \n"
    " If the headers weren't specified, or were specified and match the expected pattern, the\n"
    " function will return `Ok(#(List(parsed_type), List(ParsingError)))`;\n"
    " The first is the list of all rows that were successfully parsed, while the second is a list\n"
    " of `ParsingError`s that were thrown due to a row failing to parse.\n"
    " \n"
    " What to do with both of these Lists is up to the user, whether to ignore all errors or abort\n"
    " if any errors occur.\n"
).
-spec parse(parser(DQO), binary()) -> {ok, {list(DQO), list(parsing_error())}} |
    {error, parsing_error()}.
parse(Parser, Source) ->
    {parser,
        Column_separator,
        Row_separator,
        Escaper,
        Headers,
        Parse,
        Strict_columns,
        {Trim_start, Trim_end}} = Parser,
    Split_rows = partition_on_unescaped_(Row_separator, Escaper),
    case Split_rows(Source) of
        [] ->
            {ok, {[], []}};

        [Found_headers | Contents] ->
            Split_columns = partition_on_unescaped_(Column_separator, Escaper),
            Trim_whitespace = fun(Element) -> _pipe = Element,
                _pipe@1 = case Trim_start of
                    true ->
                        fun gleam@string:trim_start/1;

                    false ->
                        fun gleam@function:identity/1
                end(_pipe),
                case Trim_end of
                    true ->
                        fun gleam@string:trim_end/1;

                    false ->
                        fun gleam@function:identity/1
                end(_pipe@1) end,
            Unwrap = fun(Element@1) ->
                case {gleam_stdlib:string_starts_with(Element@1, Escaper),
                    gleam_stdlib:string_ends_with(Element@1, Escaper)} of
                    {true, true} ->
                        {ok,
                            begin
                                _pipe@2 = Element@1,
                                _pipe@3 = gleam_stdlib:string_remove_prefix(
                                    _pipe@2,
                                    Escaper
                                ),
                                gleam_stdlib:string_remove_suffix(
                                    _pipe@3,
                                    Escaper
                                )
                            end};

                    {false, false} ->
                        {ok, Element@1};

                    {_, _} ->
                        {error,
                            {encountered_malformed_element,
                                Element@1,
                                <<"Mismatched escapers"/utf8>>}}
                end
            end,
            Unescape = unescape([{Escaper, <<Escaper/binary, Escaper/binary>>}]),
            gleam@result:'try'(
                process_headers(Headers, Split_columns(Found_headers)),
                fun(_) ->
                    Process_row = fun(Elements) -> _pipe@4 = Elements,
                        _pipe@5 = gleam@list:map(_pipe@4, Trim_whitespace),
                        _pipe@6 = gleam@list:map(_pipe@5, Unwrap),
                        _pipe@7 = gleam@result:all(_pipe@6),
                        gleam@result:'try'(
                            _pipe@7,
                            fun(Elements@1) -> _pipe@8 = Elements@1,
                                _pipe@9 = gleam@list:map(_pipe@8, Unescape),
                                _pipe@10 = Parse(_pipe@9),
                                gleam@result:'try'(
                                    _pipe@10,
                                    fun(Output) ->
                                        {Value, Leftovers} = Output,
                                        case {Strict_columns, Leftovers} of
                                            {false, _} ->
                                                {ok, Value};

                                            {true, []} ->
                                                {ok, Value};

                                            {true, _} ->
                                                {error,
                                                    {strict_parsed_with_leftovers,
                                                        Leftovers}}
                                        end
                                    end
                                ) end
                        ) end,
                    {ok,
                        begin
                            _pipe@11 = case Headers of
                                {some, _} ->
                                    Contents;

                                none ->
                                    [Found_headers | Contents]
                            end,
                            _pipe@14 = gleam@list:map(
                                _pipe@11,
                                fun(Row_string) -> _pipe@12 = Row_string,
                                    _pipe@13 = Split_columns(_pipe@12),
                                    Process_row(_pipe@13) end
                            ),
                            _pipe@15 = gleam@result:partition(_pipe@14),
                            _pipe@16 = gleam@pair:map_first(
                                _pipe@15,
                                fun lists:reverse/1
                            ),
                            gleam@pair:map_second(_pipe@16, fun lists:reverse/1)
                        end}
                end
            )
    end.