defmodule Dsqlex.Lexer do
def tokenize(expr) when is_binary(expr) do
case do_tokenize(expr, []) do
{:ok, tokens} -> {:ok, Enum.reverse(tokens)}
error -> error
end
end
# base case: empty string (done)
defp do_tokenize("", tokens), do: {:ok, tokens}
# handle whitespaces
defp do_tokenize(<<char, rest::binary>>, tokens) when char in [?\s, ?\n, ?\t] do
do_tokenize(rest, tokens)
end
# handle commas
defp do_tokenize(<<",", rest::binary>>, tokens), do: do_tokenize(rest, [{:comma} | tokens])
# handle ' '
defp do_tokenize(<<"'", rest::binary>>, tokens) do
case consume_string(rest) do
{:ok, string_content, rest} -> do_tokenize(rest, [{:string, string_content} | tokens])
{:error, reason} -> {:error, reason}
end
end
# handle numbers
defp do_tokenize(<<c, _rest::binary>> = input, tokens) when c in ?0..?9 do
{number_str, rest} = consume_number(input)
do_tokenize(rest, [{:number, number_str} | tokens])
end
defp do_tokenize(<<c, _rest::binary>> = input, tokens) when c in ?a..?z or c in ?A..?Z or c == ?_ do
{word, rest} = consume_identifier(input)
token = classify_word(word)
do_tokenize(rest, [token | tokens])
end
# handle SQL line comments: -- ... <newline or eof>
defp do_tokenize(<<"--", rest::binary>>, tokens), do: do_tokenize(skip_to_newline(rest), tokens)
# handle MySQL-style line comments: # ... <newline or eof>
defp do_tokenize(<<"#", rest::binary>>, tokens), do: do_tokenize(skip_to_newline(rest), tokens)
# handle block comments: /* ... */ (may span multiple lines)
defp do_tokenize(<<"/*", rest::binary>>, tokens) do
case skip_block_comment(rest) do
{:ok, rest} -> do_tokenize(rest, tokens)
{:error, _} = error -> error
end
end
defp do_tokenize(<<"!=", rest::binary>>, tokens), do: do_tokenize(rest, [{:operator, :neq} | tokens])
defp do_tokenize(<<"<=", rest::binary>>, tokens), do: do_tokenize(rest, [{:operator, :lte} | tokens])
defp do_tokenize(<<">=", rest::binary>>, tokens), do: do_tokenize(rest, [{:operator, :gte} | tokens])
defp do_tokenize(<<"(", rest::binary>>, tokens), do: do_tokenize(rest, [{:lparen} | tokens])
defp do_tokenize(<<")", rest::binary>>, tokens), do: do_tokenize(rest, [{:rparen} | tokens])
defp do_tokenize(<<"+", rest::binary>>, tokens), do: do_tokenize(rest, [{:operator, :plus} | tokens])
defp do_tokenize(<<"-", rest::binary>>, tokens), do: do_tokenize(rest, [{:operator, :minus} | tokens])
defp do_tokenize(<<"*", rest::binary>>, tokens), do: do_tokenize(rest, [{:operator, :multiply} | tokens])
defp do_tokenize(<<"/", rest::binary>>, tokens), do: do_tokenize(rest, [{:operator, :divide} | tokens])
defp do_tokenize(<<"=", rest::binary>>, tokens), do: do_tokenize(rest, [{:operator, :eq} | tokens])
defp do_tokenize(<<"<", rest::binary>>, tokens), do: do_tokenize(rest, [{:operator, :lt} | tokens])
defp do_tokenize(<<">", rest::binary>>, tokens), do: do_tokenize(rest, [{:operator, :gt} | tokens])
# catch-all for unrecognized characters (e.g. trailing dot while user is still typing).
# Match a full UTF-8 codepoint first so the error message stays valid UTF-8 for
# non-ASCII input (e.g. accented letters); fall back to a hex byte for invalid UTF-8.
defp do_tokenize(<<cp::utf8, _rest::binary>>, _tokens),
do: {:error, "Unexpected character: '#{<<cp::utf8>>}'"}
defp do_tokenize(<<byte, _rest::binary>>, _tokens) do
hex = byte |> Integer.to_string(16) |> String.pad_leading(2, "0") |> String.upcase()
{:error, "Unexpected byte: 0x#{hex}"}
end
# handle numbers
defp consume_number(input), do: consume_number(input, "")
defp consume_number(<<c, rest::binary>>, acc) when c in ?0..?9 do
consume_number(rest, acc <> <<c>>)
end
defp consume_number(<<".", c, rest::binary>>, acc) when c in ?0..?9 do
consume_number(rest, acc <> "." <> <<c>>)
end
defp consume_number(rest, acc), do: {acc, rest}
# handle identifiers
defp consume_identifier(input), do: consume_identifier(input, "")
defp consume_identifier(<<c, rest::binary>>, acc) when c in ?a..?z or c in ?A..?Z or c == ?_ or c in ?0..?9 do
consume_identifier(rest, acc <> <<c>>)
end
# dot-access: consume '.' when followed by a letter or underscore (e.g. config.pricing.margin_rate)
defp consume_identifier(<<".", c, rest::binary>>, acc) when c in ?a..?z or c in ?A..?Z or c == ?_ do
consume_identifier(rest, acc <> "." <> <<c>>)
end
defp consume_identifier(rest, acc), do: {acc, rest}
defp consume_string(input), do: consume_string(input, "")
defp consume_string(<<"'", rest::binary>>, acc), do: {:ok, acc, rest}
defp consume_string(<<c, rest::binary>>, acc) do
consume_string(rest, acc <> <<c>>)
end
defp consume_string(_rest, _acc), do: {:error, "Unterminated string"}
# skip until newline or end of binary; comment bodies are discarded as raw bytes,
# so non-ASCII content is safely ignored without UTF-8 validation.
defp skip_to_newline(""), do: ""
defp skip_to_newline(<<"\n", rest::binary>>), do: rest
defp skip_to_newline(<<_byte, rest::binary>>), do: skip_to_newline(rest)
# skip until "*/" or fail with an unterminated-block-comment error.
defp skip_block_comment(""), do: {:error, "Unterminated block comment"}
defp skip_block_comment(<<"*/", rest::binary>>), do: {:ok, rest}
defp skip_block_comment(<<_byte, rest::binary>>), do: skip_block_comment(rest)
defp classify_word(word) do
case String.upcase(word) do
"SELECT" -> {:keyword, :select}
"CASE" -> {:keyword, :case}
"WHEN" -> {:keyword, :when}
"THEN" -> {:keyword, :then}
"ELSE" -> {:keyword, :else}
"END" -> {:keyword, :end}
"AND" -> {:keyword, :and}
"OR" -> {:keyword, :or}
"NOT" -> {:keyword, :not}
"NULL" -> {:keyword, :null}
"TRUE" -> {:keyword, :true}
"FALSE" -> {:keyword, :false}
"IS" -> {:keyword, :is}
"IN" -> {:keyword, :in}
"LIKE" -> {:keyword, :like}
"UPPER" -> {:function, :upper}
"LOWER" -> {:function, :lower}
"ROUND" -> {:function, :round}
"COALESCE" -> {:function, :coalesce}
"NVL" -> {:function, :coalesce} # alias for COALESCE
"ABS" -> {:function, :abs}
"CONCAT" -> {:function, :concat}
"EVENT" -> {:function, :event}
_ -> {:identifier, word}
end
end
end