defmodule Lexical.Document.LineParser do
@moduledoc """
A parser that parses a binary into `Lexical.Document.Line` records.
The approach taken by the parser is to first go through the binary to find out where
the lines break, what their endings are and if the line is ascii. As we go through the
binary, we store this information, and when we're done, go back and split up the binary
using `binary_slice`. This performs 3x faster than iterating through the binary and collecting
IOlists that represent each line.
I determines if a line is ascii (and what it really means is utf8 ascii) by checking to see if
each byte is greater than 0 and less than 128. UTF-16 files won't be marked as ascii, which
allows us to skip a lot of byte conversions later in the process.
"""
import Lexical.Document.Line
# it's important that "\r\n" comes before \r here, otherwise the generated pattern
# matches won't match.
@endings ["\r\n", "\r", "\n"]
@max_ascii_character 127
@doc """
Parses the text into lines
Parses the given text into lines, and uses `starting_index` as the first line's line number.
Passing 0 as starting_index yields a zero-based collection, while passing 1 yields a 1-based
collection.
"""
def parse(text, starting_index) do
text
|> traverse(starting_index)
|> Enum.reduce([], fn index, acc -> [extract_line(text, index) | acc] end)
end
defp extract_line(text, {line_number, start, stop, is_ascii?, ending}) do
line_text = binary_part(text, start, stop)
line(line_number: line_number, text: line_text, ascii?: is_ascii?, ending: ending)
end
defp traverse(text, starting_index) do
traverse(text, 0, starting_index, 0, true, [])
end
for ending <- @endings,
ending_length = byte_size(ending) do
defp traverse(
<<unquote(ending)>>,
current_index,
line_number,
line_start_index,
is_ascii?,
acc
) do
line_length = current_index - line_start_index
line_index = {line_number, line_start_index, line_length, is_ascii?, unquote(ending)}
[line_index | acc]
end
defp traverse(
<<unquote(ending), rest::binary>>,
current_index,
line_number,
line_start_index,
is_ascii?,
acc
) do
line_length = current_index - line_start_index
acc = [{line_number, line_start_index, line_length, is_ascii?, unquote(ending)} | acc]
next_index = current_index + unquote(ending_length)
traverse(rest, next_index, line_number + 1, next_index, is_ascii?, acc)
end
end
defp traverse(
<<c, rest::binary>>,
current_index,
line_number,
line_start_index,
is_ascii?,
acc
) do
# Note, this heuristic assumes the NUL character won't occur in elixir source files.
# if this isn't true, then we need a better heuristic for detecting utf16 text.
is_still_ascii? = is_ascii? and c <= @max_ascii_character and c > 0
traverse(
rest,
current_index + 1,
line_number,
line_start_index,
is_still_ascii?,
acc
)
end
defp traverse(<<>>, same_index, _line_number, same_index, _is_ascii, acc) do
# this is a line at the end of the document with no content
# I'm choosing not to represent it as a line to simplify things
# and to make the line count what we expect
acc
end
defp traverse(<<>>, current_index, line_number, line_start_index, is_ascii?, acc) do
# file doesn't end with a newline
line_length = current_index - line_start_index
[{line_number, line_start_index, line_length, is_ascii?, ""} | acc]
end
end