Skip to main content

src/shlex.gleam

import gleam/list
import gleam/string

// A list of single graphemes split from a string input
type Graphemes =
  List(String)

// A list of complete, lexed token
type Tokens =
  List(String)

// A working buffer that will be assembled into a token 
type Buffer =
  List(String)

/// An error that occurs when lexing a shell input
pub type LexError {
  /// A quote was opened without a matching closing quote
  UnclosedQuotation
  // An escape was encountered without a following character to escape
  NoEscapedCharacter
}

/// Split a shell input into a list of string tokens.
/// 
/// This aims to follow the POSIX standard defined by IEEE Std 1003.1-2024.
/// 
/// ## Examples
/// 
/// ```gleam
/// let assert Ok(tokens) = split("git commit -m 'hello world!'")
/// assert tokens == ["git", "commit", "-m", "hello worlds!"]
/// ```
pub fn split(input: String) -> Result(Tokens, LexError) {
  input |> string.to_graphemes |> continue([])
}

fn continue(input: Graphemes, acc: Tokens) -> Result(Tokens, LexError) {
  case input {
    [] -> acc |> list.reverse |> Ok

    // Skip whitespace between words
    [" ", ..rest] | ["\t", ..rest] | ["\n", ..rest] -> continue(rest, acc)

    // Consume comment lines
    ["#", ..rest] -> comment(rest, acc)

    _ -> word(input, acc, [])
  }
}

fn comment(input: Graphemes, acc: Tokens) -> Result(Tokens, LexError) {
  case input {
    [] -> continue([], acc)

    // Comment is ended by newline
    ["\n", ..rest] -> continue(rest, acc)

    [_, ..rest] -> comment(rest, acc)
  }
}

fn word(
  input: Graphemes,
  acc: Tokens,
  buf: Buffer,
) -> Result(Tokens, LexError) {
  case input {
    [] -> continue([], push_buffer(buf, acc))

    // <backslash> at EOF has nothing to escape
    ["\\"] -> Error(NoEscapedCharacter)

    // <newline> immediately following <backslash> is a line continuation
    ["\\", "\n", ..rest] -> word(rest, acc, buf)

    // Any other <backslash> preserves the literal
    ["\\", next, ..rest] -> word(rest, acc, [next, ..buf])

    // Begin a quoted token
    ["'", ..rest] -> single_quote(rest, acc, buf)
    ["\"", ..rest] -> double_quote(rest, acc, buf)

    // Word ended by un-escaped whitespace
    [" ", ..rest] | ["\t", ..rest] | ["\n", ..rest] ->
      continue(rest, push_buffer(buf, acc))

    [hd, ..rest] -> word(rest, acc, [hd, ..buf])
  }
}

fn single_quote(
  input: Graphemes,
  acc: Tokens,
  buf: Buffer,
) -> Result(Tokens, LexError) {
  case input {
    [] -> Error(UnclosedQuotation)

    // Ended by single-quote
    ["'", ..rest] -> word(rest, acc, buf)

    // Treat everything else as a literal
    [hd, ..rest] -> single_quote(rest, acc, [hd, ..buf])
  }
}

fn double_quote(
  input: Graphemes,
  acc: Tokens,
  buf: Buffer,
) -> Result(Tokens, LexError) {
  case input {
    [] -> Error(UnclosedQuotation)

    // Special-case escaped characters
    ["\\", c, ..rest] -> {
      case c {
        // Consume newlines
        "\n" -> double_quote(rest, acc, buf)
        // Escape only a subset of characters, e.g. \$ -> $
        "$" | "`" | "\"" | "\\" -> double_quote(rest, acc, [c, ..buf])
        // Treat everything else as literals, e.g. \t -> \t 
        _ -> double_quote(rest, acc, ["\\" <> c, ..buf])
      }
    }

    // Ended by double-quote. This must match AFTER the escaped form \" 
    ["\"", ..rest] -> word(rest, acc, buf)

    [hd, ..rest] -> double_quote(rest, acc, [hd, ..buf])
  }
}

/// Merge the character buffer into a token and push it onto the accumulator
fn push_buffer(buf: Buffer, acc: Tokens) -> Tokens {
  case buf {
    [] -> acc
    _ -> [buf |> list.reverse |> string.join(""), ..acc]
  }
}