lib/lib_judge/tokenizer.ex

defmodule LibJudge.Tokenizer do
  @moduledoc """
  Tokenizer for the MTG Comprehensive Rules
  """
  import LibJudge.Tokenizer.Guards
  alias LibJudge.Rule
  alias LibJudge.Util

  @type title :: {:title, String.t()}
  @type effective_date :: {:effective_date, Date.t()}
  @type intro :: {:intro, String.t()}
  @type contents :: {:contents, [rule | String.t()]}
  @type rule ::
          {:rule,
           {type :: Rule.rule_type(), rule :: Rule.t(), body :: String.t(),
            examples :: [String.t()]}}
  @type glossary :: {:glossary, [glossary_item]}
  @type glossary_item :: {name :: String.t(), definition :: String.t()}
  @type token ::
          title
          | effective_date
          | intro
          | contents
          | rule
          | glossary
  @type token_type ::
          :title
          | :effective_date
          | :intro
          | :contents
          | :rule
          | :glossary

  @spec tokenize(binary) :: [token]
  # Normalize non-UTF8 unicodes to UTF8, stripping BOM
  def tokenize(<<0xFE, 0xFF, rest::binary>>),
    do: rest |> :unicode.characters_to_binary({:utf16, :big}) |> tokenize()

  def tokenize(<<0xFF, 0xFE, rest::binary>>),
    do: rest |> :unicode.characters_to_binary({:utf16, :little}) |> tokenize()

  def tokenize(<<0x00, 0x00, 0xFE, 0xFF, rest::binary>>),
    do: rest |> :unicode.characters_to_binary({:utf32, :big}) |> tokenize()

  def tokenize(<<0xFF, 0xFE, 0x00, 0x00, rest::binary>>),
    do: rest |> :unicode.characters_to_binary({:utf32, :little}) |> tokenize()

  # ignore UTF-8 BOMs
  def tokenize(<<0xEF, 0xBB, 0xBF, rest::binary>>), do: tokenize(rest)

  def tokenize(string) when is_binary(string) do
    string
    |> String.replace("\r\n", "\n")
    |> String.replace("\r", "\n")
    |> tokenize([])
    |> Enum.reverse()
  end

  defp tokenize(text, tokens)

  defp tokenize(<<"Magic: The Gathering Comprehensive Rules", rest::binary>>, tokens) do
    tokenize(rest, [{:title, "Magic: The Gathering Comprehensive Rules"} | tokens])
  end

  defp tokenize(<<"These rules are effective as of ", rest_with_date::binary>>, tokens) do
    [date, rest] = String.split(rest_with_date, ".", parts: 2)

    [month, day, year] =
      date
      |> String.split(" ", parts: 3)
      |> Stream.map(&String.trim(&1, ","))
      |> Enum.map(&Util.date_map(&1))

    {:ok, parsed_date} = Date.new(year, month, day)

    tokenize(rest, [{:effective_date, parsed_date} | tokens])
  end

  defp tokenize(<<"Introduction\n\n", rest_with_intro::binary>>, tokens) do
    [intro, rest] = take_until(rest_with_intro, "\n\nContents")
    tokenize(rest, [{:intro, intro} | tokens])
  end

  defp tokenize(<<"Contents", rest_with_contents::binary>>, tokens) do
    [contents_string, rest] = take_until(rest_with_contents, "1. Game Concepts\n\n")
    contents = tokenize(String.trim(contents_string, "\n"))
    tokenize(rest, [{:contents, contents} | tokens])
  end

  # rules like: 1. <body>
  defp tokenize(<<cat::utf8, ". ", rest_with_body::binary>>, tokens) when cat in 48..57 do
    category_tokenize(cat, rest_with_body, tokens)
  end

  # rules like: 100. <body>
  defp tokenize(
         <<cat::utf8, subcat::binary-size(2), ". ", rest_with_body::binary>>,
         tokens
       )
       when cat in 48..57 do
    subcategory_tokenize(cat, subcat, rest_with_body, tokens)
  end

  # rules like: 100.1. <body>
  defp tokenize(
         <<cat::utf8, subcat::binary-size(2), ".", rule::binary-size(1), ". ",
           rest_with_body::binary>>,
         tokens
       )
       when cat in 48..57 and is_rule_1(rule) do
    rule_tokenize(cat, subcat, rule, rest_with_body, tokens)
  end

  # rules like: 100.10. <body>
  defp tokenize(
         <<cat::utf8, subcat::binary-size(2), ".", rule::binary-size(2), ". ",
           rest_with_body::binary>>,
         tokens
       )
       when cat in 48..57 and is_rule_2(rule) do
    rule_tokenize(cat, subcat, rule, rest_with_body, tokens)
  end

  # rules like: 100.100. <body>
  defp tokenize(
         <<cat::utf8, subcat::binary-size(2), ".", rule::binary-size(3), ". ",
           rest_with_body::binary>>,
         tokens
       )
       when cat in 48..57 and is_rule_3(rule) do
    rule_tokenize(cat, subcat, rule, rest_with_body, tokens)
  end

  # rules like: 100.1a <body>
  defp tokenize(
         <<cat::utf8, subcat::binary-size(2), ".", rule::binary-size(1), subrule::utf8, " ",
           rest_with_body::binary>>,
         tokens
       )
       when cat in 48..57 and subrule in 97..122 and is_rule_1(rule) do
    subrule_tokenize(cat, subcat, rule, subrule, rest_with_body, tokens)
  end

  # rules like: 100.10a <body>
  defp tokenize(
         <<cat::utf8, subcat::binary-size(2), ".", rule::binary-size(2), subrule::utf8, " ",
           rest_with_body::binary>>,
         tokens
       )
       when cat in 48..57 and subrule in 97..122 and is_rule_2(rule) do
    subrule_tokenize(cat, subcat, rule, subrule, rest_with_body, tokens)
  end

  # rules like: 100.100a <body>
  defp tokenize(
         <<cat::utf8, subcat::binary-size(2), ".", rule::binary-size(3), subrule::utf8, " ",
           rest_with_body::binary>>,
         tokens
       )
       when cat in 48..57 and subrule in 97..122 and is_rule_3(rule) do
    subrule_tokenize(cat, subcat, rule, subrule, rest_with_body, tokens)
  end

  # WOTC I AM GOING TO DO ACTIONABLE THREATS TO YOU

  # bugged rules like: 100.1 <body>
  defp tokenize(
         <<cat::utf8, subcat::binary-size(2), ".", rule::binary-size(1), " ",
           rest_with_body::binary>>,
         tokens
       )
       when cat in 48..57 and is_rule_1(rule) do
    rule_tokenize(cat, subcat, rule, rest_with_body, tokens)
  end

  # bugged rules like: 100.10 <body>
  defp tokenize(
         <<cat::utf8, subcat::binary-size(2), ".", rule::binary-size(2), " ",
           rest_with_body::binary>>,
         tokens
       )
       when cat in 48..57 and is_rule_2(rule) do
    rule_tokenize(cat, subcat, rule, rest_with_body, tokens)
  end

  # bugged rules like: 100.100 <body>
  defp tokenize(
         <<cat::utf8, subcat::binary-size(2), ".", rule::binary-size(3), " ",
           rest_with_body::binary>>,
         tokens
       )
       when cat in 48..57 and is_rule_3(rule) do
    rule_tokenize(cat, subcat, rule, rest_with_body, tokens)
  end

  # bugged rules like: 100.1.<body>
  defp tokenize(
         <<cat::utf8, subcat::binary-size(2), ".", rule::binary-size(1), ".",
           rest_with_body::binary>>,
         tokens
       )
       when cat in 48..57 and is_rule_1(rule) do
    rule_tokenize(cat, subcat, rule, rest_with_body, tokens)
  end

  # bugged rules like: 100.10.<body>
  defp tokenize(
         <<cat::utf8, subcat::binary-size(2), ".", rule::binary-size(2), ".",
           rest_with_body::binary>>,
         tokens
       )
       when cat in 48..57 and is_rule_2(rule) do
    rule_tokenize(cat, subcat, rule, rest_with_body, tokens)
  end

  # bugged rules like: 100.100.<body>
  defp tokenize(
         <<cat::utf8, subcat::binary-size(2), ".", rule::binary-size(3), ".",
           rest_with_body::binary>>,
         tokens
       )
       when cat in 48..57 and is_rule_3(rule) do
    rule_tokenize(cat, subcat, rule, rest_with_body, tokens)
  end

  # bugged rules like: 100.1a. <body>
  defp tokenize(
         <<cat::utf8, subcat::binary-size(2), ".", rule::binary-size(1), subrule::utf8, ". ",
           rest_with_body::binary>>,
         tokens
       )
       when cat in 48..57 and subrule in 97..122 and is_rule_1(rule) do
    subrule_tokenize(cat, subcat, rule, subrule, rest_with_body, tokens)
  end

  # bugged rules like: 100.10a. <body>
  defp tokenize(
         <<cat::utf8, subcat::binary-size(2), ".", rule::binary-size(2), subrule::utf8, ". ",
           rest_with_body::binary>>,
         tokens
       )
       when cat in 48..57 and subrule in 97..122 and is_rule_2(rule) do
    subrule_tokenize(cat, subcat, rule, subrule, rest_with_body, tokens)
  end

  # bugged rules like: 100.100a. <body>
  defp tokenize(
         <<cat::utf8, subcat::binary-size(2), ".", rule::binary-size(3), subrule::utf8, ". ",
           rest_with_body::binary>>,
         tokens
       )
       when cat in 48..57 and subrule in 97..122 and is_rule_3(rule) do
    subrule_tokenize(cat, subcat, rule, subrule, rest_with_body, tokens)
  end

  # strip leading newlines
  defp tokenize(<<"\n", rest::binary>>, tokens) do
    tokenize(rest, tokens)
  end

  # end tokenization
  defp tokenize(_string, tokens) do
    # TODO: when glossary parsing is added, _string should be ""
    tokens
  end

  defp category_tokenize(cat, rest_with_body, tokens) do
    [body, rest] = String.split(rest_with_body, "\n", parts: 2)

    rule = %Rule{
      category: <<cat>>,
      type: :category
    }

    tokenize(rest, [{:rule, {:category, rule, body, []}} | tokens])
  end

  defp subcategory_tokenize(cat, subcat, rest_with_body, tokens) do
    [body, rest_with_examples] = String.split(rest_with_body, "\n", parts: 2)
    {examples, rest} = take_examples(rest_with_examples)

    rule = %Rule{
      category: <<cat>>,
      subcategory: subcat,
      type: :subcategory
    }

    tokenize(rest, [
      {:rule, {:subcategory, rule, body, examples}}
      | tokens
    ])
  end

  defp rule_tokenize(cat, subcat, rule, rest_with_body, tokens) do
    [body_part, rest_with_continuation] = String.split(rest_with_body, "\n", parts: 2)
    {body, rest_with_examples} = continue(body_part, rest_with_continuation)
    {examples, rest} = take_examples(rest_with_examples)

    rule = %Rule{
      category: <<cat>>,
      subcategory: subcat,
      rule: rule,
      type: :rule
    }

    tokenize(rest, [
      {:rule, {:rule, rule, body, examples}}
      | tokens
    ])
  end

  defp subrule_tokenize(cat, subcat, rule, subrule, rest_with_body, tokens) do
    [body_part, rest_with_continuation] = String.split(rest_with_body, "\n", parts: 2)
    {body, rest_with_examples} = continue(body_part, rest_with_continuation)
    {examples, rest} = take_examples(rest_with_examples)

    rule = %Rule{
      category: <<cat>>,
      subcategory: subcat,
      rule: rule,
      subrule: <<subrule>>,
      type: :subrule
    }

    tokenize(rest, [
      {:rule, {:subrule, rule, body, examples}}
      | tokens
    ])
  end

  defp take_until(string, marker) do
    [taken, rest_no_marker] = String.split(string, marker, parts: 2)
    rest = marker <> rest_no_marker
    [taken, rest]
  end

  defp take_examples(bin), do: take_examples([], bin)

  defp take_examples(ex, <<"\n", bin::binary>>), do: take_examples(ex, bin)
  defp take_examples(ex, <<" \n", bin::binary>>), do: take_examples(ex, bin)

  defp take_examples(ex, <<"Example: ", rest::binary>>) do
    [example, rest] = take_until(rest, "\n")
    take_examples([example | ex], rest)
  end

  defp take_examples(ex, bin), do: {ex, bin}

  defp continue(rule, next) do
    case take_continuation(next) do
      {"", rest} -> {rule, rest}
      {cont, rest} -> {"#{rule}\n#{cont}", rest}
    end
  end

  defp take_continuation(bin), do: take_continuation([], bin)

  defp take_continuation(cont, <<"     ", bin::binary>>) do
    [continuation, rest] = take_until(bin, "\n")
    take_continuation([continuation | cont], rest)
  end

  defp take_continuation(cont, bin) do
    continuation = reverse_join(cont, "\n")

    {continuation, bin}
  end

  defp reverse_join(list, join) do
    reverse_join(list, join, "")
  end

  defp reverse_join([], _join, acc), do: acc
  defp reverse_join([this | rest], join, acc), do: reverse_join(rest, join, this <> join <> acc)
end