lib/ref_parser.ex

defmodule BibleEx.RefParser do
  @moduledoc """
  Parses general strings for Bible references.

  This module scans arbitrary text for Bible references in a variety of
  formats (full book names, abbreviations, ranges, etc.) and returns
  `%BibleEx.Reference{}` structs.
  """

  alias BibleEx.BibleData
  alias BibleEx.Reference

  @doc ~S"""
  Parses a binary for all Bible references and returns a list of `%BibleEx.Reference{}`.

  The parser supports:

    * Full book names with chapter and verse.
    * Common abbreviations and variants (e.g. `Matt`, `Jn`, `1 Tim`).
    * Same-chapter verse ranges (e.g. `John 3:16-18`).
    * Cross-chapter ranges (e.g. `John 3:16-4:3`).
    * Dot separators with optional spaces (e.g. `James 1 . 2 -  2 . 4`).
    * Bare book names (e.g. `Isaiah`, `1 Timothy`) which are normalized by `BibleEx.Reference`.

  ## Examples

      iex> alias BibleEx.RefParser
      iex> [ref] = RefParser.parse_references("John 3:16")
      iex> ref.book
      "John"
      iex> ref.start_chapter
      3
      iex> ref.start_verse
      16

      iex> refs = RefParser.parse_references("Matt 2:4 and James 5:1-5")
      iex> Enum.map(refs, & &1.book)
      ["Matthew", "James"]
      iex> Enum.map(refs, & &1.reference_type) |> Enum.sort()
      [:verse, :verse_range]

      iex> [ref] = RefParser.parse_references("James 1 . 2 -  2 . 4")
      iex> ref.book
      "James"
      iex> {ref.start_chapter, ref.start_verse, ref.end_chapter, ref.end_verse}
      {1, 2, 2, 4}

      iex> [ref] = RefParser.parse_references("is. 1 Timothy 6, 1.")
      iex> ref.book
      "1 Timothy"
      iex> {ref.start_chapter, ref.start_verse}
      {6, 1}

  """

  def parse_references(string) when is_binary(string) do
    # Run the full regex over the input string, collecting all raw matches.
    # Each element is a list: [full_match, book, maybe_chapter, maybe_verse, ...].
    books_matched =
      Regex.scan(full_regex(), string)
      # Drop anything whose first captured token is not a known book key.
      |> Enum.reject(fn [full, book | _] ->
        norm = String.downcase(String.trim(book))

        not (
          Map.has_key?(BibleEx.BibleData.books(), norm) or
          Map.has_key?(BibleEx.BibleData.osis_books(), norm) or
          Map.has_key?(BibleEx.BibleData.shortened_books(), norm) or
          Map.has_key?(BibleEx.BibleData.variants(), norm)
        )
      end)

    # Turn each raw regex match into a %BibleEx.Reference{}.
    Enum.map(books_matched, fn x ->
      cond do
        # Case 1: book-only reference, e.g. ["is", "is"] or ["Matthew", "Matthew"].
        # Delegate to Reference.new/1 so it can normalize book names and defaults.
        length(x) == 2 and String.trim(Enum.at(x, 0)) == Enum.at(x, 1) ->
          book_name = String.trim(Enum.at(x, 0))
          Reference.new(book: book_name)

        # Case 2: book + chapter, no verse, e.g. ["Judges 19", "Judges", "19"].
        length(x) == 3 ->
          Reference.new(
            book: String.trim(Enum.at(x, 1)),
            start_chapter: String.to_integer(Enum.at(x, 2)),
            start_verse: nil,
            end_chapter: nil,
            end_verse: nil
          )

        # Case 3: book + chapter:verse, e.g. ["Jn 3:16", "Jn", "3", "16"].
        length(x) == 4 ->
          Reference.new(
            book: String.trim(Enum.at(x, 1)),
            start_chapter: String.to_integer(Enum.at(x, 2)),
            start_verse: String.to_integer(Enum.at(x, 3)),
            end_chapter: nil,
            end_verse: nil
          )

        # Case 4: five captures – could be a verse range, dotted form, or chapter range.
        length(x) == 5 ->
          # Full matched text, used to distinguish ":" vs "." vs "-" semantics.
          ref = String.trim(Enum.at(x, 0))

          cond do
            # Example: "John 4:5-10" → same-chapter verse range.
            String.contains?(ref, ":") ->
              Reference.new(
                book: String.trim(Enum.at(x, 1)),
                start_chapter: String.to_integer(Enum.at(x, 2)),
                start_verse: String.to_integer(Enum.at(x, 3)),
                end_chapter: nil,
                end_verse: String.to_integer(Enum.at(x, 4))
              )

            # Example: "James 1.2 -  2" → dotted start verse only.
            String.contains?(ref, ".") ->
              Reference.new(
                book: String.trim(Enum.at(x, 1)),
                start_chapter: String.to_integer(Enum.at(x, 2)),
                start_verse: String.to_integer(Enum.at(x, 3)),
                end_chapter: nil,
                end_verse: nil
              )

            # Example: "James 1 - 2" or "James 1 — 2" → chapter range with no verses.
            String.contains?(ref, "-") or String.contains?(ref, "—") ->
              Reference.new(
                book: String.trim(Enum.at(x, 1)),
                start_chapter: String.to_integer(Enum.at(x, 2)),
                start_verse: nil,
                end_chapter: String.to_integer(Enum.at(x, 4)),
                end_verse: nil
              )

            # Fallback: treat as start + end verse within the same chapter.
            true ->
              Reference.new(
                book: String.trim(Enum.at(x, 1)),
                start_chapter: String.to_integer(Enum.at(x, 2)),
                start_verse: String.to_integer(Enum.at(x, 3)),
                end_chapter: nil,
                end_verse: String.to_integer(Enum.at(x, 4))
              )
          end

        # Case 5: six captures – cross-chapter range, e.g. ["John 3:16-4:3", "John", "3", "16", "4", "3"].
        length(x) == 6 ->
          Reference.new(
            book: String.trim(Enum.at(x, 1)),
            start_chapter: String.to_integer(Enum.at(x, 2)),
            start_verse: String.to_integer(Enum.at(x, 3)),
            end_chapter: String.to_integer(Enum.at(x, 4)),
            end_verse: String.to_integer(Enum.at(x, 5))
          )

        # Anything that does not match the expected shapes is ignored.
        true ->
          nil
      end
    end)
  end

  defp full_regex() do
    # Collect all canonical book keys from the different BibleData maps.
    # These are the tokens that will be recognized as book names in the regex.
    canonical_book_keys = Map.keys(BibleEx.BibleData.books())
    osis_keys           = Map.keys(BibleEx.BibleData.osis_books())
    shortened_keys      = Map.keys(BibleEx.BibleData.shortened_books())
    variant_keys        = Map.keys(BibleEx.BibleData.variants())

    # Merge all keys into a single list of searchable tokens:
    #   * downcased so matching is case-insensitive at the pattern level,
    #   * deduplicated,
    #   * sorted longest-first so multi-word / longer tokens win before prefixes.
    all_searchable =
      (canonical_book_keys ++ osis_keys ++ shortened_keys ++ variant_keys)
      |> Enum.map(&String.downcase/1)
      |> Enum.uniq()
      |> Enum.sort_by(&String.length/1, :desc)

    # Turn the list of book tokens into a single alternation group:
    #   "genesis|gen|gn|..." – with each token regex-escaped for safety.
    book_alternation =
      all_searchable
      |> Enum.map(&Regex.escape/1)
      |> Enum.join("|")

    # Main reference-matching pattern:
    #   1. Match a book token with safe left/right boundaries.
    #   2. Optionally match chapter and verse.
    #   3. Optionally match a range (same chapter or cross-chapter).
    pattern = """
    (?<![A-Za-z0-9])          # left boundary: do not allow letters/digits before the book
    (#{book_alternation})     # 1: book token (any of the known keys)
    (?![A-Za-z])              # right boundary: next char cannot be a letter (avoid 'Jos' in 'Joseph')
    \\s*                      # optional whitespace after book

    (?:
      (\\d+)                  # 2: optional chapter number (e.g. '3' in 'John 3:16')
      (?:\\s*[.:]\\s*(\\d+))? # 3: optional verse number with ':' or '.' (e.g. '16' in '3:16' or '3 . 16')

      (?:
        \\s*[—-]\\s*          # optional range separator: dash or em dash, with optional surrounding spaces

        (?:
          (\\d+)              # 4: end chapter (for cross-chapter ranges, e.g. '4' in '3:16-4:3')
          (?:\\s*[.:]\\s*(\\d+))? # 5: optional end verse after end chapter (e.g. '3' in '4:3')
          |                   # OR

          (\\d+)              # 6: end verse within the same chapter (e.g. '18' in '3:16-18')
        )
      )?
    )?                        # entire chapter/verse/range part is optional (supports book-only matches)
    """

    # Compile the pattern with:
    #   x – extended mode (whitespace/comments allowed),
    #   i – case-insensitive,
    #   u – Unicode mode.
    {:ok, regex} = Regex.compile(pattern, "xiu")
    regex
  end
end