lib/haystack/tokenizer.ex

Select File:
lib/haystack/tokenizer.ex

defmodule Haystack.Tokenizer do
  @moduledoc """
  A module for tokenizing values.

  This module provides utilities for tokenizing values. The default tokenizer
  removes anything but alphanumeric characters and extracts the positions of
  words using a start offset and length.

  There's also a `:full` tokenizer that can be used when the full value should
  be treated as a single token. For example, a serial code.
  """

  alias Haystack.Tokenizer.Token

  @default ~r/([[:alnum:]]+)/
  @full ~r/(.+)/

  # Public

  @doc """
  Return the seperator

  ## Examples

      iex> Tokenizer.separator(:default)
      ~r/([[:alnum:]]+)/

      iex> Tokenizer.separator(:full)
      ~r/(.+)/

  """
  @spec separator(atom) :: Regex.t()
  def separator(:default), do: @default
  def separator(:full), do: @full

  @doc """
  Tokenize a value.

  ## Examples

      iex> tokens = Tokenizer.tokenize("Needle in a Haystack")
      iex> Enum.map(tokens, & &1.v)
      ~w{needle in a haystack}

  """
  @spec tokenize(term) :: list(Token.t())
  def tokenize(v), do: tokenize(v, separator(:default))

  @doc """
  Tokenize a value with a given separator.

  ## Examples

      iex> tokens = Tokenizer.tokenize("Needle in a Haystack", Tokenizer.separator(:default))
      iex> Enum.map(tokens, & &1.v)
      ~w{needle in a haystack}

  """
  @spec tokenize(term, Regex.t()) :: list(Token.t())
  def tokenize(v, r) when is_integer(v),
    do: v |> to_string() |> tokenize(r)

  def tokenize(v, r) when is_binary(v) do
    v = String.downcase(v)

    [words(v, r), positions(v, r)]
    |> Enum.zip()
    |> Enum.map(fn {v, {o, l}} -> Token.new(v, offset: o, length: l) end)
  end

  # Private

  # Extract the tokenized words from the value.
  defp words(v, r),
    do: Regex.scan(r, v, capture: :first) |> List.flatten()

  # Extract the positions from the value.
  defp positions(v, r),
    do: Regex.scan(r, v, capture: :first, return: :index) |> List.flatten()
end