lib/bubble_match/token.ex

defmodule BubbleMatch.Token do
  @moduledoc """
  A token is a single word or a part of the sentence. A sentence is a sequence of tokens.

  Each token contains information and metadata that is used to match
  sentences on, and to extract information from.
  """

  @typedoc """

   Tokens contain the following fields:

   * `raw` - the raw text value of the token, including any surrounding
     whitespace.

   * `value` - the normalized value of the token. In the case of word
     tokens, this is usually the normalized, lowercased version of the
     word. In the case of entities, this value holds a map with keys
     `kind`, `provider` and `value`.

  * `start` - the start index; where in the original sentence the
    token starts.

  * `end` - the end index; where in the original sentence the
    token ends.

  * `index` - the (zero-based) token index number; 0 if it's the first
     token, 1 if it's the second, etc.

  * `type` - the type of the token; an atom, holding either `:entity`,
    `:spacy`, `:naive`, depending on the way the token was
    originally created.

  """
  @type t :: %__MODULE__{}

  use BubbleLib.DslStruct,
    raw: nil,
    value: nil,
    start: nil,
    end: nil,
    type: nil,
    index: nil

  alias BubbleMatch.{Entity, Unidekode}
  alias __MODULE__, as: M

  @emoji Unicode.Regex.compile!("^[[:Emoji:]]$")

  @doc """
  Given a single token in Spacy's JSON format, convert it into a token.
  """
  @spec from_spacy(spacy_json_token :: map()) :: t()
  def from_spacy(t) do
    value =
      Map.take(t, ~w(lemma pos norm tag))
      |> Enum.map(fn {k, v} -> {k, Unidekode.to_ascii(v)} end)
      |> Map.new()

    # 'string' is for spacy < 3.0
    text = t["text"] || t["string"]

    value =
      if Regex.match?(@emoji, text) do
        value
        |> Map.put("pos", "EMOJI")
        |> Map.put("emoji", text)
      else
        value
      end

    %M{
      type: :spacy,
      value: value,
      raw: text,
      index: t["id"],
      start: t["start"],
      end: t["end"]
    }
  end

  @doc """
  Test whether a token is punctuation
  """
  def punct?(%M{type: :punct}) do
    true
  end

  def punct?(token) do
    pos?(token, "PUNCT") || pos?(token, "SYM") || pos?(token, "EMOJI")
  end

  @doc """
  Test whether a token mathces the given POS (part-of-speech) tag.
  """
  def pos?(%M{type: :spacy, value: %{"pos" => tag}}, tag) do
    true
  end

  def pos?(%M{type: :spacy, value: %{"tag" => tag}}, tag) do
    true
  end

  def pos?(_, _) do
    false
  end

  @doc """
  Test whether a token matches the given (optionally normalized) word.
  """
  def word?(%M{type: :spacy} = t, word) do
    t.value["norm"] == word || t.value["lemma"] == word
  end

  def word?(%M{} = t, word) do
    t.value == word || t.raw == word
  end

  @doc """
  Test whether a token is an entity of the given kind.
  """
  def entity?(%M{} = t, kind) do
    t.type == :entity and t.value.kind == kind
  end

  @doc """
  Constructs a token from a Spacy entity definition
  """
  def from_spacy_entity(spacy_entity_json, sentence_text) do
    {start, end_} = {spacy_entity_json["start"], spacy_entity_json["end"]}
    raw = String.slice(sentence_text, start, end_ - start)
    entity = Entity.new("spacy", Inflex.underscore(spacy_entity_json["label"]), raw, raw)

    %M{
      type: :entity,
      value: entity,
      start: start,
      end: end_,
      raw: raw
    }
  end

  @doc """
  Constructs a token from a Duckling entity definition
  """
  def from_duckling_entity(duckling_entity) do
    {start, end_} = {duckling_entity["start"], duckling_entity["end"]}

    value = duckling_entity["value"]["value"]
    raw = duckling_entity["body"]
    extra = duckling_entity["value"] |> Map.delete("value")

    entity = Entity.new("duckling", Inflex.underscore(duckling_entity["dim"]), value, raw, extra)

    %M{
      type: :entity,
      value: entity,
      start: start,
      end: end_,
      raw: duckling_entity["body"]
    }
  end

  @doc """
  Get the base form of the given string; the downcased, ASCII version.
  """
  def base_form(str) do
    str
    |> String.trim()
    |> String.downcase()
    |> Unidekode.drop_accented()
    |> String.replace("’", "'")
    |> String.replace("ʼ", "'")
  end
end

defimpl String.Chars, for: BubbleMatch.Token do
  def to_string(%BubbleMatch.Token{raw: raw}), do: raw
end

require BubbleLib.DslStruct
BubbleLib.DslStruct.jason_derive(BubbleMatch.Token)