lib/bubble_match/sentence.ex

defmodule BubbleMatch.Sentence do
  @moduledoc """
  A data structure which holds a tokenized sentence.

  The struct contains the text of the sentence (in the *text*
  property), and a list of *tokenizations*. Normally, a sentence has
  just one tokenization, but adding entities to the sentence might
  cause several tokens in the sentence to be replaed with an entity
  token, thus creating the need for multiple tokenizations (as you
  still might want to match on the original sentence, e.g. in the case
  of a falsely identified entitiy)

  """

  use BubbleLib.DslStruct,
    text: nil,
    tokenizations: []

  alias BubbleMatch.Sentence.Tokenizer
  alias BubbleMatch.Token

  @type t :: __MODULE__

  alias __MODULE__, as: M

  @doc """
  Tokenize an input into individual tokens.

  As the name suggests, this tokenization is quite naive. It only
  splits strings on whitespace and punctuation, disregarding any
  language-specific information. However, for 'basic' use cases, and
  for our test suite, it is good enough.
  """
  @spec naive_tokenize(input :: String.t()) :: [t()]
  def naive_tokenize(input)

  def naive_tokenize("") do
    %M{text: "", tokenizations: [[]]}
  end

  def naive_tokenize(input) when is_binary(input) do
    tokens = Tokenizer.tokenize(input)
    %M{text: input, tokenizations: both_if_different(no_punct(tokens), tokens)}
  end

  @doc """
  Convert a JSON blob from Spacy NLP data into a sentence.

  This function takes the output of Spacy's [Doc.to_json][spacy]
  function and converts it into a sentence.

  Note that the Spacy tokenizer detects multiple sentences. However,
  in many cases the result is suboptimal and therefore we always
  construct a single sentence, given our use case of chat messages.

  [spacy]: https://spacy.io/api/doc#to_json
  """
  @spec from_spacy(spacy_json :: map()) :: t()
  def from_spacy(%{"sents" => []} = s) do
    %M{text: s["text"]}
  end

  def from_spacy(spacy_json) do
    sents = spacy_json["sents"]
    start = sents |> Enum.map(& &1["start"]) |> Enum.min()
    end_ = sents |> Enum.map(& &1["end"]) |> Enum.max()
    text = String.slice(spacy_json["text"], start, end_ - start)

    tokens =
      spacy_json["tokens"]
      |> Enum.map(&Token.from_spacy/1)
      |> reindex()
      |> Enum.chunk_every(2, 1, [nil])
      |> Enum.map(fn
        [tok, nil] -> tok
        [tok, next] -> %{tok | end: next.start - 1}
      end)
      |> Enum.map(fn tok ->
        %{tok | raw: String.pad_trailing(tok.raw, tok.end - tok.start + 1)}
      end)

    entities = spacy_json["ents"]

    %M{text: text, tokenizations: both_if_different(no_punct(tokens), tokens)}
    |> add_spacy_entities(entities, spacy_json)
  end

  defp add_spacy_entities(%M{} = m, [], _), do: m

  defp add_spacy_entities(%M{} = m, ents, %{"text" => text}) do
    sequences = Enum.map(ents, &[Token.from_spacy_entity(&1, text)])
    add_tokenization(m, sequences)
  end

  @doc """
  Enrich the given sentence with entities extracted via Duckling

  This function takes the output of the [Duckling JSON
  format][duckling] and enriches the given sentence with the entities
  that were found using Duckling.

  [duckling]: https://github.com/facebook/duckling
  """
  @spec add_duckling_entities(sentence :: t(), entities :: list()) :: t()
  def add_duckling_entities(%M{} = sentence, []), do: sentence

  def add_duckling_entities(%M{} = sentence, entities) do
    sequences = Enum.map(entities, &[Token.from_duckling_entity(&1)])
    add_tokenization(sentence, sequences)
  end

  @doc false
  def add_tokenization(%M{} = m, replace_token_sequences) do
    raw_tokens = List.last(m.tokenizations)

    tokenization =
      replace_token_sequences
      |> Enum.reduce(raw_tokens, fn seq, toks ->
        replace_tokens(toks, seq)
      end)

    tokenizations = both_if_different(no_punct(tokenization), tokenization)
    %M{m | tokenizations: Enum.uniq(tokenizations ++ m.tokenizations)}
  end

  defp replace_tokens(token_sequence, replace_tokens) do
    # find
    start = List.first(replace_tokens).start
    end_ = List.last(replace_tokens).end

    start_idx = Enum.find_index(token_sequence, &(&1.start == start))
    end_idx = Enum.find_index(token_sequence, &(&1.end == end_))

    cond do
      start_idx != nil and end_idx != nil and end_idx >= start_idx ->
        {a, _} = Enum.split(token_sequence, start_idx)
        {_, b} = Enum.split(token_sequence, end_idx + 1)

        (a ++ replace_tokens ++ b)
        |> reindex()

      start_idx != nil and end_idx == nil ->
        {a, _} = Enum.split(token_sequence, start_idx)

        (a ++ replace_tokens)
        |> reindex()

      start_idx == nil and end_idx != nil ->
        {_, b} = Enum.split(token_sequence, end_idx + 1)

        (replace_tokens ++ b)
        |> reindex()

      true ->
        # raise RuntimeError, "Token not found at start = #{start}, end = #{end_}"
        token_sequence
    end
  end

  defp reindex(tokens) do
    tokens
    |> Enum.with_index()
    |> Enum.map(fn {t, index} ->
      %{t | index: index}
    end)
  end

  defp both_if_different(a, b, rest \\ [])
  defp both_if_different(a, a, rest), do: [a | rest]
  defp both_if_different(a, b, rest), do: [a, b | rest]

  defp no_punct(tokens) do
    tokens |> Enum.reject(&Token.punct?/1)
  end
end

defimpl String.Chars, for: BubbleMatch.Sentence do
  def to_string(%BubbleMatch.Sentence{text: text}), do: text
end

require BubbleLib.DslStruct
BubbleLib.DslStruct.jason_derive(BubbleMatch.Sentence)