lib/akin.ex

defmodule Akin do
  @moduledoc """
  Akin
  =======

  Functions for comparing two strings for similarity using a collection of string comparison algorithms for Elixir. Algorithms can be called independently or in total to return a map of metrics.

  ## Options

  Options accepted in a keyword list (i.e. [ngram_size: 3]).

  1. `algorithms`: algorithms to use in comparision. Accepts the name or a keyword list. Default is algorithms/0.
      1. `metric` - algorithm metric. Default is both
        - "string": uses string algorithms
        - "phonetic": uses phonetic algorithms
      1. `unit` - algorithm unit. Default is both.
        - "whole": uses algorithms best suited for whole string comparison (distance)
        - "partial": uses algorithms best suited for partial string comparison (substring)
  1. `level` - level for double phonetic matching. Default is "normal".
      - "strict": both encodings for each string must match
      - "strong": the primary encoding for each string must match
      - "normal": the primary encoding of one string must match either encoding of other string (default)
      - "weak":   either primary or secondary encoding of one string must match one encoding of other string
  1. `match_at`: an algorith score equal to or above this value is condsidered a match. Default is 0.9
  1. `ngram_size`: number of contiguous letters to split strings into. Default is 2.
  1. `short_length`: qualifies as "short" to recieve a shortness boost. Used by Name Metric. Default is 8.
  1. `stem`: boolean representing whether to compare the stemmed version the strings; uses Stemmer. Default `false`
  """
  import Akin.Util,
    only: [list_algorithms: 1, modulize: 1, compose: 1, opts: 2, r: 1, default_opts: 0]

  alias Akin.Corpus
  alias Akin.Names

  @spec compare(binary() | %Corpus{}, binary() | %Corpus{}, keyword()) :: float()
  @doc """
  Compare two strings. Return map of algorithm metrics.

  Options accepted as a keyword list. If no options are given, default values will be used.
  """
  def compare(left, right, opts \\ default_opts())

  def compare(left, right, opts) when is_binary(left) and is_binary(right) do
    if opts(opts, :stem) do
      left = compose(left).stems |> Enum.join(" ")
      right = compose(right).stems |> Enum.join(" ")
      compare(compose(left), compose(right), opts)
    else
      compare(compose(left), compose(right), opts)
    end
  end

  def compare(%Corpus{} = left, %Corpus{} = right, opts) do
    Enum.reduce(list_algorithms(opts), %{}, fn algorithm, acc ->
      Map.put(acc, algorithm, apply(modulize(algorithm), :compare, [left, right, opts]))
    end)
    |> Enum.reduce([], fn {k, v}, acc ->
      if is_nil(v) do
        acc
      else
        [{String.replace(k, ".", ""), v} | acc]
      end
    end)
    |> Enum.map(fn {k, v} -> {String.to_atom(k), r(v)} end)
    |> Enum.into(%{})
  end

  @spec match_names(binary() | %Corpus{}, binary() | %Corpus{} | list(), keyword()) :: float()
  @doc """
  Compare a string against a list of strings.  Matches are determined by algorithem metrics equal to or higher than the
  `match_at` option. Return a list of strings that are a likely match.
  """
  def match_names(left, rights, opts \\ default_opts())

  def match_names(_, [], _), do: []

  def match_names(left, rights, opts) when is_binary(left) and is_list(rights) do
    rights = Enum.map(rights, fn right -> compose(right) end)
    match_names(compose(left), rights, opts)
  end

  def match_names(%Corpus{} = left, rights, opts) do
    Enum.reduce(rights, [], fn right, acc ->
      %{scores: scores} = Names.compare(left, right, opts)
      if Enum.any?(scores, fn {_algo, score} -> score > opts(opts, :match_at) end) do
        [right.original | acc]
      else
        acc
      end
    end)
  end

  @spec match_names_metrics(binary(), list(), keyword()) :: float()
  @doc """
  Compare a string against a list of strings. Matches are determined by algorithem metrics equal to or higher than the
  `match_at` option. Return a list of strings that are a likely match and their algorithm metrics.
  """
  def match_names_metrics(left, rights, opts \\ default_opts())

  def match_names_metrics(left, rights, opts) when is_binary(left) and is_list(rights) do
    Enum.reduce(rights, [], fn right, acc ->
      %{left: left, right: right, metrics: scores, match: match} =
        match_name_metrics(left, right, opts)

      if match == 1 do
        [%{left: left, right: right, metrics: scores, match: 1} | acc]
      else
        [%{left: left, right: right, metrics: scores, match: 0} | acc]
      end
    end)
  end

  @spec match_name_metrics(binary(), binary(), Keyword.t()) :: %{
    :left => binary(),
    :match => 0 | 1,
    :metrics => [any()],
    :right => binary()
  }
  @doc """
  Compare a string to a string with logic specific to names. Matches are determined by algorithem
  metrics equal to or higher than the `match_at` option. Return a list of strings that are a likely
  match and their algorithm metrics.
  """
  def match_name_metrics(left, rights, opts \\ default_opts())

  def match_name_metrics(left, right, opts) when is_binary(left) and is_binary(right) do
    left = compose(left)
    right = compose(right)

    %{scores: scores} = Names.compare(left, right, opts)
    left = Enum.join(left.list, " ")
    right = Enum.join(right.list, " ")

    if Enum.any?(scores, fn {_algo, score} -> score > opts(opts, :match_at) end) do
      %{left: left, right: right, metrics: scores, match: 1}
    else
      %{left: left, right: right, metrics: scores, match: 0}
    end
  end

  @spec phonemes(binary() | %Corpus{}) :: list()
  @doc """
  Returns list of unique phonetic encodings produces by the single and
  double metaphone algorithms.
  """
  def phonemes(string) when is_binary(string) do
    phonemes(compose(string), string)
  end

  defp phonemes(%Corpus{string: string}, _original_string) do
    single = Akin.Metaphone.Single.compute(string)
    double = Akin.Metaphone.Double.parse(string) |> Tuple.to_list()

    [single | double]
    |> List.flatten()
    |> Enum.uniq()
  end
end