lib/paasaa.ex

defmodule Paasaa do
  @moduledoc """
  Provides language detection functions

  ## Examples

      iex> Paasaa.detect "Detect this!"
      "eng"

  """

  @scripts Paasaa.Data.scripts()

  @languages Paasaa.Data.languages()

  @max_difference 300

  @type result :: [{language :: String.t(), score :: number}]

  @type options :: [
          min_length: integer,
          max_length: integer,
          whitelist: [String.t()],
          blacklist: [String.t()]
        ]

  @default_options [
    min_length: 10,
    max_length: 2048,
    whitelist: [],
    blacklist: []
  ]

  @und [{"und", 1}]

  @doc """
  Detects a language. Returns a string with ISO6393 language code (e.g. "eng").

  ## Parameters

    - `str` - a text string
    - `options` - a keyword list with options:
      - `:min_length` - If the text is shorter than `:min_length` it will return `und`. Default: `10`.
      - `:max_length` - Maximum length to analyze. Default: `2048`.
      - `:whitelist` - Allow languages. Default: `[]`.
      - `:blacklist` - Disallow languages. Default: `[]`.

  ## Examples

  Detect a string:

      iex> Paasaa.detect "Detect this!"
      "eng"

  With the `:blacklist` option:

      iex> Paasaa.detect "Detect this!", blacklist: ["eng"]
      "sco"

  With the `:min_length` option:

      iex> Paasaa.detect "Привет", min_length: 6
      "rus"

  It returns `und` for undetermined language:
      iex> Paasaa.detect "1234567890"
      "und"
  """

  @spec detect(str :: String.t(), options) :: language :: String.t()
  def detect(str, options \\ @default_options) do
    str
    |> all(options)
    |> List.first()
    |> elem(0)
  end

  @doc """
  Detects a language. Returns a list of languages scored by probability.

  ## Parameters

    - `str` - a text string
    - `options` - a keyword list with options, see `detect/2` for details.

  ## Examples

  Detect language and limit results to 5:

      iex> Paasaa.all("Detect this!") |> Enum.take(5)
      [
        {"eng", 1.0},
        {"sco", 0.8230731943771207},
        {"nob", 0.6030053320407174},
        {"nno", 0.5525933107125545},
        {"swe", 0.508482792050412}
      ]
  """

  @spec all(str :: String.t(), options) :: result
  def all(str, options \\ @default_options)
  def all("", _), do: @und
  def all(nil, _), do: @und

  def all(str, options) do
    options = Keyword.merge(@default_options, options)

    if String.length(str) < options[:min_length] do
      @und
    else
      process(str, options)
    end
  end

  @spec process(str :: String.t(), options) :: result
  defp process(str, options) do
    str = String.slice(str, 0, options[:max_length])

    {script, count} = detect_script(str)

    cond do
      count == 0 ->
        @und

      Map.has_key?(@languages, script) ->
        str
        |> get_clean_trigrams
        |> get_distances(@languages[script], options)
        |> normalize(str)

      true ->
        if allowed?(script, options) do
          [{script, 1}]
        else
          @und
        end
    end
  end

  defp allowed?(lang, options) do
    white = options[:whitelist]
    black = options[:blacklist]
    (Enum.empty?(white) || Enum.member?(white, lang)) && !Enum.member?(black, lang)
  end

  @doc """
  Detects a script.

  ## Parameters

    - `str` - a text string

  ## Examples

      iex> Paasaa.detect_script("Detect this!")
      {"Latin", 0.8333333333333334}
  """

  @spec detect_script(str :: String.t()) :: {String.t(), number}
  def detect_script(str) do
    len = String.length(str)

    @scripts
    |> Enum.map(fn {name, re} -> {name, get_occurrence(str, re, len)} end)
    |> Enum.max_by(fn {_, count} -> count end)
  end

  @spec get_occurrence(str :: String.t(), re :: Regex.t(), str_len :: non_neg_integer) :: float
  defp get_occurrence(str, re, str_len) do
    Enum.count(Regex.scan(re, str)) / str_len
  end

  @spec get_distances([String.t()], Enumerable.t(), options) :: result
  defp get_distances(trigrams, languages, options) do
    languages
    |> filter_languages(options)
    |> Enum.map(fn {lang, model} -> {lang, get_distance(trigrams, model)} end)
    |> Enum.sort(&(elem(&1, 1) < elem(&2, 1)))
  end

  @spec get_distance([String.t()], Enumerable.t()) :: number
  defp get_distance(trigrams, model) do
    Enum.reduce(trigrams, 0, fn {name, val}, distance ->
      distance +
        if Map.has_key?(model, name) do
          abs(val - model[name] - 1)
        else
          @max_difference
        end
    end)
  end

  @spec filter_languages([String.t()], Enumerable.t()) :: Enumerable.t()
  defp filter_languages(languages, options) do
    white = options[:whitelist]
    black = options[:blacklist]

    if Enum.empty?(white) && Enum.empty?(black) do
      languages
    else
      Enum.filter(languages, fn {lang, _} ->
        allowed?(lang, options)
      end)
    end
  end

  @spec normalize(result, String.t()) :: result
  defp normalize([], _str), do: @und

  defp normalize(distances, str) do
    min = distances |> List.first() |> elem(1)
    max = String.length(str) * @max_difference - min

    Enum.map(distances, fn {lang, dist} ->
      dist = if max == 0, do: 0, else: 1 - (dist - min) / max

      {lang, dist}
    end)
  end

  # trigram stuff

  @spec get_clean_trigrams(String.t()) :: result
  defp get_clean_trigrams(str) do
    str
    |> clean
    |> pad
    |> n_grams
    |> Enum.reduce(%{}, fn trigram, acc ->
      count = (acc[trigram] && acc[trigram] + 1) || 1
      Map.put(acc, trigram, count)
    end)
    |> Map.to_list()
  end

  @spec clean(str :: String.t()) :: String.t()
  defp clean(str) do
    expression_symbols = ~r/[\x{0021}-\x{0040}]+/u

    str
    |> String.replace(expression_symbols, " ")
    |> String.replace(~r/\s+/, " ")
    |> String.trim()
    |> String.downcase()
  end

  defp pad(str), do: " #{str} "

  @spec n_grams(str :: String.t(), n :: number) :: [String.t()]
  defp n_grams(str, n \\ 3) do
    str
    |> String.graphemes()
    |> Enum.chunk_every(n, 1, :discard)
    |> Enum.map(&Enum.join/1)
  end
end