lib/probability/freqdist.ex

defmodule Needlepoint.Probability.FreqDist do
  @moduledoc """
  A frequency distribution for the outcomes of an experiment.

  This basically just adds some functions for use with `Enum.frequencies`

  Based on the NLTK class which is a subclass of the Counter
  in Python's collections class, sometimes called a bag
  or multiset.

  ## Examples

      iex> alias Needlepoint.Probability.FreqDist

      iex> FreqDist.new()
      %Needlepoint.Probability.FreqDist{samples: %{}}

      iex> FreqDist.new("abracadabra")
      %Needlepoint.Probability.FreqDist{
          samples: %{"a" => 5, "b" => 2, "c" => 1, "d" => 1, "r" => 2}
      }

      iex> FreqDist.new("ABCABC") |> FreqDist.elements() |> Enum.sort() |> Enum.join()
      "AABBCC"

      iex> FreqDist.new("abracadabra") |> FreqDist.most_common()
      [{"a", 5}, {"r", 2}, {"b", 2}, {"d", 1}, {"c", 1}]

      iex> FreqDist.new("abracadabra") |> FreqDist.most_common(3)
      [{"a", 5}, {"r", 2}, {"b", 2}]

      iex> FreqDist.new("abracadabra") |> FreqDist.update("simsalabim") |> FreqDist.most_common()
      [
        {"a", 7},
        {"b", 3},
        {"s", 2},
        {"r", 2},
        {"m", 2},
        {"i", 2},
        {"l", 1},
        {"d", 1},
        {"c", 1}
      ]

      iex> FreqDist.new("abracadabra") |> FreqDist.subtract("aaaaa")
      %Needlepoint.Probability.FreqDist{
          samples: %{"a" => 0, "b" => 2, "c" => 1, "d" => 1, "r" => 2}
      }

  """
  alias __MODULE__

  # samples are stored like key => count
  defstruct samples: %{}
  @type t :: %__MODULE__{samples: map()}

  @doc "Make a new empty `FreqDict`"
  def new(), do: %FreqDist{}

  @doc """
  Make a new `FreqDict` from a string, list, map or another `FreqDist`

  ## Examples

      iex> FreqDist.new("gallahad")
      %Needlepoint.Probability.FreqDist{
          samples: %{"a" => 3, "d" => 1, "g" => 1, "h" => 1, "l" => 2}
      }

      iex> FreqDist.new(%{"a" => 4, "b" => 2})
      %Needlepoint.Probability.FreqDist{samples: %{"a" => 4, "b" => 2}}

      iex> FreqDist.new(["a","a","a","a","b","b"])
      %Needlepoint.Probability.FreqDist{samples: %{"a" => 4, "b" => 2}}

  """
  def new(samples) when is_binary(samples) do
    samples = samples |> String.graphemes |> Enum.frequencies

    %FreqDist{samples: samples}
  end
  def new(%FreqDist{} = fd), do: %FreqDist{samples: fd.samples}
  def new(samples) when is_map(samples), do: %FreqDist{samples: samples}
  def new(samples), do: %FreqDist{samples: samples |> Enum.frequencies}

  @doc """
  List all counts from the most common to the least.

  ## Examples

      iex> alias Needlepoint.Probability.FreqDist
      Needlepoint.Probability.FreqDist
      iex> FreqDist.new("aabbbcccddddd") |> FreqDist.most_common()
      [{"d", 5}, {"c", 3}, {"b", 3}, {"a", 2}]
  """
  def most_common(%FreqDist{} = fd), do: Enum.sort(fd.samples, &(elem(&1,1) > elem(&2,1)))


  @doc """
  List n counts from the most common to the least.

  ## Examples

      iex> alias Needlepoint.Probability.FreqDist
      Needlepoint.Probability.FreqDist
      iex> FreqDist.new("aabbbcccddddd") |> FreqDist.most_common(1)
      [{"d", 5}]
  """
  def most_common(%FreqDist{} = fd, n), do: most_common(fd) |> Enum.take(n)

  @doc """
  Iterate over elements repeating each as many times as its count.

  ## Examples

      iex> FreqDist.new("ABCABC") |> FreqDist.elements() |> Enum.sort()
      ["A", "A", "B", "B", "C", "C"]

      # Knuth's example for prime factors of 1836:  2**2 * 3**3 * 17**1
      iex> FreqDist.new(%{2 => 2, 3 => 3, 17 => 1}) |> FreqDist.elements() |> Enum.reduce(1, fn x, acc -> x * acc end)
      1836


  Note, if an element's count has been set to zero or is a negative number, `elements()` will ignore it.
  """
  def elements(%FreqDist{} = fd) do
    Enum.map(fd.samples, fn {key,count} -> List.duplicate(key, count) end) |> List.flatten
  end

  @doc """
  Update the FreqDict `fd` with the new set of samples.

  ## Examples

      iex> alias Needlepoint.Probability.FreqDist
      Needlepoint.Probability.FreqDist
      iex> FreqDist.new("aaa") |> FreqDist.update("bbb")
      %Needlepoint.Probability.FreqDist{samples: %{"a" => 3, "b" => 3}}

  """
  def update(%FreqDist{} = fd, samples) do
    fd2 = FreqDist.new(samples)
    Map.merge(fd.samples, fd2.samples, fn _k, v1, v2 -> v1 + v2 end)
    |> FreqDist.new()
  end

  @doc """
  Update the FreqDict `fd` by subtracting values in the samples.

  ## Examples

      iex> FreqDist.new("aaabbb") |> FreqDist.subtract(FreqDist.new("aba"))
      %Needlepoint.Probability.FreqDist{samples: %{"a" => 1, "b" => 2}}
  """
  def subtract(%FreqDist{} = fd, samples) do
    fd2 = FreqDist.new(samples)

    Map.merge(fd.samples, fd2.samples, fn _k, v1, v2 -> v1 - v2 end)
    |> FreqDist.new()
  end

  @doc """
  union is the maximum of value in either of the input counters.

  ## Examples

      iex> FreqDist.new("abbb") |> FreqDist.union(FreqDist.new("bcc"))
      %Needlepoint.Probability.FreqDist{samples: %{"a" => 1, "b" => 3, "c" => 2}}
  """
  def union(%FreqDist{} = fd, samples) do
    fd2 = FreqDist.new(samples)

    Map.merge(fd.samples, fd2.samples, fn _k, v1, v2 -> Enum.max([v1, v2]) end)
    |> FreqDist.new()
  end

  @doc """
  intersection is the minimum of corresponding counts.

  Values that only appear in one count are dropped.

  ## Examples

      iex> FreqDist.new("abbb") |> FreqDist.intersection(FreqDist.new("bcc"))
      %Needlepoint.Probability.FreqDist{samples: %{"b" => 1}}
  """
  def intersection(%FreqDist{} = fd, samples) do
    fd2 = FreqDist.new(samples)

    common =
      MapSet.new(Map.keys(fd.samples))
      |> MapSet.intersection(MapSet.new(Map.keys(fd2.samples)))
      |> MapSet.to_list()

    Map.merge(
      Map.take(fd.samples, common),
      Map.take(fd2.samples, common),
      fn _k, v1, v2 -> min(v1,v2) end
    )
    |> FreqDist.new()
  end

  @doc """
  Return the total number of sample outcomes that have been recorded.

  ## Examples

      iex> FreqDist.n(FreqDist.new("aabbccdd"))
      8
  """
  def n(%FreqDist{} = fd) do
    Enum.sum(Map.values(fd.samples))
  end

  @doc """
  Return the total number of sample values ("bins") that have counts greater than zero.
  Called `B` in nltk.

  ## Examples

      iex> FreqDist.bins(FreqDist.new(%{"a" => 0, "b" => 1}))
      1
  """
  def bins(%FreqDist{} = fd) do
    length(Enum.filter(Map.values(fd.samples), &(&1 > 0)))
  end


  @doc """
  Return a list of all samples that occur once (hapax legomena)

  ## Examples

      iex> FreqDist.hapaxes(FreqDist.new(%{"a" => 0, "b" => 1, "c" => 2}))
      ["b"]
  """
  def hapaxes(%FreqDist{} = fd) do
    fd.samples
      |> Enum.filter(fn {_,v} -> v == 1 end)
      |> Map.new
      |> Map.keys
  end

  @doc """
  Return the dictionary mapping r to Nr, the number of samples with frequency r, where Nr > 0.

  ## Examples

      iex> FreqDist.r_nr(FreqDist.new(%{"a" => 0, "b" => 1, "c" => 2, "d" => 2}))
      %{0 => 1, 1 => 1, 2 => 2}
  """
  def r_nr(%FreqDist{} = fd) do
    fd.samples
    |> Map.values()
    |> Enum.reduce(Map.new, fn x,acc -> Map.update(acc, x, 1, fn existing -> existing + 1 end) end)
  end

  @doc """
  Return the frequency of a given sample.

  The frequency of a sample is defined as the count of that sample divided by the
  total number of sample outcomes that have been recorded by
  this FreqDist.  The count of a sample is defined as the number of times that
  sample outcome was recorded by this `FreqDist`.

  Frequencies are always real numbers in the range [0, 1]

  ## Examples

      iex> FreqDist.freq(FreqDist.new(%{"a" => 0, "b" => 1, "c" => 2, "d" => 2}), "z")
      0.0

      iex> FreqDist.freq(FreqDist.new(%{"a" => 0, "b" => 1, "c" => 2, "d" => 2}), "a")
      0.0

      iex> FreqDist.freq(FreqDist.new(%{"a" => 0, "b" => 1, "c" => 2, "d" => 2}), "b")
      0.2
  """
  def freq(%FreqDist{} = fd, sample) do
    n = FreqDist.n(fd)
    cond do
      n == 0 -> 0
      n -> Map.get(fd.samples, sample, 0) / n
    end
  end

  @doc """
  Return the sample with the greatest number of outcomes.

  If two or more samples have the same number of outcomes,
  return one of them; which sample is returned is undefined.

  If no outcomes have occurred in this frequency distribution, return nil.

  ## Examples

      iex> FreqDist.max(FreqDist.new())
      nil

      iex> FreqDist.max(FreqDist.new(%{"a" => 0, "b" => 1, "c" => 2}))
      "c"
  """
  def max(%FreqDist{} = fd) do
    try do
      {sample, _count} = Enum.max_by(fd.samples, fn {_x,y} -> y end)
      sample
    rescue
      Enum.EmptyError -> nil
    end
  end
end