lib/scidata/imdb_reviews.ex

defmodule Scidata.IMDBReviews do
  @moduledoc """
  Module for downloading the [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/).
  """

  @base_url "http://ai.stanford.edu/~amaas/data/sentiment/"
  @dataset_file "aclImdb_v1.tar.gz"

  alias Scidata.Utils

  @type train_sentiment :: :pos | :neg | :unsup
  @type test_sentiment :: :pos | :neg

  @doc """
  Downloads the IMDB reviews training dataset or fetches it locally.

  `example_types` specifies which examples in the dataset should be returned
  according to each example's label: `:pos` for positive examples, `:neg` for
  negative examples, and `:unsup` for unlabeled examples. If no `example_types`
  are provided, `:pos` and `:neg` examples are fetched.

  ## Options.

    * `:base_url` - Dataset base URL.

      Defaults to `"http://ai.stanford.edu/~amaas/data/sentiment/"`

    * `:dataset_file` - Dataset filename.

      Defaults to `"aclImdb_v1.tar.gz"`

    * `:cache_dir` - Cache directory.

      Defaults to `System.tmp_dir!()`

  """
  @spec download(example_types: [train_sentiment]) :: %{
          review: [binary(), ...],
          sentiment: [1 | 0 | nil]
        }
  def download(opts \\ []), do: download_dataset(:train, opts)

  @doc """
  Downloads the IMDB reviews test dataset or fetches it locally.

  `example_types` is the same as in `download/1`, but `:unsup` is
  unavailable because all unlabeled examples are in the training set.

  Accepts the same options as `download/1`.
  """
  @spec download_test(example_types: [test_sentiment]) :: %{
          review: [binary(), ...],
          sentiment: [1 | 0]
        }
  def download_test(opts \\ []), do: download_dataset(:test, opts)

  defp download_dataset(dataset_type, opts) do
    example_types = opts[:example_types] || [:pos, :neg]
    base_url = opts[:base_url] || @base_url
    dataset_file = opts[:dataset_file] || @dataset_file

    files = Utils.get!(base_url <> dataset_file, opts).body
    regex = ~r"#{dataset_type}/(#{Enum.join(example_types, "|")})/"

    {inputs, labels} =
      for {fname, contents} <- files,
          List.to_string(fname) =~ regex,
          reduce: {[], []} do
        {inputs, labels} ->
          {[contents | inputs], [get_label(fname) | labels]}
      end

    %{review: inputs, sentiment: labels}
  end

  defp get_label(fname) do
    fname = List.to_string(fname)

    cond do
      fname =~ "pos" -> 1
      fname =~ "neg" -> 0
      fname =~ "unsup" -> nil
    end
  end
end