lib/scidata/yelp_full_reviews.ex

defmodule Scidata.YelpFullReviews do
  @moduledoc """
  Module for downloading the [Yelp Reviews dataset](https://www.yelp.com/dataset).
  """

  @base_url "https://s3.amazonaws.com/fast-ai-nlp/"

  @dataset_file "yelp_review_full_csv.tgz"

  alias Scidata.Utils
  alias NimbleCSV.RFC4180, as: CSV

  @doc """
  Downloads the Yelp Reviews training dataset or fetches it locally.

  ## Options.

    * `:base_url` - Dataset base URL.

      Defaults to `"https://s3.amazonaws.com/fast-ai-nlp/"`

    * `:dataset_file` - Dataset filename.

      Defaults to `"yelp_review_full_csv.tgz"`

    * `:cache_dir` - Cache directory.

      Defaults to `System.tmp_dir!()`

  """
  @spec download(Keyword.t()) :: %{review: [binary(), ...], rating: [5 | 4 | 3 | 2 | 1]}
  def download(opts \\ []), do: download_dataset(:train, opts)

  @doc """
  Downloads the Yelp Reviews test dataset or fetches it locally.

  Accepts the same options as `download/1`.
  """
  @spec download_test(Keyword.t()) :: %{
          review: [binary(), ...],
          rating: [5 | 4 | 3 | 2 | 1]
        }
  def download_test(opts \\ []), do: download_dataset(:test, opts)

  defp download_dataset(dataset_type, opts) do
    base_url = opts[:base_url] || @base_url
    dataset_file = opts[:dataset_file] || @dataset_file

    files = Utils.get!(base_url <> dataset_file, opts).body
    regex = ~r"#{dataset_type}"

    records =
      for {fname, contents} <- files,
          List.to_string(fname) =~ regex,
          reduce: [[]] do
        _ -> CSV.parse_string(contents, skip_headers: false)
      end

    %{
      review: records |> Enum.map(&List.last(&1)),
      rating: records |> Enum.map(fn x -> x |> List.first() |> String.to_integer() end)
    }
  end
end