lib/arangox_ecto/analyzer.ex

Select File:
lib/arangox_ecto/analyzer.ex

defmodule ArangoXEcto.Analyzer do
  @moduledoc """
  Defines an analyzer for use in views

  Since analyzer defintions are short and you may have many of them, you can just define multiple
  analyzer modules in one file, e.g. named analyzers.ex.

  ## Example

      defmodule MyApp.Analyzers do
        use ArangoXEcto.Analyzer

        norm :norm_en, [:frequency, :norm, :position], %{
          locale: "en",
          accent: false,
          case: :lower
        }

        # this exists by default but is just used as an example
        text :text_en, [:frequency, :norm, :position], %{
          locale: "en",
          accent: false,
          stemming: true,
          case: :lower
        }

        # Needed to compile the analyzers
        build()
      end

  ## Features

  The following are the features available to all the analyzers. Some analyzers and functions need certin features enabled,
  refer to the [ArangoDB docs](https://www.arangodb.com/docs/stable/analyzers.html#analyzer-features) for more info.

      * `:frequency` - (boolean) - track how often a term occurs.
      * `:norm` - (boolean) - calculate and store the field normalization factor that is used to score fairer if the same term is repeated, reducing its importance.
      * `:position` - (boolean) - enumerate the tokens for position-dependent queries.
  """
  @moduledoc since: "1.3.0"

  @doc false
  defmacro __using__(_) do
    quote do
      import ArangoXEcto.Analyzer

      Module.register_attribute(__MODULE__, :analyzers, accumulate: true)

      Module.put_attribute(__MODULE__, :pipeline, nil)
    end
  end

  @doc """
  Compiles analyzers
  """
  defmacro build do
    quote do
      def __analyzers__, do: @analyzers
    end
  end

  @doc """
  Defines an identity type analyzer.

  An Analyzer applying the identity transformation, i.e. returning the input unmodified.

  Refer to the [ArangoDB Identity Docs](https://www.arangodb.com/docs/stable/analyzers.html#identity) for more info.

  This does not accept any properties.

  ## Parameters

      * `name` - atom of the analyzer name
      * `features` - the features options to be set (see [Analyzer Features](https://hexdocs.pm/arangox_ecto/ArangoXEcto.Analyzer.html#module-features))
  """
  defmacro identity(name, features) do
    quote do
      name = ArangoXEcto.Analyzer.validate_name(unquote(name))
      features = ArangoXEcto.Analyzer.validate_features(unquote(features))

      analyzer = %{
        name: name,
        type: :identity,
        features: features
      }

      pipeline = Module.get_attribute(__MODULE__, :pipeline)

      if is_nil(pipeline) do
        Module.put_attribute(__MODULE__, :analyzers, analyzer)
      else
        # in pipeline
        Module.put_attribute(__MODULE__, :pipeline, [analyzer | pipeline])
      end
    end
  end

  @doc """
  Defines a delimiter type analyzer.

  An Analyzer capable of breaking up delimited text into tokens as per RFC 4180 (without starting new records on newlines).

  Refer to the [ArangoDB Delimiter Docs](https://www.arangodb.com/docs/stable/analyzers.html#delimiter) for more info.

  ## Parameters

      * `name` - atom of the analyzer name
      * `features` - the features options to be set (see [Analyzer Features](https://hexdocs.pm/arangox_ecto/ArangoXEcto.Analyzer.html#module-features))
      * `properties` - a map of the properties to be set, see below

  ## Properties

      * `delimiter` - (string) - the delimiting character(s). The whole string is considered as one delimiter.
  """
  defmacro delimiter(name, features, properties \\ %{}) do
    quote do
      name = ArangoXEcto.Analyzer.validate_name(unquote(name))
      features = ArangoXEcto.Analyzer.validate_features(unquote(features))

      properties =
        ArangoXEcto.Analyzer.validate_properties(unquote(properties), name, [:delimiter])

      analyzer = %{
        name: name,
        type: :delimiter,
        features: features,
        properties: properties
      }

      pipeline = Module.get_attribute(__MODULE__, :pipeline)

      if is_nil(pipeline) do
        Module.put_attribute(__MODULE__, :analyzers, analyzer)
      else
        # in pipeline
        Module.put_attribute(__MODULE__, :pipeline, [analyzer | pipeline])
      end
    end
  end

  @doc """
  Defines a stem type analyzer.

  An Analyzer capable of stemming the text, treated as a single token, for supported languages.

  Refer to the [ArangoDB Stem Docs](https://www.arangodb.com/docs/stable/analyzers.html#stem) for more info.

  ## Parameters

      * `name` - atom of the analyzer name
      * `features` - the features options to be set (see [Analyzer Features](https://hexdocs.pm/arangox_ecto/ArangoXEcto.Analyzer.html#module-features))
      * `properties` - a map of the properties to be set, see below

  ## Properties

      * `locale` - (string) - a locale in the format language[_COUNTRY] (square brackets denote optional parts), e.g. "de" or "en_US".
  """
  defmacro stem(name, features, properties \\ %{}) do
    quote do
      name = ArangoXEcto.Analyzer.validate_name(unquote(name))
      features = ArangoXEcto.Analyzer.validate_features(unquote(features))

      properties = ArangoXEcto.Analyzer.validate_properties(unquote(properties), name, [:locale])

      analyzer = %{
        name: name,
        type: :stem,
        features: features,
        properties: properties
      }

      pipeline = Module.get_attribute(__MODULE__, :pipeline)

      if is_nil(pipeline) do
        Module.put_attribute(__MODULE__, :analyzers, analyzer)
      else
        # in pipeline
        Module.put_attribute(__MODULE__, :pipeline, [analyzer | pipeline])
      end
    end
  end

  @doc """
  Defines a norm type analyzer.

  An Analyzer capable of normalizing the text, treated as a single token, i.e. case conversion and accent removal.

  Refer to the [ArangoDB Norm Docs](https://www.arangodb.com/docs/stable/analyzers.html#norm) for more info.

  ## Parameters

      * `name` - atom of the analyzer name
      * `features` - the features options to be set (see [Analyzer Features](https://hexdocs.pm/arangox_ecto/ArangoXEcto.Analyzer.html#module-features))
      * `properties` - a map of the properties to be set, see below

  ## Properties

      * `locale` - (string) - a locale in the format language[_COUNTRY] (square brackets denote optional parts), e.g. "de" or "en_US".
      * `accent` - (boolean) - whether to preserve accented characters or convert them to the base characters.
      * `case` - (atom) - option of how to store case
        * `:lower` - to convert to all lower-case characters
        * `:upper` - to convert to all upper-case characters
        * `:none` - to not change character case (default)
  """
  defmacro norm(name, features, properties \\ %{}) do
    quote do
      name = ArangoXEcto.Analyzer.validate_name(unquote(name))
      features = ArangoXEcto.Analyzer.validate_features(unquote(features))

      properties =
        ArangoXEcto.Analyzer.validate_properties(unquote(properties), name, [
          :locale,
          :accent,
          :case
        ])

      analyzer = %{
        name: name,
        type: :norm,
        features: features,
        properties: properties
      }

      pipeline = Module.get_attribute(__MODULE__, :pipeline)

      if is_nil(pipeline) do
        Module.put_attribute(__MODULE__, :analyzers, analyzer)
      else
        # in pipeline
        Module.put_attribute(__MODULE__, :pipeline, [analyzer | pipeline])
      end
    end
  end

  @doc """
  Defines a ngram type analyzer.

  An Analyzer capable of producing n-grams from a specified input in a range of min..max (inclusive). Can optionally preserve the original input.

  Refer to the [ArangoDB NGram Docs](https://www.arangodb.com/docs/stable/analyzers.html#ngram) for more info.

  ## Parameters

      * `name` - atom of the analyzer name
      * `features` - the features options to be set (see [Analyzer Features](https://hexdocs.pm/arangox_ecto/ArangoXEcto.Analyzer.html#module-features))
      * `properties` - a map of the properties to be set, see below

  ## Properties

      * `min` - (integer) - minimum n-gram length.
      * `max` - (integer) - maximum n-gram length.
      * `preserveOriginal` - (boolean) - whether to include the original value or just use the min & max values.
      * `startMarker` - (string) - this value will be prepended to n-grams which include the beginning of the input.
      * `endMarker` - (string) - this value will be appended to n-grams which include the end of the input.
      * `streamType` - (atom) - type of the input stream.
        * `:binary` - one byte is considered as one character (default).
        * `:utf8` - one Unicode codepoint is treated as one character.
  """
  defmacro ngram(name, features, properties \\ %{}) do
    quote do
      name = ArangoXEcto.Analyzer.validate_name(unquote(name))
      features = ArangoXEcto.Analyzer.validate_features(unquote(features))

      properties =
        ArangoXEcto.Analyzer.validate_properties(unquote(properties), name, [
          :min,
          :max,
          :preserveOriginal,
          :startMarker,
          :endMarker,
          :streamType
        ])

      analyzer = %{
        name: name,
        type: :ngram,
        features: features,
        properties: properties
      }

      pipeline = Module.get_attribute(__MODULE__, :pipeline)

      if is_nil(pipeline) do
        Module.put_attribute(__MODULE__, :analyzers, analyzer)
      else
        # in pipeline
        Module.put_attribute(__MODULE__, :pipeline, [analyzer | pipeline])
      end
    end
  end

  @doc """
  Defines a text type analyzer.

  An Analyzer capable of breaking up strings into individual words while also optionally filtering out stop-words,
  extracting word stems, applying case conversion and accent removal.

  Refer to the [ArangoDB Text Docs](https://www.arangodb.com/docs/stable/analyzers.html#text) for more info.

  ## Parameters

      * `name` - atom of the analyzer name
      * `features` - the features options to be set (see [Analyzer Features](https://hexdocs.pm/arangox_ecto/ArangoXEcto.Analyzer.html#module-features))
      * `properties` - a map of the properties to be set, see below

  ## Properties

      * `locale` - (string) - a locale in the format language[_COUNTRY] (square brackets denote optional parts), e.g. "de" or "en_US".
      * `accent` - (boolean) - whether to preserve accented characters or convert them to the base characters.
      * `case` - (string) - option of how to store case
        * `:lower` - to convert to all lower-case characters
        * `:upper` - to convert to all upper-case characters
        * `:none` - to not change character case (default)
      * `stemming` - (boolean) - whether to apply stemming on returned words or leave as-is
      * `edgeNgram` - (map) - if present, then edge n-grams are generated for each token (word). 
        * `min` - (integer) - minimum n-gram length.
        * `max` - (integer) - maximum n-gram length.
        * `preserveOriginal` - (boolean) - whether to include the original value or just use the min & max values.
      * `stopwords` - (list of strings) - a list of strings with words to omit from result.
      * `stopwordsPath` - (string) - path with a language sub-directory (e.g. en for a locale en_US) containing files with words to omit.
  """
  defmacro text(name, features, properties \\ %{}) do
    quote do
      name = ArangoXEcto.Analyzer.validate_name(unquote(name))
      features = ArangoXEcto.Analyzer.validate_features(unquote(features))

      properties =
        ArangoXEcto.Analyzer.validate_properties(unquote(properties), name, [
          :locale,
          :accent,
          :case,
          :stemming,
          :edgeNgram,
          :stopwords,
          :stopwordsPath
        ])

      analyzer = %{
        name: name,
        type: :text,
        features: features,
        properties: properties
      }

      pipeline = Module.get_attribute(__MODULE__, :pipeline)

      if is_nil(pipeline) do
        Module.put_attribute(__MODULE__, :analyzers, analyzer)
      else
        # in pipeline
        Module.put_attribute(__MODULE__, :pipeline, [analyzer | pipeline])
      end
    end
  end

  @doc """
  Defines a collation type analyzer.

  An Analyzer capable of converting the input into a set of language-specific tokens. This makes comparisons follow the
  rules of the respective language, most notable in range queries against Views.

  Refer to the [ArangoDB Collation Docs](https://www.arangodb.com/docs/stable/analyzers.html#collation) for more info.

  ## Parameters

      * `name` - atom of the analyzer name
      * `features` - the features options to be set (see [Analyzer Features](https://hexdocs.pm/arangox_ecto/ArangoXEcto.Analyzer.html#module-features))
      * `properties` - a map of the properties to be set, see below

  ## Properties

      * `locale` - (string) - a locale in the format language[_COUNTRY] (square brackets denote optional parts), e.g. "de" or "en_US".
  """
  defmacro collation(name, features, properties \\ %{}) do
    quote do
      name = ArangoXEcto.Analyzer.validate_name(unquote(name))
      features = ArangoXEcto.Analyzer.validate_features(unquote(features))

      properties = ArangoXEcto.Analyzer.validate_properties(unquote(properties), name, [:locale])

      analyzer = %{
        name: name,
        type: :collation,
        features: features,
        properties: properties
      }

      pipeline = Module.get_attribute(__MODULE__, :pipeline)

      if is_nil(pipeline) do
        Module.put_attribute(__MODULE__, :analyzers, analyzer)
      else
        # in pipeline
        Module.put_attribute(__MODULE__, :pipeline, [analyzer | pipeline])
      end
    end
  end

  @doc """
  Defines an aql type analyzer.

  An Analyzer capable of running a restricted AQL query to perform data manipulation / filtering.

  Refer to the [ArangoDB AQL Docs](https://www.arangodb.com/docs/stable/analyzers.html#aql) for more info.

  ## Parameters

      * `name` - atom of the analyzer name
      * `features` - the features options to be set (see [Analyzer Features](https://hexdocs.pm/arangox_ecto/ArangoXEcto.Analyzer.html#module-features))
      * `properties` - a map of the properties to be set, see below

  ## Properties

      * `queryString` - (string) - AQL query to be executed.
      * `collapsePositions` - (boolean) - whether to set the position to 0 for all members of the query result array (true) or
        set the position corresponding to the index of the result array member (false).
      * `keepNull` - (boolean) - either treat treat null like an empty string or discard null.
      * `batchSize` - (integer) - number between 1 and 1000 (default = 1) that determines the batch size for reading data from the query.
      * `memoryLimit` - (integer) - memory limit for query execution in bytes. (default is 1048576 = 1Mb) Maximum is 33554432U (32Mb).
      * `returnType` - (string) - data type of the returned tokens.
        `:string` - convert emitted tokens to strings.
        `:number` - convert emitted tokens to numbers.
        `:bool` - convert emitted tokens to booleans.
  """
  defmacro aql(name, features, properties \\ %{}) do
    quote do
      name = ArangoXEcto.Analyzer.validate_name(unquote(name))
      features = ArangoXEcto.Analyzer.validate_features(unquote(features))

      properties =
        ArangoXEcto.Analyzer.validate_properties(unquote(properties), name, [
          :queryString,
          :collapsePositions,
          :keepNull,
          :batchSize,
          :memoryLimit,
          :returnType
        ])

      analyzer = %{
        name: name,
        type: :aql,
        features: features,
        properties: properties
      }

      pipeline = Module.get_attribute(__MODULE__, :pipeline)

      if is_nil(pipeline) do
        Module.put_attribute(__MODULE__, :analyzers, analyzer)
      else
        # in pipeline
        Module.put_attribute(__MODULE__, :pipeline, [analyzer | pipeline])
      end
    end
  end

  @doc """
  Defines a pipeline type analyzer.

  An Analyzer capable of chaining effects of multiple Analyzers into one. The pipeline is a list of Analyzers, where the output
  of an Analyzer is passed to the next for further processing. The final token value is determined by last Analyzer in the pipeline.

  Refer to the [ArangoDB Pipeline Docs](https://www.arangodb.com/docs/stable/analyzers.html#pipeline) for more info.

  > ### Note {: .info}
  >
  > Features are only required on the pipeline and not on the individual analyzers within. Any features on sub analyzers will be ignored if supplied.

  ## Parameters

      * `name` - atom of the analyzer name
      * `features` - the features options to be set (see [Analyzer Features](https://hexdocs.pm/arangox_ecto/ArangoXEcto.Analyzer.html#module-features))
      * `analyzers` - a block with other analyzers

  ## Example

      pipeline :my_pipeline, [:frequency, :norm, :position] do
        norm "norm_en",  %{
          locale: "en",
          accent: false,
          case: :lower
        }

        text "text_en", %{
          locale: "en",
          accent: false,
          stemming: true,
          case: :lower
        }
      end
  """
  defmacro pipeline(name, features, block) do
    quote do
      name = ArangoXEcto.Analyzer.validate_name(unquote(name))
      features = ArangoXEcto.Analyzer.validate_features(unquote(features))

      Module.put_attribute(__MODULE__, :pipeline, [])

      try do
        unquote(block)
      after
        :ok
      end

      Module.put_attribute(__MODULE__, :analyzers, %{
        name: name,
        type: :pipeline,
        features: features,
        properties: %{pipeline: Module.get_attribute(__MODULE__, :pipeline)}
      })

      Module.put_attribute(__MODULE__, :pipeline, nil)
    end
  end

  @doc """
  Defines a stopwords type analyzer.

  An Analyzer capable of removing specified tokens from the input.

  Refer to the [ArangoDB Stopwords Docs](https://www.arangodb.com/docs/stable/analyzers.html#stopwords) for more info.

  ## Parameters

      * `name` - atom of the analyzer name
      * `features` - the features options to be set (see [Analyzer Features](https://hexdocs.pm/arangox_ecto/ArangoXEcto.Analyzer.html#module-features))
      * `properties` - a map of the properties to be set, see below

  ## Properties

      * `stopwords` - (list of strings) - array of strings that describe the tokens to be discarded.
      * `hex` - (boolean) - If false (default), then each string in stopwords is used verbatim.
  """
  defmacro stopwords(name, features, properties \\ %{}) do
    quote do
      name = ArangoXEcto.Analyzer.validate_name(unquote(name))
      features = ArangoXEcto.Analyzer.validate_features(unquote(features))

      properties =
        ArangoXEcto.Analyzer.validate_properties(unquote(properties), name, [
          :stopwords,
          :hex
        ])

      analyzer = %{
        name: name,
        type: :stopwords,
        features: features,
        properties: properties
      }

      pipeline = Module.get_attribute(__MODULE__, :pipeline)

      if is_nil(pipeline) do
        Module.put_attribute(__MODULE__, :analyzers, analyzer)
      else
        # in pipeline
        Module.put_attribute(__MODULE__, :pipeline, [analyzer | pipeline])
      end
    end
  end

  @doc """
  Defines a segmentation type analyzer.

  An Analyzer capable of breaking up the input text into tokens in a language-agnostic manner as per Unicode Standard Annex #29,
  making it suitable for mixed language strings.

  Refer to the [ArangoDB Segmentation Docs](https://www.arangodb.com/docs/stable/analyzers.html#segmentation) for more info.

  ## Parameters

      * `name` - atom of the analyzer name
      * `features` - the features options to be set (see [Analyzer Features](https://hexdocs.pm/arangox_ecto/ArangoXEcto.Analyzer.html#module-features))
      * `properties` - a map of the properties to be set, see below

  ## Properties

      * `break` - (atom) - character to break at
        * `:all` - return all tokens
        * `:alpha` - return tokens composed of alphanumeric characters only (default). Alphanumeric characters are Unicode codepoints from the
          Letter and Number categories, see Unicode Technical Note #36.
        * `:graphic` - return tokens composed of non-whitespace characters only. Note that the list of whitespace characters does not include line breaks:
          * `U+0009` Character Tabulation
          * `U+0020` Space
          * `U+0085` Next Line
          * `U+00A0` No-break Space
          * `U+1680` Ogham Space Mark
          * `U+2000` En Quad
          * `U+2028` Line Separator
          * `U+202F` Narrow No-break Space
          * `U+205F` Medium Mathematical Space
          * `U+3000` Ideographic Space
      * `case` - (atom) - option of how to store case
        * `:lower` - to convert to all lower-case characters
        * `:upper` - to convert to all upper-case characters
        * `:none` - to not change character case (default)
  """
  defmacro segmentation(name, features, properties \\ %{}) do
    quote do
      name = ArangoXEcto.Analyzer.validate_name(unquote(name))
      features = ArangoXEcto.Analyzer.validate_features(unquote(features))

      properties =
        ArangoXEcto.Analyzer.validate_properties(unquote(properties), name, [
          :break,
          :graphic,
          :case
        ])

      analyzer = %{
        name: name,
        type: :segmentation,
        features: features,
        properties: properties
      }

      pipeline = Module.get_attribute(__MODULE__, :pipeline)

      if is_nil(pipeline) do
        Module.put_attribute(__MODULE__, :analyzers, analyzer)
      else
        # in pipeline
        Module.put_attribute(__MODULE__, :pipeline, [analyzer | pipeline])
      end
    end
  end

  @doc """
  Defines a minhash type analyzer.

  An Analyzer that computes so called MinHash signatures using a locality-sensitive hash function. It applies an Analyzer of
  your choice before the hashing, for example, to break up text into words.

  Refer to the [ArangoDB MinHash Docs](https://www.arangodb.com/docs/stable/analyzers.html#minhash) for more info.

  ## Parameters

      * `name` - atom of the analyzer name
      * `features` - the features options to be set (see [Analyzer Features](https://hexdocs.pm/arangox_ecto/ArangoXEcto.Analyzer.html#module-features))
      * `properties` - a map of the properties to be set, see below
      * `analyzer` - block with one analyzer (if more then one is supplied, only the last will be used

  ## Properties

      * `numHashes` - (number) - the size of the MinHash signature.
  """
  defmacro minhash(name, features, properties, block) do
    quote do
      name = ArangoXEcto.Analyzer.validate_name(unquote(name))
      features = ArangoXEcto.Analyzer.validate_features(unquote(features))

      properties =
        ArangoXEcto.Analyzer.validate_properties(unquote(properties), name, [:numHashes])

      Module.put_attribute(__MODULE__, :pipeline, [])

      try do
        unquote(block)
      after
        :ok
      end

      pipeline = Module.get_attribute(__MODULE__, :pipeline)

      if Enum.empty?(pipeline) do
        raise ArgumentError,
              "no analyzer was provided for analyzer '#{name}', an analyzer in the do block is required"
      end

      Module.put_attribute(__MODULE__, :analyzers, %{
        name: name,
        type: :minhash,
        features: features,
        analyzer: List.first(pipeline)
      })

      Module.put_attribute(__MODULE__, :pipeline, nil)
    end
  end

  @doc """
  Defines a classification type analyzer.

  An Analyzer capable of classifying tokens in the input text.

  Refer to the [ArangoDB Classification Docs](https://www.arangodb.com/docs/stable/analyzers.html#classification) for more info.

  ## Parameters

      * `name` - atom of the analyzer name
      * `features` - the features options to be set (see [Analyzer Features](https://hexdocs.pm/arangox_ecto/ArangoXEcto.Analyzer.html#module-features))
      * `properties` - a map of the properties to be set, see below

  ## Properties

      * `model_location` - (string) - the on-disk path to the trained fastText supervised model.
      * `top_k` - (number) - the number of class labels that will be produced per input (default: 1).
      * `threshold` - (number) - the probability threshold for which a label will be assigned to an input. A fastText
        model produces a probability per class label, and this is what will be filtered (default: 0.99).
  """
  defmacro classification(name, features, properties \\ %{}) do
    quote do
      name = ArangoXEcto.Analyzer.validate_name(unquote(name))
      features = ArangoXEcto.Analyzer.validate_features(unquote(features))

      properties =
        ArangoXEcto.Analyzer.validate_properties(unquote(properties), name, [
          :model_location,
          :top_k,
          :threshold
        ])

      analyzer = %{
        name: name,
        type: :classification,
        features: features,
        properties: properties
      }

      pipeline = Module.get_attribute(__MODULE__, :pipeline)

      if is_nil(pipeline) do
        Module.put_attribute(__MODULE__, :analyzers, analyzer)
      else
        # in pipeline
        Module.put_attribute(__MODULE__, :pipeline, [analyzer | pipeline])
      end
    end
  end

  @doc """
  Defines a nearest_neighbors type analyzer.

  An Analyzer capable of finding nearest neighbors of tokens in the input.

  Refer to the [ArangoDB Nearest Neighbors Docs](https://www.arangodb.com/docs/stable/analyzers.html#nearest_neighbors) for more info.

  ## Parameters

      * `name` - atom of the analyzer name
      * `features` - the features options to be set (see [Analyzer Features](https://hexdocs.pm/arangox_ecto/ArangoXEcto.Analyzer.html#module-features))
      * `properties` - a map of the properties to be set, see below

  ## Properties

      * `model_location` - (string) - the on-disk path to the trained fastText supervised model.
      * `top_k` - (number) - the number of class labels that will be produced per input (default: 1).
  """
  defmacro nearest_neighbors(name, features, properties \\ %{}) do
    quote do
      name = ArangoXEcto.Analyzer.validate_name(unquote(name))
      features = ArangoXEcto.Analyzer.validate_features(unquote(features))

      properties =
        ArangoXEcto.Analyzer.validate_properties(unquote(properties), name, [
          :model_location,
          :top_k
        ])

      analyzer = %{
        name: name,
        type: :nearest_neighbors,
        features: features,
        properties: properties
      }

      pipeline = Module.get_attribute(__MODULE__, :pipeline)

      if is_nil(pipeline) do
        Module.put_attribute(__MODULE__, :analyzers, analyzer)
      else
        # in pipeline
        Module.put_attribute(__MODULE__, :pipeline, [analyzer | pipeline])
      end
    end
  end

  @doc """
  Defines a geojson type analyzer.

  An Analyzer capable of breaking up a GeoJSON object or coordinate array in [longitude, latitude] order into a set
  of indexable tokens for further usage with ArangoSearch Geo functions.

  Refer to the [ArangoDB GeoJSON Docs](https://www.arangodb.com/docs/stable/analyzers.html#geojson) for more info.

  ## Parameters

      * `name` - atom of the analyzer name
      * `features` - the features options to be set (see [Analyzer Features](https://hexdocs.pm/arangox_ecto/ArangoXEcto.Analyzer.html#module-features))
      * `properties` - a map of the properties to be set, see below

  ## Properties

      * `type` - (atom) - type of geojson object
        * `:shape` (default) - index all GeoJSON geometry types (Point, Polygon etc.)
        * `:centroid` - compute and only index the centroid of the input geometry
        * `:point` - only index GeoJSON objects of type Point, ignore all other geometry types
      * `options` - (map) - options for fine-tuning geo queries. These options should generally remain unchanged 
        * `:maxCells` (number, optional) - maximum number of S2 cells (default: 20)
        * `:minLevel` (number, optional) - the least precise S2 level (default: 4)
        * `:maxLevel` (number, optional) - the most precise S2 level (default: 23)

  """
  defmacro geojson(name, features, properties \\ %{}) do
    quote do
      name = ArangoXEcto.Analyzer.validate_name(unquote(name))
      features = ArangoXEcto.Analyzer.validate_features(unquote(features))

      properties =
        ArangoXEcto.Analyzer.validate_properties(unquote(properties), name, [
          :type,
          :options
        ])

      analyzer = %{
        name: name,
        type: :geojson,
        features: features,
        properties: properties
      }

      pipeline = Module.get_attribute(__MODULE__, :pipeline)

      if is_nil(pipeline) do
        Module.put_attribute(__MODULE__, :analyzers, analyzer)
      else
        # in pipeline
        Module.put_attribute(__MODULE__, :pipeline, [analyzer | pipeline])
      end
    end
  end

  @doc """
  Defines a geo_s2 type analyzer.

  An Analyzer capable of breaking up a GeoJSON object or coordinate array in [longitude, latitude] order into a set of indexable
  tokens for further usage with ArangoSearch Geo functions.

  Refer to the [ArangoDB Geo S2 Docs](https://www.arangodb.com/docs/stable/analyzers.html#geo_s2) for more info.

  ## Parameters

      * `name` - atom of the analyzer name
      * `features` - the features options to be set (see [Analyzer Features](https://hexdocs.pm/arangox_ecto/ArangoXEcto.Analyzer.html#module-features))
      * `properties` - a map of the properties to be set, see below

  ## Properties

      * `format` - (atom) - the internal binary representation to use for storing the geo-spatial data in an index 
        * `:latLngDouble` (default) - store each latitude and longitude value as an 8-byte floating-point value
          (16 bytes per coordinate pair). This format preserves numeric values exactly and is more compact than the
          VelocyPack format used by the geojson Analyzer.
        * `:latLngInt` - store each latitude and longitude value as an 4-byte integer value (8 bytes per coordinate
          pair). This is the most compact format but the precision is limited to approximately 1 to 10 centimeters.
        * `:s2Point` - store each longitude-latitude pair in the native format of Google S2 which is used for
          geo-spatial calculations (24 bytes per coordinate pair). This is not a particular compact format but it
          reduces the number of computations necessary when you execute geo-spatial queries. This format preserves
          numeric values exactly.
      * `type` - (atom) - type of geojson object
        * `:shape` (default) - index all GeoJSON geometry types (Point, Polygon etc.)
        * `:centroid` - compute and only index the centroid of the input geometry
        * `:point` - only index GeoJSON objects of type Point, ignore all other geometry types
      * `options` - (map) - options for fine-tuning geo queries. These options should generally remain unchanged 
        * `:maxCells` (number, optional) - maximum number of S2 cells (default: 20)
        * `:minLevel` (number, optional) - the least precise S2 level (default: 4)
        * `:maxLevel` (number, optional) - the most precise S2 level (default: 23)

  """
  defmacro geo_s2(name, features, properties \\ %{}) do
    quote do
      name = ArangoXEcto.Analyzer.validate_name(unquote(name))
      features = ArangoXEcto.Analyzer.validate_features(unquote(features))

      properties =
        ArangoXEcto.Analyzer.validate_properties(unquote(properties), name, [
          :format,
          :type,
          :options
        ])

      analyzer = %{
        name: name,
        type: :geo_s2,
        features: features,
        properties: properties
      }

      pipeline = Module.get_attribute(__MODULE__, :pipeline)

      if is_nil(pipeline) do
        Module.put_attribute(__MODULE__, :analyzers, analyzer)
      else
        # in pipeline
        Module.put_attribute(__MODULE__, :pipeline, [analyzer | pipeline])
      end
    end
  end

  @doc """
  Defines a geopoint type analyzer.

  An Analyzer capable of breaking up a coordinate array in [latitude, longitude] order or a JSON object describing a
  coordinate pair using two separate attributes into a set of indexable tokens for further usage with ArangoSearch Geo functions.

  Refer to the [ArangoDB Geo Point Docs](https://www.arangodb.com/docs/stable/analyzers.html#geopoint) for more info.

  ## Parameters

      * `name` - atom of the analyzer name
      * `features` - the features options to be set (see [Analyzer Features](https://hexdocs.pm/arangox_ecto/ArangoXEcto.Analyzer.html#module-features))
      * `properties` - a map of the properties to be set, see below

  ## Properties

      * `latitude` - (list of string) - list of strings that describes the attribute path of the latitude value
        relative to the field for which the Analyzer is defined in the View
      * `longitude` - (list of string) - list of strings that describes the attribute path of the longitude value
        relative to the field for which the Analyzer is defined in the View
      * `options` - (map) - options for fine-tuning geo queries. These options should generally remain unchanged 
        * `:maxCells` (number, optional) - maximum number of S2 cells (default: 20)
        * `:minLevel` (number, optional) - the least precise S2 level (default: 4)
        * `:maxLevel` (number, optional) - the most precise S2 level (default: 23)

  """
  defmacro geopoint(name, features, properties \\ %{}) do
    quote do
      name = ArangoXEcto.Analyzer.validate_name(unquote(name))
      features = ArangoXEcto.Analyzer.validate_features(unquote(features))

      properties =
        ArangoXEcto.Analyzer.validate_properties(unquote(properties), name, [
          :latitude,
          :longitude,
          :options
        ])

      analyzer = %{
        name: name,
        type: :geopoint,
        features: features,
        properties: properties
      }

      pipeline = Module.get_attribute(__MODULE__, :pipeline)

      if is_nil(pipeline) do
        Module.put_attribute(__MODULE__, :analyzers, analyzer)
      else
        # in pipeline
        Module.put_attribute(__MODULE__, :pipeline, [analyzer | pipeline])
      end
    end
  end

  @doc false
  @spec validate_name(atom()) :: atom()
  def validate_name(name) when is_atom(name), do: name

  def validate_name(name),
    do: raise(ArgumentError, "the name for analyzer must be an atom, got: #{inspect(name)}")

  @valid_keys [:frequency, :norm, :position]

  @doc false
  @spec validate_features([atom()]) :: [atom()]
  def validate_features(features) do
    unless is_list(features) and Enum.all?(features, &Enum.member?(@valid_keys, &1)) do
      raise ArgumentError,
            "the features provided are invalid, only accepts keys [:frequency, :norm, :position], got: #{inspect(features)}"
    end

    features
  end

  @doc false
  @spec validate_properties([atom()], atom(), [atom()]) :: [atom()]
  def validate_properties(properties, name, valid_keys) do
    Enum.all?(properties, fn {k, v} ->
      Enum.member?(valid_keys, k) and valid_key?(k, v)
    end)
    |> unless do
      raise ArgumentError,
            "the properties provided for analyzer '#{name}' are invalid, only accepts keys #{inspect(valid_keys)}, got: #{inspect(properties)}"
    end

    properties
  end

  defp valid_key?(:delimiter, value), do: is_binary(value)
  defp valid_key?(:locale, value), do: is_binary(value)
  defp valid_key?(:accent, value), do: is_boolean(value)
  defp valid_key?(:case, value), do: value in [:none, :lower, :upper]
  defp valid_key?(:min, value), do: is_integer(value)
  defp valid_key?(:max, value), do: is_integer(value)
  defp valid_key?(:preserveOriginal, value), do: is_boolean(value)
  defp valid_key?(:startMarker, value), do: is_binary(value)
  defp valid_key?(:endMarker, value), do: is_binary(value)
  defp valid_key?(:streamType, value), do: value in [:binary, :utf8]
  defp valid_key?(:stemming, value), do: is_boolean(value)
  defp valid_key?(:stopwords, value), do: is_list(value) and Enum.all?(value, &is_binary/1)
  defp valid_key?(:stopwordsPath, value), do: is_binary(value)
  defp valid_key?(:queryString, value), do: is_binary(value)
  defp valid_key?(:collapsePositions, value), do: is_boolean(value)
  defp valid_key?(:keepNull, value), do: is_boolean(value)
  defp valid_key?(:batchSize, value), do: is_integer(value) and value >= 1 and value <= 1000
  defp valid_key?(:numHashes, value), do: is_integer(value) and value >= 1
  defp valid_key?(:hex, value), do: is_boolean(value)
  defp valid_key?(:model_location, value), do: is_binary(value)
  defp valid_key?(:top_k, value), do: is_integer(value)
  defp valid_key?(:threshold, value), do: is_float(value) or is_integer(value)
  defp valid_key?(:latitude, value), do: is_list(value) and Enum.all?(value, &is_binary/1)
  defp valid_key?(:longitude, value), do: is_list(value) and Enum.all?(value, &is_binary/1)
  defp valid_key?(:returnType, value), do: value in [:string, :number, :bool]
  defp valid_key?(:break, value), do: value in [:all, :alpha, :graphic]
  defp valid_key?(:type, value), do: value in [:shape, :centroid, :point]
  defp valid_key?(:format, value), do: value in [:latLngDouble, :latLngInt, :s2Point]

  defp valid_key?(:memoryLimit, value),
    do: is_integer(value) and value >= 1_048_576 and value <= 33_554_432

  defp valid_key?(:edgeNgram, value),
    do:
      is_map(value) and
        Enum.all?(value, fn {k, v} ->
          k in [:min, :max, :preserveOriginal] and valid_key?(k, v)
        end)

  defp valid_key?(:options, value),
    do:
      is_map(value) and
        Enum.all?(value, fn {k, v} ->
          k in [:maxCells, :minLevel, :maxLevel] and is_integer(v)
        end)
end