lib/exonerate.ex

defmodule Exonerate do
  @moduledoc """
  An opinionated JSONSchema compiler for elixir.

  Currently supports JSONSchema drafts 4, 6, 7, 2019, and 2020.  *except:*

  - multipleOf is not supported for number types.  This is because
  elixir does not support a floating point remainder guard, and also
  because it is impossible for a floating point to guarantee sane results
  (e.g. for IEEE Float64, `1.2 / 0.1 != 12`)
  - id fields with fragments in their uri identifier (draft 7 and earlier only)
  - dynamicRefs and anchors.
  - contentMediaType, contentEncoding, contentSchema

  For details, see:  http://json-schema.org

  Exonerate is automatically tested against the JSONSchema test suite.

  Note that Exonerate does *not* generally validate that the schema presented to it
  is valid, unless the violation results in an uncompilable entity.

  ## Usage

  Exonerate yields 100% compile-time generated code.  You may include Exonerate
  with the `runtime: false` option in `mix.exs`, unless you believe you will
  need to edit and recompile modules with Exonerate at runtime.

  ### In your module:

  ```
  defmodule MyModule do
    require Exonerate

    Exonerate.function_from_string(:def, :function_name, \"""
    {
      "type": "string"
    }
    \""")
  end
  ```

  The above module generates a function `MyModule.function_name/1` that takes an BEAM JSON term
  (`string | number | list | map | bool | nil`) and validates it based on the the JSONschema.  If
  the term validates, it produces `:ok`.  If the term fails to validate, it produces
  `{:error, keyword}`, where the key `:instance_location` and points to the error location in the passed
  parameter, the `:schema_pointers` points to the validation that failed, and `error_value` is the
  failing inner term.

  ## Error keywords

  The following error keywords conform to the JSONSchema spec
  (https://json-schema.org/draft/2020-12/json-schema-core.html#name-format):

  - `:absolute_keyword_location`: a JSON pointer to the keyword in the schema that failed.
  - `:instance_location`: a JSON pointer to the location in the instance that failed.
  - `:errors`: a list of errors generated when a combining filter fails to match.

  The following error keywords are not standard and are specific to Exonerate:

  - `:error_value`: the innermost term that failed to validate.
  - `:matches`: a list of JSON pointers to the keywords that matched a combining filter.
  - `:reason`: a string describing the error, when the failing filter can fail for nonobvious
    reasons.  For example `oneOf` will fail with the reason "no matches" when none of the
    child schemas match; but it will fail with the reason "multiple matches" when more than
    of the child schemas match.
  - `:required`: a list of object keys that were required but missing.
  - `:ref_trace`: a list of `$ref` keywords that were followed to get to the failing keyword.

  ## Options

  The following options are available:

  - `:dump`: `true` to dump the generated code to the console.  Note that this
    will create function names that aren't the function names when compiled otherwise,
    but adjusted so that you can copy/paste them into the elixir console.  This could
    cause collisions when more than one dumped templates are present in the same module.

  - `:metadata`: `true` to enable all metadata decorator functions or a list of
    atoms parameters to enable.  The following metadata are accessible by passing
    the corresponding atom to the generated function in lieu of a JSON term to
    validate.

    | JSONschema tag  | atom parameter |
    |-----------------|----------------|
    | $id or id       | `:id`          |
    | $schema         | `:schema_id`   |
    | default         | `:default`     |
    | examples        | `:examples`    |
    | description     | `:description` |
    | title           | `:title`       |
    | <entire schema> | `:schema`      |

  - `:format`: instructions for using (optional) format filters.  Pass `true`
    to enable all default format filters, or a keyword list for fine-grained
    control.  See the [Format Filters guide](guides/formatting.md) for complete
    documentation of all available format types and custom filter configuration.

  - `:entrypoint`: a JSONpointer to the internal location inside of a json
    document where you would like to start the JSONschema.  This should be in
    JSONPointer form (not URI form).  See https://datatracker.ietf.org/doc/html/rfc6901
    for more information about JSONPointer

  - `:decoders`: a list of `{<mimetype>, <decoder>}` tuples.  `<encoding-type>`
    should be a string that matches the `content-type` of the schema. `<decoder>`
    should be one of the following:
    - `Jason` (default) for json parsing
    - `:yamerl` for yaml parsing (requires the `yamerl` dependency)
    - `{module, function}` for custom parsing; the function should accept a
      string and return json term, raising if the string is not valid input
      for the decoder.

    Defaults to `[{"application/json", Jason}, {"application/yaml", :yamerl}]`.
    Tuples specified in this option will override or add to the defaults.

  - `:draft`: specifies any special draft information.  Defaults to `"2020"`,
    `"2019"`, `"4"`, `"6"`, and `"7"` are also supported. This overrides draft
    information provided in the schema

    > ### Validation {: .warning}
    >
    > Validation is NOT performed on the schema, so intermingling draft
    > components is possible (but not recommended).  In the future, using
    > components in the wrong draft may cause a compile-time warning.

  ### remoteRef schema retrieval options

  - `:remote_fetch_adapter`: specifies the module to use for fetching remote
    resources.  This module must export a `fetch_remote!/2` function which
    is passed a `t:URI.t/0` struct and returns `{<body>, <content-type>}` pair.
    content-type may be `nil`.  Defaults to `Exonerate.Remote`, which uses the
    `Req` library to perform the http request.

  - `:force_remote`: bypasses the manual prompt confirming if remote resources
    should be downoladed.  Use with caution!  Defaults to `false`.

  - `:cache`: if remote JSONs should be cached to the local filesystem.
    Defaults to `false`

  - `:cache_app`: specifies the otp app whose priv directory cached remote
    JSONs are stored. Defaults to `:exonerate`.

  - `:cache_path`: specifies the subdirectory of priv where cached remote JSONs
    are stored.  Defaults to `/`.

  - `:proxy`: a string proplist which describes string substitution of url
    resources for proxied remote content.

    #### Example

    ``` elixir
    [proxy: [{"https://my.remote.resource/", "http://localhost:4000"}]]
    ```
  """

  alias Exonerate.Cache
  alias Exonerate.Draft
  alias Exonerate.Tools
  alias Exonerate.Metadata
  alias Exonerate.Schema

  @doc """
  saves in the compile-time registry a schema under the given name.  The schema
  can then be used to generate a validation function with
  `function_from_resource/3`.  This is useful for clearly reusing a string
  schema across multiple functions with potentially different entrypoints, but
  without having to repeat the (potentially large) schema string literal in
  your module code.

  > ### Note {: .info}
  >
  > this function is optional, `function_from_string/4` will also create a
  > resource for the string and reuse private functions between calls.

  > ### File schemas {: .info}
  >
  > `function_from_file/4` will perform the equivalent of this process under
  > the hood, so don't run this function for file functions.

  ### Extra options

  - `:content_type`: specifies the MIME type used to parse the *schema definition
    itself* (not the data being validated at runtime). This tells Exonerate how
    to decode the schema string into an Elixir map. Supported values:
    - `"application/json"` (default for `.json` files) - parse schema as JSON
    - `"application/yaml"` (default for `.yaml` files) - parse schema as YAML
      (requires the `yamerl` dependency)

    > ### Important distinction {: .info}
    >
    > This option controls how the *schema file* is parsed at compile time, not
    > how validated data is parsed at runtime. The generated validation function
    > always works on already-decoded Elixir terms (maps, lists, strings, etc.).

    #### Example: YAML schema

    ```elixir
    # Using a YAML-formatted schema file
    Exonerate.function_from_file(:def, :validate, "schema.yaml")
    # content_type is auto-detected from .yaml extension

    # Using a YAML string directly
    Exonerate.function_from_string(:def, :validate, \"\"\"
    type: object
    properties:
      name:
        type: string
    \"\"\", content_type: "application/yaml")
    ```

  - `:mimetype_mapping`: a proplist of `{<extension>, <mimetype>}` tuples that
    maps file extensions to their content type. Use this when working with
    non-standard file extensions.

    #### Example

    ```elixir
    # Parse .schema files as JSON
    Exonerate.function_from_file(
      :def,
      :validate,
      "types.schema",
      mimetype_mapping: [{".schema", "application/json"}]
    )
    ```

    The built-in mappings `{".json", "application/json"}` and
    `{".yaml", "application/yaml"}` cannot be overridden.
  """
  defmacro register_resource(schema, name, opts \\ []) do
    schema = Macro.expand(schema, __CALLER__)
    opts = set_resource_opts(__CALLER__, opts)

    Cache.register_resource(__CALLER__.module, schema, name, opts)

    quote do
    end
  end

  @doc """
  generates a series of functions that validates a provided JSONSchema.

  Note that the `schema` parameter must be a string literal.

  ### Extra options

  The options described at the top of the module are available to this macro,
  in addition to the options described in `register_resource/3`
  """
  defmacro function_from_string(type, function_name, schema_ast, opts \\ []) do
    opts = set_resource_opts(__CALLER__, opts)

    # find or register the function.
    resource = Cache.find_or_make_resource(__CALLER__.module, schema_ast, opts)

    # prewalk the schema text
    root_pointer = Tools.entrypoint(opts)

    # TODO: also attempt to obtain this from the schema.
    draft = Keyword.get(opts, :draft, "2020-12")
    opts = Keyword.put(opts, :draft, draft)

    schema_string = Macro.expand(schema_ast, __CALLER__)

    build_code(
      __CALLER__,
      schema_string,
      type,
      function_name,
      "#{resource.uri}",
      root_pointer,
      opts
    )
  end

  defp id_from(schema) when is_map(schema), do: schema["$id"] || schema["id"]
  defp id_from(_), do: nil

  @doc """
  generates a series of functions that validates a JSONschema in a file at
  the provided path.

  Note that the `path` parameter must be a `t:Path.t/0` value.  The function
  names will contain the file url.

  ### Options

  The options described at the top of the module are available to this macro,
  in addition to the options described in `register_resource/3`
  """
  defmacro function_from_file(type, function_name, path, opts \\ [])

  defmacro function_from_file(type, function_name, path, opts) do
    # Read and potentially decompress the file
    {schema_string, actual_path} = read_and_decompress(path)

    # expand literals (aliases) in ast.
    # Use actual_path (with .gz stripped) for content_type detection
    opts =
      opts
      |> Macro.expand_literals(__CALLER__)
      |> set_content_type(actual_path)
      |> Tools.set_decoders()

    # prewalk the schema text
    root_pointer = Tools.entrypoint(opts)

    # TODO: also attempt to obtain this from the schema.
    draft = Keyword.get(opts, :draft, "2020-12")
    opts = Keyword.put(opts, :draft, draft)

    function_resource = to_string(%URI{scheme: "file", host: "", path: Path.absname(path)})

    # set decoder options for the schema

    build_code(
      __CALLER__,
      schema_string,
      type,
      function_name,
      function_resource,
      root_pointer,
      opts
    )
  end

  @doc """
  generates a series of functions from a previously provided JSONSchema found
  registered using `register_resource/3`.

  Note that the `resource` parameter must be a string literal defined earlier
  in a `register_resource/3` call

  ### Options

  Only supply options described in the module section.
  """

  defmacro function_from_resource(type, function_name, resource, opts \\ []) do
    # expand literals (aliases) in ast.
    opts = Macro.expand_literals(opts, __CALLER__)

    # prewalk the schema text
    root_pointer = Tools.entrypoint(opts)

    # TODO: also attempt to obtain this from the schema.
    draft = Keyword.get(opts, :draft, "2020-12")
    opts = Keyword.put(opts, :draft, draft)

    resource = Cache.fetch_resource!(__CALLER__.module, resource)

    # set decoder options for the schema

    build_code(
      __CALLER__,
      resource.schema,
      type,
      function_name,
      "#{resource.uri}",
      root_pointer,
      Keyword.merge(opts, resource.opts)
    )
  end

  defp build_code(
         caller,
         schema_string,
         type,
         function_name,
         resource_uri,
         root_pointer,
         opts
       ) do
    schema = Schema.ingest(schema_string, caller, resource_uri, opts)

    opts = Draft.set_opts(opts, schema)

    resource =
      if id = id_from(schema) do
        resource = id
        Cache.put_schema(caller.module, resource, schema)

        resource
      else
        resource_uri
      end

    # Phase 1: Collect all declarations before generating code
    Exonerate.Context.collect_declarations(caller.module, resource, root_pointer, opts)

    # Phase 2: Generate code (all declarations are now known)
    schema_fn = Metadata.schema(schema_string, type, function_name, opts)

    call = Tools.call(resource, root_pointer, opts)

    # Extract types from root schema for @spec generation
    root_schema = JsonPtr.resolve_json!(schema, root_pointer)
    input_typespec = build_input_typespec(root_schema)

    Tools.maybe_dump(
      quote do
        require Exonerate.Metadata

        unquote(schema_fn)

        Exonerate.Metadata.functions(
          unquote(type),
          unquote(function_name),
          unquote(resource),
          unquote(root_pointer),
          unquote(opts)
        )

        @spec unquote(function_name)(unquote(input_typespec)) :: :ok | {:error, list()}
        unquote(type)(unquote(function_name)(data), do: unquote(call)(data, "/"))

        require Exonerate.Context
        Exonerate.Context.filter(unquote(resource), unquote(root_pointer), unquote(opts))
      end,
      caller,
      opts
    )
  end

  defp build_input_typespec(root_schema) when is_map(root_schema) do
    case Map.get(root_schema, "type") do
      nil ->
        # No type constraint - accept any JSON
        quote(do: Exonerate.Type.json())

      types when is_list(types) ->
        Tools.spec_from_only(types)

      type when is_binary(type) ->
        Tools.spec_from_only([type])
    end
  end

  defp build_input_typespec(_), do: quote(do: Exonerate.Type.json())

  defp set_content_type(opts, path) do
    # need to support "encoding" option for backwards compatibility
    Keyword.put_new_lazy(opts, :content_type, fn ->
      if encoding = Keyword.get(opts, :encoding) do
        IO.warn("the `:encoding` option is deprecated.  use `:content_type` instead")
        encoding
      else
        Tools.content_type_from_extension(path, opts)
      end
    end)
  end

  defp set_resource_opts(caller, opts) do
    opts
    |> Macro.expand(caller)
    |> Macro.expand_literals(caller)
    |> handle_encoding_deprecation()
    |> Keyword.put_new(:content_type, "application/json")
    |> Tools.set_decoders()
  end

  defp handle_encoding_deprecation(opts) do
    if encoding = Keyword.get(opts, :encoding) do
      IO.warn("the `:encoding` option is deprecated.  use `:content_type` instead")
      Keyword.put_new(opts, :content_type, encoding)
    else
      opts
    end
  end

  defp read_and_decompress(path) do
    binary = File.read!(path)

    if String.ends_with?(path, ".gz") do
      decompressed = :zlib.gunzip(binary)
      # Strip .gz to get actual extension for content_type detection
      actual_path = String.replace_suffix(path, ".gz", "")
      {decompressed, actual_path}
    else
      {binary, path}
    end
  end
end