lib/exonerate/formats/iri_reference.ex

defmodule Exonerate.Formats.IriReference do
  @moduledoc """
  Module which provides a macro that generates special code for an iri-reference
  filter.  This is a uri-reference with internationalization support.

  the format is governed by appendix A of RFC 3986:
  https://www.rfc-editor.org/rfc/rfc3986.txt
  """

  alias Exonerate.Cache

  @doc """
  Creates a `NimbleParsec` parser `~iri-reference/1`.

  This function returns `{:ok, ...}` if the passed string is a valid iri
  reference, or `{:error, reason, ...}` if it is not.  See `NimbleParsec` for
  more information on the return tuples.

  The function will only be created once per module, and it is safe to call
  the macro more than once.

  ## Options:
  - `:name` (atom): the name of the function to create.  Defaults to
    `:"~iri-reference"`
  """
  defmacro filter do
    if Cache.register_context(__CALLER__.module, :"~iri-reference") do
      quote do
        require Pegasus
        import NimbleParsec

        Pegasus.parser_from_string(~S"""
        IRI_REFERENCE            <- IRI_REF_URI / IRI_REF_irelative_ref

        IRI_REF_irelative_ref    <- IRI_REF_irelative_part ("?" IRI_REF_iquery)? ("#" IRI_REF_ifragment)?

        IRI_REF_irelative_part   <- "//" IRI_REF_iauthority IRI_REF_ipath_abempty
                                    / IRI_REF_ipath_absolute
                                    / IRI_REF_ipath_no_scheme
                                    / IRI_REF_ipath_empty

        IRI_REF_URI              <- IRI_REF_scheme ":" IRI_REF_ihier_part ("?" IRI_REF_iquery)? ("#" IRI_REF_ifragment)?

        IRI_REF_ihier_part       <- "//" IRI_REF_iauthority IRI_REF_ipath_abempty
                                    / IRI_REF_ipath_absolute
                                    / IRI_REF_ipath_rootless
                                    / IRI_REF_ipath_empty

        IRI_REF_scheme           <- IRI_REF_ALPHA ( IRI_REF_ALPHA / IRI_REF_DIGIT / "+" / "-" / "." )*

        IRI_REF_iauthority       <- (IRI_REF_iuserinfo "@")? IRI_REF_ihost (":" IRI_REF_port)?
        IRI_REF_iuserinfo        <- ( IRI_REF_iunreserved / IRI_REF_pct_encoded / IRI_REF_sub_delims / ":" )*
        IRI_REF_ihost            <- IRI_REF_IP_literal / IRI_REF_IPv4address / IRI_REF_ireg_name
        IRI_REF_port             <- IRI_REF_DIGIT*

        IRI_REF_IP_literal       <- "[" ( IRI_REF_IPv6address / IRI_REF_IPvFuture  ) "]"

        IRI_REF_IPvFuture        <- "v" (IRI_REF_HEXDIG)+ "." ( IRI_REF_unreserved / IRI_REF_sub_delims / ":" )+

        IRI_REF_DIGIT            <- [0-9]
        IRI_REF_HEXDIG           <- [0-9A-Fa-f]
        IRI_REF_ALPHA            <- [A-Za-z]

        IRI_REF_Snum             <-  IRI_REF_DIGIT IRI_REF_DIGIT IRI_REF_DIGIT

        IRI_REF_IPv4address      <- IRI_REF_Snum "." IRI_REF_Snum "." IRI_REF_Snum "." IRI_REF_Snum

        IRI_REF_IPv6address      <- IRI_REF_IPv6_full / IRI_REF_IPv6_comp / IRI_REF_IPv6v4_full / IRI_REF_IPv6v4_comp

        IRI_REF_IPv6_hex         <- IRI_REF_HEXDIG IRI_REF_HEXDIG IRI_REF_HEXDIG IRI_REF_HEXDIG

        IRI_REF_IPv6_full        <- IRI_REF_IPv6_hex ":" IRI_REF_IPv6_hex  ":" IRI_REF_IPv6_hex  ":" IRI_REF_IPv6_hex  ":" IRI_REF_IPv6_hex  ":" IRI_REF_IPv6_hex  ":" IRI_REF_IPv6_hex  ":" IRI_REF_IPv6_hex

        IRI_REF_IPv6_comp        <- (IRI_REF_IPv6_hex (":" IRI_REF_IPv6_hex)? (":" IRI_REF_IPv6_hex)? (":" IRI_REF_IPv6_hex)? (":" IRI_REF_IPv6_hex)? (":" IRI_REF_IPv6_hex)?)? "::"
                                    (IRI_REF_IPv6_hex (":" IRI_REF_IPv6_hex)? (":" IRI_REF_IPv6_hex)? (":" IRI_REF_IPv6_hex)? (":" IRI_REF_IPv6_hex)? (":" IRI_REF_IPv6_hex)?)?

        IRI_REF_IPv6v4_full      <- IRI_REF_IPv6_hex ":" IRI_REF_IPv6_hex ":" IRI_REF_IPv6_hex ":" IRI_REF_IPv6_hex ":" IRI_REF_IPv6_hex ":" IRI_REF_IPv6_hex ":" IRI_REF_IPv4address

        IRI_REF_IPv6v4_comp      <- (IRI_REF_IPv6_hex (":" IRI_REF_IPv6_hex)? (":" IRI_REF_IPv6_hex)? (":" IRI_REF_IPv6_hex)?)? "::"
                                    (IRI_REF_IPv6_hex (":" IRI_REF_IPv6_hex)? (":" IRI_REF_IPv6_hex)? (":" IRI_REF_IPv6_hex)? ":")?
                                    IRI_REF_IPv4address

        IRI_REF_ireg_name        <- ( IRI_REF_iunreserved / IRI_REF_pct_encoded / IRI_REF_sub_delims )*

        IRI_REF_ipath            <- IRI_REF_ipath_abempty       # begins with "/" or is empty
                                    / IRI_REF_ipath_absolute    # begins with "/" but not "//"
                                    / IRI_REF_ipath_no_scheme   # begins with a non-colon IRI_REF_segment
                                    / IRI_REF_ipath_rootless    # begins with a IRI_REF_segment
                                    / IRI_REF_ipath_empty       # zero characters

        IRI_REF_ipath_abempty    <- ( "/" IRI_REF_isegment )*
        IRI_REF_ipath_absolute   <- "/" ( IRI_REF_isegment_nz ( "/" IRI_REF_isegment )*)?
        IRI_REF_ipath_no_scheme  <- IRI_REF_isegment_nz_nc ( "/" IRI_REF_isegment )*
        IRI_REF_ipath_rootless   <- IRI_REF_isegment_nz ( "/" IRI_REF_isegment )*
        IRI_REF_ipath_empty      <- ""

        IRI_REF_isegment         <- IRI_REF_ipchar*
        IRI_REF_isegment_nz      <- IRI_REF_ipchar+
        IRI_REF_isegment_nz_nc   <- ( IRI_REF_iunreserved / IRI_REF_pct_encoded / IRI_REF_sub_delims / "@" )+
                                    # non-zero-length IRI_REF_segment without any colon ":"

        IRI_REF_ipchar           <- IRI_REF_iunreserved / IRI_REF_pct_encoded / IRI_REF_sub_delims / ":" / "@"

        IRI_REF_iquery           <- ( IRI_REF_ipchar / IRI_REF_iprivate / "/" / "?" )*

        IRI_REF_ifragment        <- ( IRI_REF_ipchar / "/" / "?" )*

        IRI_REF_pct_encoded      <- "%" IRI_REF_HEXDIG IRI_REF_HEXDIG

        IRI_REF_unreserved       <- IRI_REF_ALPHA / IRI_REF_DIGIT / "-" / "." / "_" / "~" # needed for ipVfuture
        IRI_REF_iunreserved      <- IRI_REF_ALPHA / IRI_REF_DIGIT / "-" / "." / "_" / "~" / IRI_REF_ucschar
        IRI_REF_reserved         <- IRI_REF_gen_delims / IRI_REF_sub_delims
        IRI_REF_gen_delims       <- ":" / "/" / "?" / "#" / "[" / "]" / "@"
        IRI_REF_sub_delims       <- "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
        """)

        defcombinatorp(
          :IRI_REF_ucschar,
          utf8_char(
            not: 0..127,
            not: 0xE000..0xF8FF,
            not: 0xF0000..0xFFFFD,
            not: 0x100000..0x10FFFD
          )
        )

        defcombinatorp(
          :IRI_REF_iprivate,
          utf8_char([0xE000..0xF8FF, 0xF0000..0xFFFFD, 0x100000..0x10FFFD])
        )

        defparsec(:"~iri-reference", parsec(:IRI_REFERENCE) |> eos)
      end
    end
  end
end