Skip to main content

lib/mix/tasks/php.extract.ex

defmodule Mix.Tasks.Php.Extract do
  @shortdoc "Re-vendor the PHP FILTER_VALIDATE_EMAIL regex from php-src"

  @moduledoc """
  Downloads `ext/filter/logical_filters.c` from php-src for a given ref and
  re-vendors the two regex strings into `priv/php/`, refreshing
  `priv/php/MANIFEST.json` with new checksums.

      mix php.extract            # defaults to the PHP-8.5 branch
      mix php.extract PHP-8.4    # a specific branch
      mix php.extract php-8.4.4  # a specific release tag

  The vendored `regexp{0,1}.full` literals are the single source of truth the
  library compiles (it derives both the pattern and the `:re` options from
  them). This task makes updating them auditable: it re-extracts from the
  canonical upstream and shows you exactly what changed (diff the resulting
  `priv/php` files in git).
  """
  use Mix.Task

  alias ElixirPhpEmailValidator.PhpSource

  @default_ref "PHP-8.5"
  @priv "priv/php"

  @impl Mix.Task
  def run(args) do
    ref = List.first(args) || @default_ref
    url = raw_url(ref)
    Mix.shell().info("Fetching #{url}")
    source = fetch!(url)

    fulls = extract_fulls(source)
    # Validate every inline flag translates to a vetted :re option (raises on an
    # unvetted flag), then derive the inner patterns — both via the same
    # PhpSource the library compiles from, so all three stay in lockstep.
    Enum.each(fulls, fn {_name, full} -> PhpSource.options(full) end)
    inners = Map.new(fulls, fn {name, full} -> {name, PhpSource.pattern(full)} end)

    write!("logical_filters.c", source)
    write!("regexp1.full", fulls["regexp1"])
    write!("regexp0.full", fulls["regexp0"])
    write!("regexp1.pattern", inners["regexp1"])
    write!("regexp0.pattern", inners["regexp0"])
    write!("MANIFEST.json", render_manifest(ref, url, inners, fulls))

    Mix.shell().info("""
    Re-vendored from #{ref}:
      regexp1.pattern sha256 = #{sha(inners["regexp1"])}
      regexp0.pattern sha256 = #{sha(inners["regexp0"])}
    Review `git diff priv/php`, then run `mix php.test` to confirm parity.
    """)
  end

  # --- Parsing (shared with php.drift) -------------------------------------

  @doc """
  Extracts the two `const char regexpN[] = "...";` literals from the C source,
  un-escaping C backslashes, returning a map `%{"regexp0" => full, "regexp1" => full}`
  where each `full` is the regex exactly as PHP uses it, including the
  `/.../iD` delimiter and flags.
  """
  def extract_fulls(source) do
    %{
      "regexp1" => extract_one(source, "regexp1"),
      "regexp0" => extract_one(source, "regexp0")
    }
  end

  defp extract_one(source, name) do
    # Match a single-line C string literal:  const char NAME[] = "....";
    # The C string body is (?:[^"\\]|\\.)* — any char except quote/backslash,
    # or a backslash followed by any char (so escaped quotes don't end it).
    pattern = "const char " <> name <> "\\[\\]\\s*=\\s*\"((?:[^\"\\\\]|\\\\.)*)\"\\s*;"
    re = Regex.compile!(pattern, "s")

    case Regex.run(re, source) do
      [_, raw] ->
        assert_only_backslash_doubling!(raw, name)
        unescape_c(raw)

      _ ->
        Mix.raise("could not find the regexp literal '#{name}' in the PHP source")
    end
  end

  # The PHP literals only use backslash-doubling as C escaping (every regex
  # metacharacter is written \\x.., \\pL, \\\\, etc.), so un-escaping is a
  # single collapse of "\\\\" -> "\\".
  defp unescape_c(raw), do: String.replace(raw, "\\\\", "\\")

  # Guard the single-rule unescape above: in the vendored literals every
  # backslash is C-escaped as a doubled "\\\\", so after removing those pairs no
  # lone backslash must remain. A lone backslash means upstream introduced a
  # different C escape (\", \n, \t, …) that unescape_c/1 would silently mishandle
  # — fail loudly so the re-vendor is reviewed by a human (matching the fail-loud
  # philosophy of PhpSource.flags_to_re_opts/1) instead of emitting wrong bytes.
  defp assert_only_backslash_doubling!(raw, name) do
    if raw |> String.replace("\\\\", "") |> String.contains?("\\") do
      Mix.raise(
        "the C string literal '#{name}' contains a backslash escape other than `\\\\`; " <>
          "review the upstream change and extend unescape_c/1 before re-vendoring"
      )
    end
  end

  @doc "Lowercase hex SHA-256 of a binary."
  def sha(bin), do: :crypto.hash(:sha256, bin) |> Base.encode16(case: :lower)

  @doc "raw.githubusercontent.com URL for logical_filters.c at a given ref."
  def raw_url(ref),
    do: "https://raw.githubusercontent.com/php/php-src/#{ref}/ext/filter/logical_filters.c"

  @doc """
  Downloads a URL via `curl` (or `wget`), returning the body binary. Shelling
  out keeps this dependency-free and dodges OTP `:inets`/`:ssl` version quirks;
  every dev/CI environment that has `php` also has `curl`.
  """
  def fetch!(url) do
    cond do
      curl = System.find_executable("curl") ->
        run_fetch!(curl, ["-fsSL", url], url)

      wget = System.find_executable("wget") ->
        run_fetch!(wget, ["-qO-", url], url)

      true ->
        Mix.raise("need `curl` or `wget` on PATH to fetch #{url}")
    end
  end

  defp run_fetch!(bin, args, url) do
    # Merge stderr so the actionable diagnostic (curl's "(56) … 404", "Could not
    # resolve host", …) lands in the raise message. On success the body is the
    # only output: `curl -fsSL` / `wget -qO-` are silent on exit 0, so nothing
    # pollutes the returned source.
    case System.cmd(bin, args, stderr_to_stdout: true) do
      {body, 0} -> body
      {out, code} -> Mix.raise("fetch of #{url} failed (exit #{code}): #{out}")
    end
  end

  # --- Writing -------------------------------------------------------------

  defp write!(name, content) do
    path = Path.expand(Path.join(@priv, name), File.cwd!())
    File.write!(path, content)
  end

  # Render a derived :re option list as a JSON string array body, e.g.
  # [:caseless, :dollar_endonly] -> ~s("caseless", "dollar_endonly").
  defp render_opts(opts), do: Enum.map_join(opts, ", ", &~s("#{&1}"))

  defp render_manifest(ref, url, inners, fulls) do
    blob = "https://github.com/php/php-src/blob/#{ref}/ext/filter/logical_filters.c"

    # Derive the re_options from the vendored flags (same path the library
    # compiles from), so this provenance can never drift from the actual options.
    default_opts = render_opts(PhpSource.options(fulls["regexp1"]))
    unicode_opts = render_opts(PhpSource.options(fulls["regexp0"]))

    """
    {
      "_comment": "Provenance for the vendored PHP FILTER_VALIDATE_EMAIL regexes. Regenerate with `mix php.extract`. The library does not parse this file at runtime; ElixirPhpEmailValidator.source_info/0 derives its checksums from the actual vendored bytes. This file is the canonical record for humans and CI/jq tooling.",
      "upstream": {
        "repo": "https://github.com/php/php-src",
        "file": "ext/filter/logical_filters.c",
        "function": "php_filter_validate_email",
        "ref": "#{ref}",
        "blob_url": "#{blob}",
        "raw_url": "#{url}"
      },
      "vendored_at": "#{Date.utc_today()}",
      "max_length_octets": 320,
      "match_flags": {
        "default": "/iD  (i = caseless, D = PCRE_DOLLAR_ENDONLY)",
        "unicode": "/iDu (adds u = PCRE_UTF8 + PCRE_UCP, used with FILTER_FLAG_EMAIL_UNICODE)"
      },
      "re_options": {
        "default": [#{default_opts}],
        "unicode": [#{unicode_opts}]
      },
      "files": {
        "regexp1.pattern": { "role": "default (ASCII) inner pattern", "sha256": "#{sha(inners["regexp1"])}" },
        "regexp0.pattern": { "role": "FILTER_FLAG_EMAIL_UNICODE inner pattern", "sha256": "#{sha(inners["regexp0"])}" },
        "regexp1.full": { "role": "default regex with /.../iD", "sha256": "#{sha(fulls["regexp1"])}" },
        "regexp0.full": { "role": "unicode regex with /.../iDu", "sha256": "#{sha(fulls["regexp0"])}" }
      }
    }
    """
  end
end