lib/mix/tasks/tesseract_js.download.ex

defmodule Mix.Tasks.TesseractJs.Download do
  @shortdoc "Downloads tesseract.js core WASM + traineddata language files"
  @moduledoc """
  Downloads the tesseract.js core WASM bundle and one or more traineddata
  language files into your app's `priv/static/assets/vendor/tesseract/`,
  so you can run `tesseract_js` in local mode (no jsDelivr at runtime).

  ## Examples

      mix tesseract_js.download                       # core + "eng" + :standard
      mix tesseract_js.download eng jpn               # core + two langs
      mix tesseract_js.download --tier best eng jpn   # from tessdata_best
      mix tesseract_js.download --list                # prints curated registry
      mix tesseract_js.download --core-only           # just the WASM core

  ## Options

    * `--tier` — one of `standard` (default) or `best`.
    * `--core-variant` — `simd_lstm` (default), `simd`, `basic`.
    * `--out` — output dir, defaults to `priv/static/assets/vendor/tesseract`.
    * `--core-only` — download only the WASM core.
    * `--list` — print the curated language registry and exit.
    * `--force` — re-download even if file exists.
  """
  use Mix.Task

  alias TesseractJs.Models

  @switches [
    tier: :string,
    core_variant: :string,
    out: :string,
    core_only: :boolean,
    list: :boolean,
    force: :boolean
  ]

  @impl true
  def run(argv) do
    {:ok, _} = Application.ensure_all_started(:inets)
    {:ok, _} = Application.ensure_all_started(:ssl)

    {opts, positional} = OptionParser.parse!(argv, strict: @switches)

    cond do
      opts[:list] -> print_registry()
      true -> do_download(positional, opts)
    end
  end

  defp do_download(positional, opts) do
    tier = parse_tier(opts[:tier] || "standard")
    variant = parse_variant(opts[:core_variant] || "simd_lstm")
    out_dir = resolve_out_dir(opts[:out])
    force? = !!opts[:force]

    File.mkdir_p!(out_dir)

    langs =
      if opts[:core_only] do
        []
      else
        case positional do
          [] -> ["eng"]
          ls -> ls
        end
        |> Enum.flat_map(&Models.split_langs/1)
        |> Enum.uniq()
      end

    Mix.shell().info("→ Output: #{out_dir}")
    Mix.shell().info("→ Tier:   #{tier}, core variant: #{variant}")

    fetch_core(variant, out_dir, force?)
    Enum.each(langs, fn lang -> fetch_lang(lang, tier, out_dir, force?) end)

    Mix.shell().info("")
    Mix.shell().info("Done. Add this to config/config.exs to switch to local mode:")
    Mix.shell().info("")
    Mix.shell().info("    config :tesseract_js,")
    Mix.shell().info("      source: :local,")
    Mix.shell().info("      tessdata_repo: #{inspect(tier)},")
    Mix.shell().info("      core_variant: #{inspect(variant)}")
  end

  defp fetch_core(variant, out_dir, force?) do
    url = Models.core_cdn_url(variant)
    filename = Models.core_filename(variant)
    dest = Path.join(out_dir, filename)
    fetch(url, dest, force?, nil)
  end

  defp fetch_lang(lang, tier, out_dir, force?) do
    url = Models.cdn_url(lang, tier)
    filename = Models.filename(lang)
    dest = Path.join(out_dir, filename)
    expected = Models.get(lang) && Models.get(lang).sha256
    fetch(url, dest, force?, expected)
  end

  defp fetch(url, dest, force?, expected_sha256) do
    cond do
      File.exists?(dest) and not force? and sha_ok?(dest, expected_sha256) ->
        Mix.shell().info("✓ already present: #{Path.basename(dest)}")

      true ->
        Mix.shell().info("↓ downloading #{Path.basename(dest)} from #{url}")

        case download_to(url, dest) do
          :ok ->
            if expected_sha256 do
              if sha_ok?(dest, expected_sha256) do
                Mix.shell().info("✓ verified #{Path.basename(dest)}")
              else
                File.rm(dest)
                Mix.raise("checksum mismatch for #{Path.basename(dest)} — file removed")
              end
            else
              Mix.shell().info(
                "✓ downloaded #{Path.basename(dest)} (no checksum in registry — skipped verification)"
              )
            end

          {:error, reason} ->
            Mix.raise("download failed for #{url}: #{inspect(reason)}")
        end
    end
  end

  defp download_to(url, dest) do
    headers = [{~c"user-agent", ~c"tesseract_js mix task"}]
    request = {String.to_charlist(url), headers}

    http_opts = [
      ssl: [
        verify: :verify_peer,
        cacerts: :public_key.cacerts_get(),
        depth: 4,
        customize_hostname_check: [
          match_fun: :public_key.pkix_verify_hostname_match_fun(:https)
        ]
      ],
      autoredirect: true
    ]

    opts = [body_format: :binary]

    case :httpc.request(:get, request, http_opts, opts) do
      {:ok, {{_, 200, _}, _, body}} ->
        File.write!(dest, body)
        :ok

      {:ok, {{_, code, _}, _, _}} ->
        {:error, "HTTP #{code}"}

      {:error, reason} ->
        {:error, reason}
    end
  end

  defp sha_ok?(_path, nil), do: true

  defp sha_ok?(path, expected) when is_binary(expected) do
    case File.read(path) do
      {:ok, body} ->
        actual = :crypto.hash(:sha256, body) |> Base.encode16(case: :lower)
        actual == String.downcase(expected)

      _ ->
        false
    end
  end

  defp parse_tier("standard"), do: :standard
  defp parse_tier("best"), do: :best

  defp parse_tier(other),
    do: Mix.raise("unknown --tier #{inspect(other)} (expected standard|best)")

  defp parse_variant("simd_lstm"), do: :simd_lstm
  defp parse_variant("simd"), do: :simd
  defp parse_variant("basic"), do: :basic
  defp parse_variant(other), do: Mix.raise("unknown --core-variant #{inspect(other)}")

  # When invoked from the host app, Mix.Project is the host's, so `priv`
  # resolves to the host's `priv/`. From inside the package itself, this
  # would write into the package's priv — not useful, but harmless.
  defp resolve_out_dir(nil), do: Path.join(["priv", "static", "assets", "vendor", "tesseract"])
  defp resolve_out_dir(custom), do: custom

  defp print_registry do
    Mix.shell().info("Curated tesseract_js language registry:")
    Mix.shell().info("")
    Mix.shell().info("  code        size  name")
    Mix.shell().info("  ----------  ----  ----")

    Models.list()
    |> Enum.sort_by(fn {code, _} -> code end)
    |> Enum.each(fn {code, %{name: name, size_mb: size}} ->
      Mix.shell().info(
        "  #{String.pad_trailing(code, 10)}  #{String.pad_leading("#{size}M", 4)}  #{name}"
      )
    end)

    Mix.shell().info("")

    Mix.shell().info(
      "Any tesseract language code works at runtime; the registry is just for help text + checksums."
    )
  end
end