lib/matchers/text.ex

defmodule Infer.Text do
  @moduledoc """
  Text type matchers based on the [magic number](https://en.wikipedia.org/wiki/Magic_number_(programming))
  """

  @doc """
  Takes the binary file contents as arguments. Returns `true` if it's html.

  See: https://mimesniff.spec.whatwg.org/

  ## Examples

      iex> Infer.Text.html?("<!DOCTYPE html>")
      true

      iex> Infer.Text.html?("     <BODY>")
      true

      iex> Infer.Text.html?("<")
      false

      iex> binary = File.read!("test/archives/sample.zip")
      iex> Infer.Text.html?(binary)
      false

  """
  @spec html?(binary()) :: boolean()
  def html?(binary) do
    values = [
      '<!DOCTYPE HTML',
      '<HTML',
      '<HEAD',
      '<SCRIPT',
      '<IFRAME',
      '<H1',
      '<DIV',
      '<FONT',
      '<TABLE',
      '<A',
      '<STYLE',
      '<TITLE',
      '<B',
      '<BODY',
      '<BR',
      '<P',
      '<!--'
    ]

    char_list =
      binary
      |> String.trim()
      |> :binary.bin_to_list()

    Enum.any?(values, fn val ->
      if starts_with_ignore_ascii_case(char_list, val) do
        case Enum.at(char_list, length(val)) do
          0x20 -> true
          0x3E -> true
          _ -> false
        end
      end
    end)
  end

  @doc """
  Takes the binary file contents as arguments. Returns `true` if it's xml.

  See: https://mimesniff.spec.whatwg.org/

  ## Examples

      iex> Infer.Text.xml?(~s(<?xml version="1.0" encoding="ISO-8859-1"?>))
      true

      iex> binary = File.read!("test/archives/sample.zip")
      iex> Infer.Text.xml?(binary)
      false

  """
  @spec xml?(binary()) :: boolean()
  def xml?(binary) do
    char_list =
      binary
      |> String.trim()
      |> :binary.bin_to_list()

    starts_with_ignore_ascii_case(char_list, '<?xml')
  end

  @doc """
  Takes the binary file contents as arguments. Returns `true` if it's a shell script.

  ## Examples

    iex> Infer.Text.shell_script?("#!/bin/sh")
    true

    iex> binary = File.read!("test/archives/sample.zip")
    iex> Infer.Text.shell_script?(binary)
    false

  """
  @spec shell_script?(binary()) :: boolean()
  def shell_script?(<<"#!", _rest::binary>>), do: true
  def shell_script?(_binary), do: false

  defp starts_with_ignore_ascii_case(char_list, needle) when length(char_list) >= length(needle) do
    char_list = Enum.take(char_list, length(needle))

    String.downcase(to_string(char_list), :ascii) == String.downcase(to_string(needle), :ascii)
  end

  defp starts_with_ignore_ascii_case(_binary, _needle), do: false
end