lib/matchers/doc.ex

defmodule Infer.Doc do
  @moduledoc """
  Document type matchers based on the [magic number](https://en.wikipedia.org/wiki/Magic_number_(programming))
  """

  use Bitwise

  @doc """
  Takes the binary file contents as arguments. Returns `true` if it's Microsoft Word Open XML Format Document (DOCX) data.

  ## Examples

      iex> binary = File.read!("test/docs/sample.docx")
      iex> Infer.Doc.docx?(binary)
      true

      iex> binary = File.read!("test/docs/sample.xlsx")
      iex> Infer.Doc.docx?(binary)
      false

  """
  @spec docx?(binary()) :: boolean()
  def docx?(<<?P, ?K, 0x03, 0x04, _part::binary-size(26), "word/", _rest::binary>>), do: true
  def docx?(binary), do: msooxml?(binary) == :docx

  @doc """
  Takes the binary file contents as arguments. Returns `true` if it's Microsoft Excel Open XML Format Spreadsheet (XLSX) data.

  ## Examples

      iex> binary = File.read!("test/docs/sample.xlsx")
      iex> Infer.Doc.xlsx?(binary)
      true

      iex> binary = File.read!("test/docs/sample.docx")
      iex> Infer.Doc.xlsx?(binary)
      false

  """
  @spec xlsx?(binary()) :: boolean()
  def xlsx?(<<?p, ?k, 0x03, 0x04, _part::binary-size(26), "xl/", _rest::binary>>), do: true
  def xlsx?(binary), do: msooxml?(binary) == :xlsx

  @doc """
  Takes the binary file contents as arguments. Returns `true` if it's Microsoft PowerPoint Open XML Presentation (PPTX) data.

  ## Examples

      iex> binary = File.read!("test/docs/sample.pptx")
      iex> Infer.Doc.pptx?(binary)
      true

      iex> binary = File.read!("test/docs/sample.xlsx")
      iex> Infer.Doc.pptx?(binary)
      false

  """
  @spec pptx?(binary()) :: boolean()
  def pptx?(<<?p, ?k, 0x03, 0x04, _part::binary-size(26), "ppt/", _rest::binary>>), do: true
  def pptx?(binary), do: msooxml?(binary) == :pptx

  defp msooxml?(<<?P, ?K, 0x03, 0x04, _part::binary-size(26), "[Content_Types].xml", _rest::binary>> = binary) do
    search_x_format(binary)
  end

  defp msooxml?(<<?P, ?K, 0x03, 0x04, _part::binary-size(26), "docProps", _rest::binary>> = binary) do
    search_x_format(binary)
  end

  defp msooxml?(<<?P, ?K, 0x03, 0x04, _part::binary-size(26), "_rels/.rels", _rest::binary>> = binary) do
    search_x_format(binary)
  end

  defp msooxml?(_binary), do: nil

  defp search_x_format(binary) do
    case :binary.match(binary, [<<?w, ?o, ?r, ?d, ?/>>, <<?p, ?p, ?t, ?/>>, <<?x, ?l, ?/>>]) do
      {_pos, 5} -> :docx
      {_pos, 4} -> :pptx
      {_pos, 3} -> :xlsx
      :nomatch -> :ooxml
    end
  end

  @doc """
  Takes the binary file contents as arguments. Returns `true` if it's an OpenDocument Text Document.

  ## Examples

      iex> binary = File.read!("test/docs/sample.odt")
      iex> Infer.Doc.odt?(binary)
      true

      iex> binary = File.read!("test/docs/sample.odt")
      iex> Infer.Doc.pptx?(binary)
      false

  """
  @spec odt?(binary()) :: boolean()
  def odt?(<<?P, ?K, 0x03, 0x04, _part::binary-size(26), "mimetype", "application/vnd.oasis.opendocument.text", _rest::binary>>),
    do: true

  def odt?(_binary), do: false

  @doc """
  Takes the binary file contents as arguments. Returns `true` if it's an OpenDocument Spreadsheet Document.

  ## Examples

      iex> binary = File.read!("test/docs/sample.ods")
      iex> Infer.Doc.ods?(binary)
      true

      iex> binary = File.read!("test/docs/sample.ods")
      iex> Infer.Doc.odt?(binary)
      false

  """
  @spec ods?(binary()) :: boolean()
  def ods?(<<?P, ?K, 0x03, 0x04, _part::binary-size(26), "mimetype", "application/vnd.oasis.opendocument.spreadsheet", _rest::binary>>),
    do: true

  def ods?(_binary), do: false

  @doc """
  Takes the binary file contents as arguments. Returns `true` if it's an OpenDocument Presentation Document.

  ## Examples

      iex> binary = File.read!("test/docs/sample.odp")
      iex> Infer.Doc.odp?(binary)
      true

      iex> binary = File.read!("test/docs/sample.odp")
      iex> Infer.Doc.odt?(binary)
      false

  """
  @spec odp?(binary()) :: boolean()
  def odp?(<<?P, ?K, 0x03, 0x04, _part::binary-size(26), "mimetype", "application/vnd.oasis.opendocument.presentation", _rest::binary>>),
    do: true

  def odp?(_binary), do: false

  @doc """
  Takes the binary file contents as arguments. Returns `true` if it's Microsoft Word Document (DOC) data.

  ## Examples

      iex> binary = File.read!("test/docs/sample.doc")
      iex> Infer.Doc.doc?(binary)
      true

      iex> binary = File.read!("test/docs/sample.docx")
      iex> Infer.Doc.doc?(binary)
      false

  """
  @spec doc?(binary()) :: boolean()
  def doc?(<<0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1, _rest::binary>> = doc), do: search_format(doc) == :doc
  def doc?(_binary), do: false

  @doc """
  Takes the binary file contents as arguments. Returns `true` if it's Microsoft Power Point Document (PPT) data.

  ## Examples

      iex> binary = File.read!("test/docs/sample.ppt")
      iex> Infer.Doc.ppt?(binary)
      true

      iex> binary = File.read!("test/docs/sample.doc")
      iex> Infer.Doc.ppt?(binary)
      false

  """
  @spec ppt?(binary()) :: boolean()
  def ppt?(<<0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1, _rest::binary>> = ppt), do: search_format(ppt) == :ppt
  def ppt?(_binary), do: false

  @doc """
  Takes the binary file contents as arguments. Returns `true` if it's Microsoft Excel (XLS) data.

  ## Examples

      iex> binary = File.read!("test/docs/sample.xls")
      iex> Infer.Doc.xls?(binary)
      true

      iex> binary = File.read!("test/docs/sample.doc")
      iex> Infer.Doc.xls?(binary)
      false

  """
  @spec xls?(binary()) :: boolean()
  def xls?(<<0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1, _rest::binary>> = ppt), do: search_format(ppt) == :xls
  def xls?(_binary), do: false

  defp search_format(
         <<_header::binary-size(30), sector_size::binary-size(2), _dir_offset::binary-size(16), root_directory_index::binary-size(4),
           _rest::binary>> = doc
       ) do
    sector_size = 1 <<< :binary.decode_unsigned(sector_size, :little)

    root_directory_index = :binary.decode_unsigned(root_directory_index, :little)
    root_directory_address = sector_size + root_directory_index * sector_size
    position = root_directory_address + 80

    case doc do
      <<_offset::binary-size(position), 16, 141, 129, 100, 155, 79, 207, 17, 134, 234, 0, 170, 0, 185, 41, 232, _rest::binary>> -> :ppt
      <<_offset::binary-size(position), 6, 9, 2, 0, 0, 0, 0, 0, 192, 0, 0, 0, 0, 0, 0, 70, _rest::binary>> -> :doc
      <<_offset::binary-size(position), 32, 8, 2, 0, 0, 0, 0, 0, 192, 0, 0, 0, 0, 0, 0, 70, _rest::binary>> -> :xls
      <<_offset::binary-size(position), 32, 8, 1, 0, 0, 0, 0, 0, 192, 0, 0, 0, 0, 0, 0, 70, _rest::binary>> -> :xls
      _ -> nil
    end
  end
end