lib/extr_text.ex

defmodule ExtrText do
  @moduledoc """
  ExtrText is an Elixir library for extracting text and meta information from `.docx`, `.xlsx`,
  `.pptx` files.
  """

  @doc """
  Extracts properties (metadata) from the specified OOXML data.
  """
  @spec get_metadata(binary()) :: {:ok, ExtrText.Metadata.t()} | {:error, String.t()}
  def get_metadata(data) do
    case unzip(data) do
      {:ok, subdir, paths} -> do_get_metadata(subdir, paths)
      {:error, reason} -> {:error, reason}
    end
  end

  @doc """
  Extracts plain texts from the body of specified OOXML data.

  The return value is a double nested list of strings.

  Each element of outer list represents the sheets of `.xsls` data and the slides of `.pptx` data.
  For `.docx` data, the outer list has only one element.

  Each element of inner list represents the paragraphs or lines of a spreadsheet.
  """
  def get_texts(data) do
    case unzip(data) do
      {:ok, subdir, paths} -> do_get_texts(subdir, paths)
      {:error, reason} -> {:error, reason}
    end
  end

  defp unzip(data) do
    tmpdir = System.tmp_dir!()
    now = DateTime.utc_now()
    {usec, _} = now.microsecond
    subdir = tmpdir <> "/extr-text-" <> Integer.to_string(usec)

    case File.mkdir_p(subdir) do
      :ok -> do_unzip(data, subdir)
      {:error, _reason} -> {:error, "Can't create #{subdir}."}
    end
  end

  defp do_unzip(data, subdir) do
    case :zip.unzip(data, cwd: String.to_charlist(subdir)) do
      {:ok, paths} -> {:ok, subdir, Enum.map(paths, &List.to_string/1)}
      {:error, _reason} -> {:error, "Can't unzip the given data."}
    end
  end

  defp get_worksheets(subdir, paths) do
    Enum.filter(paths, fn path ->
      String.starts_with?(path, subdir <> "/xl/worksheets/") &&
        String.ends_with?(path, ".xml")
    end)
  end

  defp get_slides(subdir, paths) do
    Enum.filter(paths, fn path ->
      String.starts_with?(path, subdir <> "/ppt/slides/") &&
        String.ends_with?(path, ".xml")
    end)
  end

  defp do_get_metadata(subdir, _paths) do
    result =
      case File.read(Path.join(subdir, "docProps/core.xml")) do
        {:ok, xml} -> extract_metadata(xml)
        {:error, _} -> {:error, "Can't read docProps/core.xml."}
      end

    File.rm_rf!(subdir)
    result
  end

  defp extract_metadata(xml) do
    {:ok, %{metadata: metadata}} =
      Saxy.parse_string(xml, ExtrText.MetadataHandler, %{
        name: nil,
        metadata: %ExtrText.Metadata{}
      })

    {:ok, metadata}
  end

  defp do_get_texts(subdir, paths) do
    type =
      cond do
        Enum.any?(paths, fn path -> path == subdir <> "/word/document.xml" end) -> :docx
        Enum.any?(paths, fn path -> path == subdir <> "/xl/workbook.xml" end) -> :xlsx
        Enum.any?(paths, fn path -> path == subdir <> "/ppt/presentation.xml" end) -> :pptx
        true -> :unknown
      end

    result = do_get_texts(subdir, paths, type)
    File.rm_rf!(subdir)
    result
  end

  defp do_get_texts(_subdir, _paths, :unknown) do
    {:error, "Could not find a target XML file."}
  end

  defp do_get_texts(subdir, paths, :xlsx) do
    strings =
      if File.exists?(subdir <> "/xl/sharedStrings.xml") do
        ss_xml = File.read!(subdir <> "/xl/sharedStrings.xml")

        {:ok, strings} = Saxy.parse_string(ss_xml, ExtrText.ExcelSharedStringsHandler, [])
        Enum.reverse(strings)
      else
        []
      end

    st_xml = File.read!(subdir <> "/xl/styles.xml")

    {:ok, %{num_formats: num_formats, cell_style_xfs: cell_style_xfs}} =
      Saxy.parse_string(st_xml, ExtrText.ExcelStylesHandler, %{
        num_formats: [],
        cell_style_xfs: [],
        name: nil
      })

    num_formats = Enum.reverse(num_formats)
    cell_style_xfs = Enum.reverse(cell_style_xfs)

    worksheets = get_worksheets(subdir, paths)

    text_sets =
      worksheets
      |> Enum.map(fn path ->
        case File.read(path) do
          {:ok, xml} -> extract_texts(:xslx, xml, strings, num_formats, cell_style_xfs)
          {:error, _} -> nil
        end
      end)
      |> Enum.reject(fn doc -> is_nil(doc) end)

    {:ok, text_sets}
  end

  defp do_get_texts(subdir, paths, type) when type in ~w(docx pptx)a do
    {handler, paths} =
      case type do
        :docx -> {ExtrText.WordDocumentHandler, [subdir <> "/word/document.xml"]}
        :pptx -> {ExtrText.PresentationSlideHandler, get_slides(subdir, paths)}
      end

    text_sets =
      paths
      |> Enum.map(fn path ->
        case File.read(path) do
          {:ok, xml} -> extract_texts(handler, xml)
          {:error, _} -> nil
        end
      end)
      |> Enum.reject(fn doc -> is_nil(doc) end)

    {:ok, text_sets}
  end

  defp extract_texts(:xslx, xml, strings, num_formats, cell_style_xfs) do
    {:ok, %{texts: texts}} =
      Saxy.parse_string(xml, ExtrText.ExcelWorksheetHandler, %{
        texts: [],
        buffer: [],
        strings: strings,
        num_formats: num_formats,
        cell_style_xfs: cell_style_xfs,
        type: nil,
        style: nil
      })

    Enum.reverse(texts)
  end

  defp extract_texts(handler, xml) do
    {:ok, %{texts: texts}} = Saxy.parse_string(xml, handler, %{texts: [], buffer: []})
    Enum.reverse(texts)
  end
end