lib/gbk_to_utf8.ex

defmodule GbkToUtf8 do
  @moduledoc """
  通过 CP936 映射文件, 将GBK编码转换为UTF8编码

  1. GbkToUtf8.start
  2. GbkToUtf8.get_utf_by_gbk_string(File.read!('/mnt/d/hello.gbk.txt'))
  """

  @doc """
  将 CP936 文件加载到 :persistent_term, key , value 都删除了 0x 前缀
  """
  def start do
    String.split(GbkToUtf8Map.cp936string(), "\r\n", trim: true)
    |> Enum.filter(fn l -> !String.starts_with?(l, "#") end)
    #|> IO.inspect()
    |> Enum.map(fn l ->
      [gbk, utf8, _] = String.split(l, "\t")
      gbk = String.replace(gbk, ["0x0", "0x"], "")
      utf8 = String.replace(utf8, "0x", "")
      {gbk, utf8}
    end)
    #|> IO.inspect()
    |> Enum.each(fn {gbk, utf8} ->
      :persistent_term.put({__MODULE__, gbk}, utf8)
    end)
  end

  def get_utf_by_gbk_string(gbk_in) do
    List.to_string(get_utf_by_gbk(gbk_in))
  end

  def get_utf_by_gbk(gbk_in) do
    Enum.map(gbk_to_utf(gbk_in, byte_size(gbk_in) - 1, 0, []) |> tl() |> Enum.reverse(), fn e ->
      case Integer.parse(e, 16) do
        {n, _} -> n
        :error -> 0
      end
    end)
  end

  defp gbk_to_utf(gbk_bin, bs, current, acc) when current - 1 == bs do
    [find_gbk_utf_one(binary_part(gbk_bin, current - 1, 1)) | acc]
  end

  defp gbk_to_utf(gbk_bin, bs, current, acc) when current - 2 == bs do
    [find_gbk_utf_two(binary_part(gbk_bin, current - 2, 2)) | acc]
  end

  defp gbk_to_utf(gbk_bin, bs, current, acc) do
    first_byte = binary_part(gbk_bin, current, 1)
    <<first_byte_value::8>> = first_byte

    if first_byte_value <= 0x7F do
      # 表示是单字节字符
      gbk_to_utf(gbk_bin, bs, current + 1, [find_gbk_utf_one(first_byte) | acc])
    else
      # 表示是双字节字符
      gbk_to_utf(gbk_bin, bs, current + 2, [
        find_gbk_utf_two(binary_part(gbk_bin, current, 2)) | acc
      ])
    end
  end

  # 单字节 <<x>>
  defp find_gbk_utf_one(b1) do
    <<c::8>> = b1
    :persistent_term.get({__MODULE__, Integer.to_string(c, 16)}, "0")
  end

  # 双字节 <<x1,x2>>
  defp find_gbk_utf_two(b1) do
    <<c::16>> = b1
    :persistent_term.get({__MODULE__, Integer.to_string(c, 16)}, "0")
  end

end