defmodule CSV do
use CSV.Defaults
alias CSV.Decoding.Decoder
alias CSV.Encoding.Encoder
@moduledoc ~S"""
RFC 4180 compliant CSV parsing and encoding for Elixir. Allows to specify
other separators, so it could also be named: TSV, but it isn't.
"""
@doc """
Decode a stream of comma-separated lines into a stream of tuples. Decoding
errors will be inlined into the stream.
## Options
These are the options:
* `:separator` – The separator token to use, defaults to `?,`.
Must be a codepoint (syntax: ? + (your separator)).
* `:escape_character` – The escape character token to use, defaults to `?"`.
Must be a codepoint (syntax: ? + (your escape character)).
* `:field_transform` – A function with arity 1 that will get called with
each field and can apply transformations. Defaults to identity function.
This function will get called for every field and therefore should return
quickly.
* `:headers` – When set to `true`, will take the first row of
the csv and use it as header values.
When set to a list, will use the given list as header values.
When set to `false` (default), will use no header values.
When set to anything but `false`, the resulting rows in the matrix will
be maps instead of lists.
* `:validate_row_length` – When set to `true`, will take the first row of
the csv or its headers and validate that following rows are of the same
length. Defaults to `false`.
* `:unescape_formulas` – When set to `true`, will remove formula escaping
inserted to prevent [CSV Injection](https://owasp.org/www-community/attacks/CSV_Injection).
## Examples
Convert a filestream into a stream of rows in order of the given stream:
iex> \"../test/fixtures/docs/valid.csv\"
iex> |> Path.expand(__DIR__)
iex> |> File.stream!
iex> |> CSV.decode
iex> |> Enum.take(2)
[ok: [\"a\",\"b\",\"c\"], ok: [\"d\",\"e\",\"f\"]]
Read from a file with a Byte Order Mark (BOM):
iex> \"../test/fixtures/utf8-with-bom.csv\"
...> |> Path.expand(__DIR__)
...> |> File.stream!([:trim_bom])
...> |> CSV.decode()
...> |> Enum.take(2)
[ok: [\"a\", \"b\"], ok: [\"d\", \"e\"]]
Errors will show up as error tuples:
iex> \"../test/fixtures/docs/escape-errors.csv\"
iex> |> Path.expand(__DIR__)
iex> |> File.stream!
iex> |> CSV.decode
iex> |> Enum.take(2)
[
ok: [\"a\",\"b\",\"c\"],
error: "Escape sequence started on line 2:\\n\\n\\"d,e,f\\n\\n\
did not terminate before the stream halted. Parsing will continue on line 3.\\n"
]
Map an existing stream of lines separated by a token to a stream of rows
with a header row:
iex> [\"a;b\\n\",\"c;d\\n\", \"e;f\\n\"]
iex> |> Stream.map(&(&1))
iex> |> CSV.decode(separator: ?;, headers: true)
iex> |> Enum.take(2)
[
ok: %{\"a\" => \"c\", \"b\" => \"d\"},
ok: %{\"a\" => \"e\", \"b\" => \"f\"}
]
Map a stream with custom escape characters:
iex> [\"@a@,@b@\\n\",\"@c@,@d@\\n\"]
...> |> Stream.map(&(&1))
...> |> CSV.decode(escape_character: ?@)
...> |> Enum.take(2)
[ok: [\"a\", \"b\"], ok: [\"c\", \"d\"]]
Map a stream with custom separator characters:
iex> [\"a;b\\n\",\"c;d\\n\"]
...> |> Stream.map(&(&1))
...> |> CSV.decode(separator: ?;)
...> |> Enum.take(2)
[ok: [\"a\", \"b\"], ok: [\"c\", \"d\"]]
Trim each field:
iex> [\" a , b \\n\",\" c , d \\n\"]
...> |> Stream.map(&(&1))
...> |> CSV.decode(field_transform: &String.trim/1)
...> |> Enum.take(2)
[ok: [\"a\", \"b\"], ok: [\"c\", \"d\"]]
Map an existing stream of lines separated by a token to a stream of rows
with a given header row:
iex> [\"a;b\\n\",\"c;d\\n\", \"e;f\\n\"]
iex> |> Stream.map(&(&1))
iex> |> CSV.decode(separator: ?;, headers: [:x, :y])
iex> |> Enum.take(2)
[
ok: %{:x => \"a\", :y => \"b\"},
ok: %{:x => \"c\", :y => \"d\"}
]
"""
@type decode_options ::
{:separator, char}
| {:field_transform, (String.t() -> String.t())}
| {:headers, [String.t() | atom()] | boolean()}
| {:unescape_formulas, boolean()}
| {:validate_row_length, boolean()}
@spec decode(Enumerable.t(), [decode_options()]) :: Enumerable.t()
def decode(stream, options \\ []) do
stream |> Decoder.decode(options) |> inline_errors!(options)
end
@doc """
Decode a stream of comma-separated lines into a stream of tuples. Errors
when decoding will get raised immediately.
## Options
These are the options:
* `:separator` – The separator token to use, defaults to `?,`.
Must be a codepoint (syntax: ? + (your separator)).
* `:escape_character` – The escape character token to use, defaults to `?"`.
Must be a codepoint (syntax: ? + (your escape character)).
* `:field_transform` – A function with arity 1 that will get called with
each field and can apply transformations. Defaults to identity function.
This function will get called for every field and therefore should return
quickly.
* `:headers` – When set to `true`, will take the first row of
the csv and use it as header values.
When set to a list, will use the given list as header values.
When set to `false` (default), will use no header values.
When set to anything but `false`, the resulting rows in the matrix will
be maps instead of lists.
* `:validate_row_length` – When set to `true`, will take the first row of
the csv or its headers and validate that following rows are of the same
length. Will raise an error if validation fails. Defaults to `false`.
* `:unescape_formulas` – When set to `true`, will remove formula escaping
inserted to prevent [CSV Injection](https://owasp.org/www-community/attacks/CSV_Injection).
## Examples
Convert a filestream into a stream of rows in order of the given stream:
iex> \"../test/fixtures/docs/valid.csv\"
iex> |> Path.expand(__DIR__)
iex> |> File.stream!()
iex> |> CSV.decode!()
iex> |> Enum.take(2)
[[\"a\",\"b\",\"c\"], [\"d\",\"e\",\"f\"]]
Read from a file with a Byte Order Mark (BOM):
iex> \"../test/fixtures/utf8-with-bom.csv\"
...> |> Path.expand(__DIR__)
...> |> File.stream!([:trim_bom])
...> |> CSV.decode!()
...> |> Enum.take(2)
[[\"a\", \"b\"], [\"d\", \"e\"]]
Map an existing stream of lines separated by a token to a stream of rows
with a header row:
iex> [\"a;b\\n\",\"c;d\\n\", \"e;f\"]
iex> |> Stream.map(&(&1))
iex> |> CSV.decode!(separator: ?;, headers: true)
iex> |> Enum.take(2)
[
%{\"a\" => \"c\", \"b\" => \"d\"},
%{\"a\" => \"e\", \"b\" => \"f\"}
]
Map a stream with custom escape characters:
iex> [\"@a@,@b@\\n\",\"@c@,@d@\\n\"]
...> |> Stream.map(&(&1))
...> |> CSV.decode!(escape_character: ?@)
...> |> Enum.take(2)
[[\"a\", \"b\"], [\"c\", \"d\"]]
Map an existing stream of lines separated by a token to a stream of rows
with a given header row:
iex> [\"a;b\\n\",\"c;d\\n\", \"e;f\"]
iex> |> Stream.map(&(&1))
iex> |> CSV.decode!(separator: ?;, headers: [:x, :y])
iex> |> Enum.take(2)
[
%{:x => \"a\", :y => \"b\"},
%{:x => \"c\", :y => \"d\"}
]
Trim each field:
iex> [\" a , b \\n\",\" c , d \\n\"]
...> |> Stream.map(&(&1))
...> |> CSV.decode!(field_transform: &String.trim/1)
...> |> Enum.take(2)
[[\"a\", \"b\"], [\"c\", \"d\"]]
Replace invalid codepoints:
iex> "../test/fixtures/broken-encoding.csv"
...> |> Path.expand(__DIR__)
...> |> File.stream!()
...> |> CSV.decode!(field_transform: fn field ->
...> if String.valid?(field) do
...> field
...> else
...> field
...> |> String.codepoints()
...> |> Enum.map(fn codepoint -> if String.valid?(codepoint), do: codepoint, else: "?" end)
...> |> Enum.join()
...> end
...> end)
...> |> Enum.take(2)
[["a", "b", "c", "?_?"], ["ಠ_ಠ"]]
"""
@spec decode!(Enumerable.t(), [decode_options()]) :: Enumerable.t()
def decode!(stream, options \\ []) do
stream |> Decoder.decode(options) |> raise_errors!(options)
end
defp raise_errors!(stream, options) do
escape_max_lines = options |> Keyword.get(:escape_max_lines, @escape_max_lines)
stream |> Stream.map(&yield_or_raise!(&1, escape_max_lines))
end
defp yield_or_raise!({:error, mod, args}, _) do
raise mod, args ++ [mode: :strict]
end
defp yield_or_raise!({:ok, row}, _), do: row
defp inline_errors!(stream, options) do
escape_max_lines = options |> Keyword.get(:escape_max_lines, @escape_max_lines)
stream |> Stream.map(&yield_or_inline!(&1, escape_max_lines))
end
defp yield_or_inline!({:error, mod, args}, _) do
{:error, mod.exception(args ++ [mode: :normal]).message}
end
defp yield_or_inline!(value, _), do: value
@doc """
Encode a table stream into a stream of RFC 4180 compliant CSV lines for
writing to a file or other IO.
## Options
These are the options:
* `:separator` – The separator token to use, defaults to `?,`.
Must be a codepoint (syntax: ? + (your separator)).
* `:escape_character` – The escape character token to use, defaults to `?"`.
Must be a codepoint (syntax: ? + (your escape character)).
* `:delimiter` – The delimiter token to use, defaults to `\\r\\n`.
Must be a string.
* `:force_escaping – When set to `true`, will escape fields even if
they do not contain characters that require escaping
* `:escape_formulas – When set to `true`, will escape formulas
to prevent [CSV Injection](https://owasp.org/www-community/attacks/CSV_Injection).
## Examples
Convert a stream of rows with fields into a stream of lines:
iex> [~w(a b), ~w(c d)]
iex> |> CSV.encode
iex> |> Enum.take(2)
[\"a,b\\r\\n\", \"c,d\\r\\n\"]
Convert a stream of rows with fields with escape sequences into a stream of
lines:
iex> [[\"a\\nb\", \"\\tc\"], [\"de\", \"\\tf\\\"\"]]
iex> |> CSV.encode(separator: ?\\t, delimiter: \"\\n\")
iex> |> Enum.take(2)
[\"\\\"a\\nb\\\"\\t\\\"\\tc\\\"\\n\", \"de\\t\\\"\\tf\\\"\\\"\\\"\\n\"]
Convert a stream of rows with fields into a stream of lines forcing escaping
with a custom character:
iex> [~w(a b), ~w(c d)]
iex> |> CSV.encode(force_escaping: true, escape_character: ?@)
iex> |> Enum.take(2)
[\"@a@,@b@\\r\\n\", \"@c@,@d@\\r\\n\"]
Convert a stream of rows with fields with formulas into a stream of
lines:
iex> [~w(@a =b), ~w(-c +d)]
iex> |> CSV.encode(escape_formulas: true)
iex> |> Enum.take(2)
[\"\\\"'@a\\\",\\\"'=b\\\"\\r\\n\", \"\\\"'-c\\\",\\\"'+d\\\"\\r\\n\"]
"""
@type encode_options ::
{:separator, char()}
| {:escape_character, char()}
| {:delimiter, String.t()}
| {:force_escaping, boolean()}
| {:escape_formulas, boolean()}
@spec encode(Enumerable.t(), [encode_options()]) :: Enumerable.t()
def encode(stream, options \\ []) do
Encoder.encode(stream, options)
end
end