Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: allow users to manage a user-defined state while parsing rows #77

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 45 additions & 20 deletions lib/nimble_csv.ex
Original file line number Diff line number Diff line change
Expand Up @@ -334,31 +334,39 @@ defmodule NimbleCSV do
## Parser

def parse_stream(stream, opts \\ []) when is_list(opts) do
{state, separator, escape} = init_parser(opts)
{state, state_transform, separator, escape} = init_parser(opts)

Stream.transform(
stream,
fn -> state end,
&parse(maybe_to_utf8(&1), &2, separator, escape),
&to_enum(
parse(maybe_to_utf8(&1), &2, separator, escape),
&2,
state_transform
),
&finalize_parser/1
)
end

def parse_enumerable(enumerable, opts \\ []) when is_list(opts) do
{state, separator, escape} = init_parser(opts)
{state, state_transform, separator, escape} = init_parser(opts)

{lines, state} =
{lines, {user_state, state}} =
Enum.flat_map_reduce(
enumerable,
state,
&parse(maybe_to_utf8(&1), &2, separator, escape)
&to_enum(
parse(maybe_to_utf8(&1), &2, separator, escape),
&2,
state_transform
)
)

finalize_parser(state)
lines
finalize_parser({user_state, state})
{lines, user_state}
end

def parse_string(string, opts \\ []) when is_binary(string) and is_list(opts) do
def parse_string_with_state(string, opts \\ []) when is_binary(string) and is_list(opts) do
newline = :binary.compile_pattern(@encoded_newlines)
string = string |> maybe_trim_bom()

Expand All @@ -382,6 +390,12 @@ defmodule NimbleCSV do
|> parse_enumerable(opts)
end

def parse_string(string, opts \\ []) when is_binary(string) and is_list(opts) do
{lines, _} = parse_string_with_state(string, opts)

lines
end

def to_line_stream(stream) do
newline = :binary.compile_pattern(@encoded_newlines)

Expand Down Expand Up @@ -429,33 +443,44 @@ defmodule NimbleCSV do
defp to_line_stream_after_fun(""), do: {:cont, []}
defp to_line_stream_after_fun(acc), do: {:cont, [acc], []}

defp default_state_transform(s, _), do: s

defp init_parser(opts) do
state = if Keyword.get(opts, :skip_headers, true), do: :header, else: :line
{state, :binary.compile_pattern(@separator), :binary.compile_pattern(@escape)}
line_parse_state = if Keyword.get(opts, :skip_headers, true), do: :header, else: :line
state_transform_function = case Keyword.get(opts, :state_transform_function, nil) do
nil -> &default_state_transform/2
x -> x
end
init_user_state = Keyword.get(opts, :init_user_state, :unused)

{{init_user_state, line_parse_state},
state_transform_function,
:binary.compile_pattern(@separator),
:binary.compile_pattern(@escape)}
end

defp finalize_parser({:escape, _, _, _}) do
defp finalize_parser({_user_state, {:escape, _, _, _}}) do
raise ParseError, "expected escape character #{@escape} but reached the end of file"
end

defp finalize_parser(_) do
:ok
end

defp to_enum(result) do
defp to_enum(result, {user_state, _state}, fstate) do
case result do
{:line, row} -> {[row], :line}
{:header, _} -> {[], :line}
{:escape, _, _, _} = escape -> {[], escape}
{:header, _} -> {[], {fstate.(user_state, :header), :line}}
{:line, row} -> {[row], {fstate.(user_state, :line), :line}}
{:escape, _, _, _} = escape -> {[], {user_state, escape}}
end
end

defp parse(line, {:escape, entry, row, state}, separator, escape) do
to_enum(escape(line, entry, row, state, separator, escape))
defp parse(line, {_user_state, {:escape, entry, row, state}}, separator, escape) do
escape(line, entry, row, state, separator, escape)
end

defp parse(line, state, separator, escape) do
to_enum(separator(line, [], state, separator, escape))
defp parse(line, {_user_state, state}, separator, escape) do
separator(line, [], state, separator, escape)
end

defmacrop newlines_separator!() do
Expand Down Expand Up @@ -588,7 +613,7 @@ defmodule NimbleCSV do
newlines_escape!(:binary.match(line, escape))
end

@compile {:inline, init_parser: 1, to_enum: 1, parse: 4}
@compile {:inline, init_parser: 1, to_enum: 3, parse: 4}

## Dumper

Expand Down
30 changes: 25 additions & 5 deletions test/nimble_csv_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -176,15 +176,15 @@ defmodule NimbleCSVTest do
assert CSV.parse_enumerable([
"name,last,year\n",
"john,doe,1986\n"
]) == [~w(john doe 1986)]
]) == {[~w(john doe 1986)], :unused}

assert CSV.parse_enumerable(
[
"name,last,year\n",
"john,doe,1986\n"
],
skip_headers: false
) == [~w(name last year), ~w(john doe 1986)]
) == {[~w(name last year), ~w(john doe 1986)], :unused}

assert_raise NimbleCSV.ParseError,
~s(expected escape character " but reached the end of file),
Expand All @@ -198,9 +198,7 @@ defmodule NimbleCSVTest do
assert Spreadsheet.parse_enumerable([
utf16le("name\tage\n"),
utf16le("\"doe\tjohn\"\t27\n")
]) == [
["doe\tjohn", "27"]
]
]) == {[["doe\tjohn", "27"]], :unused}
end

test "parse_stream/2" do
Expand Down Expand Up @@ -487,4 +485,26 @@ defmodule NimbleCSVTest do

defp utf16le(binary), do: :unicode.characters_to_binary(binary, :utf8, {:utf16, :little})
defp utf16le_bom(), do: :unicode.encoding_to_bom({:utf16, :little})

describe "user state management" do
NimbleCSV.define(
CSVWithUserState,
separator: [",", ";", "\t"]
)

test "parse_string_with_state/2" do
assert CSVWithUserState.parse_string_with_state("""
name,last\tyear
john;doe,1986
""",
[ init_user_state: 0,
state_transform_function: fn count, kind ->
case kind do
:line -> count + 1
:header -> count
end
end
]) == {[~w(john doe 1986)], 1}
end
end
end