Skip to content

Commit

Permalink
WIP: allow user to be able to control the parse state.
Browse files Browse the repository at this point in the history
  • Loading branch information
diasbruno committed Oct 7, 2023
1 parent 0b23904 commit 6b9c1eb
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 25 deletions.
79 changes: 59 additions & 20 deletions lib/nimble_csv.ex
Original file line number Diff line number Diff line change
Expand Up @@ -334,31 +334,53 @@ defmodule NimbleCSV do
## Parser

def parse_stream(stream, opts \\ []) when is_list(opts) do
{state, separator, escape} = init_parser(opts)
{state, state_transform, separator, escape} = init_parser(opts)

Stream.transform(
stream,
fn -> state end,
&parse(maybe_to_utf8(&1), &2, separator, escape),
&to_enum(
parse(maybe_to_utf8(&1), &2, separator, escape),
&2,
state_transform
),
&finalize_parser/1
)
# fn {count, parse_state}, kind ->
# :io.format("state: ~p ~p ~p~n" , [count, parse_state, kind])
# case kind do
# :line -> {count + 1, :line}
# :header -> {count, :line}
# end
# end
end

def parse_enumerable(enumerable, opts \\ []) when is_list(opts) do
{state, separator, escape} = init_parser(opts)
{state, state_transform, separator, escape} = init_parser(opts)

{lines, state} =
{lines, {user_state, state}} =
Enum.flat_map_reduce(
enumerable,
state,
&parse(maybe_to_utf8(&1), &2, separator, escape)
&to_enum(
parse(maybe_to_utf8(&1), &2, separator, escape),
&2,
state_transform
)
)

finalize_parser(state)
lines
# fn {count, parse_state}, kind ->
# :io.format("state: ~p ~p ~p~n" , [count, parse_state, kind])
# case kind do
# :line -> {count + 1, :line}
# :header -> {count, :line}
# end
# end
finalize_parser({user_state, state})
{lines, user_state}
end

def parse_string(string, opts \\ []) when is_binary(string) and is_list(opts) do
def parse_string_with_state(string, opts \\ []) when is_binary(string) and is_list(opts) do
newline = :binary.compile_pattern(@encoded_newlines)
string = string |> maybe_trim_bom()

Expand All @@ -382,6 +404,12 @@ defmodule NimbleCSV do
|> parse_enumerable(opts)
end

def parse_string(string, opts \\ []) when is_binary(string) and is_list(opts) do
{lines, _} = parse_string_with_state(string, opts)

lines
end

def to_line_stream(stream) do
newline = :binary.compile_pattern(@encoded_newlines)

Expand Down Expand Up @@ -429,33 +457,44 @@ defmodule NimbleCSV do
defp to_line_stream_after_fun(""), do: {:cont, []}
defp to_line_stream_after_fun(acc), do: {:cont, [acc], []}

defp default_state_transform(s, _), do: s

defp init_parser(opts) do
state = if Keyword.get(opts, :skip_headers, true), do: :header, else: :line
{state, :binary.compile_pattern(@separator), :binary.compile_pattern(@escape)}
line_parse_state = if Keyword.get(opts, :skip_headers, true), do: :header, else: :line
state_transform_function = case Keyword.get(opts, :state_transform_function, nil) do
nil -> &default_state_transform/2
x -> x
end
init_user_state = Keyword.get(opts, :init_user_state, :unused)

{{init_user_state, line_parse_state},
state_transform_function,
:binary.compile_pattern(@separator),
:binary.compile_pattern(@escape)}
end

defp finalize_parser({:escape, _, _, _}) do
defp finalize_parser({_user_state, {:escape, _, _, _}}) do
raise ParseError, "expected escape character #{@escape} but reached the end of file"
end

defp finalize_parser(_) do
:ok
end

defp to_enum(result) do
defp to_enum(result, {user_state, _state}, fstate) do
case result do
{:line, row} -> {[row], :line}
{:header, _} -> {[], :line}
{:escape, _, _, _} = escape -> {[], escape}
{:header, _} -> {[], {fstate.(user_state, :header), :line}}
{:line, row} -> {[row], {fstate.(user_state, :line), :line}}
{:escape, _, _, _} = escape -> {[], {user_state, escape}}
end
end

defp parse(line, {:escape, entry, row, state}, separator, escape) do
to_enum(escape(line, entry, row, state, separator, escape))
defp parse(line, {_user_state, {:escape, entry, row, state}}, separator, escape) do
escape(line, entry, row, state, separator, escape)
end

defp parse(line, state, separator, escape) do
to_enum(separator(line, [], state, separator, escape))
defp parse(line, {_user_state, state}, separator, escape) do
separator(line, [], state, separator, escape)
end

defmacrop newlines_separator!() do
Expand Down Expand Up @@ -588,7 +627,7 @@ defmodule NimbleCSV do
newlines_escape!(:binary.match(line, escape))
end

@compile {:inline, init_parser: 1, to_enum: 1, parse: 4}
@compile {:inline, init_parser: 1, to_enum: 3, parse: 4}

## Dumper

Expand Down
30 changes: 25 additions & 5 deletions test/nimble_csv_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -176,15 +176,15 @@ defmodule NimbleCSVTest do
assert CSV.parse_enumerable([
"name,last,year\n",
"john,doe,1986\n"
]) == [~w(john doe 1986)]
]) == {[~w(john doe 1986)], :unused}

assert CSV.parse_enumerable(
[
"name,last,year\n",
"john,doe,1986\n"
],
skip_headers: false
) == [~w(name last year), ~w(john doe 1986)]
) == {[~w(name last year), ~w(john doe 1986)], :unused}

assert_raise NimbleCSV.ParseError,
~s(expected escape character " but reached the end of file),
Expand All @@ -198,9 +198,7 @@ defmodule NimbleCSVTest do
assert Spreadsheet.parse_enumerable([
utf16le("name\tage\n"),
utf16le("\"doe\tjohn\"\t27\n")
]) == [
["doe\tjohn", "27"]
]
]) == {[["doe\tjohn", "27"]], :unused}
end

test "parse_stream/2" do
Expand Down Expand Up @@ -487,4 +485,26 @@ defmodule NimbleCSVTest do

defp utf16le(binary), do: :unicode.characters_to_binary(binary, :utf8, {:utf16, :little})
defp utf16le_bom(), do: :unicode.encoding_to_bom({:utf16, :little})

describe "user state management" do
NimbleCSV.define(
CSVWithUserState,
separator: [",", ";", "\t"]
)

test "parse_string_with_state/2" do
assert CSVWithUserState.parse_string_with_state("""
name,last\tyear
john;doe,1986
""",
[ init_user_state: 0,
state_transform_function: fn count, kind ->
case kind do
:line -> count + 1
:header -> count
end
end
]) == {[~w(john doe 1986)], 1}
end
end
end

0 comments on commit 6b9c1eb

Please sign in to comment.