From ff5ed33cf94fc3304bb18a0237976744772da495 Mon Sep 17 00:00:00 2001 From: zeljko predjeskovic Date: Sun, 8 Dec 2024 11:17:42 +0100 Subject: [PATCH 1/3] CSV.parse - Support Headers --- spec/std/csv/csv_parse_spec.cr | 11 +++++++++++ src/csv.cr | 18 ++++++++++++++++++ src/csv/parser.cr | 18 ++++++++++++++++++ 3 files changed, 47 insertions(+) diff --git a/spec/std/csv/csv_parse_spec.cr b/spec/std/csv/csv_parse_spec.cr index 5f9906f35333..c93ae5be2b53 100644 --- a/spec/std/csv/csv_parse_spec.cr +++ b/spec/std/csv/csv_parse_spec.cr @@ -49,6 +49,17 @@ describe CSV do CSV.parse(%("","")).should eq([["", ""]]) end + it "parses to hashes" do + csv_text = "Index,Customer Id,First Name,Last Name + 1,DD37Cf93aecA6Dc,Sheryl,Baxter + 2,1Ef7b82A4CAAD10,Preston,Lozano + 3,6F94879bDAfE5a6,,Berry" + + CSV.parse_to_h(csv_text).should eq([{"Index" => "1", "Customer Id" => "DD37Cf93aecA6Dc", "First Name" => "Sheryl", "Last Name" => "Baxter"}, + {"Index" => "2", "Customer Id" => "1Ef7b82A4CAAD10", "First Name" => "Preston", "Last Name" => "Lozano"}, + {"Index" => "3", "Customer Id" => "6F94879bDAfE5a6", "First Name" => "", "Last Name" => "Berry"}]) + end + it "raises if single quote in the middle" do expect_raises CSV::MalformedCSVError, "Unexpected quote at line 1, column 4" do CSV.parse(%(hel"lo)) diff --git a/src/csv.cr b/src/csv.cr index 6751085d28cc..2a3c5dd79f67 100644 --- a/src/csv.cr +++ b/src/csv.cr @@ -78,6 +78,24 @@ class CSV Parser.new(string_or_io, separator, quote_char).parse end + # Parses a CSV or `IO` into an array of hashes using the first row as headers. + # + # Takes optional *separator* and *quote_char* arguments for defining + # non-standard csv cell separators and quote characters. + # + # ``` + # require "csv" + # + # CSV.parse_to_hashes("name,age,city\nJohn,30,New York\nJane,25,San Francisco") + # # => [ + # # {"name" => "John", "age" => "30", "city" => "New York"}, + # # {"name" => "Jane", "age" => "25", "city" => "San Francisco"} + # # ] + # ``` + def self.parse_to_h(string_or_io : String | IO, separator : Char = DEFAULT_SEPARATOR, quote_char : Char = DEFAULT_QUOTE_CHAR) : Array(Hash(String, String)) + Parser.new(string_or_io, separator, quote_char).parse_to_h + end + # Yields each of a CSV's rows as an `Array(String)`. # # See `CSV.parse` about the *separator* and *quote_char* arguments. diff --git a/src/csv/parser.cr b/src/csv/parser.cr index 57491b726dce..32d0931d921e 100644 --- a/src/csv/parser.cr +++ b/src/csv/parser.cr @@ -19,6 +19,16 @@ class CSV::Parser rows end + def parse_to_h : Array(Hash(String, String)) + rows = [] of Hash(String, String) + if headers = next_row + while row = next_row + rows << parse_row_to_h_internal(headers, row) + end + end + rows + end + # Yields each of the remaining rows as an `Array(String)`. def each_row(&) : Nil while row = next_row @@ -71,6 +81,14 @@ class CSV::Parser end end + private def parse_row_to_h_internal(headers : Array(String), row : Array(String)) : Hash(String, String) + h = {} of String => String + headers.each_with_index do |header, i| + h[header] = row[i].strip || "" + end + h + end + private struct RowIterator include Iterator(Array(String)) From 6ef3352fb81970f925dd33eac52b20d035bc1051 Mon Sep 17 00:00:00 2001 From: Zeljko Predjeskovic Date: Mon, 9 Dec 2024 20:11:13 +0100 Subject: [PATCH 2/3] added updated parser_to_h and added test cases --- spec/std/csv/csv_parse_spec.cr | 31 +++++++++++++++++++++++++++---- src/csv/lexer.cr | 3 +++ src/csv/parser.cr | 20 +++++++++++++------- 3 files changed, 43 insertions(+), 11 deletions(-) diff --git a/spec/std/csv/csv_parse_spec.cr b/spec/std/csv/csv_parse_spec.cr index c93ae5be2b53..3e505cd349ea 100644 --- a/spec/std/csv/csv_parse_spec.cr +++ b/spec/std/csv/csv_parse_spec.cr @@ -50,16 +50,39 @@ describe CSV do end it "parses to hashes" do - csv_text = "Index,Customer Id,First Name,Last Name - 1,DD37Cf93aecA6Dc,Sheryl,Baxter - 2,1Ef7b82A4CAAD10,Preston,Lozano - 3,6F94879bDAfE5a6,,Berry" + csv_text = "Index,Customer Id,First Name,Last Name\n\n1,DD37Cf93aecA6Dc,Sheryl,Baxter\n2,1Ef7b82A4CAAD10,Preston,Lozano\n3,6F94879bDAfE5a6,,Berry, Jerry, \n" CSV.parse_to_h(csv_text).should eq([{"Index" => "1", "Customer Id" => "DD37Cf93aecA6Dc", "First Name" => "Sheryl", "Last Name" => "Baxter"}, {"Index" => "2", "Customer Id" => "1Ef7b82A4CAAD10", "First Name" => "Preston", "Last Name" => "Lozano"}, {"Index" => "3", "Customer Id" => "6F94879bDAfE5a6", "First Name" => "", "Last Name" => "Berry"}]) end + it "parses to hashes with no headers" do + csv_text = "\n1,DD37Cf93aecA6Dc,Sheryl,Baxter\n2,1Ef7b82A4CAAD10,Preston,Lozano\n3,6F94879bDAfE5a6,,Berry" + + actual = [{} of String => String, {} of String => String, {} of String => String] + + CSV.parse_to_h(csv_text).should eq(actual) + end + + it "parses to hashes with only headers" do + csv_text = "Index,Customer Id,First Name,Last Name" + + CSV.parse_to_h(csv_text).should eq([] of Hash(String, String)) + end + + it "parses to hashes remaining rows" do + csv_text = "Index,Customer Id,First Name,Last Name\n1,DD37Cf93aecA6Dc,Sheryl,Baxter\n2,1Ef7b82A4CAAD10,Preston,Lozano\n3,6F94879bDAfE5a6,,Berry" + parser = CSV::Parser.new(csv_text) + # skip header + parser.next_row + # skip rows + parser.next_row + parser.next_row + + parser.parse_to_h.should eq([{"Index" => "3", "Customer Id" => "6F94879bDAfE5a6", "First Name" => "", "Last Name" => "Berry"}]) + end + it "raises if single quote in the middle" do expect_raises CSV::MalformedCSVError, "Unexpected quote at line 1, column 4" do CSV.parse(%(hel"lo)) diff --git a/src/csv/lexer.cr b/src/csv/lexer.cr index 9d3d04c68c0f..5109f630a7e0 100644 --- a/src/csv/lexer.cr +++ b/src/csv/lexer.cr @@ -29,6 +29,9 @@ abstract class CSV::Lexer getter separator : Char getter quote_char : Char + # :nodoc: + protected getter line_number : Int32 + # :nodoc: def initialize(@separator : Char = DEFAULT_SEPARATOR, @quote_char : Char = DEFAULT_QUOTE_CHAR) @token = Token.new diff --git a/src/csv/parser.cr b/src/csv/parser.cr index 32d0931d921e..a3460b302fe7 100644 --- a/src/csv/parser.cr +++ b/src/csv/parser.cr @@ -21,9 +21,17 @@ class CSV::Parser def parse_to_h : Array(Hash(String, String)) rows = [] of Hash(String, String) + row_number = @lexer.line_number + + rewind if headers = next_row - while row = next_row - rows << parse_row_to_h_internal(headers, row) + while @lexer.line_number < row_number + next_row + end + each_row do |row| + if parsed_row = parse_row_to_h_internal(headers, row) + rows << parsed_row + end end end rows @@ -81,12 +89,10 @@ class CSV::Parser end end - private def parse_row_to_h_internal(headers : Array(String), row : Array(String)) : Hash(String, String) + private def parse_row_to_h_internal(headers : Array(String), row : Array(String)) : Hash(String, String) | Nil h = {} of String => String - headers.each_with_index do |header, i| - h[header] = row[i].strip || "" - end - h + row.empty? ? return nil : headers.each_with_index { |header, i| h[header] = row[i] } + return h end private struct RowIterator From 647aa3e999ba009c0be535b9f97903f0c8b37907 Mon Sep 17 00:00:00 2001 From: Zeljko Predjeskovic Date: Mon, 30 Dec 2024 11:13:26 +0100 Subject: [PATCH 3/3] removed rewinding rows in parser --- spec/std/csv/csv_parse_spec.cr | 9 +++------ src/csv/lexer.cr | 3 --- src/csv/parser.cr | 8 +------- 3 files changed, 4 insertions(+), 16 deletions(-) diff --git a/spec/std/csv/csv_parse_spec.cr b/spec/std/csv/csv_parse_spec.cr index 3e505cd349ea..3118488ef4c5 100644 --- a/spec/std/csv/csv_parse_spec.cr +++ b/spec/std/csv/csv_parse_spec.cr @@ -72,15 +72,12 @@ describe CSV do end it "parses to hashes remaining rows" do - csv_text = "Index,Customer Id,First Name,Last Name\n1,DD37Cf93aecA6Dc,Sheryl,Baxter\n2,1Ef7b82A4CAAD10,Preston,Lozano\n3,6F94879bDAfE5a6,,Berry" + csv_text = "Index,Customer Id,First Name,Last Name\n1,DD37Cf93aecA6Dc,Sheryl,Baxter\n2,1Ef7b82A4CAAD10,Preston,Lozano" parser = CSV::Parser.new(csv_text) - # skip header - parser.next_row - # skip rows - parser.next_row + parser.next_row - parser.parse_to_h.should eq([{"Index" => "3", "Customer Id" => "6F94879bDAfE5a6", "First Name" => "", "Last Name" => "Berry"}]) + parser.parse_to_h.should eq([{"1" => "2", "DD37Cf93aecA6Dc" => "1Ef7b82A4CAAD10", "Sheryl" => "Preston", "Baxter" => "Lozano"}]) end it "raises if single quote in the middle" do diff --git a/src/csv/lexer.cr b/src/csv/lexer.cr index 5109f630a7e0..9d3d04c68c0f 100644 --- a/src/csv/lexer.cr +++ b/src/csv/lexer.cr @@ -29,9 +29,6 @@ abstract class CSV::Lexer getter separator : Char getter quote_char : Char - # :nodoc: - protected getter line_number : Int32 - # :nodoc: def initialize(@separator : Char = DEFAULT_SEPARATOR, @quote_char : Char = DEFAULT_QUOTE_CHAR) @token = Token.new diff --git a/src/csv/parser.cr b/src/csv/parser.cr index a3460b302fe7..60296db75839 100644 --- a/src/csv/parser.cr +++ b/src/csv/parser.cr @@ -21,14 +21,8 @@ class CSV::Parser def parse_to_h : Array(Hash(String, String)) rows = [] of Hash(String, String) - row_number = @lexer.line_number - - rewind if headers = next_row - while @lexer.line_number < row_number - next_row - end - each_row do |row| + while row = next_row if parsed_row = parse_row_to_h_internal(headers, row) rows << parsed_row end