From 0d73e6266a8841202938ba4e3b201c1e8c91c601 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Wed, 15 Jan 2025 23:21:07 +0100 Subject: [PATCH] CSV: fix parsing files with double-quote inside a field value Fixes #11660 --- .../double_quotes_in_middle_of_field_bis.csv | 3 + autotest/ogr/ogr_csv.py | 20 ++++++ port/cpl_csv.cpp | 63 ++++++++++++------- 3 files changed, 65 insertions(+), 21 deletions(-) create mode 100644 autotest/ogr/data/csv/double_quotes_in_middle_of_field_bis.csv diff --git a/autotest/ogr/data/csv/double_quotes_in_middle_of_field_bis.csv b/autotest/ogr/data/csv/double_quotes_in_middle_of_field_bis.csv new file mode 100644 index 000000000000..050d95d81045 --- /dev/null +++ b/autotest/ogr/data/csv/double_quotes_in_middle_of_field_bis.csv @@ -0,0 +1,3 @@ +first,second,third +1,two"with quote,3 +10,twenty"with quote,30 diff --git a/autotest/ogr/ogr_csv.py b/autotest/ogr/ogr_csv.py index 0ab07472aed8..977c87952769 100755 --- a/autotest/ogr/ogr_csv.py +++ b/autotest/ogr/ogr_csv.py @@ -2800,6 +2800,26 @@ def test_ogr_csv_double_quotes_in_middle_of_field(): assert f["str"] == "foo" +############################################################################### +# Test bugfix for https://github.com/OSGeo/gdal/issues/11660 + + +def test_ogr_csv_double_quotes_in_middle_of_field_bis(): + + ds = ogr.Open("data/csv/double_quotes_in_middle_of_field_bis.csv") + lyr = ds.GetLayer(0) + + f = lyr.GetNextFeature() + assert f["first"] == "1" + assert f["second"] == """two"with quote""" + assert f["third"] == "3" + + f = lyr.GetNextFeature() + assert f["first"] == "10" + assert f["second"] == """twenty"with quote""" + assert f["third"] == "30" + + ############################################################################### diff --git a/port/cpl_csv.cpp b/port/cpl_csv.cpp index 6acf55fe234c..36e0216f39ff 100644 --- a/port/cpl_csv.cpp +++ b/port/cpl_csv.cpp @@ -647,45 +647,66 @@ CSVReadParseLineGeneric(void *fp, const char *(*pfnReadLine)(void *, size_t), return CSVSplitLine(pszLine, pszDelimiter, bKeepLeadingAndClosingQuotes, bMergeDelimiter); + const size_t nDelimiterLength = strlen(pszDelimiter); + bool bInString = false; // keep in that scope ! + std::string osWorkLine(pszLine); // keep in that scope ! + size_t i = 0; // keep in that scope ! + try { - // We must now count the quotes in our working string, and as - // long as it is odd, keep adding new lines. - std::string osWorkLine(pszLine); - - size_t i = 0; - int nCount = 0; - while (true) { - for (; i < osWorkLine.size(); i++) + for (; i < osWorkLine.size(); ++i) { if (osWorkLine[i] == '\"') - nCount++; + { + if (!bInString) + { + // Only consider " as the start of a quoted string + // if it is the first character of the line, or + // if it is immediately after the field delimiter. + if (i == 0 || + (i >= nDelimiterLength && + osWorkLine.compare(i - nDelimiterLength, + nDelimiterLength, pszDelimiter, + nDelimiterLength) == 0)) + { + bInString = true; + } + } + else if (i + 1 < osWorkLine.size() && + osWorkLine[i + 1] == '"') + { + // Escaped double quote in a quoted string + ++i; + } + else + { + bInString = false; + } + } } - if (nCount % 2 == 0) - break; + if (!bInString) + { + return CSVSplitLine(osWorkLine.c_str(), pszDelimiter, + bKeepLeadingAndClosingQuotes, + bMergeDelimiter); + } - pszLine = pfnReadLine(fp, nMaxLineSize); - if (pszLine == nullptr) + const char *pszNewLine = pfnReadLine(fp, nMaxLineSize); + if (pszNewLine == nullptr) break; osWorkLine.append("\n"); - osWorkLine.append(pszLine); + osWorkLine.append(pszNewLine); } - - char **papszReturn = - CSVSplitLine(osWorkLine.c_str(), pszDelimiter, - bKeepLeadingAndClosingQuotes, bMergeDelimiter); - - return papszReturn; } catch (const std::exception &e) { CPLError(CE_Failure, CPLE_OutOfMemory, "%s", e.what()); - return nullptr; } + return nullptr; } /************************************************************************/