Skip to content

Commit

Permalink
Merge pull request #10799 from rouault/recode_improvements
Browse files Browse the repository at this point in the history
CPLRecode(): make ISO-8859-2 and -15 and CP437/CP1250/CP1251/CP1252 to UTF-8 always available
  • Loading branch information
rouault authored Sep 16, 2024
2 parents ca5b639 + ce9bbb5 commit c637701
Show file tree
Hide file tree
Showing 7 changed files with 1,112 additions and 67 deletions.
55 changes: 50 additions & 5 deletions autotest/cpp/test_cpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2846,22 +2846,23 @@ TEST_F(test_cpl, CPLJSONDocument)
}

// Test CPLRecodeIconv() with re-allocation
// (this test also passed on Windows using its native recoding API)
TEST_F(test_cpl, CPLRecodeIconv)
{
#ifdef CPL_RECODE_ICONV
#if defined(CPL_RECODE_ICONV) || defined(_WIN32)
int N = 32800;
char *pszIn = static_cast<char *>(CPLMalloc(N + 1));
for (int i = 0; i < N; i++)
pszIn[i] = '\xE9';
pszIn[i] = '\xA1';
pszIn[N] = 0;
char *pszExpected = static_cast<char *>(CPLMalloc(N * 2 + 1));
for (int i = 0; i < N; i++)
{
pszExpected[2 * i] = '\xC3';
pszExpected[2 * i + 1] = '\xA9';
pszExpected[2 * i] = '\xD0';
pszExpected[2 * i + 1] = '\x81';
}
pszExpected[N * 2] = 0;
char *pszRet = CPLRecode(pszIn, "ISO-8859-2", CPL_ENC_UTF8);
char *pszRet = CPLRecode(pszIn, "ISO-8859-5", CPL_ENC_UTF8);
EXPECT_EQ(memcmp(pszExpected, pszRet, N * 2 + 1), 0);
CPLFree(pszIn);
CPLFree(pszRet);
Expand All @@ -2871,6 +2872,50 @@ TEST_F(test_cpl, CPLRecodeIconv)
#endif
}

// Test CP1252 to UTF-8
TEST_F(test_cpl, CPLRecodeStubCP1252_to_UTF8_strict_alloc)
{
CPLClearRecodeWarningFlags();
CPLErrorReset();
CPLPushErrorHandler(CPLQuietErrorHandler);
// Euro character expands to 3-bytes
char *pszRet = CPLRecode("\x80", "CP1252", CPL_ENC_UTF8);
CPLPopErrorHandler();
EXPECT_STREQ(CPLGetLastErrorMsg(), "");
EXPECT_EQ(memcmp(pszRet, "\xE2\x82\xAC\x00", 4), 0);
CPLFree(pszRet);
}

// Test CP1252 to UTF-8
TEST_F(test_cpl, CPLRecodeStubCP1252_to_UTF8_with_ascii)
{
CPLClearRecodeWarningFlags();
CPLErrorReset();
CPLPushErrorHandler(CPLQuietErrorHandler);
char *pszRet = CPLRecode("x\x80y", "CP1252", CPL_ENC_UTF8);
CPLPopErrorHandler();
EXPECT_STREQ(CPLGetLastErrorMsg(), "");
EXPECT_EQ(memcmp(pszRet, "x\xE2\x82\xACy\x00", 6), 0);
CPLFree(pszRet);
}

// Test CP1252 to UTF-8
TEST_F(test_cpl, CPLRecodeStubCP1252_to_UTF8_with_warning)
{
CPLClearRecodeWarningFlags();
CPLErrorReset();
CPLPushErrorHandler(CPLQuietErrorHandler);
// \x90 is an invalid CP1252 character. Will be skipped
char *pszRet = CPLRecode("\x90\x80", "CP1252", CPL_ENC_UTF8);
CPLPopErrorHandler();
EXPECT_STREQ(
CPLGetLastErrorMsg(),
"One or several characters couldn't be converted correctly from CP1252 "
"to UTF-8. This warning will not be emitted anymore");
EXPECT_EQ(memcmp(pszRet, "\xE2\x82\xAC\x00", 4), 0);
CPLFree(pszRet);
}

// Test CPLHTTPParseMultipartMime()
TEST_F(test_cpl, CPLHTTPParseMultipartMime)
{
Expand Down
9 changes: 9 additions & 0 deletions port/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -374,3 +374,12 @@ if (NOT CMAKE_CROSSCOMPILING AND BUILD_VSIPRELOAD AND "${CMAKE_SYSTEM}" MATCHES
endforeach()
endif ()
endif ()

# Utility to generate cpl_character_sets.h and .c
add_executable(character_set_conv_table_generator EXCLUDE_FROM_ALL character_set_conv_table_generator.c)

# Custom target that must be manually invoked if character_set_conv_table_generator.c is modified
add_custom_target(generate_cpl_character_sets
COMMAND $<TARGET_FILE:character_set_conv_table_generator>
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
DEPENDS character_set_conv_table_generator)
124 changes: 124 additions & 0 deletions port/character_set_conv_table_generator.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
// SPDX-License-Identifier: MIT
// Copyright 2024 Even Rouault

#include <assert.h>
#include <iconv.h>
#include <stdio.h>
#include <stdlib.h>

#define ENCODING_MAX_LEN 256

static void launder_name(const char *srcEncoding,
char srcEncodingLaundered[ENCODING_MAX_LEN])
{
snprintf(srcEncodingLaundered, ENCODING_MAX_LEN, "%s", srcEncoding);
for (int i = 0; srcEncodingLaundered[i]; ++i)
{
if (srcEncodingLaundered[i] == '-')
srcEncodingLaundered[i] = '_';
}
}

static void generate(FILE *c_file, FILE *h_file, const char *srcEncoding,
const char *comment)
{
iconv_t sConv = iconv_open("UTF-8", srcEncoding);
if (sConv == (iconv_t)(-1))
{
fprintf(stderr, "iconv_open(%s) failed\n", srcEncoding);
exit(1);
}
char srcEncodingLaundered[ENCODING_MAX_LEN];
launder_name(srcEncoding, srcEncodingLaundered);
fprintf(c_file, "/* %s */\n", comment);
fprintf(c_file, "static const CPLCodePageConvTable CPL_%s_to_UTF8 = {\n",
srcEncodingLaundered);
for (int i = 0; i <= 255; ++i)
{
unsigned char c = (unsigned char)i;
size_t size_in = 1;
unsigned char out[4] = {0, 0, 0, 0};
size_t size_out = sizeof(out);
char *p_in = (char *)&c;
char *p_out = (char *)out;
size_t nConverted = iconv(sConv, &p_in, &size_in, &p_out, &size_out);
if (i <= 127)
{
assert(out[0] == i);
continue;
}
if (nConverted != (size_t)-1)
{
const size_t needed = sizeof(out) - size_out;
assert(needed <= 3);
fprintf(c_file, " {0x%02X, 0x%02X, 0x%02X},\n", out[0], out[1],
out[2]);
}
else
{
fprintf(c_file, " {0, 0, 0}, /* invalid */\n");
}
}
fprintf(c_file, "};\n\n");
iconv_close(sConv);
}

int main()
{
FILE *c_file = fopen("cpl_character_sets.c", "wb");
FILE *h_file = fopen("cpl_character_sets.h", "wb");
fprintf(c_file, "/* This file has been generated by "
"generate_character_set_conv_tables.c */\n");
fprintf(c_file, "/* DO NOT EDIT !*/\n\n");
fprintf(c_file, "/* clang-format off */\n");
fprintf(c_file, "#include \"cpl_port.h\"\n");
fprintf(c_file, "#include \"cpl_character_sets.h\"\n\n");

fprintf(h_file, "/* This file has been generated by "
"generate_character_set_conv_tables.c */\n");
fprintf(h_file, "/* DO NOT EDIT !*/\n\n");
fprintf(h_file, "/* clang-format off */\n");
fprintf(h_file, "typedef unsigned char CPLCodePageConvTable[128][3];\n");

const struct
{
const char *name;
const char *comment;
} encodings[] = {
{"CP437", "Character set of original IBM PC"},
{"CP1250", "Central and eastern Europe languages"},
{"CP1251", "Cyrillic script"},
{"CP1252",
"Legacy Windows single-byte character set used in a lot of countries"},
{"ISO-8859-2", "Central Europe languages"},
{"ISO-8859-15", "New Western Europe"},
{NULL, NULL}};

for (int i = 0; encodings[i].name; ++i)
{
generate(c_file, h_file, encodings[i].name, encodings[i].comment);
}
fprintf(h_file, "\n");
fprintf(h_file, "const CPLCodePageConvTable* "
"CPLGetConversionTableToUTF8(const char* pszEncoding);\n");

fprintf(c_file, "\nconst CPLCodePageConvTable* "
"CPLGetConversionTableToUTF8(const char* pszEncoding)\n");
fprintf(c_file, "{\n");
for (int i = 0; encodings[i].name; ++i)
{
char srcEncodingLaundered[ENCODING_MAX_LEN];
launder_name(encodings[i].name, srcEncodingLaundered);
fprintf(c_file, " if (EQUAL(pszEncoding, \"%s\"))\n",
encodings[i].name);
fprintf(c_file, " return &CPL_%s_to_UTF8;\n",
srcEncodingLaundered);
}
fprintf(c_file, " return CPL_NULLPTR;\n");
fprintf(c_file, "}\n");
fprintf(c_file, "/* clang-format on */\n");
fprintf(h_file, "/* clang-format on */\n");
fclose(c_file);
fclose(h_file);
return 0;
}
Loading

0 comments on commit c637701

Please sign in to comment.