From ce9bbb5ba9bdc857a0072e8954f5c26aadeb3d71 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sat, 14 Sep 2024 17:46:37 +0200 Subject: [PATCH] CPLRecode(): make ISO-8859-2 and -15 and CP437/CP1250/CP1251/CP1252 to UTF-8 always available (but not other direction yet) This will help for common use cases when using a minimal GDAL build lacking iconv support. --- autotest/cpp/test_cpl.cpp | 55 +- port/CMakeLists.txt | 9 + port/character_set_conv_table_generator.c | 124 ++++ port/cpl_character_sets.c | 818 ++++++++++++++++++++++ port/cpl_character_sets.h | 8 + port/cpl_recode.cpp | 25 +- port/cpl_recode_stub.cpp | 140 ++-- 7 files changed, 1112 insertions(+), 67 deletions(-) create mode 100644 port/character_set_conv_table_generator.c create mode 100644 port/cpl_character_sets.c create mode 100644 port/cpl_character_sets.h diff --git a/autotest/cpp/test_cpl.cpp b/autotest/cpp/test_cpl.cpp index be5292265689..710aa876f490 100644 --- a/autotest/cpp/test_cpl.cpp +++ b/autotest/cpp/test_cpl.cpp @@ -2846,22 +2846,23 @@ TEST_F(test_cpl, CPLJSONDocument) } // Test CPLRecodeIconv() with re-allocation +// (this test also passed on Windows using its native recoding API) TEST_F(test_cpl, CPLRecodeIconv) { -#ifdef CPL_RECODE_ICONV +#if defined(CPL_RECODE_ICONV) || defined(_WIN32) int N = 32800; char *pszIn = static_cast(CPLMalloc(N + 1)); for (int i = 0; i < N; i++) - pszIn[i] = '\xE9'; + pszIn[i] = '\xA1'; pszIn[N] = 0; char *pszExpected = static_cast(CPLMalloc(N * 2 + 1)); for (int i = 0; i < N; i++) { - pszExpected[2 * i] = '\xC3'; - pszExpected[2 * i + 1] = '\xA9'; + pszExpected[2 * i] = '\xD0'; + pszExpected[2 * i + 1] = '\x81'; } pszExpected[N * 2] = 0; - char *pszRet = CPLRecode(pszIn, "ISO-8859-2", CPL_ENC_UTF8); + char *pszRet = CPLRecode(pszIn, "ISO-8859-5", CPL_ENC_UTF8); EXPECT_EQ(memcmp(pszExpected, pszRet, N * 2 + 1), 0); CPLFree(pszIn); CPLFree(pszRet); @@ -2871,6 +2872,50 @@ TEST_F(test_cpl, CPLRecodeIconv) #endif } +// Test CP1252 to UTF-8 +TEST_F(test_cpl, CPLRecodeStubCP1252_to_UTF8_strict_alloc) +{ + CPLClearRecodeWarningFlags(); + CPLErrorReset(); + CPLPushErrorHandler(CPLQuietErrorHandler); + // Euro character expands to 3-bytes + char *pszRet = CPLRecode("\x80", "CP1252", CPL_ENC_UTF8); + CPLPopErrorHandler(); + EXPECT_STREQ(CPLGetLastErrorMsg(), ""); + EXPECT_EQ(memcmp(pszRet, "\xE2\x82\xAC\x00", 4), 0); + CPLFree(pszRet); +} + +// Test CP1252 to UTF-8 +TEST_F(test_cpl, CPLRecodeStubCP1252_to_UTF8_with_ascii) +{ + CPLClearRecodeWarningFlags(); + CPLErrorReset(); + CPLPushErrorHandler(CPLQuietErrorHandler); + char *pszRet = CPLRecode("x\x80y", "CP1252", CPL_ENC_UTF8); + CPLPopErrorHandler(); + EXPECT_STREQ(CPLGetLastErrorMsg(), ""); + EXPECT_EQ(memcmp(pszRet, "x\xE2\x82\xACy\x00", 6), 0); + CPLFree(pszRet); +} + +// Test CP1252 to UTF-8 +TEST_F(test_cpl, CPLRecodeStubCP1252_to_UTF8_with_warning) +{ + CPLClearRecodeWarningFlags(); + CPLErrorReset(); + CPLPushErrorHandler(CPLQuietErrorHandler); + // \x90 is an invalid CP1252 character. Will be skipped + char *pszRet = CPLRecode("\x90\x80", "CP1252", CPL_ENC_UTF8); + CPLPopErrorHandler(); + EXPECT_STREQ( + CPLGetLastErrorMsg(), + "One or several characters couldn't be converted correctly from CP1252 " + "to UTF-8. This warning will not be emitted anymore"); + EXPECT_EQ(memcmp(pszRet, "\xE2\x82\xAC\x00", 4), 0); + CPLFree(pszRet); +} + // Test CPLHTTPParseMultipartMime() TEST_F(test_cpl, CPLHTTPParseMultipartMime) { diff --git a/port/CMakeLists.txt b/port/CMakeLists.txt index bb87bc8c3e37..26469f1b47a0 100644 --- a/port/CMakeLists.txt +++ b/port/CMakeLists.txt @@ -374,3 +374,12 @@ if (NOT CMAKE_CROSSCOMPILING AND BUILD_VSIPRELOAD AND "${CMAKE_SYSTEM}" MATCHES endforeach() endif () endif () + +# Utility to generate cpl_character_sets.h and .c +add_executable(character_set_conv_table_generator EXCLUDE_FROM_ALL character_set_conv_table_generator.c) + +# Custom target that must be manually invoked if character_set_conv_table_generator.c is modified +add_custom_target(generate_cpl_character_sets + COMMAND $ + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" + DEPENDS character_set_conv_table_generator) diff --git a/port/character_set_conv_table_generator.c b/port/character_set_conv_table_generator.c new file mode 100644 index 000000000000..9b8928f6bac6 --- /dev/null +++ b/port/character_set_conv_table_generator.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: MIT +// Copyright 2024 Even Rouault + +#include +#include +#include +#include + +#define ENCODING_MAX_LEN 256 + +static void launder_name(const char *srcEncoding, + char srcEncodingLaundered[ENCODING_MAX_LEN]) +{ + snprintf(srcEncodingLaundered, ENCODING_MAX_LEN, "%s", srcEncoding); + for (int i = 0; srcEncodingLaundered[i]; ++i) + { + if (srcEncodingLaundered[i] == '-') + srcEncodingLaundered[i] = '_'; + } +} + +static void generate(FILE *c_file, FILE *h_file, const char *srcEncoding, + const char *comment) +{ + iconv_t sConv = iconv_open("UTF-8", srcEncoding); + if (sConv == (iconv_t)(-1)) + { + fprintf(stderr, "iconv_open(%s) failed\n", srcEncoding); + exit(1); + } + char srcEncodingLaundered[ENCODING_MAX_LEN]; + launder_name(srcEncoding, srcEncodingLaundered); + fprintf(c_file, "/* %s */\n", comment); + fprintf(c_file, "static const CPLCodePageConvTable CPL_%s_to_UTF8 = {\n", + srcEncodingLaundered); + for (int i = 0; i <= 255; ++i) + { + unsigned char c = (unsigned char)i; + size_t size_in = 1; + unsigned char out[4] = {0, 0, 0, 0}; + size_t size_out = sizeof(out); + char *p_in = (char *)&c; + char *p_out = (char *)out; + size_t nConverted = iconv(sConv, &p_in, &size_in, &p_out, &size_out); + if (i <= 127) + { + assert(out[0] == i); + continue; + } + if (nConverted != (size_t)-1) + { + const size_t needed = sizeof(out) - size_out; + assert(needed <= 3); + fprintf(c_file, " {0x%02X, 0x%02X, 0x%02X},\n", out[0], out[1], + out[2]); + } + else + { + fprintf(c_file, " {0, 0, 0}, /* invalid */\n"); + } + } + fprintf(c_file, "};\n\n"); + iconv_close(sConv); +} + +int main() +{ + FILE *c_file = fopen("cpl_character_sets.c", "wb"); + FILE *h_file = fopen("cpl_character_sets.h", "wb"); + fprintf(c_file, "/* This file has been generated by " + "generate_character_set_conv_tables.c */\n"); + fprintf(c_file, "/* DO NOT EDIT !*/\n\n"); + fprintf(c_file, "/* clang-format off */\n"); + fprintf(c_file, "#include \"cpl_port.h\"\n"); + fprintf(c_file, "#include \"cpl_character_sets.h\"\n\n"); + + fprintf(h_file, "/* This file has been generated by " + "generate_character_set_conv_tables.c */\n"); + fprintf(h_file, "/* DO NOT EDIT !*/\n\n"); + fprintf(h_file, "/* clang-format off */\n"); + fprintf(h_file, "typedef unsigned char CPLCodePageConvTable[128][3];\n"); + + const struct + { + const char *name; + const char *comment; + } encodings[] = { + {"CP437", "Character set of original IBM PC"}, + {"CP1250", "Central and eastern Europe languages"}, + {"CP1251", "Cyrillic script"}, + {"CP1252", + "Legacy Windows single-byte character set used in a lot of countries"}, + {"ISO-8859-2", "Central Europe languages"}, + {"ISO-8859-15", "New Western Europe"}, + {NULL, NULL}}; + + for (int i = 0; encodings[i].name; ++i) + { + generate(c_file, h_file, encodings[i].name, encodings[i].comment); + } + fprintf(h_file, "\n"); + fprintf(h_file, "const CPLCodePageConvTable* " + "CPLGetConversionTableToUTF8(const char* pszEncoding);\n"); + + fprintf(c_file, "\nconst CPLCodePageConvTable* " + "CPLGetConversionTableToUTF8(const char* pszEncoding)\n"); + fprintf(c_file, "{\n"); + for (int i = 0; encodings[i].name; ++i) + { + char srcEncodingLaundered[ENCODING_MAX_LEN]; + launder_name(encodings[i].name, srcEncodingLaundered); + fprintf(c_file, " if (EQUAL(pszEncoding, \"%s\"))\n", + encodings[i].name); + fprintf(c_file, " return &CPL_%s_to_UTF8;\n", + srcEncodingLaundered); + } + fprintf(c_file, " return CPL_NULLPTR;\n"); + fprintf(c_file, "}\n"); + fprintf(c_file, "/* clang-format on */\n"); + fprintf(h_file, "/* clang-format on */\n"); + fclose(c_file); + fclose(h_file); + return 0; +} diff --git a/port/cpl_character_sets.c b/port/cpl_character_sets.c new file mode 100644 index 000000000000..65a326059785 --- /dev/null +++ b/port/cpl_character_sets.c @@ -0,0 +1,818 @@ +/* This file has been generated by generate_character_set_conv_tables.c */ +/* DO NOT EDIT !*/ + +/* clang-format off */ +#include "cpl_port.h" +#include "cpl_character_sets.h" + +/* Character set of original IBM PC */ +static const CPLCodePageConvTable CPL_CP437_to_UTF8 = { + {0xC3, 0x87, 0x00}, + {0xC3, 0xBC, 0x00}, + {0xC3, 0xA9, 0x00}, + {0xC3, 0xA2, 0x00}, + {0xC3, 0xA4, 0x00}, + {0xC3, 0xA0, 0x00}, + {0xC3, 0xA5, 0x00}, + {0xC3, 0xA7, 0x00}, + {0xC3, 0xAA, 0x00}, + {0xC3, 0xAB, 0x00}, + {0xC3, 0xA8, 0x00}, + {0xC3, 0xAF, 0x00}, + {0xC3, 0xAE, 0x00}, + {0xC3, 0xAC, 0x00}, + {0xC3, 0x84, 0x00}, + {0xC3, 0x85, 0x00}, + {0xC3, 0x89, 0x00}, + {0xC3, 0xA6, 0x00}, + {0xC3, 0x86, 0x00}, + {0xC3, 0xB4, 0x00}, + {0xC3, 0xB6, 0x00}, + {0xC3, 0xB2, 0x00}, + {0xC3, 0xBB, 0x00}, + {0xC3, 0xB9, 0x00}, + {0xC3, 0xBF, 0x00}, + {0xC3, 0x96, 0x00}, + {0xC3, 0x9C, 0x00}, + {0xC2, 0xA2, 0x00}, + {0xC2, 0xA3, 0x00}, + {0xC2, 0xA5, 0x00}, + {0xE2, 0x82, 0xA7}, + {0xC6, 0x92, 0x00}, + {0xC3, 0xA1, 0x00}, + {0xC3, 0xAD, 0x00}, + {0xC3, 0xB3, 0x00}, + {0xC3, 0xBA, 0x00}, + {0xC3, 0xB1, 0x00}, + {0xC3, 0x91, 0x00}, + {0xC2, 0xAA, 0x00}, + {0xC2, 0xBA, 0x00}, + {0xC2, 0xBF, 0x00}, + {0xE2, 0x8C, 0x90}, + {0xC2, 0xAC, 0x00}, + {0xC2, 0xBD, 0x00}, + {0xC2, 0xBC, 0x00}, + {0xC2, 0xA1, 0x00}, + {0xC2, 0xAB, 0x00}, + {0xC2, 0xBB, 0x00}, + {0xE2, 0x96, 0x91}, + {0xE2, 0x96, 0x92}, + {0xE2, 0x96, 0x93}, + {0xE2, 0x94, 0x82}, + {0xE2, 0x94, 0xA4}, + {0xE2, 0x95, 0xA1}, + {0xE2, 0x95, 0xA2}, + {0xE2, 0x95, 0x96}, + {0xE2, 0x95, 0x95}, + {0xE2, 0x95, 0xA3}, + {0xE2, 0x95, 0x91}, + {0xE2, 0x95, 0x97}, + {0xE2, 0x95, 0x9D}, + {0xE2, 0x95, 0x9C}, + {0xE2, 0x95, 0x9B}, + {0xE2, 0x94, 0x90}, + {0xE2, 0x94, 0x94}, + {0xE2, 0x94, 0xB4}, + {0xE2, 0x94, 0xAC}, + {0xE2, 0x94, 0x9C}, + {0xE2, 0x94, 0x80}, + {0xE2, 0x94, 0xBC}, + {0xE2, 0x95, 0x9E}, + {0xE2, 0x95, 0x9F}, + {0xE2, 0x95, 0x9A}, + {0xE2, 0x95, 0x94}, + {0xE2, 0x95, 0xA9}, + {0xE2, 0x95, 0xA6}, + {0xE2, 0x95, 0xA0}, + {0xE2, 0x95, 0x90}, + {0xE2, 0x95, 0xAC}, + {0xE2, 0x95, 0xA7}, + {0xE2, 0x95, 0xA8}, + {0xE2, 0x95, 0xA4}, + {0xE2, 0x95, 0xA5}, + {0xE2, 0x95, 0x99}, + {0xE2, 0x95, 0x98}, + {0xE2, 0x95, 0x92}, + {0xE2, 0x95, 0x93}, + {0xE2, 0x95, 0xAB}, + {0xE2, 0x95, 0xAA}, + {0xE2, 0x94, 0x98}, + {0xE2, 0x94, 0x8C}, + {0xE2, 0x96, 0x88}, + {0xE2, 0x96, 0x84}, + {0xE2, 0x96, 0x8C}, + {0xE2, 0x96, 0x90}, + {0xE2, 0x96, 0x80}, + {0xCE, 0xB1, 0x00}, + {0xC3, 0x9F, 0x00}, + {0xCE, 0x93, 0x00}, + {0xCF, 0x80, 0x00}, + {0xCE, 0xA3, 0x00}, + {0xCF, 0x83, 0x00}, + {0xC2, 0xB5, 0x00}, + {0xCF, 0x84, 0x00}, + {0xCE, 0xA6, 0x00}, + {0xCE, 0x98, 0x00}, + {0xCE, 0xA9, 0x00}, + {0xCE, 0xB4, 0x00}, + {0xE2, 0x88, 0x9E}, + {0xCF, 0x86, 0x00}, + {0xCE, 0xB5, 0x00}, + {0xE2, 0x88, 0xA9}, + {0xE2, 0x89, 0xA1}, + {0xC2, 0xB1, 0x00}, + {0xE2, 0x89, 0xA5}, + {0xE2, 0x89, 0xA4}, + {0xE2, 0x8C, 0xA0}, + {0xE2, 0x8C, 0xA1}, + {0xC3, 0xB7, 0x00}, + {0xE2, 0x89, 0x88}, + {0xC2, 0xB0, 0x00}, + {0xE2, 0x88, 0x99}, + {0xC2, 0xB7, 0x00}, + {0xE2, 0x88, 0x9A}, + {0xE2, 0x81, 0xBF}, + {0xC2, 0xB2, 0x00}, + {0xE2, 0x96, 0xA0}, + {0xC2, 0xA0, 0x00}, +}; + +/* Central and eastern Europe languages */ +static const CPLCodePageConvTable CPL_CP1250_to_UTF8 = { + {0xE2, 0x82, 0xAC}, + {0, 0, 0}, /* invalid */ + {0xE2, 0x80, 0x9A}, + {0, 0, 0}, /* invalid */ + {0xE2, 0x80, 0x9E}, + {0xE2, 0x80, 0xA6}, + {0xE2, 0x80, 0xA0}, + {0xE2, 0x80, 0xA1}, + {0, 0, 0}, /* invalid */ + {0xE2, 0x80, 0xB0}, + {0xC5, 0xA0, 0x00}, + {0xE2, 0x80, 0xB9}, + {0xC5, 0x9A, 0x00}, + {0xC5, 0xA4, 0x00}, + {0xC5, 0xBD, 0x00}, + {0xC5, 0xB9, 0x00}, + {0, 0, 0}, /* invalid */ + {0xE2, 0x80, 0x98}, + {0xE2, 0x80, 0x99}, + {0xE2, 0x80, 0x9C}, + {0xE2, 0x80, 0x9D}, + {0xE2, 0x80, 0xA2}, + {0xE2, 0x80, 0x93}, + {0xE2, 0x80, 0x94}, + {0, 0, 0}, /* invalid */ + {0xE2, 0x84, 0xA2}, + {0xC5, 0xA1, 0x00}, + {0xE2, 0x80, 0xBA}, + {0xC5, 0x9B, 0x00}, + {0xC5, 0xA5, 0x00}, + {0xC5, 0xBE, 0x00}, + {0xC5, 0xBA, 0x00}, + {0xC2, 0xA0, 0x00}, + {0xCB, 0x87, 0x00}, + {0xCB, 0x98, 0x00}, + {0xC5, 0x81, 0x00}, + {0xC2, 0xA4, 0x00}, + {0xC4, 0x84, 0x00}, + {0xC2, 0xA6, 0x00}, + {0xC2, 0xA7, 0x00}, + {0xC2, 0xA8, 0x00}, + {0xC2, 0xA9, 0x00}, + {0xC5, 0x9E, 0x00}, + {0xC2, 0xAB, 0x00}, + {0xC2, 0xAC, 0x00}, + {0xC2, 0xAD, 0x00}, + {0xC2, 0xAE, 0x00}, + {0xC5, 0xBB, 0x00}, + {0xC2, 0xB0, 0x00}, + {0xC2, 0xB1, 0x00}, + {0xCB, 0x9B, 0x00}, + {0xC5, 0x82, 0x00}, + {0xC2, 0xB4, 0x00}, + {0xC2, 0xB5, 0x00}, + {0xC2, 0xB6, 0x00}, + {0xC2, 0xB7, 0x00}, + {0xC2, 0xB8, 0x00}, + {0xC4, 0x85, 0x00}, + {0xC5, 0x9F, 0x00}, + {0xC2, 0xBB, 0x00}, + {0xC4, 0xBD, 0x00}, + {0xCB, 0x9D, 0x00}, + {0xC4, 0xBE, 0x00}, + {0xC5, 0xBC, 0x00}, + {0xC5, 0x94, 0x00}, + {0xC3, 0x81, 0x00}, + {0xC3, 0x82, 0x00}, + {0xC4, 0x82, 0x00}, + {0xC3, 0x84, 0x00}, + {0xC4, 0xB9, 0x00}, + {0xC4, 0x86, 0x00}, + {0xC3, 0x87, 0x00}, + {0xC4, 0x8C, 0x00}, + {0xC3, 0x89, 0x00}, + {0xC4, 0x98, 0x00}, + {0xC3, 0x8B, 0x00}, + {0xC4, 0x9A, 0x00}, + {0xC3, 0x8D, 0x00}, + {0xC3, 0x8E, 0x00}, + {0xC4, 0x8E, 0x00}, + {0xC4, 0x90, 0x00}, + {0xC5, 0x83, 0x00}, + {0xC5, 0x87, 0x00}, + {0xC3, 0x93, 0x00}, + {0xC3, 0x94, 0x00}, + {0xC5, 0x90, 0x00}, + {0xC3, 0x96, 0x00}, + {0xC3, 0x97, 0x00}, + {0xC5, 0x98, 0x00}, + {0xC5, 0xAE, 0x00}, + {0xC3, 0x9A, 0x00}, + {0xC5, 0xB0, 0x00}, + {0xC3, 0x9C, 0x00}, + {0xC3, 0x9D, 0x00}, + {0xC5, 0xA2, 0x00}, + {0xC3, 0x9F, 0x00}, + {0xC5, 0x95, 0x00}, + {0xC3, 0xA1, 0x00}, + {0xC3, 0xA2, 0x00}, + {0xC4, 0x83, 0x00}, + {0xC3, 0xA4, 0x00}, + {0xC4, 0xBA, 0x00}, + {0xC4, 0x87, 0x00}, + {0xC3, 0xA7, 0x00}, + {0xC4, 0x8D, 0x00}, + {0xC3, 0xA9, 0x00}, + {0xC4, 0x99, 0x00}, + {0xC3, 0xAB, 0x00}, + {0xC4, 0x9B, 0x00}, + {0xC3, 0xAD, 0x00}, + {0xC3, 0xAE, 0x00}, + {0xC4, 0x8F, 0x00}, + {0xC4, 0x91, 0x00}, + {0xC5, 0x84, 0x00}, + {0xC5, 0x88, 0x00}, + {0xC3, 0xB3, 0x00}, + {0xC3, 0xB4, 0x00}, + {0xC5, 0x91, 0x00}, + {0xC3, 0xB6, 0x00}, + {0xC3, 0xB7, 0x00}, + {0xC5, 0x99, 0x00}, + {0xC5, 0xAF, 0x00}, + {0xC3, 0xBA, 0x00}, + {0xC5, 0xB1, 0x00}, + {0xC3, 0xBC, 0x00}, + {0xC3, 0xBD, 0x00}, + {0xC5, 0xA3, 0x00}, + {0xCB, 0x99, 0x00}, +}; + +/* Cyrillic script */ +static const CPLCodePageConvTable CPL_CP1251_to_UTF8 = { + {0xD0, 0x82, 0x00}, + {0xD0, 0x83, 0x00}, + {0xE2, 0x80, 0x9A}, + {0xD1, 0x93, 0x00}, + {0xE2, 0x80, 0x9E}, + {0xE2, 0x80, 0xA6}, + {0xE2, 0x80, 0xA0}, + {0xE2, 0x80, 0xA1}, + {0xE2, 0x82, 0xAC}, + {0xE2, 0x80, 0xB0}, + {0xD0, 0x89, 0x00}, + {0xE2, 0x80, 0xB9}, + {0xD0, 0x8A, 0x00}, + {0xD0, 0x8C, 0x00}, + {0xD0, 0x8B, 0x00}, + {0xD0, 0x8F, 0x00}, + {0xD1, 0x92, 0x00}, + {0xE2, 0x80, 0x98}, + {0xE2, 0x80, 0x99}, + {0xE2, 0x80, 0x9C}, + {0xE2, 0x80, 0x9D}, + {0xE2, 0x80, 0xA2}, + {0xE2, 0x80, 0x93}, + {0xE2, 0x80, 0x94}, + {0, 0, 0}, /* invalid */ + {0xE2, 0x84, 0xA2}, + {0xD1, 0x99, 0x00}, + {0xE2, 0x80, 0xBA}, + {0xD1, 0x9A, 0x00}, + {0xD1, 0x9C, 0x00}, + {0xD1, 0x9B, 0x00}, + {0xD1, 0x9F, 0x00}, + {0xC2, 0xA0, 0x00}, + {0xD0, 0x8E, 0x00}, + {0xD1, 0x9E, 0x00}, + {0xD0, 0x88, 0x00}, + {0xC2, 0xA4, 0x00}, + {0xD2, 0x90, 0x00}, + {0xC2, 0xA6, 0x00}, + {0xC2, 0xA7, 0x00}, + {0xD0, 0x81, 0x00}, + {0xC2, 0xA9, 0x00}, + {0xD0, 0x84, 0x00}, + {0xC2, 0xAB, 0x00}, + {0xC2, 0xAC, 0x00}, + {0xC2, 0xAD, 0x00}, + {0xC2, 0xAE, 0x00}, + {0xD0, 0x87, 0x00}, + {0xC2, 0xB0, 0x00}, + {0xC2, 0xB1, 0x00}, + {0xD0, 0x86, 0x00}, + {0xD1, 0x96, 0x00}, + {0xD2, 0x91, 0x00}, + {0xC2, 0xB5, 0x00}, + {0xC2, 0xB6, 0x00}, + {0xC2, 0xB7, 0x00}, + {0xD1, 0x91, 0x00}, + {0xE2, 0x84, 0x96}, + {0xD1, 0x94, 0x00}, + {0xC2, 0xBB, 0x00}, + {0xD1, 0x98, 0x00}, + {0xD0, 0x85, 0x00}, + {0xD1, 0x95, 0x00}, + {0xD1, 0x97, 0x00}, + {0xD0, 0x90, 0x00}, + {0xD0, 0x91, 0x00}, + {0xD0, 0x92, 0x00}, + {0xD0, 0x93, 0x00}, + {0xD0, 0x94, 0x00}, + {0xD0, 0x95, 0x00}, + {0xD0, 0x96, 0x00}, + {0xD0, 0x97, 0x00}, + {0xD0, 0x98, 0x00}, + {0xD0, 0x99, 0x00}, + {0xD0, 0x9A, 0x00}, + {0xD0, 0x9B, 0x00}, + {0xD0, 0x9C, 0x00}, + {0xD0, 0x9D, 0x00}, + {0xD0, 0x9E, 0x00}, + {0xD0, 0x9F, 0x00}, + {0xD0, 0xA0, 0x00}, + {0xD0, 0xA1, 0x00}, + {0xD0, 0xA2, 0x00}, + {0xD0, 0xA3, 0x00}, + {0xD0, 0xA4, 0x00}, + {0xD0, 0xA5, 0x00}, + {0xD0, 0xA6, 0x00}, + {0xD0, 0xA7, 0x00}, + {0xD0, 0xA8, 0x00}, + {0xD0, 0xA9, 0x00}, + {0xD0, 0xAA, 0x00}, + {0xD0, 0xAB, 0x00}, + {0xD0, 0xAC, 0x00}, + {0xD0, 0xAD, 0x00}, + {0xD0, 0xAE, 0x00}, + {0xD0, 0xAF, 0x00}, + {0xD0, 0xB0, 0x00}, + {0xD0, 0xB1, 0x00}, + {0xD0, 0xB2, 0x00}, + {0xD0, 0xB3, 0x00}, + {0xD0, 0xB4, 0x00}, + {0xD0, 0xB5, 0x00}, + {0xD0, 0xB6, 0x00}, + {0xD0, 0xB7, 0x00}, + {0xD0, 0xB8, 0x00}, + {0xD0, 0xB9, 0x00}, + {0xD0, 0xBA, 0x00}, + {0xD0, 0xBB, 0x00}, + {0xD0, 0xBC, 0x00}, + {0xD0, 0xBD, 0x00}, + {0xD0, 0xBE, 0x00}, + {0xD0, 0xBF, 0x00}, + {0xD1, 0x80, 0x00}, + {0xD1, 0x81, 0x00}, + {0xD1, 0x82, 0x00}, + {0xD1, 0x83, 0x00}, + {0xD1, 0x84, 0x00}, + {0xD1, 0x85, 0x00}, + {0xD1, 0x86, 0x00}, + {0xD1, 0x87, 0x00}, + {0xD1, 0x88, 0x00}, + {0xD1, 0x89, 0x00}, + {0xD1, 0x8A, 0x00}, + {0xD1, 0x8B, 0x00}, + {0xD1, 0x8C, 0x00}, + {0xD1, 0x8D, 0x00}, + {0xD1, 0x8E, 0x00}, + {0xD1, 0x8F, 0x00}, +}; + +/* Legacy Windows single-byte character set used in a lot of countries */ +static const CPLCodePageConvTable CPL_CP1252_to_UTF8 = { + {0xE2, 0x82, 0xAC}, + {0, 0, 0}, /* invalid */ + {0xE2, 0x80, 0x9A}, + {0xC6, 0x92, 0x00}, + {0xE2, 0x80, 0x9E}, + {0xE2, 0x80, 0xA6}, + {0xE2, 0x80, 0xA0}, + {0xE2, 0x80, 0xA1}, + {0xCB, 0x86, 0x00}, + {0xE2, 0x80, 0xB0}, + {0xC5, 0xA0, 0x00}, + {0xE2, 0x80, 0xB9}, + {0xC5, 0x92, 0x00}, + {0, 0, 0}, /* invalid */ + {0xC5, 0xBD, 0x00}, + {0, 0, 0}, /* invalid */ + {0, 0, 0}, /* invalid */ + {0xE2, 0x80, 0x98}, + {0xE2, 0x80, 0x99}, + {0xE2, 0x80, 0x9C}, + {0xE2, 0x80, 0x9D}, + {0xE2, 0x80, 0xA2}, + {0xE2, 0x80, 0x93}, + {0xE2, 0x80, 0x94}, + {0xCB, 0x9C, 0x00}, + {0xE2, 0x84, 0xA2}, + {0xC5, 0xA1, 0x00}, + {0xE2, 0x80, 0xBA}, + {0xC5, 0x93, 0x00}, + {0, 0, 0}, /* invalid */ + {0xC5, 0xBE, 0x00}, + {0xC5, 0xB8, 0x00}, + {0xC2, 0xA0, 0x00}, + {0xC2, 0xA1, 0x00}, + {0xC2, 0xA2, 0x00}, + {0xC2, 0xA3, 0x00}, + {0xC2, 0xA4, 0x00}, + {0xC2, 0xA5, 0x00}, + {0xC2, 0xA6, 0x00}, + {0xC2, 0xA7, 0x00}, + {0xC2, 0xA8, 0x00}, + {0xC2, 0xA9, 0x00}, + {0xC2, 0xAA, 0x00}, + {0xC2, 0xAB, 0x00}, + {0xC2, 0xAC, 0x00}, + {0xC2, 0xAD, 0x00}, + {0xC2, 0xAE, 0x00}, + {0xC2, 0xAF, 0x00}, + {0xC2, 0xB0, 0x00}, + {0xC2, 0xB1, 0x00}, + {0xC2, 0xB2, 0x00}, + {0xC2, 0xB3, 0x00}, + {0xC2, 0xB4, 0x00}, + {0xC2, 0xB5, 0x00}, + {0xC2, 0xB6, 0x00}, + {0xC2, 0xB7, 0x00}, + {0xC2, 0xB8, 0x00}, + {0xC2, 0xB9, 0x00}, + {0xC2, 0xBA, 0x00}, + {0xC2, 0xBB, 0x00}, + {0xC2, 0xBC, 0x00}, + {0xC2, 0xBD, 0x00}, + {0xC2, 0xBE, 0x00}, + {0xC2, 0xBF, 0x00}, + {0xC3, 0x80, 0x00}, + {0xC3, 0x81, 0x00}, + {0xC3, 0x82, 0x00}, + {0xC3, 0x83, 0x00}, + {0xC3, 0x84, 0x00}, + {0xC3, 0x85, 0x00}, + {0xC3, 0x86, 0x00}, + {0xC3, 0x87, 0x00}, + {0xC3, 0x88, 0x00}, + {0xC3, 0x89, 0x00}, + {0xC3, 0x8A, 0x00}, + {0xC3, 0x8B, 0x00}, + {0xC3, 0x8C, 0x00}, + {0xC3, 0x8D, 0x00}, + {0xC3, 0x8E, 0x00}, + {0xC3, 0x8F, 0x00}, + {0xC3, 0x90, 0x00}, + {0xC3, 0x91, 0x00}, + {0xC3, 0x92, 0x00}, + {0xC3, 0x93, 0x00}, + {0xC3, 0x94, 0x00}, + {0xC3, 0x95, 0x00}, + {0xC3, 0x96, 0x00}, + {0xC3, 0x97, 0x00}, + {0xC3, 0x98, 0x00}, + {0xC3, 0x99, 0x00}, + {0xC3, 0x9A, 0x00}, + {0xC3, 0x9B, 0x00}, + {0xC3, 0x9C, 0x00}, + {0xC3, 0x9D, 0x00}, + {0xC3, 0x9E, 0x00}, + {0xC3, 0x9F, 0x00}, + {0xC3, 0xA0, 0x00}, + {0xC3, 0xA1, 0x00}, + {0xC3, 0xA2, 0x00}, + {0xC3, 0xA3, 0x00}, + {0xC3, 0xA4, 0x00}, + {0xC3, 0xA5, 0x00}, + {0xC3, 0xA6, 0x00}, + {0xC3, 0xA7, 0x00}, + {0xC3, 0xA8, 0x00}, + {0xC3, 0xA9, 0x00}, + {0xC3, 0xAA, 0x00}, + {0xC3, 0xAB, 0x00}, + {0xC3, 0xAC, 0x00}, + {0xC3, 0xAD, 0x00}, + {0xC3, 0xAE, 0x00}, + {0xC3, 0xAF, 0x00}, + {0xC3, 0xB0, 0x00}, + {0xC3, 0xB1, 0x00}, + {0xC3, 0xB2, 0x00}, + {0xC3, 0xB3, 0x00}, + {0xC3, 0xB4, 0x00}, + {0xC3, 0xB5, 0x00}, + {0xC3, 0xB6, 0x00}, + {0xC3, 0xB7, 0x00}, + {0xC3, 0xB8, 0x00}, + {0xC3, 0xB9, 0x00}, + {0xC3, 0xBA, 0x00}, + {0xC3, 0xBB, 0x00}, + {0xC3, 0xBC, 0x00}, + {0xC3, 0xBD, 0x00}, + {0xC3, 0xBE, 0x00}, + {0xC3, 0xBF, 0x00}, +}; + +/* Central Europe languages */ +static const CPLCodePageConvTable CPL_ISO_8859_2_to_UTF8 = { + {0xC2, 0x80, 0x00}, + {0xC2, 0x81, 0x00}, + {0xC2, 0x82, 0x00}, + {0xC2, 0x83, 0x00}, + {0xC2, 0x84, 0x00}, + {0xC2, 0x85, 0x00}, + {0xC2, 0x86, 0x00}, + {0xC2, 0x87, 0x00}, + {0xC2, 0x88, 0x00}, + {0xC2, 0x89, 0x00}, + {0xC2, 0x8A, 0x00}, + {0xC2, 0x8B, 0x00}, + {0xC2, 0x8C, 0x00}, + {0xC2, 0x8D, 0x00}, + {0xC2, 0x8E, 0x00}, + {0xC2, 0x8F, 0x00}, + {0xC2, 0x90, 0x00}, + {0xC2, 0x91, 0x00}, + {0xC2, 0x92, 0x00}, + {0xC2, 0x93, 0x00}, + {0xC2, 0x94, 0x00}, + {0xC2, 0x95, 0x00}, + {0xC2, 0x96, 0x00}, + {0xC2, 0x97, 0x00}, + {0xC2, 0x98, 0x00}, + {0xC2, 0x99, 0x00}, + {0xC2, 0x9A, 0x00}, + {0xC2, 0x9B, 0x00}, + {0xC2, 0x9C, 0x00}, + {0xC2, 0x9D, 0x00}, + {0xC2, 0x9E, 0x00}, + {0xC2, 0x9F, 0x00}, + {0xC2, 0xA0, 0x00}, + {0xC4, 0x84, 0x00}, + {0xCB, 0x98, 0x00}, + {0xC5, 0x81, 0x00}, + {0xC2, 0xA4, 0x00}, + {0xC4, 0xBD, 0x00}, + {0xC5, 0x9A, 0x00}, + {0xC2, 0xA7, 0x00}, + {0xC2, 0xA8, 0x00}, + {0xC5, 0xA0, 0x00}, + {0xC5, 0x9E, 0x00}, + {0xC5, 0xA4, 0x00}, + {0xC5, 0xB9, 0x00}, + {0xC2, 0xAD, 0x00}, + {0xC5, 0xBD, 0x00}, + {0xC5, 0xBB, 0x00}, + {0xC2, 0xB0, 0x00}, + {0xC4, 0x85, 0x00}, + {0xCB, 0x9B, 0x00}, + {0xC5, 0x82, 0x00}, + {0xC2, 0xB4, 0x00}, + {0xC4, 0xBE, 0x00}, + {0xC5, 0x9B, 0x00}, + {0xCB, 0x87, 0x00}, + {0xC2, 0xB8, 0x00}, + {0xC5, 0xA1, 0x00}, + {0xC5, 0x9F, 0x00}, + {0xC5, 0xA5, 0x00}, + {0xC5, 0xBA, 0x00}, + {0xCB, 0x9D, 0x00}, + {0xC5, 0xBE, 0x00}, + {0xC5, 0xBC, 0x00}, + {0xC5, 0x94, 0x00}, + {0xC3, 0x81, 0x00}, + {0xC3, 0x82, 0x00}, + {0xC4, 0x82, 0x00}, + {0xC3, 0x84, 0x00}, + {0xC4, 0xB9, 0x00}, + {0xC4, 0x86, 0x00}, + {0xC3, 0x87, 0x00}, + {0xC4, 0x8C, 0x00}, + {0xC3, 0x89, 0x00}, + {0xC4, 0x98, 0x00}, + {0xC3, 0x8B, 0x00}, + {0xC4, 0x9A, 0x00}, + {0xC3, 0x8D, 0x00}, + {0xC3, 0x8E, 0x00}, + {0xC4, 0x8E, 0x00}, + {0xC4, 0x90, 0x00}, + {0xC5, 0x83, 0x00}, + {0xC5, 0x87, 0x00}, + {0xC3, 0x93, 0x00}, + {0xC3, 0x94, 0x00}, + {0xC5, 0x90, 0x00}, + {0xC3, 0x96, 0x00}, + {0xC3, 0x97, 0x00}, + {0xC5, 0x98, 0x00}, + {0xC5, 0xAE, 0x00}, + {0xC3, 0x9A, 0x00}, + {0xC5, 0xB0, 0x00}, + {0xC3, 0x9C, 0x00}, + {0xC3, 0x9D, 0x00}, + {0xC5, 0xA2, 0x00}, + {0xC3, 0x9F, 0x00}, + {0xC5, 0x95, 0x00}, + {0xC3, 0xA1, 0x00}, + {0xC3, 0xA2, 0x00}, + {0xC4, 0x83, 0x00}, + {0xC3, 0xA4, 0x00}, + {0xC4, 0xBA, 0x00}, + {0xC4, 0x87, 0x00}, + {0xC3, 0xA7, 0x00}, + {0xC4, 0x8D, 0x00}, + {0xC3, 0xA9, 0x00}, + {0xC4, 0x99, 0x00}, + {0xC3, 0xAB, 0x00}, + {0xC4, 0x9B, 0x00}, + {0xC3, 0xAD, 0x00}, + {0xC3, 0xAE, 0x00}, + {0xC4, 0x8F, 0x00}, + {0xC4, 0x91, 0x00}, + {0xC5, 0x84, 0x00}, + {0xC5, 0x88, 0x00}, + {0xC3, 0xB3, 0x00}, + {0xC3, 0xB4, 0x00}, + {0xC5, 0x91, 0x00}, + {0xC3, 0xB6, 0x00}, + {0xC3, 0xB7, 0x00}, + {0xC5, 0x99, 0x00}, + {0xC5, 0xAF, 0x00}, + {0xC3, 0xBA, 0x00}, + {0xC5, 0xB1, 0x00}, + {0xC3, 0xBC, 0x00}, + {0xC3, 0xBD, 0x00}, + {0xC5, 0xA3, 0x00}, + {0xCB, 0x99, 0x00}, +}; + +/* New Western Europe */ +static const CPLCodePageConvTable CPL_ISO_8859_15_to_UTF8 = { + {0xC2, 0x80, 0x00}, + {0xC2, 0x81, 0x00}, + {0xC2, 0x82, 0x00}, + {0xC2, 0x83, 0x00}, + {0xC2, 0x84, 0x00}, + {0xC2, 0x85, 0x00}, + {0xC2, 0x86, 0x00}, + {0xC2, 0x87, 0x00}, + {0xC2, 0x88, 0x00}, + {0xC2, 0x89, 0x00}, + {0xC2, 0x8A, 0x00}, + {0xC2, 0x8B, 0x00}, + {0xC2, 0x8C, 0x00}, + {0xC2, 0x8D, 0x00}, + {0xC2, 0x8E, 0x00}, + {0xC2, 0x8F, 0x00}, + {0xC2, 0x90, 0x00}, + {0xC2, 0x91, 0x00}, + {0xC2, 0x92, 0x00}, + {0xC2, 0x93, 0x00}, + {0xC2, 0x94, 0x00}, + {0xC2, 0x95, 0x00}, + {0xC2, 0x96, 0x00}, + {0xC2, 0x97, 0x00}, + {0xC2, 0x98, 0x00}, + {0xC2, 0x99, 0x00}, + {0xC2, 0x9A, 0x00}, + {0xC2, 0x9B, 0x00}, + {0xC2, 0x9C, 0x00}, + {0xC2, 0x9D, 0x00}, + {0xC2, 0x9E, 0x00}, + {0xC2, 0x9F, 0x00}, + {0xC2, 0xA0, 0x00}, + {0xC2, 0xA1, 0x00}, + {0xC2, 0xA2, 0x00}, + {0xC2, 0xA3, 0x00}, + {0xE2, 0x82, 0xAC}, + {0xC2, 0xA5, 0x00}, + {0xC5, 0xA0, 0x00}, + {0xC2, 0xA7, 0x00}, + {0xC5, 0xA1, 0x00}, + {0xC2, 0xA9, 0x00}, + {0xC2, 0xAA, 0x00}, + {0xC2, 0xAB, 0x00}, + {0xC2, 0xAC, 0x00}, + {0xC2, 0xAD, 0x00}, + {0xC2, 0xAE, 0x00}, + {0xC2, 0xAF, 0x00}, + {0xC2, 0xB0, 0x00}, + {0xC2, 0xB1, 0x00}, + {0xC2, 0xB2, 0x00}, + {0xC2, 0xB3, 0x00}, + {0xC5, 0xBD, 0x00}, + {0xC2, 0xB5, 0x00}, + {0xC2, 0xB6, 0x00}, + {0xC2, 0xB7, 0x00}, + {0xC5, 0xBE, 0x00}, + {0xC2, 0xB9, 0x00}, + {0xC2, 0xBA, 0x00}, + {0xC2, 0xBB, 0x00}, + {0xC5, 0x92, 0x00}, + {0xC5, 0x93, 0x00}, + {0xC5, 0xB8, 0x00}, + {0xC2, 0xBF, 0x00}, + {0xC3, 0x80, 0x00}, + {0xC3, 0x81, 0x00}, + {0xC3, 0x82, 0x00}, + {0xC3, 0x83, 0x00}, + {0xC3, 0x84, 0x00}, + {0xC3, 0x85, 0x00}, + {0xC3, 0x86, 0x00}, + {0xC3, 0x87, 0x00}, + {0xC3, 0x88, 0x00}, + {0xC3, 0x89, 0x00}, + {0xC3, 0x8A, 0x00}, + {0xC3, 0x8B, 0x00}, + {0xC3, 0x8C, 0x00}, + {0xC3, 0x8D, 0x00}, + {0xC3, 0x8E, 0x00}, + {0xC3, 0x8F, 0x00}, + {0xC3, 0x90, 0x00}, + {0xC3, 0x91, 0x00}, + {0xC3, 0x92, 0x00}, + {0xC3, 0x93, 0x00}, + {0xC3, 0x94, 0x00}, + {0xC3, 0x95, 0x00}, + {0xC3, 0x96, 0x00}, + {0xC3, 0x97, 0x00}, + {0xC3, 0x98, 0x00}, + {0xC3, 0x99, 0x00}, + {0xC3, 0x9A, 0x00}, + {0xC3, 0x9B, 0x00}, + {0xC3, 0x9C, 0x00}, + {0xC3, 0x9D, 0x00}, + {0xC3, 0x9E, 0x00}, + {0xC3, 0x9F, 0x00}, + {0xC3, 0xA0, 0x00}, + {0xC3, 0xA1, 0x00}, + {0xC3, 0xA2, 0x00}, + {0xC3, 0xA3, 0x00}, + {0xC3, 0xA4, 0x00}, + {0xC3, 0xA5, 0x00}, + {0xC3, 0xA6, 0x00}, + {0xC3, 0xA7, 0x00}, + {0xC3, 0xA8, 0x00}, + {0xC3, 0xA9, 0x00}, + {0xC3, 0xAA, 0x00}, + {0xC3, 0xAB, 0x00}, + {0xC3, 0xAC, 0x00}, + {0xC3, 0xAD, 0x00}, + {0xC3, 0xAE, 0x00}, + {0xC3, 0xAF, 0x00}, + {0xC3, 0xB0, 0x00}, + {0xC3, 0xB1, 0x00}, + {0xC3, 0xB2, 0x00}, + {0xC3, 0xB3, 0x00}, + {0xC3, 0xB4, 0x00}, + {0xC3, 0xB5, 0x00}, + {0xC3, 0xB6, 0x00}, + {0xC3, 0xB7, 0x00}, + {0xC3, 0xB8, 0x00}, + {0xC3, 0xB9, 0x00}, + {0xC3, 0xBA, 0x00}, + {0xC3, 0xBB, 0x00}, + {0xC3, 0xBC, 0x00}, + {0xC3, 0xBD, 0x00}, + {0xC3, 0xBE, 0x00}, + {0xC3, 0xBF, 0x00}, +}; + + +const CPLCodePageConvTable* CPLGetConversionTableToUTF8(const char* pszEncoding) +{ + if (EQUAL(pszEncoding, "CP437")) + return &CPL_CP437_to_UTF8; + if (EQUAL(pszEncoding, "CP1250")) + return &CPL_CP1250_to_UTF8; + if (EQUAL(pszEncoding, "CP1251")) + return &CPL_CP1251_to_UTF8; + if (EQUAL(pszEncoding, "CP1252")) + return &CPL_CP1252_to_UTF8; + if (EQUAL(pszEncoding, "ISO-8859-2")) + return &CPL_ISO_8859_2_to_UTF8; + if (EQUAL(pszEncoding, "ISO-8859-15")) + return &CPL_ISO_8859_15_to_UTF8; + return CPL_NULLPTR; +} + +/* clang-format on */ diff --git a/port/cpl_character_sets.h b/port/cpl_character_sets.h new file mode 100644 index 000000000000..358887e8166e --- /dev/null +++ b/port/cpl_character_sets.h @@ -0,0 +1,8 @@ +/* This file has been generated by generate_character_set_conv_tables.c */ +/* DO NOT EDIT !*/ + +/* clang-format off */ +typedef unsigned char CPLCodePageConvTable[128][3]; + +const CPLCodePageConvTable* CPLGetConversionTableToUTF8(const char* pszEncoding); +/* clang-format on */ diff --git a/port/cpl_recode.cpp b/port/cpl_recode.cpp index e22f5d6e095e..f4bb24f63a79 100644 --- a/port/cpl_recode.cpp +++ b/port/cpl_recode.cpp @@ -29,6 +29,7 @@ #include #include "cpl_conv.h" +#include "cpl_character_sets.h" #include "utf8.h" @@ -91,27 +92,11 @@ char CPL_DLL *CPLRecode(const char *pszSource, const char *pszSrcEncoding, EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1))) return CPLStrdup(pszSource); - /* -------------------------------------------------------------------- */ - /* For ZIP file handling */ - /* (CP437 might be missing even on some iconv, like on Mac) */ - /* -------------------------------------------------------------------- */ - if (EQUAL(pszSrcEncoding, "CP437") && - EQUAL(pszDstEncoding, CPL_ENC_UTF8)) // + // A few hard coded CPxxx/ISO-8859-x to UTF-8 tables + if (EQUAL(pszDstEncoding, CPL_ENC_UTF8) && + CPLGetConversionTableToUTF8(pszSrcEncoding)) { - bool bIsAllPrintableASCII = true; - const size_t nCharCount = strlen(pszSource); - for (size_t i = 0; i < nCharCount; i++) - { - if (pszSource[i] < 32 || pszSource[i] > 126) - { - bIsAllPrintableASCII = false; - break; - } - } - if (bIsAllPrintableASCII) - { - return CPLStrdup(pszSource); - } + return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding); } #ifdef CPL_RECODE_ICONV diff --git a/port/cpl_recode_stub.cpp b/port/cpl_recode_stub.cpp index 62e7d5cbf037..e44da896c0cb 100644 --- a/port/cpl_recode_stub.cpp +++ b/port/cpl_recode_stub.cpp @@ -36,6 +36,7 @@ #include "cpl_conv.h" #include "cpl_error.h" +#include "cpl_character_sets.c" static unsigned utf8decode(const char *p, const char *end, int *len); static unsigned utf8towc(const char *src, unsigned srclen, wchar_t *dst, @@ -161,39 +162,118 @@ char *CPLRecodeStub(const char *pszSource, const char *pszSrcEncoding, return pszResult; } + // A few hard coded CPxxx/ISO-8859-x to UTF-8 tables + if (EQUAL(pszDstEncoding, CPL_ENC_UTF8)) + { + const auto pConvTable = CPLGetConversionTableToUTF8(pszSrcEncoding); + if (pConvTable) + { + const auto convTable = *pConvTable; + const size_t nCharCount = strlen(pszSource); + char *pszResult = + static_cast(CPLCalloc(1, nCharCount * 3 + 1)); + size_t iDst = 0; + unsigned char *pabyResult = + reinterpret_cast(pszResult); + for (size_t i = 0; i < nCharCount; ++i) + { + const unsigned char nChar = + static_cast(pszSource[i]); + if (nChar <= 127) + { + pszResult[iDst] = pszSource[i]; + ++iDst; + } + else + { + const unsigned char nShiftedChar = nChar - 128; + if (convTable[nShiftedChar][0]) + { + pabyResult[iDst] = convTable[nShiftedChar][0]; + ++iDst; + CPLAssert(convTable[nShiftedChar][1]); + pabyResult[iDst] = convTable[nShiftedChar][1]; + ++iDst; + if (convTable[nShiftedChar][2]) + { + pabyResult[iDst] = convTable[nShiftedChar][2]; + ++iDst; + } + } + else + { + // Skip the invalid sequence in the input string. + if (!bHaveWarned2) + { + bHaveWarned2 = true; + CPLError(CE_Warning, CPLE_AppDefined, + "One or several characters couldn't be " + "converted correctly from %s to %s. " + "This warning will not be emitted anymore", + pszSrcEncoding, pszDstEncoding); + } + } + } + } + + pszResult[iDst] = 0; + return pszResult; + } + } + #ifdef _WIN32 + const auto MapEncodingToWindowsCodePage = [](const char *pszEncoding) + { + // Cf https://learn.microsoft.com/fr-fr/windows/win32/intl/code-page-identifiers + if (STARTS_WITH(pszEncoding, "CP")) + { + const int nCode = atoi(pszEncoding + strlen("CP")); + if (nCode > 0) + return nCode; + else if (EQUAL(pszEncoding, "CP_OEMCP")) + return CP_OEMCP; + else if (EQUAL(pszEncoding, "CP_ACP")) + return CP_ACP; + } + else if (STARTS_WITH(pszEncoding, "WINDOWS-")) + { + const int nCode = atoi(pszEncoding + strlen("WINDOWS-")); + if (nCode > 0) + return nCode; + } + else if (STARTS_WITH(pszEncoding, "ISO-8859-")) + { + const int nCode = atoi(pszEncoding + strlen("ISO-8859-")); + if ((nCode >= 1 && nCode <= 9) || nCode == 13 || nCode == 15) + return 28590 + nCode; + } + + // Return a negative value, since CP_ACP = 0 + return -1; + }; + /* ---------------------------------------------------------------------*/ - /* CPXXX to UTF8 */ + /* XXX to UTF8 */ /* ---------------------------------------------------------------------*/ - if (STARTS_WITH(pszSrcEncoding, "CP") && - strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0) + if (strcmp(pszDstEncoding, CPL_ENC_UTF8) == 0) { - int nCode = atoi(pszSrcEncoding + 2); - if (nCode > 0) + const int nCode = MapEncodingToWindowsCodePage(pszSrcEncoding); + if (nCode >= 0) { return CPLWin32Recode(pszSource, nCode, CP_UTF8); } - else if (EQUAL(pszSrcEncoding, "CP_OEMCP")) - return CPLWin32Recode(pszSource, CP_OEMCP, CP_UTF8); - else if (EQUAL(pszSrcEncoding, "CP_ACP")) - return CPLWin32Recode(pszSource, CP_ACP, CP_UTF8); } /* ---------------------------------------------------------------------*/ - /* UTF8 to CPXXX */ + /* UTF8 to XXX */ /* ---------------------------------------------------------------------*/ - if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0 && - STARTS_WITH(pszDstEncoding, "CP")) + if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0) { - int nCode = atoi(pszDstEncoding + 2); - if (nCode > 0) + const int nCode = MapEncodingToWindowsCodePage(pszDstEncoding); + if (nCode >= 0) { return CPLWin32Recode(pszSource, CP_UTF8, nCode); } - else if (EQUAL(pszDstEncoding, "CP_OEMCP")) - return CPLWin32Recode(pszSource, CP_UTF8, CP_OEMCP); - else if (EQUAL(pszDstEncoding, "CP_ACP")) - return CPLWin32Recode(pszSource, CP_UTF8, CP_ACP); } #endif @@ -220,30 +300,6 @@ char *CPLRecodeStub(const char *pszSource, const char *pszSrcEncoding, return pszResult; } - /* -------------------------------------------------------------------- */ - /* UTF-8 to anything else is treated as UTF-8 to ISO-8859-1 */ - /* with a warning. */ - /* -------------------------------------------------------------------- */ - if (strcmp(pszSrcEncoding, CPL_ENC_UTF8) == 0 && - strcmp(pszDstEncoding, CPL_ENC_ISO8859_1) == 0) - { - int nCharCount = static_cast(strlen(pszSource)); - char *pszResult = static_cast(CPLCalloc(1, nCharCount + 1)); - - if (!bHaveWarned2) - { - bHaveWarned2 = true; - CPLError(CE_Warning, CPLE_AppDefined, - "Recode from UTF-8 to %s not supported, " - "treated as UTF-8 to ISO-8859-1.", - pszDstEncoding); - } - - utf8toa(pszSource, nCharCount, pszResult, nCharCount + 1); - - return pszResult; - } - /* -------------------------------------------------------------------- */ /* Everything else is treated as a no-op with a warning. */ /* -------------------------------------------------------------------- */