[SPARK-48682][SQL][FOLLOW-UP] Changed initCap behaviour with UTF8_BIN…

…ARY collation ### What changes were proposed in this pull request? Changing the way that spark does initCap with respect to UTF8_BINARY collation. In this PR, initCap titlecases the first character of every word, and lowercases every other character. Words are separated only by ASCII space. Special care is taken when lowercasing Σ, to take into account if it is at the end of the word(with respect to case-ignorable characters) and should be lowercased into ς, or in other case into σ(this already works correctly with the current implementation because lowercasing a whole string handled this, but in this PR this was handled manually because lowercase function wasn't used). The key difference between outputs that this PR introduces is: | input | current_initCap(input) | new_initCap(input) | |----------|----------|----------| | İo | İo (I\u0307o) | İo | | ß ﬁ ﬃ ﬀ ﬆ | ß ﬁ ﬃ ﬀ ﬆ | Ss Fi Ffi Ff St | These are just some examples, much more mappings are actually affected. More details about the key changes are in the next section. This behaviour is put under the ICU_CASE_MAPPINGS_ENABLED flag in SQLConf, which is true by default. ### Why are the changes needed? The previous implementation first lowercases the complete string, and then titlecases the first character of every word[1]. When titlecasing the first character of every word, it maps a single codepoint to a single codepoint[2]. This leads to the following behaviour with respect to [1]: | input | initCap(input) | |----------|----------| | İo | İo (I\u0307o) | In summary, when the lowercase of a first character(for example "İ") in a word maps onto more than 1 character(for example "I\u0307"), we only consider the first character("I" in "I\u0307") of that lowercased letter("İ") for titlecasing instead of that complete character because we titlecase only the first character in a word after we completely lowercase it. The behaviour that [2] produces is: | input | initCap(input) | |----------|----------| | ß ﬁ ﬃ ﬀ ﬆ | ß ﬁ ﬃ ﬀ ﬆ | While the expected output would probably be: | input | initCap(input) | |----------|----------| | ß ﬁ ﬃ ﬀ ﬆ | Ss Fi Ffi Ff St | which clearly maps titlecase of each of those characters into more than one character, which is not handled because of [2]. Again, these are just examples and not an exhaustive list of all the mappings that have been changed. ### Does this PR introduce _any_ user-facing change? Yes, InitCap expression will now return different results for: - One-to-many case mapping (e.g. Turkish dotted I, ß, ﬁ) ### How was this patch tested? Tests in CollationSupportSuite. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#47771 from viktorluc-db/initCap. Authored-by: viktorluc-db <[email protected]> Signed-off-by: Max Gekk <[email protected]>
mrk-andreev · Sep 1, 2024 · fb8d01a · fb8d01a
1 parent c58148d
commit fb8d01a
Show file tree

Hide file tree

Showing 6 changed files with 337 additions and 48 deletions.
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -17,6 +17,7 @@
 package org.apache.spark.sql.catalyst.util;
 
 import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UProperty;
 import com.ibm.icu.text.BreakIterator;
 import com.ibm.icu.text.Collator;
 import com.ibm.icu.text.RuleBasedCollator;
@@ -48,6 +49,16 @@ public class CollationAwareUTF8String {
    */
   private static final int MATCH_NOT_FOUND = -1;
 
+  /**
+   * `COMBINED_ASCII_SMALL_I_COMBINING_DOT` is an internal representation of the combined
+   * lowercase code point for ASCII lowercase letter i with an additional combining dot character
+   * (U+0307). This integer value is not a valid code point itself, but rather an artificial code
+   * point marker used to represent the two lowercase characters that are the result of converting
+   * the uppercase Turkish dotted letter I with a combining dot character (U+0130) to lowercase.
+   */
+  private static final int COMBINED_ASCII_SMALL_I_COMBINING_DOT =
+    SpecialCodePointConstants.ASCII_SMALL_I << 16 | SpecialCodePointConstants.COMBINING_DOT;
+
   /**
    * Returns whether the target string starts with the specified prefix, starting from the
    * specified position (0-based index referring to character position in UTF8String), with respect
@@ -105,9 +116,9 @@ private static int lowercaseMatchLengthFrom(
       } else {
         // Use buffered lowercase code point iteration to handle one-to-many case mappings.
         targetCodePoint = getLowercaseCodePoint(targetIterator.next());
-        if (targetCodePoint == CODE_POINT_COMBINED_LOWERCASE_I_DOT) {
-          targetCodePoint = CODE_POINT_LOWERCASE_I;
-          codePointBuffer = CODE_POINT_COMBINING_DOT;
+        if (targetCodePoint == COMBINED_ASCII_SMALL_I_COMBINING_DOT) {
+          targetCodePoint = SpecialCodePointConstants.ASCII_SMALL_I;
+          codePointBuffer = SpecialCodePointConstants.COMBINING_DOT;
         }
         ++matchLength;
       }
@@ -207,9 +218,9 @@ private static int lowercaseMatchLengthUntil(
       } else {
         // Use buffered lowercase code point iteration to handle one-to-many case mappings.
         targetCodePoint = getLowercaseCodePoint(targetIterator.next());
-        if (targetCodePoint == CODE_POINT_COMBINED_LOWERCASE_I_DOT) {
-          targetCodePoint = CODE_POINT_COMBINING_DOT;
-          codePointBuffer = CODE_POINT_LOWERCASE_I;
+        if (targetCodePoint == COMBINED_ASCII_SMALL_I_COMBINING_DOT) {
+          targetCodePoint = SpecialCodePointConstants.COMBINING_DOT;
+          codePointBuffer = SpecialCodePointConstants.ASCII_SMALL_I;
         }
         ++matchLength;
       }
@@ -461,44 +472,32 @@ private static UTF8String toLowerCaseSlow(final UTF8String target, final int col
    */
   private static void appendLowercaseCodePoint(final int codePoint, final StringBuilder sb) {
     int lowercaseCodePoint = getLowercaseCodePoint(codePoint);
-    if (lowercaseCodePoint == CODE_POINT_COMBINED_LOWERCASE_I_DOT) {
+    if (lowercaseCodePoint == COMBINED_ASCII_SMALL_I_COMBINING_DOT) {
       // Latin capital letter I with dot above is mapped to 2 lowercase characters.
-      sb.appendCodePoint(0x0069);
-      sb.appendCodePoint(0x0307);
+      sb.appendCodePoint(SpecialCodePointConstants.ASCII_SMALL_I);
+      sb.appendCodePoint(SpecialCodePointConstants.COMBINING_DOT);
     } else {
       // All other characters should follow context-unaware ICU single-code point case mapping.
       sb.appendCodePoint(lowercaseCodePoint);
     }
   }
 
-  /**
-   * `CODE_POINT_COMBINED_LOWERCASE_I_DOT` is an internal representation of the combined lowercase
-   * code point for ASCII lowercase letter i with an additional combining dot character (U+0307).
-   * This integer value is not a valid code point itself, but rather an artificial code point
-   * marker used to represent the two lowercase characters that are the result of converting the
-   * uppercase Turkish dotted letter I with a combining dot character (U+0130) to lowercase.
-   */
-  private static final int CODE_POINT_LOWERCASE_I = 0x69;
-  private static final int CODE_POINT_COMBINING_DOT = 0x307;
-  private static final int CODE_POINT_COMBINED_LOWERCASE_I_DOT =
-    CODE_POINT_LOWERCASE_I << 16 | CODE_POINT_COMBINING_DOT;
-
   /**
    * Returns the lowercase version of the provided code point, with special handling for
    * one-to-many case mappings (i.e. characters that map to multiple characters in lowercase) and
    * context-insensitive case mappings (i.e. characters that map to different characters based on
    * the position in the string relative to other characters in lowercase).
    */
   private static int getLowercaseCodePoint(final int codePoint) {
-    if (codePoint == 0x0130) {
+    if (codePoint == SpecialCodePointConstants.CAPITAL_I_WITH_DOT_ABOVE) {
       // Latin capital letter I with dot above is mapped to 2 lowercase characters.
-      return CODE_POINT_COMBINED_LOWERCASE_I_DOT;
+      return COMBINED_ASCII_SMALL_I_COMBINING_DOT;
     }
-    else if (codePoint == 0x03C2) {
+    else if (codePoint == SpecialCodePointConstants.GREEK_FINAL_SIGMA) {
       // Greek final and non-final letter sigma should be mapped the same. This is achieved by
       // mapping Greek small final sigma (U+03C2) to Greek small non-final sigma (U+03C3). Capital
       // letter sigma (U+03A3) is mapped to small non-final sigma (U+03C3) in the `else` branch.
-      return 0x03C3;
+      return SpecialCodePointConstants.GREEK_SMALL_SIGMA;
     }
     else {
       // All other characters should follow context-unaware ICU single-code point case mapping.
@@ -550,6 +549,152 @@ public static UTF8String toTitleCase(final UTF8String target, final int collatio
       BreakIterator.getWordInstance(locale)));
   }
 
+  /**
+   * This 'HashMap' is introduced as a performance speedup. Since title-casing a codepoint can
+   * result in more than a single codepoint, for correctness, we would use
+   * 'UCharacter.toTitleCase(String)' which returns a 'String'. If we use
+   * 'UCharacter.toTitleCase(int)' (the version of the same function which converts a single
+   * codepoint to its title-case codepoint), it would be faster than the previously mentioned
+   * version, but the problem here is that we don't handle when title-casing a codepoint yields more
+   * than 1 codepoint. Since there are only 48 codepoints that are mapped to more than 1 codepoint
+   * when title-cased, they are precalculated here, so that the faster function for title-casing
+   * could be used in combination with this 'HashMap' in the method 'appendCodepointToTitleCase'.
+   */
+  private static final HashMap<Integer, String> codepointOneToManyTitleCaseLookupTable =
+    new HashMap<>(){{
+    StringBuilder sb = new StringBuilder();
+    for (int i = Character.MIN_CODE_POINT; i <= Character.MAX_CODE_POINT; ++i) {
+      sb.appendCodePoint(i);
+      String titleCase = UCharacter.toTitleCase(sb.toString(), null);
+      if (titleCase.codePointCount(0, titleCase.length()) > 1) {
+        put(i, titleCase);
+      }
+      sb.setLength(0);
+    }
+  }};
+
+  /**
+   * Title-casing a string using ICU case mappings. Iterates over the string and title-cases
+   * the first character in each word, and lowercases every other character. Handles lowercasing
+   * capital Greek letter sigma ('Σ') separately, taking into account if it should be a small final
+   * Greek sigma ('ς') or small non-final Greek sigma ('σ'). Words are separated by ASCII
+   * space(\u0020).
+   *
+   * @param source UTF8String to be title cased
+   * @return title cased source
+   */
+  public static UTF8String toTitleCaseICU(UTF8String source) {
+    // In the default UTF8String implementation, `toLowerCase` method implicitly does UTF8String
+    // validation (replacing invalid UTF-8 byte sequences with Unicode replacement character
+    // U+FFFD), but now we have to do the validation manually.
+    source = source.makeValid();
+
+    // Building the title cased source with 'sb'.
+    UTF8StringBuilder sb = new UTF8StringBuilder();
+
+    // 'isNewWord' is true if the current character is the beginning of a word, false otherwise.
+    boolean isNewWord = true;
+    // We are maintaining if the current character is preceded by a cased letter.
+    // This is used when lowercasing capital Greek letter sigma ('Σ'), to figure out if it should be
+    // lowercased into σ or ς.
+    boolean precededByCasedLetter = false;
+
+    // 'offset' is a byte offset in source's byte array pointing to the beginning of the character
+    // that we need to process next.
+    int offset = 0;
+    int len = source.numBytes();
+
+    while (offset < len) {
+      // We will actually call 'codePointFrom()' 2 times for each character in the worst case (once
+      // here, and once in 'followedByCasedLetter'). Example of a string where we call it 2 times
+      // for almost every character is 'ΣΣΣΣΣ' (a string consisting only of Greek capital sigma)
+      // and 'Σ`````' (a string consisting of a Greek capital sigma, followed by case-ignorable
+      // characters).
+      int codepoint = source.codePointFrom(offset);
+      // Appending the correctly cased character onto 'sb'.
+      appendTitleCasedCodepoint(sb, codepoint, isNewWord, precededByCasedLetter, source, offset);
+      // Updating 'isNewWord', 'precededByCasedLetter' and 'offset' to be ready for the next
+      // character that we will process.
+      isNewWord = (codepoint == SpecialCodePointConstants.ASCII_SPACE);
+      if (!UCharacter.hasBinaryProperty(codepoint, UProperty.CASE_IGNORABLE)) {
+        precededByCasedLetter = UCharacter.hasBinaryProperty(codepoint, UProperty.CASED);
+      }
+      offset += UTF8String.numBytesForFirstByte(source.getByte(offset));
+    }
+    return sb.build();
+  }
+
+  private static void appendTitleCasedCodepoint(
+      UTF8StringBuilder sb,
+      int codepoint,
+      boolean isAfterAsciiSpace,
+      boolean precededByCasedLetter,
+      UTF8String source,
+      int offset) {
+    if (isAfterAsciiSpace) {
+      // Title-casing a character if it is in the beginning of a new word.
+      appendCodepointToTitleCase(sb, codepoint);
+      return;
+    }
+    if (codepoint == SpecialCodePointConstants.GREEK_CAPITAL_SIGMA) {
+      // Handling capital Greek letter sigma ('Σ').
+      appendLowerCasedGreekCapitalSigma(sb, precededByCasedLetter, source, offset);
+      return;
+    }
+    // If it's not the beginning of a word, or a capital Greek letter sigma ('Σ'), we lowercase the
+    // character. We specially handle 'CAPITAL_I_WITH_DOT_ABOVE'.
+    if (codepoint == SpecialCodePointConstants.CAPITAL_I_WITH_DOT_ABOVE) {
+      sb.appendCodePoint(SpecialCodePointConstants.ASCII_SMALL_I);
+      sb.appendCodePoint(SpecialCodePointConstants.COMBINING_DOT);
+      return;
+    }
+    sb.appendCodePoint(UCharacter.toLowerCase(codepoint));
+  }
+
+  private static void appendLowerCasedGreekCapitalSigma(
+      UTF8StringBuilder sb,
+      boolean precededByCasedLetter,
+      UTF8String source,
+      int offset) {
+    int codepoint = (!followedByCasedLetter(source, offset) && precededByCasedLetter)
+      ? SpecialCodePointConstants.GREEK_FINAL_SIGMA
+      : SpecialCodePointConstants.GREEK_SMALL_SIGMA;
+    sb.appendCodePoint(codepoint);
+  }
+
+  /**
+   * Checks if the character beginning at 'offset'(in 'sources' byte array) is followed by a cased
+   * letter.
+   */
+  private static boolean followedByCasedLetter(UTF8String source, int offset) {
+    // Moving the offset one character forward, so we could start the linear search from there.
+    offset += UTF8String.numBytesForFirstByte(source.getByte(offset));
+    int len = source.numBytes();
+
+    while (offset < len) {
+      int codepoint = source.codePointFrom(offset);
+
+      if (UCharacter.hasBinaryProperty(codepoint, UProperty.CASE_IGNORABLE)) {
+        offset += UTF8String.numBytesForFirstByte(source.getByte(offset));
+        continue;
+      }
+      return UCharacter.hasBinaryProperty(codepoint, UProperty.CASED);
+    }
+    return false;
+  }
+
+  /**
+   * Appends title-case of a single character to a 'StringBuilder' using the ICU root locale rules.
+   */
+  private static void appendCodepointToTitleCase(UTF8StringBuilder sb, int codepoint) {
+    String toTitleCase = codepointOneToManyTitleCaseLookupTable.get(codepoint);
+    if (toTitleCase == null) {
+      sb.appendCodePoint(UCharacter.toTitleCase(codepoint));
+    } else {
+      sb.append(toTitleCase);
+    }
+  }
+
   /*
    * Returns the position of the first occurrence of the match string in the set string,
    * counting ASCII commas as delimiters. The match string is compared in a collation-aware manner,
@@ -843,11 +988,11 @@ public static UTF8String lowercaseTranslate(final UTF8String input,
       }
       // Special handling for letter i (U+0069) followed by a combining dot (U+0307). By ensuring
       // that `CODE_POINT_LOWERCASE_I` is buffered, we guarantee finding a max-length match.
-      if (lowercaseDict.containsKey(CODE_POINT_COMBINED_LOWERCASE_I_DOT) &&
-          codePoint == CODE_POINT_LOWERCASE_I && inputIter.hasNext()) {
+      if (lowercaseDict.containsKey(COMBINED_ASCII_SMALL_I_COMBINING_DOT)
+          && codePoint == SpecialCodePointConstants.ASCII_SMALL_I && inputIter.hasNext()) {
         int nextCodePoint = inputIter.next();
-        if (nextCodePoint == CODE_POINT_COMBINING_DOT) {
-          codePoint = CODE_POINT_COMBINED_LOWERCASE_I_DOT;
+        if (nextCodePoint == SpecialCodePointConstants.COMBINING_DOT) {
+          codePoint = COMBINED_ASCII_SMALL_I_COMBINING_DOT;
         } else {
           codePointBuffer = nextCodePoint;
         }
@@ -1007,11 +1152,11 @@ public static UTF8String lowercaseTrimLeft(
         codePoint = getLowercaseCodePoint(srcIter.next());
       }
       // Special handling for Turkish dotted uppercase letter I.
-      if (codePoint == CODE_POINT_LOWERCASE_I && srcIter.hasNext() &&
-          trimChars.contains(CODE_POINT_COMBINED_LOWERCASE_I_DOT)) {
+      if (codePoint == SpecialCodePointConstants.ASCII_SMALL_I && srcIter.hasNext() &&
+          trimChars.contains(COMBINED_ASCII_SMALL_I_COMBINING_DOT)) {
         codePointBuffer = codePoint;
         codePoint = getLowercaseCodePoint(srcIter.next());
-        if (codePoint == CODE_POINT_COMBINING_DOT) {
+        if (codePoint == SpecialCodePointConstants.COMBINING_DOT) {
           searchIndex += 2;
           codePointBuffer = -1;
         } else if (trimChars.contains(codePointBuffer)) {
@@ -1125,11 +1270,11 @@ public static UTF8String lowercaseTrimRight(
         codePoint = getLowercaseCodePoint(srcIter.next());
       }
       // Special handling for Turkish dotted uppercase letter I.
-      if (codePoint == CODE_POINT_COMBINING_DOT && srcIter.hasNext() &&
-          trimChars.contains(CODE_POINT_COMBINED_LOWERCASE_I_DOT)) {
+      if (codePoint == SpecialCodePointConstants.COMBINING_DOT && srcIter.hasNext() &&
+          trimChars.contains(COMBINED_ASCII_SMALL_I_COMBINING_DOT)) {
         codePointBuffer = codePoint;
         codePoint = getLowercaseCodePoint(srcIter.next());
-        if (codePoint == CODE_POINT_LOWERCASE_I) {
+        if (codePoint == SpecialCodePointConstants.ASCII_SMALL_I) {
           searchIndex -= 2;
           codePointBuffer = -1;
         } else if (trimChars.contains(codePointBuffer)) {

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -283,7 +283,7 @@ public static UTF8String execBinary(final UTF8String v) {
       return v.toLowerCase().toTitleCase();
     }
     public static UTF8String execBinaryICU(final UTF8String v) {
-      return CollationAwareUTF8String.toLowerCase(v).toTitleCaseICU();
+      return CollationAwareUTF8String.toTitleCaseICU(v);
     }
     public static UTF8String execLowercase(final UTF8String v) {
       return CollationAwareUTF8String.toTitleCase(v);

diff --git a/...on/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/SpecialCodePointConstants.java b/...on/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/SpecialCodePointConstants.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util;
+
+/**
+ * 'SpecialCodePointConstants' is introduced in order to keep the codepoints used in
+ * 'CollationAwareUTF8String' in one place.
+ */
+public class SpecialCodePointConstants {
+
+    public static final int COMBINING_DOT = 0x0307;
+    public static final int ASCII_SMALL_I = 0x0069;
+    public static final int ASCII_SPACE = 0x0020;
+    public static final int GREEK_CAPITAL_SIGMA = 0x03A3;
+    public static final int GREEK_SMALL_SIGMA = 0x03C3;
+    public static final int GREEK_FINAL_SIGMA = 0x03C2;
+    public static final int CAPITAL_I_WITH_DOT_ABOVE = 0x0130;
+}
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/UTF8StringBuilder.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/UTF8StringBuilder.java
@@ -96,4 +96,33 @@ public void appendBytes(Object base, long offset, int length) {
   public UTF8String build() {
     return UTF8String.fromBytes(buffer, 0, totalSize());
   }
+
+  public void appendCodePoint(int codePoint) {
+    if (codePoint <= 0x7F) {
+      grow(1);
+      buffer[cursor - Platform.BYTE_ARRAY_OFFSET] = (byte) codePoint;
+      ++cursor;
+    } else if (codePoint <= 0x7FF) {
+      grow(2);
+      buffer[cursor - Platform.BYTE_ARRAY_OFFSET] = (byte) (0xC0 | (codePoint >> 6));
+      buffer[cursor + 1 - Platform.BYTE_ARRAY_OFFSET] = (byte) (0x80 | (codePoint & 0x3F));
+      cursor += 2;
+    } else if (codePoint <= 0xFFFF) {
+      grow(3);
+      buffer[cursor - Platform.BYTE_ARRAY_OFFSET] = (byte) (0xE0 | (codePoint >> 12));
+      buffer[cursor + 1 - Platform.BYTE_ARRAY_OFFSET] = (byte) (0x80 | ((codePoint >> 6) & 0x3F));
+      buffer[cursor + 2 - Platform.BYTE_ARRAY_OFFSET] = (byte) (0x80 | (codePoint & 0x3F));
+      cursor += 3;
+    } else if (codePoint <= 0x10FFFF) {
+      grow(4);
+      buffer[cursor - Platform.BYTE_ARRAY_OFFSET] = (byte) (0xF0 | (codePoint >> 18));
+      buffer[cursor + 1 - Platform.BYTE_ARRAY_OFFSET] = (byte) (0x80 | ((codePoint >> 12) & 0x3F));
+      buffer[cursor + 2 - Platform.BYTE_ARRAY_OFFSET] = (byte) (0x80 | ((codePoint >> 6) & 0x3F));
+      buffer[cursor + 3 - Platform.BYTE_ARRAY_OFFSET] = (byte) (0x80 | (codePoint & 0x3F));
+      cursor += 4;
+    } else {
+      throw new IllegalArgumentException("Invalid Unicode codePoint: " + codePoint);
+    }
+  }
+
 }