Skip to content

Commit aba7ba8

Browse files
Kevin Wilfongfacebook-github-bot
Kevin Wilfong
authored andcommitted
Fix inconsistencies between Velox and Presto Java when formatting time zone short/long names (facebookincubator#11447)
Summary: Pull Request resolved: facebookincubator#11447 While running some more queries in Presto on Velox and Presto on Java, I noticed some cases where format_datetime would produce in consistent results for the time zone short name ('z', 'zz', or 'zzz') and the time zone long name ('zzzz' or more). It seems that Joda does not follow the conventions used by the IANA Time Zone Database or the ICU library (CLDR), but rather seems to rely on Java's DateFormatSymbols's getZoneStrings method. This seems to match CLDR except in some cases (which is why I didn't initially notice). To ensure the behavior of Presto on Velox matches that of the Java implementation, I've written a small Java program GenTimeZoneNames.java to generate a C++ file TimeZoneNames.cpp with a mapping from the time zone ID to the abbreviations and long names for that time zone (in standard time and in daylight savings time). This diff updates TimeZoneMap's getShortName and getLongName functions to use the mapping in this generated file to resolve the time zone name. This fixes format_datetime so it's consistent. This also allowed me to remove the dependency on ICU, so I undid all the changes to build files that were necessary to make that work. Reviewed By: kgpai Differential Revision: D65507863 fbshipit-source-id: f8aa23693705a5ef0630cdd5c6c96950eaa85ecc
1 parent c3023b6 commit aba7ba8

File tree

7 files changed

+1897
-65
lines changed

7 files changed

+1897
-65
lines changed

velox/functions/lib/CMakeLists.txt

+1-2
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,7 @@ velox_link_libraries(velox_functions_util velox_vector velox_common_base)
2424
velox_add_library(velox_functions_lib_date_time_formatter DateTimeFormatter.cpp
2525
DateTimeFormatterBuilder.cpp)
2626

27-
velox_link_libraries(velox_functions_lib_date_time_formatter velox_type_tz
28-
ICU::i18n ICU::uc)
27+
velox_link_libraries(velox_functions_lib_date_time_formatter velox_type_tz)
2928

3029
velox_add_library(
3130
velox_functions_lib

velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp

+18-2
Original file line numberDiff line numberDiff line change
@@ -3566,18 +3566,34 @@ TEST_F(DateTimeFunctionsTest, formatDateTime) {
35663566

35673567
// Test a long abbreviation.
35683568
setQueryTimeZone("Asia/Colombo");
3569-
EXPECT_EQ("+0530", formatDatetime(parseTimestamp("1970-10-01"), "z"));
3569+
EXPECT_EQ("IST", formatDatetime(parseTimestamp("1970-10-01"), "z"));
35703570
EXPECT_EQ(
35713571
"India Standard Time",
35723572
formatDatetime(parseTimestamp("1970-10-01"), "zzzz"));
35733573

35743574
// Test a long long name.
35753575
setQueryTimeZone("Australia/Eucla");
3576-
EXPECT_EQ("+0845", formatDatetime(parseTimestamp("1970-10-01"), "z"));
3576+
EXPECT_EQ("ACWST", formatDatetime(parseTimestamp("1970-10-01"), "z"));
35773577
EXPECT_EQ(
35783578
"Australian Central Western Standard Time",
35793579
formatDatetime(parseTimestamp("1970-10-01"), "zzzz"));
35803580

3581+
// Test a time zone that doesn't follow the standard abbrevation in the IANA
3582+
// Time Zone Database, i.e. it relies on our map in TimeZoneNames.cpp.
3583+
setQueryTimeZone("Asia/Dubai");
3584+
// According to the IANA time zone database the abbreviation should be +04.
3585+
EXPECT_EQ("GST", formatDatetime(parseTimestamp("1970-10-01"), "z"));
3586+
EXPECT_EQ(
3587+
"Gulf Standard Time",
3588+
formatDatetime(parseTimestamp("1970-10-01"), "zzzz"));
3589+
3590+
// Test UTC specifically (because it's so common).
3591+
setQueryTimeZone("UTC");
3592+
EXPECT_EQ("UTC", formatDatetime(parseTimestamp("1970-10-01"), "z"));
3593+
EXPECT_EQ(
3594+
"Coordinated Universal Time",
3595+
formatDatetime(parseTimestamp("1970-10-01"), "zzzz"));
3596+
35813597
// Test a time zone name that is linked to another (that gets replaced when
35823598
// converted to a string).
35833599
setQueryTimeZone("US/Pacific");

velox/type/tz/CMakeLists.txt

+4-4
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,14 @@ velox_add_library(
1919
TimeZoneMap.h
2020
TimeZoneDatabase.cpp
2121
TimeZoneLinks.cpp
22-
TimeZoneMap.cpp)
22+
TimeZoneMap.cpp
23+
TimeZoneNames.h
24+
TimeZoneNames.cpp)
2325

2426
velox_link_libraries(
2527
velox_type_tz
2628
velox_exception
2729
velox_external_date
2830
Boost::regex
2931
fmt::fmt
30-
Folly::folly
31-
ICU::i18n
32-
ICU::uc)
32+
Folly::folly)

velox/type/tz/GenTimeZoneNames.java

+80
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package velox.type.tz;
17+
18+
import java.text.DateFormatSymbols;
19+
import java.util.Locale;
20+
21+
public class GenTimeZoneNames {
22+
public static void main(String[] args) {
23+
System.out.println("/*");
24+
System.out.println(" * Copyright (c) Facebook, Inc. and its affiliates.");
25+
System.out.println(" *");
26+
System.out.println(" * Licensed under the Apache License, Version 2.0 (the \"License\");");
27+
System.out.println(" * you may not use this file except in compliance with the License.");
28+
System.out.println(" * You may obtain a copy of the License at");
29+
System.out.println(" *");
30+
System.out.println(" * http://www.apache.org/licenses/LICENSE-2.0");
31+
System.out.println(" *");
32+
System.out.println(" * Unless required by applicable law or agreed to in writing, software");
33+
System.out.println(" * distributed under the License is distributed on an \"AS IS\" BASIS,");
34+
System.out.println(" * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.");
35+
System.out.println(" * See the License for the specific language governing permissions and");
36+
System.out.println(" * limitations under the License.");
37+
System.out.println(" */");
38+
System.out.println("");
39+
System.out.println("// This file is generated. Do not modify it manually. To re-generate it, run:");
40+
System.out.println("//");
41+
System.out.println("// javac velox/type/tz/GenTimeZoneNames.java");
42+
System.out.println("// java velox/type/tz/GenTimeZoneNames > velox/type/tz/TimeZoneNames.cpp");
43+
System.out.println("//");
44+
System.out.println("// Use the version of Java with which you're running Presto to generate");
45+
System.out.println("// compatible names. This version was generated using openjdk version ");
46+
System.out.println("// \"1.8.0_362\"");
47+
System.out.println("// @generated");
48+
System.out.println("");
49+
System.out.println("#include \"velox/type/tz/TimeZoneNames.h\"");
50+
System.out.println("");
51+
System.out.println("#include <folly/Portability.h>");
52+
System.out.println("");
53+
System.out.println("// This file just contains a single function that returns an, admittedly");
54+
System.out.println("// large, hard coded map.");
55+
System.out.println("FOLLY_CLANG_DISABLE_WARNING(\"-Wignored-optimization-argument\");");
56+
System.out.println("");
57+
System.out.println("namespace facebook::velox::tz {");
58+
System.out.println("");
59+
System.out.println("const std::unordered_map<std::string, TimeZoneNames>& getTimeZoneNames() {");
60+
System.out.println(" static auto* timeZoneNames = new std::unordered_map<");
61+
System.out.println(" std::string,");
62+
System.out.println(" TimeZoneNames>([] {");
63+
System.out.println(" // Work around clang compiler bug causing multi-hour compilation");
64+
System.out.println(" // with -fsanitize=fuzzer");
65+
System.out.println(" // https://github.com/llvm/llvm-project/issues/75666");
66+
System.out.println(" return std::unordered_map<std::string, TimeZoneNames>{");
67+
68+
String[][] zoneStringsEn = DateFormatSymbols.getInstance(Locale.ENGLISH).getZoneStrings();
69+
for (String[] zoneStrings : zoneStringsEn) {
70+
System.out.println(String.format(" {\"%s\", {\"%s\", \"%s\", \"%s\", \"%s\"}},", zoneStrings[0], zoneStrings[1], zoneStrings[2], zoneStrings[3], zoneStrings[4]));
71+
}
72+
73+
System.out.println(" };");
74+
System.out.println(" }());");
75+
System.out.println(" return *timeZoneNames;");
76+
System.out.println("}");
77+
System.out.println("");
78+
System.out.println("} // namespace facebook::velox::tz");
79+
}
80+
}

velox/type/tz/TimeZoneMap.cpp

+40-57
Original file line numberDiff line numberDiff line change
@@ -21,18 +21,10 @@
2121
#include <folly/container/F14Map.h>
2222
#include <folly/container/F14Set.h>
2323

24-
#include <unicode/locid.h>
25-
#include <unicode/timezone.h>
26-
#include <unicode/unistr.h>
27-
28-
// The ICU libraries define TRUE/FALSE macros which frequently conflict with
29-
// other libraries that use these as enum/variable names.
30-
#undef TRUE
31-
#undef FALSE
32-
3324
#include "velox/common/base/Exceptions.h"
3425
#include "velox/common/testutil/TestValue.h"
3526
#include "velox/external/date/tz.h"
27+
#include "velox/type/tz/TimeZoneNames.h"
3628

3729
using facebook::velox::common::testutil::TestValue;
3830

@@ -311,6 +303,43 @@ TDuration toLocalImpl(
311303
}
312304
return date::zoned_time{tz, timePoint}.get_local_time().time_since_epoch();
313305
}
306+
307+
template <bool isLongName>
308+
std::string getName(
309+
TimeZone::milliseconds timestamp,
310+
TimeZone::TChoose choose,
311+
const date::time_zone* tz,
312+
const std::string& timeZoneName) {
313+
validateRange(date::sys_time<TimeZone::milliseconds>(timestamp));
314+
315+
// Time zone offsets only have one name.
316+
if (tz == nullptr) {
317+
return timeZoneName;
318+
}
319+
320+
static const auto& timeZoneNames = getTimeZoneNames();
321+
auto it = timeZoneNames.find(timeZoneName);
322+
323+
VELOX_CHECK(
324+
it != timeZoneNames.end(),
325+
"Unable to find short name for time zone: {}",
326+
timeZoneName);
327+
328+
// According to the documentation this is how to determine if DST applies to
329+
// a given timestamp in a given time zone.
330+
// https://howardhinnant.github.io/date/tz.html#sys_info
331+
date::local_time<TimeZone::milliseconds> timePoint{timestamp};
332+
bool isDst = getZonedTime(tz, timePoint, choose).get_info().save !=
333+
std::chrono::minutes(0);
334+
335+
if constexpr (isLongName) {
336+
return isDst ? it->second.daylightTimeLongName
337+
: it->second.standardTimeLongName;
338+
} else {
339+
return isDst ? it->second.daylightTimeAbbreviation
340+
: it->second.standardTimeAbbreviation;
341+
}
342+
}
314343
} // namespace
315344

316345
void validateRange(time_point<std::chrono::seconds> timePoint) {
@@ -419,58 +448,12 @@ TimeZone::milliseconds TimeZone::to_local(
419448
std::string TimeZone::getShortName(
420449
TimeZone::milliseconds timestamp,
421450
TimeZone::TChoose choose) const {
422-
date::local_time<milliseconds> timePoint{timestamp};
423-
validateRange(date::sys_time<milliseconds>(timestamp));
424-
425-
// Time zone offsets only have one name (no abbreviations).
426-
if (tz_ == nullptr) {
427-
return timeZoneName_;
428-
}
429-
430-
return getZonedTime(tz_, timePoint, choose).get_info().abbrev;
451+
return getName<false>(timestamp, choose, tz_, timeZoneName_);
431452
}
432453

433454
std::string TimeZone::getLongName(
434455
TimeZone::milliseconds timestamp,
435456
TimeZone::TChoose choose) const {
436-
static const icu::Locale locale("en", "US");
437-
438-
validateRange(date::sys_time<milliseconds>(timestamp));
439-
440-
// Time zone offsets only have one name.
441-
if (tz_ == nullptr) {
442-
return timeZoneName_;
443-
}
444-
445-
// Special case for UTC. ICU uses "GMT" for some reason which is an
446-
// abbreviation.
447-
if (timeZoneID_ == 0) {
448-
return "Coordinated Universal Time";
449-
}
450-
451-
// Get the ICU TimeZone by name
452-
std::unique_ptr<icu::TimeZone> tz(icu::TimeZone::createTimeZone(
453-
icu::UnicodeString(timeZoneName_.data(), timeZoneName_.length())));
454-
VELOX_USER_CHECK_NOT_NULL(tz);
455-
456-
// According to the documentation this is how to determine if DST applies to
457-
// a given timestamp in a given time zone.
458-
// https://howardhinnant.github.io/date/tz.html#sys_info
459-
date::local_time<milliseconds> timePoint{timestamp};
460-
bool isDst = getZonedTime(tz_, timePoint, choose).get_info().save !=
461-
std::chrono::minutes(0);
462-
463-
// Construct the long name for the time zone.
464-
// Note that ICU does not have DST information for many time zones prior to
465-
// 1970, so it's important to specify it explicitly.
466-
icu::UnicodeString longName;
467-
tz->getDisplayName(
468-
isDst, icu::TimeZone::EDisplayType::LONG, locale, longName);
469-
470-
// Convert the UnicodeString back to a string and write it out
471-
std::string longNameStr;
472-
longName.toUTF8String(longNameStr);
473-
474-
return longNameStr;
457+
return getName<true>(timestamp, choose, tz_, timeZoneName_);
475458
}
476459
} // namespace facebook::velox::tz

0 commit comments

Comments
 (0)