Skip to content

Commit 1e0649f

Browse files
authored
Improvements to Table.union (#9968)
- Closes #9952
1 parent 517299b commit 1e0649f

File tree

17 files changed

+774
-517
lines changed

17 files changed

+774
-517
lines changed

CHANGELOG.md

+2
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,7 @@
666666
- [Added `Text.cleanse` `Column.Text_Cleanse` and `Table.Text_Cleanse`][9879]
667667
- [Added ability to save an existing Postgres connection as a Data Link in Enso
668668
Cloud.][9957]
669+
- [Improved `Table.union`.][9968]
669670

670671
[debug-shortcuts]:
671672
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@@ -978,6 +979,7 @@
978979
[9873]: https://github.com/enso-org/enso/pull/9873
979980
[9879]: https://github.com/enso-org/enso/pull/9879
980981
[9957]: https://github.com/enso-org/enso/pull/9957
982+
[9968]: https://github.com/enso-org/enso/pull/9968
981983

982984
#### Enso Compiler
983985

distribution/lib/Standard/Database/0.0.0-dev/src/DB_Table.enso

+95-92
Large diffs are not rendered by default.

distribution/lib/Standard/Database/0.0.0-dev/src/Errors.enso

+9-2
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,18 @@ type SQL_Error
3535
Convert the SQL error to a textual representation.
3636
to_text : Text
3737
to_text self =
38-
query = if self.related_query.is_nothing.not then " [Query was: " + self.related_query.to_display_text + "]" else ""
38+
query = if self.related_query.is_nothing then "" else
39+
query_text = self.related_query.to_text
40+
## Our generated queries tend to be very long, so to still be readable,
41+
we don't shorten them too much. We impose an upper limit to avoid unbounded error message size.
42+
max_length = 1000
43+
shortened_query_text = if query_text.length <= max_length then query_text else
44+
query_text.take (Index_Sub_Range.First (max_length.div 2)) + " (...) " + query_text.take (Index_Sub_Range.Last (max_length.div 2))
45+
" [Query was: " + shortened_query_text + "]"
3946
message = self.java_exception.getMessage
4047
max_length = 300
4148
short_message = if message.length < max_length then message else
42-
message.take (Index_Sub_Range.First max_length/2) + " (...) " + message.take (Index_Sub_Range.Last max_length/2)
49+
message.take (Index_Sub_Range.First (max_length.div 2)) + " (...) " + message.take (Index_Sub_Range.Last (max_length.div 2))
4350
"There was an SQL error: " + short_message + "." + query
4451

4552
## PRIVATE

distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Postgres/Postgres_Dialect.enso

+1-1
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ type Postgres_Dialect
140140

141141
## PRIVATE
142142
make_cast : Internal_Column -> SQL_Type -> (SQL_Expression -> SQL_Type_Reference) -> Internal_Column
143-
make_cast self column target_type infer_result_type_from_database_callback =
143+
make_cast self (column : Internal_Column) (target_type : SQL_Type) (infer_result_type_from_database_callback : SQL_Expression -> SQL_Type_Reference) =
144144
mapping = self.get_type_mapping
145145
source_type = mapping.sql_type_to_value_type column.sql_type_reference.get
146146
target_value_type = mapping.sql_type_to_value_type target_type

distribution/lib/Standard/Database/0.0.0-dev/src/Internal/SQLite/SQLite_Dialect.enso

+1-1
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ type SQLite_Dialect
138138

139139
## PRIVATE
140140
make_cast : Internal_Column -> SQL_Type -> (SQL_Expression -> SQL_Type_Reference) -> Internal_Column
141-
make_cast self column target_type infer_result_type_from_database_callback =
141+
make_cast self (column : Internal_Column) (target_type : SQL_Type) (infer_result_type_from_database_callback : SQL_Expression -> SQL_Type_Reference) =
142142
_ = [infer_result_type_from_database_callback]
143143
mapping = self.get_type_mapping
144144
target_value_type = mapping.sql_type_to_value_type target_type
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from Standard.Base import Vector, Text
2+
from Standard.Base.Metadata import make_single_choice, Widget
3+
4+
## Specifies which columns to keep in a union operation.
5+
type Columns_To_Keep
6+
## All columns are kept.
7+
8+
If a column is present only in some of the tables, it is padded with
9+
`Nothing` for tables where it is missing.
10+
In_Any
11+
12+
## Only columns that are present in all tables are kept.
13+
14+
If there are columns that are only present in some of the tables,
15+
a problem is reported.
16+
In_All
17+
18+
## Specific list of column names to keep.
19+
20+
If a table does not have a column that is specified in the list, it is
21+
padded with `Nothing` and a problem is reported.
22+
In_List (column_names : Vector Text)
23+
24+
## PRIVATE
25+
Same as `In_Any`, but it will warn about columns that are not present in
26+
all tables.
27+
In_Any_Warn_On_Missing
28+
29+
## PRIVATE
30+
The default widget for `Columns_To_Keep`.
31+
It does not display the internal `In_Any_Warn_On_Missing` variant, since
32+
that variant is only meant to be used as the default value.
33+
default_widget -> Widget =
34+
make_single_choice <|
35+
["In_Any", "In_All", "In_List"].map c-> [c, ".."+c]

distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso

+46-8
Original file line numberDiff line numberDiff line change
@@ -494,26 +494,42 @@ type Column_Type_Mismatch
494494

495495
type No_Common_Type
496496
## PRIVATE
497-
An error indicating that no common type could be found.
497+
An error indicating that no common type could be found, and the operation
498+
could not be performed.
498499

499500
Arguments:
500501
- types: The types that were tried to be unified.
501502
- related_column_name: The name of the resulting column that was being
502503
unified, if applicable.
503504
Error (types : Vector Value_Type) (related_column_name : Nothing|Text)
504505

506+
## PRIVATE
507+
A warning indicating that no common type could be found, so the operation
508+
had to fall back to converting all values to text.
509+
Warning_Convert_To_Text (types : Vector Value_Type) (related_column_name:Text)
510+
505511
## PRIVATE
506512

507513
Create a human-readable version of the error.
508514
to_display_text : Text
509515
to_display_text self =
510516
types = self.types.map .to_display_text . join ", "
511517
prefix = "No common type was found for types: "+types
512-
infix = case self.related_column_name of
513-
column_name : Text -> " when unifying column ["+column_name+"]."
514-
_ -> "."
515-
suffix = " If you want to allow mixed types, please cast one of the columns to `Mixed` beforehand."
516-
prefix + infix + suffix
518+
location = case self.related_column_name of
519+
column_name : Text -> " when unifying column ["+column_name+"]"
520+
_ -> ""
521+
suffix_type = case self of
522+
No_Common_Type.Error _ _ -> "."
523+
No_Common_Type.Warning_Convert_To_Text _ _ -> ", so the values were converted to text."
524+
suffix_mixed = " If you want to have mixed types instead, please cast one of the columns to `Mixed` beforehand."
525+
prefix + location + suffix_type + suffix_mixed
526+
527+
## PRIVATE
528+
to_text self -> Text =
529+
ctor = case self of
530+
No_Common_Type.Error _ _ -> "Error"
531+
No_Common_Type.Warning_Convert_To_Text _ _ -> "Warning_Convert_To_Text"
532+
"No_Common_Type."+ctor+" "+self.types.to_text+" "+self.related_column_name.to_text
517533

518534
type Unmatched_Columns
519535
## PRIVATE
@@ -637,9 +653,11 @@ type Conversion_Failure
637653

638654
type Loss_Of_Integer_Precision
639655
## PRIVATE
640-
Indicates that an automatic conversion of an integer column to a decimal
656+
Indicates that an automatic conversion of an Integer column to a Float
641657
column is losing precision because some of the large integers cannot be
642-
exactly represented by the `double` type.
658+
exactly represented by the floating-point type.
659+
660+
Currently, this error is only reported in-memory.
643661
Warning (affected_rows_count : Integer) (example_value : Integer) (example_value_converted : Float)
644662

645663
## PRIVATE
@@ -834,3 +852,23 @@ type Nothing_Value_In_Filter_Condition
834852
to_display_text : Text
835853
to_display_text self =
836854
"Using `Nothing` as an argument to a `"+self.filter_condition.to_text+"` cannot match anything."
855+
856+
## Indicates that different Date_Time (with or without timezone) or Date types
857+
are mixed in the result, causing implicit coercions.
858+
859+
This is a warning, because using the `00:00` time and default time-zone may
860+
not always be the expected choice, so the user should be aware of this.
861+
type Mixing_Date_Time_Types
862+
## PRIVATE
863+
Date_To_Date_Time (related_column_name : Text | Nothing)
864+
865+
## PRIVATE
866+
Implicit_Time_Zone (related_column_name : Text | Nothing)
867+
868+
to_display_text self -> Text =
869+
location = if self.related_column_name.is_nothing then "" else " (in column ["+self.related_column_name+"])"
870+
case self of
871+
Mixing_Date_Time_Types.Date_To_Date_Time _ ->
872+
"Mixing Date and Date_Time values"+location+": the Date values have been automatically converted to Date_Time by adding a time of 00:00 in the default time-zone."
873+
Mixing_Date_Time_Types.Implicit_Time_Zone _ ->
874+
"Mixing Date_Time values with and without timezone"+location+". A default timezone has been assumed where it was missing."

distribution/lib/Standard/Table/0.0.0-dev/src/Excel/Excel_Workbook.enso

+4-3
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ from Standard.Base.Data.Filter_Condition import sql_like_to_regex
1212
from Standard.Base.Metadata.Choice import Option
1313
from Standard.Base.Metadata.Widget import Multiple_Choice, Single_Choice
1414

15+
import project.Columns_To_Keep.Columns_To_Keep
1516
import project.Excel.Excel_Range.Excel_Range
1617
import project.Headers.Headers
1718
import project.Internal.Excel_Reader
@@ -331,15 +332,15 @@ type Excel_Workbook
331332
tables = sheet_names.map on_problems=on_problems address-> self.read address headers on_problems=on_problems
332333
case return of
333334
Return_As.Table_Of_Tables -> Table.new [["Sheet Name", sheet_names], ["Table", tables]]
334-
Return_As.Merged_Table match ->
335+
Return_As.Merged_Table columns_to_keep match ->
335336
first_tbl = tables.find t-> t != Nothing
336337
if first_tbl == Nothing then Error.throw (Illegal_Argument.Error "No valid sheets found.") else
337338
unique = first_tbl.column_naming_helper.create_unique_name_strategy
338339
tables.each tbl-> if tbl != Nothing then unique.mark_used tbl.column_names
339340
new_column_name = unique.make_unique "Sheet Name"
340341

341342
with_names = tables.zip sheet_names tbl->name-> if tbl == Nothing then Nothing else tbl.set name new_column_name . reorder_columns [new_column_name]
342-
result = Table.from_union (with_names.filter Filter_Condition.Not_Nothing) match keep_unmatched_columns=True
343+
result = Table.from_union (with_names.filter Filter_Condition.Not_Nothing) columns_to_keep=columns_to_keep match_columns=match
343344

344345
problem_builder = Problem_Builder.new
345346
problem_builder.report_unique_name_strategy unique
@@ -359,4 +360,4 @@ type Return_As
359360
Table_Of_Tables
360361

361362
## All sheets are merged into a single table. A union operation is performed.
362-
Merged_Table match:Match_Columns=Match_Columns.By_Name
363+
Merged_Table (columns_to_keep : Columns_To_Keep = Columns_To_Keep.In_Any) (match : Match_Columns = Match_Columns.By_Name)

distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Table_Helpers.enso

+32-23
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import project.Expression.Expression
1010
import project.Internal.Column_Naming_Helper.Column_Naming_Helper
1111
import project.Internal.Problem_Builder.Problem_Builder
1212
import project.Internal.Value_Type_Helpers
13+
import project.Match_Columns.Column_Set
1314
import project.Position.Position
1415
import project.Set_Mode.Set_Mode
1516
import project.Sort_Column.Sort_Column
@@ -530,30 +531,38 @@ is_column obj =
530531
## PRIVATE
531532
A helper method that resolves what should be the result type of a particular
532533
column set based on the union settings.
533-
unify_result_type_for_union column_set all_tables allow_type_widening problem_builder =
534+
unify_result_type_for_union (column_set : Column_Set) (all_tables : Vector) (problem_builder : Problem_Builder) -> Union_Result_Type =
534535
columns = column_set.resolve_columns all_tables
535-
case allow_type_widening of
536-
True ->
537-
types = columns.filter Filter_Condition.Not_Nothing . map .value_type
538-
common_type = Value_Type_Helpers.find_common_type types strict=True
539-
if common_type.is_nothing then
540-
problem_builder.report_other_warning (No_Common_Type.Error types related_column_name=column_set.name)
541-
common_type
542-
False ->
543-
is_not_nothing c = case c of
544-
Nothing -> False
545-
_ -> True
546-
first_column = columns.find is_not_nothing
547-
first_type = first_column.value_type
548-
if first_type == Value_Type.Mixed then Value_Type.Mixed else
549-
first_wrong_column = columns.find if_missing=Nothing col->
550-
is_not_nothing col && col.value_type != first_type
551-
case first_wrong_column of
552-
Nothing -> first_type
553-
_ ->
554-
got_type = first_wrong_column.value_type
555-
problem_builder.report_other_warning (Column_Type_Mismatch.Error column_set.name first_type got_type)
556-
Nothing
536+
. filter Filter_Condition.Not_Nothing
537+
types = columns.map .value_type
538+
539+
if types.is_empty then Union_Result_Type.No_Types_To_Unify else
540+
## First we check if we can find a generic common type.
541+
This includes widening numeric column sizes, or converting Integer to Float.
542+
common_type = Value_Type_Helpers.find_common_type types strict=True
543+
if common_type.is_nothing.not then Union_Result_Type.Common_Type common_type else
544+
## Union has less strict requirements than other operations relying on `find_common_type`,
545+
so if the common type was not found, we still check some fallbacks.
546+
common_numeric_boolean = Value_Type_Helpers.find_common_numeric_boolean_type types
547+
if common_numeric_boolean.is_nothing.not then Union_Result_Type.Common_Type common_numeric_boolean else
548+
common_date_type = Value_Type_Helpers.find_common_date_types types column_set.name problem_builder
549+
if common_date_type.is_nothing.not then Union_Result_Type.Common_Type common_date_type else
550+
# Lastly, we fall back to text, reporting a warning.
551+
problem_builder.report_other_warning (No_Common_Type.Warning_Convert_To_Text types column_set.name)
552+
Union_Result_Type.Fallback_To_Text
553+
554+
## PRIVATE
555+
type Union_Result_Type
556+
## PRIVATE
557+
Common_Type (value_type : Value_Type)
558+
559+
## PRIVATE
560+
Fallback_To_Text
561+
562+
## PRIVATE
563+
This case is returned if the requested column was missing from _all_ tables,
564+
so there were no types to unify. An all-null column should be created.
565+
No_Types_To_Unify
557566

558567
## PRIVATE
559568
Replace a set of columns in the table with a new set of columns. The old

0 commit comments

Comments
 (0)