1+ import builtins
12import typing as t
23
34from attr import Factory
@@ -13,6 +14,7 @@ class Treatment(Dumpable):
1314 convert_list : t .List [str ] = Factory (list )
1415 convert_string : t .List [str ] = Factory (list )
1516 convert_dict : t .List [t .Dict [str , str ]] = Factory (list )
17+ normalize_complex_lists : bool = False
1618 prune_invalid_date : t .List [str ] = Factory (list )
1719
1820 def apply (self , data : DictOrList ) -> DictOrList :
@@ -28,7 +30,7 @@ def apply_record(self, data: Record) -> Record:
2830 local_ignores = []
2931 if self .ignore_complex_lists :
3032 for k , v in data .items ():
31- if isinstance ( v , list ) and v and isinstance ( v [ 0 ], dict ):
33+ if self . is_list_of_dicts ( v ):
3234 # Skip ignoring special-encoded items.
3335 if v [0 ] and list (v [0 ].keys ())[0 ].startswith ("$" ):
3436 continue
@@ -39,6 +41,12 @@ def apply_record(self, data: Record) -> Record:
3941 if ignore_name in data :
4042 del data [ignore_name ]
4143
44+ # Apply normalization for lists of objects.
45+ if self .normalize_complex_lists :
46+ for _ , v in data .items ():
47+ if self .is_list_of_dicts (v ):
48+ ListOfVaryingObjectsNormalizer (v ).apply ()
49+
4250 # Converge certain items to `list` even when defined differently.
4351 for to_list_name in self .convert_list :
4452 if to_list_name in data and not isinstance (data [to_list_name ], list ):
@@ -66,3 +74,58 @@ def apply_record(self, data: Record) -> Record:
6674 del data [key ]
6775
6876 return data
77+
78+ @staticmethod
79+ def is_list_of_dicts (v : t .Any ) -> bool :
80+ return isinstance (v , list ) and bool (v ) and isinstance (v [0 ], dict )
81+
82+
83+ @define
84+ class NormalizerRule :
85+ """
86+ Manage details of a normalizer rule.
87+ """
88+
89+ name : str
90+ converter : t .Callable
91+
92+
93+ @define
94+ class ListOfVaryingObjectsNormalizer :
95+ """
96+ CrateDB can not store lists of varying objects, so try to normalize them.
97+ """
98+
99+ data : Collection
100+
101+ def apply (self ):
102+ self .apply_rules (self .get_rules (self .type_stats ()))
103+
104+ def apply_rules (self , rules : t .List [NormalizerRule ]) -> None :
105+ for item in self .data :
106+ for rule in rules :
107+ name = rule .name
108+ if name in item :
109+ item [name ] = rule .converter (item [name ])
110+
111+ def get_rules (self , statistics ) -> t .List [NormalizerRule ]:
112+ rules = []
113+ for name , types in statistics .items ():
114+ if len (types ) > 1 :
115+ rules .append (NormalizerRule (name = name , converter = self .get_best_converter (types )))
116+ return rules
117+
118+ def type_stats (self ) -> t .Dict [str , t .List [str ]]:
119+ types : t .Dict [str , t .List [str ]] = {}
120+ for item in self .data :
121+ for key , value in item .items ():
122+ types .setdefault (key , []).append (type (value ).__name__ )
123+ return types
124+
125+ @staticmethod
126+ def get_best_converter (types : t .List [str ]) -> t .Callable :
127+ if "str" in types :
128+ return builtins .str
129+ if "float" in types and "int" in types and "str" not in types :
130+ return builtins .float
131+ return lambda x : x
0 commit comments