@@ -15,6 +15,8 @@ class FieldStriperCompiler<TClass> {
15
15
16
16
private readonly ParquetSchema _schema ;
17
17
private readonly DataField _df ;
18
+ private readonly bool _hasRls ;
19
+ private readonly bool _hasDls ;
18
20
19
21
// input parameters
20
22
private readonly ParameterExpression _dfParam = Expression . Parameter ( typeof ( DataField ) , "df" ) ;
@@ -34,10 +36,14 @@ class FieldStriperCompiler<TClass> {
34
36
// currently iterated class element
35
37
private readonly ParameterExpression _classElementVar = Expression . Variable ( typeof ( TClass ) , "curr" ) ;
36
38
39
+ private static readonly Expression NullListOfInt = Expression . Convert ( Expression . Constant ( null ) , typeof ( List < int > ) ) ;
40
+
37
41
public FieldStriperCompiler ( ParquetSchema schema , DataField df ) {
38
42
39
43
_schema = schema ;
40
44
_df = df ;
45
+ _hasRls = _df . MaxRepetitionLevel > 0 ;
46
+ _hasDls = _df . MaxDefinitionLevel > 0 ;
41
47
42
48
//
43
49
_valuesListType = typeof ( List < > ) . MakeGenericType ( df . ClrType ) ;
@@ -82,38 +88,40 @@ private Expression WriteValue(ParameterExpression valueVar,
82
88
83
89
// only need RL and DL-1
84
90
Expression . Block (
85
- Expression . Call ( _dlsVar , LevelsAddMethod , Expression . Constant ( dl - 1 ) ) ,
86
- Expression . Call ( _rlsVar , LevelsAddMethod , currentRlVar ) ) ,
91
+ _hasDls ? Expression . Call ( _dlsVar , LevelsAddMethod , Expression . Constant ( dl - 1 ) ) : Expression . Empty ( ) ,
92
+ _hasRls ? Expression . Call ( _rlsVar , LevelsAddMethod , currentRlVar ) : Expression . Empty ( ) ) ,
87
93
88
94
// everything, but value must be non-null
89
95
Expression . Block (
90
96
Expression . Call ( _valuesVar , _valuesListAddMethod , getNonNullValue ) ,
91
- Expression . Call ( _dlsVar , LevelsAddMethod , Expression . Constant ( dl ) ) ,
92
- Expression . Call ( _rlsVar , LevelsAddMethod , currentRlVar ) ) ) ;
97
+ _hasDls ? Expression . Call ( _dlsVar , LevelsAddMethod , Expression . Constant ( dl ) ) : Expression . Empty ( ) ,
98
+ _hasRls ? Expression . Call ( _rlsVar , LevelsAddMethod , currentRlVar ) : Expression . Empty ( ) ) ) ;
93
99
94
100
} else {
95
101
// required atomics are simple - add value, RL and DL as is
96
102
return Expression . Block (
97
103
Expression . Call ( _valuesVar , _valuesListAddMethod , valueVar ) ,
98
- Expression . Call ( _dlsVar , LevelsAddMethod , Expression . Constant ( dl ) ) ,
99
- Expression . Call ( _rlsVar , LevelsAddMethod , currentRlVar ) ) ;
104
+ _hasDls ? Expression . Call ( _dlsVar , LevelsAddMethod , Expression . Constant ( dl ) ) : Expression . Empty ( ) ,
105
+ _hasRls ? Expression . Call ( _rlsVar , LevelsAddMethod , currentRlVar ) : Expression . Empty ( ) ) ;
100
106
}
101
107
}
102
108
103
109
// non-atomics still need RL and DL dumped
104
110
return Expression . Block (
105
- Expression . Call ( _dlsVar , LevelsAddMethod , Expression . Constant ( dl ) ) ,
106
- Expression . Call ( _rlsVar , LevelsAddMethod , currentRlVar ) ) ;
111
+ _hasDls ? Expression . Call ( _dlsVar , LevelsAddMethod , Expression . Constant ( dl ) ) : Expression . Empty ( ) ,
112
+ _hasRls ? Expression . Call ( _rlsVar , LevelsAddMethod , currentRlVar ) : Expression . Empty ( ) ) ;
107
113
108
114
}
109
115
110
116
private Expression WriteMissingValue ( int dl , Expression currentRlVar ) {
111
117
return Expression . Block (
112
- Expression . Call ( _dlsVar , LevelsAddMethod , Expression . Constant ( dl ) ) ,
113
- Expression . Call ( _rlsVar , LevelsAddMethod , currentRlVar ) ) ;
118
+ _hasDls ? Expression . Call ( _dlsVar , LevelsAddMethod , Expression . Constant ( dl ) ) : Expression . Empty ( ) ,
119
+ _hasRls ? Expression . Call ( _rlsVar , LevelsAddMethod , currentRlVar ) : Expression . Empty ( ) ) ;
114
120
}
115
121
116
- private Expression WhileBody ( Expression element , bool isAtomic , int dl , ParameterExpression currentRlVar , ParameterExpression seenFieldsVar , Field field , int rlDepth , Type elementType , List < string > path ) {
122
+ private Expression WhileBody ( Expression element , bool isAtomic , int dl , ParameterExpression currentRlVar ,
123
+ ParameterExpression seenFieldsVar , Field field , int rlDepth , Type elementType , List < string > path ) {
124
+
117
125
string suffix = field . Path . ToString ( ) . Replace ( "." , "_" ) ;
118
126
ParameterExpression chRepetitionLevelVar = Expression . Variable ( typeof ( int ) , $ "chRepetitionLevel_{ suffix } ") ;
119
127
ParameterExpression valueVar = Expression . Variable ( elementType , $ "value_{ suffix } ") ;
@@ -127,13 +135,15 @@ private Expression WhileBody(Expression element, bool isAtomic, int dl, Paramete
127
135
// L9-13
128
136
Expression . IfThenElse (
129
137
// if seenFields.Contains(field.Path)
130
- Expression . Call ( seenFieldsVar , typeof ( HashSet < string > ) . GetMethod ( "Contains" ) ! , Expression . Constant ( field . Path . ToString ( ) ) ) ,
138
+ //Expression.Call(seenFieldsVar, typeof(HashSet<string>).GetMethod("Contains")!, Expression.Constant(field.Path.ToString())),
139
+ Expression . IsTrue ( seenFieldsVar ) ,
131
140
132
141
// chRepetitionLevelVar = treeDepth
133
142
Expression . Assign ( chRepetitionLevelVar , Expression . Constant ( rlDepth ) ) ,
134
143
135
144
// seenFields.Add(field.Path)
136
- Expression . Call ( seenFieldsVar , typeof ( HashSet < string > ) . GetMethod ( "Add" ) ! , Expression . Constant ( field . Path . ToString ( ) ) )
145
+ //Expression.Call(seenFieldsVar, typeof(HashSet<string>).GetMethod("Add")!, Expression.Constant(field.Path.ToString()))
146
+ Expression . Assign ( seenFieldsVar , Expression . Constant ( true ) )
137
147
) ,
138
148
139
149
// L14-
@@ -195,13 +205,14 @@ private Expression DissectRecord(
195
205
Expression levelProperty = Expression . Property ( rootVar , levelPropertyName ) ;
196
206
Type levelPropertyType = rootType . GetProperty ( levelPropertyName ) ! . PropertyType ;
197
207
ParameterExpression seenFieldsVar = Expression . Variable ( typeof ( HashSet < string > ) , $ "seenFieldsVar_{ levelPropertyName } ") ;
208
+ ParameterExpression seenVar = Expression . Variable ( typeof ( bool ) , $ "seen_{ levelPropertyName } ") ;
198
209
199
210
Expression extraBody ;
200
211
if ( isRepeated ) {
201
212
Type elementType = ExtractElementTypeFromEnumerableType ( levelPropertyType ) ;
202
213
Expression collection = levelProperty ;
203
214
ParameterExpression element = Expression . Variable ( elementType , "element" ) ;
204
- Expression elementProcessor = WhileBody ( element , isAtomic , dl , currentRlVar , seenFieldsVar , field , rlDepth , elementType , path ) ;
215
+ Expression elementProcessor = WhileBody ( element , isAtomic , dl , currentRlVar , seenVar , field , rlDepth , elementType , path ) ;
205
216
extraBody = elementProcessor . Loop ( collection , elementType , element ) ;
206
217
207
218
// todo: if levelProperty (collection) is null, we need extra iteration with null value (which rep and def level?)
@@ -212,12 +223,12 @@ private Expression DissectRecord(
212
223
extraBody ) ;
213
224
} else {
214
225
Expression element = levelProperty ;
215
- extraBody = WhileBody ( element , isAtomic , dl , currentRlVar , seenFieldsVar , field , rlDepth , levelPropertyType , path ) ;
226
+ extraBody = WhileBody ( element , isAtomic , dl , currentRlVar , seenVar , field , rlDepth , levelPropertyType , path ) ;
216
227
}
217
228
218
229
return Expression . Block (
219
- new [ ] { seenFieldsVar } ,
220
- Expression . Assign ( seenFieldsVar , Expression . New ( typeof ( HashSet < string > ) ) ) ,
230
+ new [ ] { seenVar } ,
231
+ Expression . Assign ( seenVar , Expression . Constant ( false ) ) ,
221
232
extraBody ) ;
222
233
}
223
234
@@ -236,16 +247,16 @@ public FieldStriper<TClass> Compile() {
236
247
// init 3 building blocks
237
248
Expression . Block (
238
249
Expression . Assign ( _valuesVar , Expression . New ( _valuesListType ) ) ,
239
- Expression . Assign ( _dlsVar , Expression . New ( typeof ( List < int > ) ) ) ,
240
- Expression . Assign ( _rlsVar , Expression . New ( typeof ( List < int > ) ) ) ) ,
250
+ Expression . Assign ( _dlsVar , _hasDls ? Expression . New ( typeof ( List < int > ) ) : NullListOfInt ) ,
251
+ Expression . Assign ( _rlsVar , _hasRls ? Expression . New ( typeof ( List < int > ) ) : NullListOfInt ) ) ,
241
252
242
253
iterationLoop ,
243
254
244
255
// result: use triple to construct ShreddedColumn and return (last element in the block)
245
256
Expression . New ( ShreddedColumnConstructor ,
246
257
Expression . Call ( _valuesVar , _valuesListType . GetMethod ( "ToArray" ) ! ) ,
247
- _df . MaxDefinitionLevel == 0 ? Expression . Convert ( Expression . Constant ( null ) , typeof ( List < int > ) ) : _dlsVar ,
248
- _df . MaxRepetitionLevel == 0 ? Expression . Convert ( Expression . Constant ( null ) , typeof ( List < int > ) ) : _rlsVar )
258
+ _dlsVar ,
259
+ _rlsVar )
249
260
) ;
250
261
251
262
Func < DataField , IEnumerable < TClass > , ShreddedColumn > lambda = Expression
0 commit comments