Skip to content

Commit fe69a55

Browse files
authored
Class Serialisation Performance Maintenance (#284)
1 parent 1a5dde2 commit fe69a55

File tree

8 files changed

+153
-28
lines changed

8 files changed

+153
-28
lines changed

.github/workflows/full.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name: 'Full Workflow'
22

33
env:
4-
VERSION: 4.6.1
4+
VERSION: 4.6.2
55
ASM_VERSION: 4.0.0
66

77
on:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using System.Text;
5+
using System.Threading.Tasks;
6+
using BenchmarkDotNet.Attributes;
7+
using Microsoft.Win32.SafeHandles;
8+
using Parquet.Serialization;
9+
10+
namespace Parquet.PerfRunner.Benchmarks {
11+
12+
class Record {
13+
public DateTime Timestamp { get; set; }
14+
public string? EventName { get; set; }
15+
public double MeterValue { get; set; }
16+
}
17+
18+
19+
[ShortRunJob]
20+
[MeanColumn]
21+
[MemoryDiagnoser]
22+
[MarkdownExporter]
23+
public class Classes {
24+
private List<Record>? _testData;
25+
private MemoryStream _ms = new MemoryStream();
26+
27+
[GlobalSetup]
28+
public async Task SetUp() {
29+
_testData = Enumerable.Range(0, 1_000).Select(i => new Record {
30+
Timestamp = DateTime.UtcNow.AddSeconds(i),
31+
EventName = i % 2 == 0 ? "on" : "off",
32+
MeterValue = i
33+
}).ToList();
34+
35+
await ParquetSerializer.SerializeAsync(_testData, _ms);
36+
}
37+
38+
39+
[Benchmark(Baseline = true)]
40+
public async Task Serialise_Legacy() {
41+
using var ms = new MemoryStream();
42+
await ParquetConvert.SerializeAsync(_testData, ms);
43+
}
44+
45+
[Benchmark]
46+
public async Task Deserialise_Legacy() {
47+
_ms.Position = 0;
48+
await ParquetConvert.DeserializeAsync<Record>(_ms);
49+
}
50+
51+
[Benchmark]
52+
public async Task Serialise() {
53+
using var ms = new MemoryStream();
54+
await ParquetSerializer.SerializeAsync(_testData, ms);
55+
}
56+
57+
[Benchmark]
58+
public async Task Deserialise() {
59+
_ms.Position = 0;
60+
await ParquetSerializer.DeserializeAsync<Record>(_ms);
61+
}
62+
}
63+
}

src/Parquet.PerfRunner/Program.cs

+7-1
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,14 @@
1111
case "progression":
1212
VersionedBenchmark.Run();
1313
break;
14+
case "classes":
15+
BenchmarkRunner.Run<Classes>();
16+
break;
1417
}
1518
} else {
1619
//new VsParquetSharp().Main();
17-
await new DataTypes().NullableInts();
20+
//await new DataTypes().NullableInts();
21+
var c = new Classes();
22+
c.SetUp();
23+
c.Serialise();
1824
}

src/Parquet.Test/Serialisation/ParquetSerializerTest.cs

+25
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,31 @@ public async Task Struct_WithNullProps_Serde() {
118118
Assert.Equivalent(data2, data);
119119
}
120120

121+
//[Fact]
122+
public async Task Struct_With_NestedNulls_Serde() {
123+
124+
var data = new List<AddressBookEntry> {
125+
new AddressBookEntry {
126+
FirstName = "Joe",
127+
LastName = "Bloggs",
128+
Address = new Address() {
129+
City = null,
130+
Country = null
131+
}
132+
}
133+
};
134+
135+
// serialiser puts (null, 0) for Address.City, but should put (null, 1)
136+
137+
using var ms = new MemoryStream();
138+
await ParquetSerializer.SerializeAsync(data, ms);
139+
140+
ms.Position = 0;
141+
IList<AddressBookEntry> data2 = await ParquetSerializer.DeserializeAsync<AddressBookEntry>(ms);
142+
143+
XAssert.JsonEquivalent(data, data2);
144+
}
145+
121146
[Fact]
122147
public async Task List_Structs_Serde() {
123148
var data = Enumerable.Range(0, 1_000).Select(i => new MovementHistory {

src/Parquet/Globals.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ public static class Globals {
1717
/// </summary>
1818
public static readonly string GithubSha = "${GITHUB_SHA}";
1919

20-
internal const string DataTypeEnumObsolete = "Please resort to using System.Type overloads. Will be removed in v6.";
21-
internal const string ParquetConvertObsolete = "ParquetConvert was an experimental project and is not obsolete. Consider switching to ParquetSerializer which supports all data types, including nested ones, and is just superior. ParquetConvert will be removed in v6.";
20+
internal const string DataTypeEnumObsolete = "Please resort to using System.Type overloads. Will be removed in v5.";
21+
internal const string ParquetConvertObsolete = "ParquetConvert was an experimental project and is now obsolete. Consider switching to ParquetSerializer which supports all data types, including nested ones, and is just superior. ParquetConvert will be removed in v5.";
2222
}
2323
}

src/Parquet/Serialization/Dremel/FieldAssemblerCompiler.cs

+3-1
Original file line numberDiff line numberDiff line change
@@ -144,13 +144,15 @@ private static void Discover(Field field, out bool isRepeated) {
144144
(field.SchemaType == SchemaType.Data && field is DataField rdf && rdf.IsArray);
145145
}
146146

147+
#if DEBUG
147148
private static void InjectLevelDebug(string levelPropertyName,
148149
object value, int dataIdx,
149150
int dl, int rl,
150151
int dlDepth, int rlDepth,
151152
int[] rsm) {
152153
Console.WriteLine("debug");
153154
}
155+
#endif
154156

155157
/// <summary>
156158
/// Transitions RSM for current RL iteration
@@ -259,7 +261,7 @@ private Expression InjectLevel(Expression rootVar, Type rootType, Field[] levelF
259261
} else {
260262
if(isAtomic) {
261263

262-
// C#: dlDepth <= _dlVar?
264+
// C#: dlDepth == _dlVar?
263265
iteration =
264266
Expression.IfThen(
265267
Expression.Equal(Expression.Constant(dlDepth), _dlVar),

src/Parquet/Serialization/Dremel/FieldStriperCompiler.cs

+32-21
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ class FieldStriperCompiler<TClass> {
1515

1616
private readonly ParquetSchema _schema;
1717
private readonly DataField _df;
18+
private readonly bool _hasRls;
19+
private readonly bool _hasDls;
1820

1921
// input parameters
2022
private readonly ParameterExpression _dfParam = Expression.Parameter(typeof(DataField), "df");
@@ -34,10 +36,14 @@ class FieldStriperCompiler<TClass> {
3436
// currently iterated class element
3537
private readonly ParameterExpression _classElementVar = Expression.Variable(typeof(TClass), "curr");
3638

39+
private static readonly Expression NullListOfInt = Expression.Convert(Expression.Constant(null), typeof(List<int>));
40+
3741
public FieldStriperCompiler(ParquetSchema schema, DataField df) {
3842

3943
_schema = schema;
4044
_df = df;
45+
_hasRls = _df.MaxRepetitionLevel > 0;
46+
_hasDls = _df.MaxDefinitionLevel > 0;
4147

4248
//
4349
_valuesListType = typeof(List<>).MakeGenericType(df.ClrType);
@@ -82,38 +88,40 @@ private Expression WriteValue(ParameterExpression valueVar,
8288

8389
// only need RL and DL-1
8490
Expression.Block(
85-
Expression.Call(_dlsVar, LevelsAddMethod, Expression.Constant(dl - 1)),
86-
Expression.Call(_rlsVar, LevelsAddMethod, currentRlVar)),
91+
_hasDls ? Expression.Call(_dlsVar, LevelsAddMethod, Expression.Constant(dl - 1)) : Expression.Empty(),
92+
_hasRls ? Expression.Call(_rlsVar, LevelsAddMethod, currentRlVar) : Expression.Empty()),
8793

8894
// everything, but value must be non-null
8995
Expression.Block(
9096
Expression.Call(_valuesVar, _valuesListAddMethod, getNonNullValue),
91-
Expression.Call(_dlsVar, LevelsAddMethod, Expression.Constant(dl)),
92-
Expression.Call(_rlsVar, LevelsAddMethod, currentRlVar)));
97+
_hasDls ? Expression.Call(_dlsVar, LevelsAddMethod, Expression.Constant(dl)) : Expression.Empty(),
98+
_hasRls ? Expression.Call(_rlsVar, LevelsAddMethod, currentRlVar) : Expression.Empty()));
9399

94100
} else {
95101
// required atomics are simple - add value, RL and DL as is
96102
return Expression.Block(
97103
Expression.Call(_valuesVar, _valuesListAddMethod, valueVar),
98-
Expression.Call(_dlsVar, LevelsAddMethod, Expression.Constant(dl)),
99-
Expression.Call(_rlsVar, LevelsAddMethod, currentRlVar));
104+
_hasDls ? Expression.Call(_dlsVar, LevelsAddMethod, Expression.Constant(dl)) : Expression.Empty(),
105+
_hasRls ? Expression.Call(_rlsVar, LevelsAddMethod, currentRlVar) : Expression.Empty());
100106
}
101107
}
102108

103109
// non-atomics still need RL and DL dumped
104110
return Expression.Block(
105-
Expression.Call(_dlsVar, LevelsAddMethod, Expression.Constant(dl)),
106-
Expression.Call(_rlsVar, LevelsAddMethod, currentRlVar));
111+
_hasDls ? Expression.Call(_dlsVar, LevelsAddMethod, Expression.Constant(dl)) : Expression.Empty(),
112+
_hasRls ? Expression.Call(_rlsVar, LevelsAddMethod, currentRlVar) : Expression.Empty());
107113

108114
}
109115

110116
private Expression WriteMissingValue(int dl, Expression currentRlVar) {
111117
return Expression.Block(
112-
Expression.Call(_dlsVar, LevelsAddMethod, Expression.Constant(dl)),
113-
Expression.Call(_rlsVar, LevelsAddMethod, currentRlVar));
118+
_hasDls ? Expression.Call(_dlsVar, LevelsAddMethod, Expression.Constant(dl)) : Expression.Empty(),
119+
_hasRls ? Expression.Call(_rlsVar, LevelsAddMethod, currentRlVar) : Expression.Empty());
114120
}
115121

116-
private Expression WhileBody(Expression element, bool isAtomic, int dl, ParameterExpression currentRlVar, ParameterExpression seenFieldsVar, Field field, int rlDepth, Type elementType, List<string> path) {
122+
private Expression WhileBody(Expression element, bool isAtomic, int dl, ParameterExpression currentRlVar,
123+
ParameterExpression seenFieldsVar, Field field, int rlDepth, Type elementType, List<string> path) {
124+
117125
string suffix = field.Path.ToString().Replace(".", "_");
118126
ParameterExpression chRepetitionLevelVar = Expression.Variable(typeof(int), $"chRepetitionLevel_{suffix}");
119127
ParameterExpression valueVar = Expression.Variable(elementType, $"value_{suffix}");
@@ -127,13 +135,15 @@ private Expression WhileBody(Expression element, bool isAtomic, int dl, Paramete
127135
// L9-13
128136
Expression.IfThenElse(
129137
// if seenFields.Contains(field.Path)
130-
Expression.Call(seenFieldsVar, typeof(HashSet<string>).GetMethod("Contains")!, Expression.Constant(field.Path.ToString())),
138+
//Expression.Call(seenFieldsVar, typeof(HashSet<string>).GetMethod("Contains")!, Expression.Constant(field.Path.ToString())),
139+
Expression.IsTrue(seenFieldsVar),
131140

132141
// chRepetitionLevelVar = treeDepth
133142
Expression.Assign(chRepetitionLevelVar, Expression.Constant(rlDepth)),
134143

135144
// seenFields.Add(field.Path)
136-
Expression.Call(seenFieldsVar, typeof(HashSet<string>).GetMethod("Add")!, Expression.Constant(field.Path.ToString()))
145+
//Expression.Call(seenFieldsVar, typeof(HashSet<string>).GetMethod("Add")!, Expression.Constant(field.Path.ToString()))
146+
Expression.Assign(seenFieldsVar, Expression.Constant(true))
137147
),
138148

139149
// L14-
@@ -195,13 +205,14 @@ private Expression DissectRecord(
195205
Expression levelProperty = Expression.Property(rootVar, levelPropertyName);
196206
Type levelPropertyType = rootType.GetProperty(levelPropertyName)!.PropertyType;
197207
ParameterExpression seenFieldsVar = Expression.Variable(typeof(HashSet<string>), $"seenFieldsVar_{levelPropertyName}");
208+
ParameterExpression seenVar = Expression.Variable(typeof(bool), $"seen_{levelPropertyName}");
198209

199210
Expression extraBody;
200211
if(isRepeated) {
201212
Type elementType = ExtractElementTypeFromEnumerableType(levelPropertyType);
202213
Expression collection = levelProperty;
203214
ParameterExpression element = Expression.Variable(elementType, "element");
204-
Expression elementProcessor = WhileBody(element, isAtomic, dl, currentRlVar, seenFieldsVar, field, rlDepth, elementType, path);
215+
Expression elementProcessor = WhileBody(element, isAtomic, dl, currentRlVar, seenVar, field, rlDepth, elementType, path);
205216
extraBody = elementProcessor.Loop(collection, elementType, element);
206217

207218
// todo: if levelProperty (collection) is null, we need extra iteration with null value (which rep and def level?)
@@ -212,12 +223,12 @@ private Expression DissectRecord(
212223
extraBody);
213224
} else {
214225
Expression element = levelProperty;
215-
extraBody = WhileBody(element, isAtomic, dl, currentRlVar, seenFieldsVar, field, rlDepth, levelPropertyType, path);
226+
extraBody = WhileBody(element, isAtomic, dl, currentRlVar, seenVar, field, rlDepth, levelPropertyType, path);
216227
}
217228

218229
return Expression.Block(
219-
new[] { seenFieldsVar },
220-
Expression.Assign(seenFieldsVar, Expression.New(typeof(HashSet<string>))),
230+
new[] { seenVar },
231+
Expression.Assign(seenVar, Expression.Constant(false)),
221232
extraBody);
222233
}
223234

@@ -236,16 +247,16 @@ public FieldStriper<TClass> Compile() {
236247
// init 3 building blocks
237248
Expression.Block(
238249
Expression.Assign(_valuesVar, Expression.New(_valuesListType)),
239-
Expression.Assign(_dlsVar, Expression.New(typeof(List<int>))),
240-
Expression.Assign(_rlsVar, Expression.New(typeof(List<int>)))),
250+
Expression.Assign(_dlsVar, _hasDls ? Expression.New(typeof(List<int>)) : NullListOfInt),
251+
Expression.Assign(_rlsVar, _hasRls ? Expression.New(typeof(List<int>)) : NullListOfInt)),
241252

242253
iterationLoop,
243254

244255
// result: use triple to construct ShreddedColumn and return (last element in the block)
245256
Expression.New(ShreddedColumnConstructor,
246257
Expression.Call(_valuesVar, _valuesListType.GetMethod("ToArray")!),
247-
_df.MaxDefinitionLevel == 0 ? Expression.Convert(Expression.Constant(null), typeof(List<int>)) : _dlsVar,
248-
_df.MaxRepetitionLevel == 0 ? Expression.Convert(Expression.Constant(null), typeof(List<int>)) : _rlsVar)
258+
_dlsVar,
259+
_rlsVar)
249260
);
250261

251262
Func<DataField, IEnumerable<TClass>, ShreddedColumn> lambda = Expression

src/Parquet/Serialization/ParquetSerializer.cs

+20-2
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ namespace Parquet.Serialization {
1717
/// </summary>
1818
public static class ParquetSerializer {
1919

20+
private static readonly Dictionary<Type, object> _typeToStriper = new();
21+
private static readonly Dictionary<Type, object> _typeToAssembler = new();
22+
2023
/// <summary>
2124
/// Serialize
2225
/// </summary>
@@ -31,7 +34,14 @@ public static async Task<ParquetSchema> SerializeAsync<T>(IEnumerable<T> objectI
3134
ParquetSerializerOptions? options = null,
3235
CancellationToken cancellationToken = default) {
3336

34-
Striper<T> striper = new Striper<T>(typeof(T).GetParquetSchema(false));
37+
Striper<T> striper;
38+
39+
if(_typeToStriper.TryGetValue(typeof(T), out object? boxedStriper)) {
40+
striper = (Striper<T>)boxedStriper;
41+
} else {
42+
striper = new Striper<T>(typeof(T).GetParquetSchema(false));
43+
_typeToStriper[typeof(T)] = striper;
44+
}
3545

3646
bool append = options != null && options.Append;
3747
using(ParquetWriter writer = await ParquetWriter.CreateAsync(striper.Schema, destination, null, append, cancellationToken)) {
@@ -86,7 +96,15 @@ public static async Task<IList<T>> DeserializeAsync<T>(Stream source,
8696
CancellationToken cancellationToken = default)
8797
where T : new() {
8898

89-
Assembler<T> asm = new Assembler<T>(typeof(T).GetParquetSchema(true));
99+
Assembler<T> asm;
100+
101+
if(_typeToAssembler.TryGetValue(typeof(T), out object? boxedAssembler)) {
102+
asm = (Assembler<T>)boxedAssembler;
103+
} else {
104+
asm = new Assembler<T>(typeof(T).GetParquetSchema(true));
105+
_typeToAssembler[typeof(T)] = asm;
106+
}
107+
90108
var result = new List<T>();
91109

92110
using ParquetReader reader = await ParquetReader.CreateAsync(source, new ParquetOptions { UnpackDefinitions = false });

0 commit comments

Comments
 (0)