Skip to content

Commit ebda0f3

Browse files
authored
fix(parquet/metadata): fix default unsigned int statistics (#210)
### Rationale for this change fixes #209 ### What changes are included in this PR? Fixing `UpdateSpaced` statistics methods to properly check unsigned values to ensure proper handling of default min/max values. ### Are these changes tested? Yes, unit test was added to confirm. ### Are there any user-facing changes? None, other than fixing the reported bug.
1 parent 90f5515 commit ebda0f3

File tree

3 files changed

+90
-39
lines changed

3 files changed

+90
-39
lines changed

parquet/metadata/statistics_test.go

+30
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"github.com/apache/arrow-go/v18/parquet/metadata"
2929
"github.com/apache/arrow-go/v18/parquet/schema"
3030
"github.com/stretchr/testify/assert"
31+
"github.com/stretchr/testify/require"
3132
)
3233

3334
// NOTE(zeroshade): tests will be added and updated after merging the "file" package
@@ -260,3 +261,32 @@ func TestBooleanStatisticsEncoding(t *testing.T) {
260261
assert.Equal(t, []byte{1}, maxEnc)
261262
assert.Equal(t, []byte{0}, minEnc)
262263
}
264+
265+
func TestUnsignedDefaultStats(t *testing.T) {
266+
// testing issue github.com/apache/arrow-go/issues/209
267+
// stats for unsigned columns should not have invalid min values
268+
{
269+
n, err := schema.NewPrimitiveNodeLogical("uint16", parquet.Repetitions.Optional,
270+
schema.NewIntLogicalType(16, false), parquet.Types.Int32, 4, -1)
271+
require.NoError(t, err)
272+
descr := schema.NewColumn(n, 1, 0)
273+
s := metadata.NewStatistics(descr, nil).(*metadata.Int32Statistics)
274+
s.UpdateSpaced([]int32{0}, []byte{1}, 0, 0)
275+
276+
minv, maxv := s.Min(), s.Max()
277+
assert.Zero(t, minv)
278+
assert.Zero(t, maxv)
279+
}
280+
{
281+
n, err := schema.NewPrimitiveNodeLogical("uint64", parquet.Repetitions.Optional,
282+
schema.NewIntLogicalType(64, false), parquet.Types.Int64, 4, -1)
283+
require.NoError(t, err)
284+
descr := schema.NewColumn(n, 1, 0)
285+
s := metadata.NewStatistics(descr, nil).(*metadata.Int64Statistics)
286+
s.UpdateSpaced([]int64{0}, []byte{1}, 0, 0)
287+
288+
minv, maxv := s.Min(), s.Max()
289+
assert.Zero(t, minv)
290+
assert.Zero(t, maxv)
291+
}
292+
}

parquet/metadata/statistics_types.gen.go

+40-26
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

parquet/metadata/statistics_types.gen.go.tmpl

+20-13
Original file line numberDiff line numberDiff line change
@@ -248,13 +248,26 @@ func (s *{{.Name}}Statistics) getMinMaxSpaced(values []{{.name}}, validBits []by
248248
max = s.defaultMax()
249249

250250
{{- if or (eq .name "int32") (eq .name "int64")}}
251-
var fn func([]{{.name}}) ({{.name}}, {{.name}})
251+
var fn func([]{{.name}})
252252
if s.order == schema.SortSIGNED {
253-
fn = shared_utils.GetMinMax{{.Name}}
253+
fn = func(v []{{.name}}) {
254+
localMin, localMax := shared_utils.GetMinMax{{.Name}}(v)
255+
if min > localMin {
256+
min = localMin
257+
}
258+
if max < localMax {
259+
max = localMax
260+
}
261+
}
254262
} else {
255-
fn = func(v []{{.name}}) ({{.name}}, {{.name}}) {
256-
umin, umax := shared_utils.GetMinMaxU{{.name}}(arrow.U{{.name}}Traits.CastFromBytes(arrow.{{.Name}}Traits.CastToBytes(values)))
257-
return {{.name}}(umin), {{.name}}(umax)
263+
fn = func(v []{{.name}}) {
264+
umin, umax := shared_utils.GetMinMaxU{{.name}}(arrow.U{{.name}}Traits.CastFromBytes(arrow.{{.Name}}Traits.CastToBytes(v)))
265+
if u{{.name}}(min) > umin {
266+
min = {{.name}}(umin)
267+
}
268+
if u{{.name}}(max) < umax {
269+
max = {{.name}}(umax)
270+
}
258271
}
259272
}
260273
{{- end}}
@@ -271,13 +284,7 @@ func (s *{{.Name}}Statistics) getMinMaxSpaced(values []{{.name}}, validBits []by
271284
break
272285
}
273286
{{- if or (eq .name "int32") (eq .name "int64")}}
274-
localMin, localMax := fn(values[int(run.Pos):int(run.Pos+run.Length)])
275-
if min > localMin {
276-
min = localMin
277-
}
278-
if max < localMax {
279-
max = localMax
280-
}
287+
fn(values[int(run.Pos):int(run.Pos+run.Length)])
281288
{{- else}}
282289
for _, v := range values[int(run.Pos):int(run.Pos+run.Length)] {
283290
{{- if or (eq .name "float32") (eq .name "float64") (eq .Name "Float16") }}
@@ -385,7 +392,7 @@ func (s *{{.Name}}Statistics) UpdateFromArrow(values arrow.Array, updateCounts b
385392
)
386393

387394
for i := 0; i < arr.Len(); i++ {
388-
nextOffset := arr.ValueOffset64(i + 1)
395+
nextOffset := curOffset + int64(arr.ValueLen(i))
389396
v := data[curOffset:nextOffset]
390397
curOffset = nextOffset
391398

0 commit comments

Comments
 (0)