Skip to content

Commit f90780d

Browse files
committed
Apply projection to Statistics in FilterExec
1 parent 4c484a6 commit f90780d

File tree

3 files changed

+74
-1
lines changed

3 files changed

+74
-1
lines changed

datafusion/common/src/stats.rs

+20
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,26 @@ impl Statistics {
258258
self
259259
}
260260

261+
/// Project the statistics to the given column indices.
262+
///
263+
/// For example, if we had statistics for columns `{"a", "b", "c"}`,
264+
/// projecting to `vec![2, 1]` would return statistics for columns `{"c",
265+
/// "b"}`.
266+
pub fn project(mut self, projection: Option<&Vec<usize>>) -> Self {
267+
let Some(projection) = projection else {
268+
return self;
269+
};
270+
271+
// todo: it would be nice to avoid cloning column statistics if
272+
// possible (e.g. if the projection did not contain duplicates)
273+
self.column_statistics = projection
274+
.iter()
275+
.map(|&i| self.column_statistics[i].clone())
276+
.collect();
277+
278+
self
279+
}
280+
261281
/// Calculates the statistics after `fetch` and `skip` operations apply.
262282
/// Here, `self` denotes per-partition statistics. Use the `n_partitions`
263283
/// parameter to compute global statistics in a multi-partition setting.

datafusion/physical-plan/src/filter.rs

+6-1
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,12 @@ impl ExecutionPlan for FilterExec {
370370
/// The output statistics of a filtering operation can be estimated if the
371371
/// predicate's selectivity value can be determined for the incoming data.
372372
fn statistics(&self) -> Result<Statistics> {
373-
Self::statistics_helper(&self.input, self.predicate(), self.default_selectivity)
373+
let stats = Self::statistics_helper(
374+
&self.input,
375+
self.predicate(),
376+
self.default_selectivity,
377+
)?;
378+
Ok(stats.project(self.projection.as_ref()))
374379
}
375380
}
376381

datafusion/sqllogictest/test_files/parquet.slt

+48
Original file line numberDiff line numberDiff line change
@@ -348,3 +348,51 @@ DROP TABLE list_columns;
348348
# Clean up
349349
statement ok
350350
DROP TABLE listing_table;
351+
352+
## Tests for https://github.com/apache/datafusion/issues/13186
353+
statement ok
354+
create table cpu (time timestamp, usage_idle float, usage_user float, cpu int);
355+
356+
statement ok
357+
insert into cpu values ('1970-01-01 00:00:00', 1.0, 2.0, 3);
358+
359+
# must put it into a parquet file to get statistics
360+
statement ok
361+
copy (select * from cpu) to 'test_files/scratch/parquet/cpu.parquet';
362+
363+
# Run queries against parquet files
364+
statement ok
365+
create external table cpu_parquet
366+
stored as parquet
367+
location 'test_files/scratch/parquet/cpu.parquet';
368+
369+
# Double filtering
370+
#
371+
# Expect 1 row for both queries
372+
query PI
373+
select time, rn
374+
from (
375+
select time, row_number() OVER (ORDER BY usage_idle, time) as rn
376+
from cpu
377+
where cpu = 3
378+
) where rn > 0;
379+
----
380+
1970-01-01T00:00:00 1
381+
382+
query PI
383+
select time, rn
384+
from (
385+
select time, row_number() OVER (ORDER BY usage_idle, time) as rn
386+
from cpu_parquet
387+
where cpu = 3
388+
) where rn > 0;
389+
----
390+
1970-01-01T00:00:00 1
391+
392+
393+
# Clean up
394+
statement ok
395+
drop table cpu;
396+
397+
statement ok
398+
drop table cpu_parquet;

0 commit comments

Comments
 (0)