apache · Dandandan · Feb 22, 2026 · Feb 22, 2026 · Feb 22, 2026 · Feb 22, 2026
diff --git a/datafusion-examples/examples/custom_data_source/custom_datasource.rs b/datafusion-examples/examples/custom_data_source/custom_datasource.rs
@@ -63,7 +63,6 @@ async fn search_accounts(
 ) -> Result<()> {
     // create local execution context
     let ctx = SessionContext::new();
-
     // create logical plan composed of a single TableScan
     let logical_plan = LogicalPlanBuilder::scan_with_filters(
         "accounts",

diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -743,6 +743,10 @@ config_namespace! {
         /// (reading) Use any available bloom filters when reading parquet files
         pub bloom_filter_on_read: bool, default = true
 
+        /// (reading) If true, the parquet reader will share work between partitions
+        /// using morsel-driven execution. This can help mitigate data skew.
+        pub allow_morsel_driven: bool, default = true
+
         /// (reading) The maximum predicate cache size, in bytes. When
         /// `pushdown_filters` is enabled, sets the maximum memory used to cache
         /// the results of predicate evaluation between filter evaluation and

diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs
@@ -208,6 +208,7 @@ impl ParquetOptions {
             binary_as_string: _, // not used for writer props
             coerce_int96: _,     // not used for writer props
             skip_arrow_metadata: _,
+            allow_morsel_driven: _,
             max_predicate_cache_size: _,
         } = self;
 
@@ -460,6 +461,7 @@ mod tests {
             skip_arrow_metadata: defaults.skip_arrow_metadata,
             coerce_int96: None,
             max_predicate_cache_size: defaults.max_predicate_cache_size,
+            allow_morsel_driven: defaults.allow_morsel_driven,
         }
     }
 
@@ -573,6 +575,7 @@ mod tests {
                 schema_force_view_types: global_options_defaults.schema_force_view_types,
                 binary_as_string: global_options_defaults.binary_as_string,
                 skip_arrow_metadata: global_options_defaults.skip_arrow_metadata,
+                allow_morsel_driven: global_options_defaults.allow_morsel_driven,
                 coerce_int96: None,
             },
             column_specific_options,

diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs
@@ -49,8 +49,11 @@ mod tests {
     use datafusion_common::config::TableParquetOptions;
     use datafusion_common::test_util::{batches_to_sort_string, batches_to_string};
     use datafusion_common::{Result, ScalarValue, assert_contains};
+    use datafusion_common_runtime::SpawnedTask;
     use datafusion_datasource::file_format::FileFormat;
-    use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
+    use datafusion_datasource::file_scan_config::{
+        FileScanConfig, FileScanConfigBuilder,
+    };
     use datafusion_datasource::source::DataSourceExec;
 
     use datafusion_datasource::file::FileSource;

diff --git a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs
@@ -227,8 +227,58 @@ impl RunQueryResult {
         format!("{}", pretty_format_batches(&self.result).unwrap())
     }
 
+    /// Extract ORDER BY column names from the query.
+    /// The query format is always:
+    ///   `SELECT * FROM test_table ORDER BY <col> <dir> <nulls>, ... LIMIT <n>`
+    fn sort_columns(&self) -> Vec<String> {
+        let order_by_start = self.query.find("ORDER BY").unwrap() + "ORDER BY".len();
+        let limit_start = self.query.rfind(" LIMIT").unwrap();
+        self.query[order_by_start..limit_start]
+            .trim()
+            .split(',')
+            .map(|part| part.split_whitespace().next().unwrap().to_string())
+            .collect()
+    }
+
+    /// Project `batches` to only include the named columns.
+    fn project_columns(batches: &[RecordBatch], cols: &[String]) -> Vec<RecordBatch> {
+        batches
+            .iter()
+            .map(|b| {
+                let schema = b.schema();
+                let indices: Vec<usize> = cols
+                    .iter()
+                    .filter_map(|c| schema.index_of(c).ok())
+                    .collect();
+                let columns: Vec<_> =
+                    indices.iter().map(|&i| Arc::clone(b.column(i))).collect();
+                let fields: Vec<_> =
+                    indices.iter().map(|&i| schema.field(i).clone()).collect();
+                let new_schema = Arc::new(Schema::new(fields));
+                RecordBatch::try_new(new_schema, columns).unwrap()
+            })
+            .collect()
+    }
+
     fn is_ok(&self) -> bool {
-        self.expected_formatted() == self.result_formatted()
+        if self.expected_formatted() == self.result_formatted() {
+            return true;
+        }
+        // If the full results differ, compare only the ORDER BY column values.
+        //
+        // For queries with ORDER BY <col> LIMIT k, multiple rows may tie on the
+        // sort key (e.g. two rows with id=27 for ORDER BY id DESC LIMIT 1).
+        // SQL permits returning any of the tied rows, so with vs without dynamic
+        // filter pushdown may legitimately return different tied rows.
+        //
+        // The dynamic filter must not change the *sort-key values* of the top-k
+        // result. We verify correctness by projecting both results down to only
+        // the ORDER BY columns and comparing those.
+        let sort_cols = self.sort_columns();
+        let expected_keys = Self::project_columns(&self.expected, &sort_cols);
+        let result_keys = Self::project_columns(&self.result, &sort_cols);
+        format!("{}", pretty_format_batches(&expected_keys).unwrap())
+            == format!("{}", pretty_format_batches(&result_keys).unwrap())
     }
 }
 

diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs
@@ -382,7 +382,11 @@ async fn prune_disabled() {
     .await;
     println!("{}", output.description());
 
-    // This should not prune any
+    // Row group stats pruning is disabled, so 0 row groups are pruned by statistics.
+    // Bloom filter runs next and matches all 4 row groups (bloom filters don't help
+    // for range/inequality predicates like `nanos < threshold`). Page index pruning
+    // runs afterwards and can produce row-level selections, but those don't affect
+    // the bloom filter matched count. The query result is still correct.
     assert_eq!(output.predicate_evaluation_errors(), Some(0));
     assert_eq!(output.row_groups_matched(), Some(4));
     assert_eq!(output.row_groups_pruned(), Some(0));

diff --git a/datafusion/core/tests/physical_optimizer/partition_statistics.rs b/datafusion/core/tests/physical_optimizer/partition_statistics.rs
@@ -78,6 +78,11 @@ mod test {
         target_partition: Option<usize>,
     ) -> Arc<dyn ExecutionPlan> {
         let mut session_config = SessionConfig::new().with_collect_statistics(true);
+        session_config
+            .options_mut()
+            .execution
+            .parquet
+            .allow_morsel_driven = false;
         if let Some(partition) = target_partition {
             session_config = session_config.with_target_partitions(partition);
         }

diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs
@@ -27,12 +27,12 @@ use std::{fmt, vec};
 
 use arrow::array::RecordBatch;
 use arrow::datatypes::{Fields, Schema, SchemaRef, TimeUnit};
-use datafusion_datasource::TableSchema;
 use datafusion_datasource::file_compression_type::FileCompressionType;
 use datafusion_datasource::file_sink_config::{FileSink, FileSinkConfig};
 use datafusion_datasource::write::{
     ObjectWriterBuilder, SharedBuffer, get_writer_schema,
 };
+use datafusion_datasource::{PartitionedFile, TableSchema};
 
 use datafusion_datasource::file_format::{FileFormat, FileFormatFactory};
 use datafusion_datasource::write::demux::DemuxedStreamReceiver;
@@ -49,12 +49,14 @@ use datafusion_common::{HashMap, Statistics};
 use datafusion_common_runtime::{JoinSet, SpawnedTask};
 use datafusion_datasource::display::FileGroupDisplay;
 use datafusion_datasource::file::FileSource;
+use datafusion_datasource::file_groups::FileGroup;
 use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
 use datafusion_datasource::sink::{DataSink, DataSinkExec};
 use datafusion_execution::memory_pool::{MemoryConsumer, MemoryPool, MemoryReservation};
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use datafusion_expr::dml::InsertOp;
 use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement};
+use datafusion_physical_plan::merge_partitions::MergePartitionsExec;
 use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan};
 use datafusion_session::Session;
 
@@ -522,8 +524,10 @@ impl FileFormat for ParquetFormat {
         let store = state
             .runtime_env()
             .object_store(conf.object_store_url.clone())?;
-        let cached_parquet_read_factory =
-            Arc::new(CachedParquetFileReaderFactory::new(store, metadata_cache));
+        let cached_parquet_read_factory = Arc::new(CachedParquetFileReaderFactory::new(
+            Arc::clone(&store),
+            metadata_cache,
+        ));
         source = source.with_parquet_file_reader_factory(cached_parquet_read_factory);
 
         if let Some(metadata_size_hint) = metadata_size_hint {
@@ -532,10 +536,85 @@ impl FileFormat for ParquetFormat {
 
         source = self.set_source_encryption_factory(source, state)?;
 
-        let conf = FileScanConfigBuilder::from(conf)
-            .with_source(Arc::new(source))
-            .build();
-        Ok(DataSourceExec::from_data_source(conf))
+        let use_merge_partitions = self.options.global.allow_morsel_driven;
+
+        if use_merge_partitions {
+            // Morsel-driven execution: create one partition per file for the
+            // plan tree, then use a ParquetMorselizer that lazily expands
+            // files into row-group-level work items for fine-grained
+            // work-stealing via MergePartitionsExec.
+            let target_partitions = state.config_options().execution.target_partitions;
+
+            // Collect all files before conf is consumed.
+            let all_files: Vec<PartitionedFile> = conf
+                .file_groups
+                .iter()
+                .flat_map(|group| group.iter().cloned())
+                .collect();
+
+            // Get the reader factory (for metadata reads in the morselizer).
+            // It was set above as CachedParquetFileReaderFactory.
+            let reader_factory: Arc<dyn crate::ParquetFileReaderFactory> = source
+                .parquet_file_reader_factory()
+                .cloned()
+                .expect("reader factory was set above");
+
+            // Set batch size on source before creating the opener (normally
+            // done by FileScanConfig::open_file_reader, but we create the
+            // opener directly here for the morselizer).
+            source.batch_size = Some(
+                conf.batch_size
+                    .unwrap_or(state.config_options().execution.batch_size),
+            );
+
+            // Create the opener for row-group morsel execution.
+            let opener = source.create_file_opener(Arc::clone(&store), &conf, 0)?;
+
+            let output_schema = conf.projected_schema()?;
+            let metrics = source.metrics().clone();
+
+            // Build the DataSourceExec (one partition per file, for plan tree).
+            let one_per_file: Vec<FileGroup> = conf
+                .file_groups
+                .iter()
+                .flat_map(|group| {
+                    group.iter().map(|file| FileGroup::new(vec![file.clone()]))
+                })
+                .collect();
+
+            let conf = FileScanConfigBuilder::from(conf)
+                .with_source(Arc::new(source))
+                .with_file_groups(one_per_file)
+                .build();
+            let data_source = DataSourceExec::from_data_source(conf);
+            let input_partitions = data_source
+                .properties()
+                .output_partitioning()
+                .partition_count();
+
+            if input_partitions > target_partitions {
+                let morselizer = Arc::new(crate::morselizer::ParquetMorselizer::new(
+                    all_files,
+                    opener,
+                    reader_factory,
+                    metrics,
+                    metadata_size_hint,
+                    output_schema,
+                ));
+                Ok(Arc::new(MergePartitionsExec::new_with_morselizer(
+                    data_source,
+                    target_partitions,
+                    morselizer,
+                )))
+            } else {
+                Ok(data_source)
+            }
+        } else {
+            let conf = FileScanConfigBuilder::from(conf)
+                .with_source(Arc::new(source))
+                .build();
+            Ok(DataSourceExec::from_data_source(conf))
+        }
     }
 
     async fn create_writer_physical_plan(

diff --git a/datafusion/datasource-parquet/src/mod.rs b/datafusion/datasource-parquet/src/mod.rs
@@ -24,6 +24,7 @@ pub mod access_plan;
 pub mod file_format;
 pub mod metadata;
 mod metrics;
+pub(crate) mod morselizer;
 mod opener;
 mod page_filter;
 mod reader;