fix: fall back scan when plan uses input_file_name expressions

andygrove · claude · andygrove · commit 432c277b4db9 · 2026-02-10T12:05:56.000-07:00
CometScanExec does not populate InputFileBlockHolder (the thread-local
that Spark's FileScanRDD sets), so input_file_name(),
input_file_block_start(), and input_file_block_length() return empty
or default values when Comet replaces the scan. Detect these
expressions in the plan and fall back to Spark's FileSourceScanExec.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
@@ -28,7 +28,7 @@ import scala.jdk.CollectionConverters._
 import org.apache.hadoop.conf.Configuration
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.expressions.{Attribute, DynamicPruningExpression, Expression, GenericInternalRow, PlanExpression}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, DynamicPruningExpression, Expression, GenericInternalRow, InputFileBlockLength, InputFileBlockStart, InputFileName, PlanExpression}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.util.{sideBySide, ArrayBasedMapData, GenericArrayData, MetadataColumnHelper}
 import org.apache.spark.sql.catalyst.util.ResolveDefaultColumns.getExistenceDefaultValues
@@ -90,6 +90,17 @@ case class CometScanRule(session: SparkSession)
       })
     }
 
+    // Check if the plan uses expressions that read from InputFileBlockHolder
+    // (input_file_name, input_file_block_start, input_file_block_length).
+    // These rely on Spark's FileScanRDD to set a thread-local, which Comet scans don't do.
+    lazy val usesInputFileBlock = plan.exists(node =>
+      node.expressions.exists(_.exists {
+        case _: InputFileName => true
+        case _: InputFileBlockStart => true
+        case _: InputFileBlockLength => true
+        case _ => false
+      }))
+
     def isIcebergMetadataTable(scanExec: BatchScanExec): Boolean = {
       // List of Iceberg metadata tables:
       // https://iceberg.apache.org/docs/latest/spark-queries/#inspecting-tables
@@ -117,6 +128,12 @@ case class CometScanRule(session: SparkSession)
       case scan if hasMetadataCol(scan) =>
         withInfo(scan, "Metadata column is not supported")
 
+      case scan if usesInputFileBlock =>
+        withInfo(
+          scan,
+          "Comet scans are not compatible with input_file_name, " +
+            "input_file_block_start, or input_file_block_length")
+
       // data source V1
       case scanExec: FileSourceScanExec =>
         transformV1Scan(scanExec)