Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ parquet = { version = "59.0.0", default-features = false, features = [
] }
pbjson = { version = "0.9.0" }
pbjson-types = "0.9"
percent-encoding = "2.3"
pin-project = "1"
# Should match arrow-flight's version of prost.
prost = "0.14.1"
Expand Down
1 change: 1 addition & 0 deletions datafusion/catalog-listing/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ futures = { workspace = true }
itertools = { workspace = true }
log = { workspace = true }
object_store = { workspace = true }
percent-encoding = { workspace = true }

[dev-dependencies]
chrono = { workspace = true }
Expand Down
78 changes: 69 additions & 9 deletions datafusion/catalog-listing/src/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

//! Helper functions for the table implementation

use std::borrow::Cow;
use std::sync::Arc;

use datafusion_catalog::Session;
Expand All @@ -43,6 +44,7 @@ use datafusion_expr::{Expr, Volatility};
use datafusion_physical_expr::create_physical_expr;
use object_store::path::Path;
use object_store::{ObjectMeta, ObjectStore};
use percent_encoding::percent_decode_str;

/// Check whether the given expression can be resolved using only the columns `col_names`.
/// This means that if this function returns true:
Expand Down Expand Up @@ -343,7 +345,7 @@ fn try_into_partitioned_file(
.into_iter()
.zip(partition_cols)
.map(|(parsed, (_, datatype))| {
ScalarValue::try_from_string(parsed.to_string(), datatype)
ScalarValue::try_from_string(parsed.into_owned(), datatype)
})
.collect::<Result<Vec<_>>>()?;

Expand Down Expand Up @@ -435,12 +437,15 @@ fn object_meta_to_partitioned_file(
}

/// Extract the partition values for the given `file_path` (in the given `table_path`)
/// associated to the partitions defined by `table_partition_cols`
/// associated to the partitions defined by `table_partition_cols`.
///
/// Partition values are percent-decoded to match Hive-style object-store paths
/// that encode special characters in path segments.
pub fn parse_partitions_for_path<'a, I>(
table_path: &ListingTableUrl,
file_path: &'a Path,
table_partition_cols: I,
) -> Option<Vec<&'a str>>
) -> Option<Vec<Cow<'a, str>>>
where
I: IntoIterator<Item = &'a str>,
{
Expand All @@ -449,7 +454,12 @@ where
let mut part_values = vec![];
for (part, expected_partition) in subpath.zip(table_partition_cols) {
match part.split_once('=') {
Some((name, val)) if name == expected_partition => part_values.push(val),
Some((name, val)) if name == expected_partition => {
let decoded = percent_decode_str(val)
.decode_utf8()
.unwrap_or(Cow::Borrowed(val));
part_values.push(decoded);
}
_ => {
debug!(
"Ignoring file: file_path='{file_path}', table_path='{table_path}', part='{part}', partition_col='{expected_partition}'",
Expand Down Expand Up @@ -525,7 +535,7 @@ mod tests {
#[test]
fn test_parse_partitions_for_path() {
assert_eq!(
Some(vec![]),
Some(vec![] as Vec<Cow<'_, str>>),
parse_partitions_for_path(
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
&Path::from("bucket/mytable/file.csv"),
Expand All @@ -549,15 +559,39 @@ mod tests {
)
);
assert_eq!(
Some(vec!["v1"]),
Some(vec![Cow::Borrowed("v1")]),
parse_partitions_for_path(
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
&Path::from("bucket/mytable/mypartition=v1/file.csv"),
vec!["mypartition"]
)
);
assert_eq!(
Some(vec!["v1"]),
Some(vec![Cow::<str>::Owned("v/1".to_string())]),
parse_partitions_for_path(
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
&Path::parse("bucket/mytable/mypartition=v%2F1/file.csv").unwrap(),
vec!["mypartition"]
)
);
assert_eq!(
Some(vec![Cow::<str>::Owned("John Doe".to_string())]),
parse_partitions_for_path(
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
&Path::parse("bucket/mytable/name=John%20Doe/file.csv").unwrap(),
vec!["name"]
)
);
assert_eq!(
Some(vec![Cow::Borrowed("%FF")]),
parse_partitions_for_path(
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
&Path::parse("bucket/mytable/mypartition=%FF/file.csv").unwrap(),
vec!["mypartition"]
)
);
assert_eq!(
Some(vec![Cow::Borrowed("v1")]),
parse_partitions_for_path(
&ListingTableUrl::parse("file:///bucket/mytable/").unwrap(),
&Path::from("bucket/mytable/mypartition=v1/file.csv"),
Expand All @@ -574,15 +608,15 @@ mod tests {
)
);
assert_eq!(
Some(vec!["v1", "v2"]),
Some(vec![Cow::Borrowed("v1"), Cow::Borrowed("v2")]),
parse_partitions_for_path(
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
&Path::from("bucket/mytable/mypartition=v1/otherpartition=v2/file.csv"),
vec!["mypartition", "otherpartition"]
)
);
assert_eq!(
Some(vec!["v1"]),
Some(vec![Cow::Borrowed("v1")]),
parse_partitions_for_path(
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
&Path::from("bucket/mytable/mypartition=v1/otherpartition=v2/file.csv"),
Expand Down Expand Up @@ -614,6 +648,32 @@ mod tests {
);
}

#[test]
fn test_try_into_partitioned_file_decodes_partition_value() {
let table_path = ListingTableUrl::parse("file:///bucket/mytable").unwrap();
let partition_cols = vec![("category".to_string(), DataType::Utf8)];
let meta = ObjectMeta {
location: Path::parse(
"bucket/mytable/category=Electronics%2FComputers/data.parquet",
)
.unwrap(),
last_modified: chrono::Utc::now(),
size: 100,
e_tag: None,
version: None,
};

let result =
try_into_partitioned_file(meta, &partition_cols, &table_path).unwrap();
assert!(result.is_some());
let pf = result.unwrap();
assert_eq!(pf.partition_values.len(), 1);
assert_eq!(
pf.partition_values[0],
ScalarValue::Utf8(Some("Electronics/Computers".to_string()))
);
}

#[test]
fn test_try_into_partitioned_file_root_file_skipped() {
// File in root directory (not inside any partition path) should be
Expand Down