Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 35 additions & 5 deletions python/pyarrow/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,32 @@ def __getattr__(name):
)


def _is_likely_uri(path):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mention on the issue:

Working on a fix that ports the C++ IsLikelyUri() heuristic to Python and uses it in _resolve_filesystem_and_path()

Could you check if it's possible to expose IsLikelyUri through Python so ultimately the logic is only implemented once and is always doing the same?

"""
Check if a string looks like a URI (has a valid scheme followed by ':').

This is a Python port of the C++ ``IsLikelyUri()`` heuristic in
``cpp/src/arrow/filesystem/path_util.cc``. It is intentionally
conservative: single-letter schemes are excluded (could be a Windows
drive letter) and the scheme must conform to RFC 3986.
"""
if not path or path[0] == '/':
return False
colon_pos = path.find(':')
if colon_pos < 0:
return False
# One-letter URI schemes don't officially exist; may be a Windows drive
if colon_pos < 2:
return False
# The largest IANA-registered scheme is 36 characters
if colon_pos > 36:
return False
scheme = path[:colon_pos]
if not scheme[0].isalpha():
return False
return all(c.isalnum() or c in ('+', '-', '.') for c in scheme[1:])


def _ensure_filesystem(filesystem, *, use_mmap=False):
if isinstance(filesystem, FileSystem):
return filesystem
Expand Down Expand Up @@ -174,13 +200,17 @@ def _resolve_filesystem_and_path(path, filesystem=None, *, memory_map=False):
filesystem, path = FileSystem.from_uri(path)
except ValueError as e:
msg = str(e)
if "empty scheme" in msg or "Cannot parse URI" in msg:
# neither an URI nor a locally existing path, so assume that
# local path was given and propagate a nicer file not found
# error instead of a more confusing scheme parsing error
if "empty scheme" in msg:
# No scheme at all — treat as a local path and propagate
# a nicer "file not found" error later.
pass
elif "Cannot parse URI" in msg and not _is_likely_uri(path):
# Path doesn't look like a URI (no valid scheme prefix),
# so treat it as a local path rather than surfacing a
# confusing URI-parsing error.
pass
else:
raise e
raise
else:
path = filesystem.normalize_path(path)

Expand Down
61 changes: 61 additions & 0 deletions python/pyarrow/tests/test_fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1695,6 +1695,67 @@ def test_filesystem_from_path_object(path):
assert path == p.resolve().absolute().as_posix()


def test_is_likely_uri():
"""Unit tests for the _is_likely_uri() heuristic."""
from pyarrow.fs import _is_likely_uri

# Valid URI schemes
assert _is_likely_uri("s3://bucket/key")
assert _is_likely_uri("gs://bucket/key")
assert _is_likely_uri("hdfs://host/path")
assert _is_likely_uri("file:///local/path")
assert _is_likely_uri("abfss://container@account/path")
assert _is_likely_uri("grpc+https://host:443")

# Not URIs — local paths, Windows drives, empty, etc.
assert not _is_likely_uri("")
assert not _is_likely_uri("/absolute/path")
assert not _is_likely_uri("relative/path")
assert not _is_likely_uri("C:\\Users\\foo") # single-letter → drive
assert not _is_likely_uri("C:/Users/foo")
assert not _is_likely_uri("3bucket://key") # scheme starts with digit
assert not _is_likely_uri("-scheme://key") # scheme starts with dash


def test_resolve_filesystem_and_path_uri_with_spaces():
"""
URIs with a recognised scheme but un-encoded spaces must raise
ValueError — NOT silently fall back to LocalFileSystem.
(GH-41365)
"""
from pyarrow.fs import _resolve_filesystem_and_path

# S3 URI with spaces should raise, not return LocalFileSystem
with pytest.raises(ValueError, match="Cannot parse URI"):
_resolve_filesystem_and_path("s3://bucket/path with space/file.parquet")

# GCS URI with spaces should also raise
with pytest.raises(ValueError, match="Cannot parse URI"):
_resolve_filesystem_and_path("gs://bucket/path with space/file.csv")

# abfss URI with spaces
with pytest.raises(ValueError, match="Cannot parse URI"):
_resolve_filesystem_and_path(
"abfss://container@account/dir with space/file"
)


def test_resolve_filesystem_and_path_local_with_spaces():
"""
Local paths (no scheme) with spaces should still resolve to
LocalFileSystem — they must NOT be confused with malformed URIs.
"""
from pyarrow.fs import _resolve_filesystem_and_path

# Absolute local path with spaces → LocalFileSystem
fs, path = _resolve_filesystem_and_path("/tmp/path with spaces/data")
assert isinstance(fs, LocalFileSystem)

# Non-existent absolute path → still LocalFileSystem
fs, path = _resolve_filesystem_and_path("/nonexistent/path")
assert isinstance(fs, LocalFileSystem)


@pytest.mark.s3
def test_filesystem_from_uri_s3(s3_server):
from pyarrow.fs import S3FileSystem
Expand Down
Loading