Skip to content
70 changes: 40 additions & 30 deletions tensorrt_llm/_torch/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,19 @@ def _unified_kv_pool_includes_mamba(
return not (use_split_pool or use_spec)


def _is_lock_infra_error(exc: BaseException) -> bool:
"""Whether exc indicates broken lock infrastructure (not mere contention)."""
# filelock.Timeout is contention, not broken infra, so it is not eligible.
if isinstance(exc, filelock.Timeout):
return False
if isinstance(exc, PermissionError):
return True
if isinstance(exc, OSError):
return exc.errno in (errno.EACCES, errno.EPERM, errno.ENOLCK,
errno.ESTALE)
return False


@contextlib.contextmanager
def config_file_lock(timeout: int = 10):
"""
Expand All @@ -77,47 +90,44 @@ def config_file_lock(timeout: int = 10):
Args:
timeout: Maximum time to wait for lock acquisition in seconds
"""
# Use a single global lock file in HF cache directory
# This serializes all model loading operations to prevent race conditions
# Use a single global lock file in HF cache directory to serialize loads.
lock_path = Path(HF_MODULES_CACHE) / "_remote_code.lock"

# Create and acquire the lock
lock = filelock.FileLock(str(lock_path), timeout=timeout)

# Guard only acquisition so caller-body exceptions propagate (single-yield).
try:
with lock:
yield
except (PermissionError, OSError, filelock.Timeout) as e:
# Fallback to tempdir when primary lock path is unusable (e.g.,
# NFS locking failures like ENOLCK/ESTALE, permission issues,
# or lock acquisition timeouts)
if isinstance(e,
OSError) and e.errno not in (errno.EACCES, errno.EPERM,
errno.ENOLCK, errno.ESTALE):
lock.acquire(timeout=timeout)
except filelock.Timeout:
# Contention, not broken infra: a tempdir lock can't serialize against
# the holder, so degrade to no lock instead of crashing the process.
logger.warning(
f"could not acquire config lock within {timeout}s, proceeding without lock"
)
yield
except (PermissionError, OSError) as e:
# Broken lock infra (perms / NFS ENOLCK/ESTALE): retry on a tempdir lock.
if not _is_lock_infra_error(e):
raise
tmp_dir = Path(tempfile.gettempdir())
tmp_dir.mkdir(parents=True, exist_ok=True)
tmp_lock_path = tmp_dir / "_remote_code.lock"
tmp_lock = filelock.FileLock(str(tmp_lock_path), timeout=timeout)
tmp_lock = filelock.FileLock(str(tmp_dir / "_remote_code.lock"),
timeout=timeout)
try:
with tmp_lock:
yield
except filelock.Timeout:
tmp_lock.acquire(timeout=timeout)
except (PermissionError, OSError, filelock.Timeout):
logger.warning(
f"failed to acquire tempdir config lock within {timeout} seconds, proceeding without lock"
)
# proceed without lock
"tempdir config lock unavailable, proceeding without lock")
yield
except (PermissionError, OSError) as e:
if isinstance(
e, OSError) and e.errno not in (errno.EACCES, errno.EPERM,
errno.ENOLCK, errno.ESTALE):
raise
logger.warning(
f"tempdir config lock unavailable due to OS/permission issue: {e}, proceeding without lock"
)
# proceed without lock
else:
try:
yield
finally:
tmp_lock.release()
else:
try:
yield
finally:
lock.release()


@dataclass(kw_only=True)
Expand Down
Loading