diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py index fa448d8876c..b0328160d98 100644 --- a/tensorrt_llm/_torch/model_config.py +++ b/tensorrt_llm/_torch/model_config.py @@ -66,6 +66,19 @@ def _unified_kv_pool_includes_mamba( return not (use_split_pool or use_spec) +def _is_lock_infra_error(exc: BaseException) -> bool: + """Whether exc indicates broken lock infrastructure (not mere contention).""" + # filelock.Timeout is contention, not broken infra, so it is not eligible. + if isinstance(exc, filelock.Timeout): + return False + if isinstance(exc, PermissionError): + return True + if isinstance(exc, OSError): + return exc.errno in (errno.EACCES, errno.EPERM, errno.ENOLCK, + errno.ESTALE) + return False + + @contextlib.contextmanager def config_file_lock(timeout: int = 10): """ @@ -77,47 +90,44 @@ def config_file_lock(timeout: int = 10): Args: timeout: Maximum time to wait for lock acquisition in seconds """ - # Use a single global lock file in HF cache directory - # This serializes all model loading operations to prevent race conditions + # Use a single global lock file in HF cache directory to serialize loads. lock_path = Path(HF_MODULES_CACHE) / "_remote_code.lock" - - # Create and acquire the lock lock = filelock.FileLock(str(lock_path), timeout=timeout) + # Guard only acquisition so caller-body exceptions propagate (single-yield). try: - with lock: - yield - except (PermissionError, OSError, filelock.Timeout) as e: - # Fallback to tempdir when primary lock path is unusable (e.g., - # NFS locking failures like ENOLCK/ESTALE, permission issues, - # or lock acquisition timeouts) - if isinstance(e, - OSError) and e.errno not in (errno.EACCES, errno.EPERM, - errno.ENOLCK, errno.ESTALE): + lock.acquire(timeout=timeout) + except filelock.Timeout: + # Contention, not broken infra: a tempdir lock can't serialize against + # the holder, so degrade to no lock instead of crashing the process. + logger.warning( + f"could not acquire config lock within {timeout}s, proceeding without lock" + ) + yield + except (PermissionError, OSError) as e: + # Broken lock infra (perms / NFS ENOLCK/ESTALE): retry on a tempdir lock. + if not _is_lock_infra_error(e): raise tmp_dir = Path(tempfile.gettempdir()) tmp_dir.mkdir(parents=True, exist_ok=True) - tmp_lock_path = tmp_dir / "_remote_code.lock" - tmp_lock = filelock.FileLock(str(tmp_lock_path), timeout=timeout) + tmp_lock = filelock.FileLock(str(tmp_dir / "_remote_code.lock"), + timeout=timeout) try: - with tmp_lock: - yield - except filelock.Timeout: + tmp_lock.acquire(timeout=timeout) + except (PermissionError, OSError, filelock.Timeout): logger.warning( - f"failed to acquire tempdir config lock within {timeout} seconds, proceeding without lock" - ) - # proceed without lock + "tempdir config lock unavailable, proceeding without lock") yield - except (PermissionError, OSError) as e: - if isinstance( - e, OSError) and e.errno not in (errno.EACCES, errno.EPERM, - errno.ENOLCK, errno.ESTALE): - raise - logger.warning( - f"tempdir config lock unavailable due to OS/permission issue: {e}, proceeding without lock" - ) - # proceed without lock + else: + try: + yield + finally: + tmp_lock.release() + else: + try: yield + finally: + lock.release() @dataclass(kw_only=True)