Skip to content

Commit 5c56800

Browse files
author
Roja Reddy Sareddy
committed
Fix: Bug fixes for s3 path validation, mlflow app creation
1 parent b3707fe commit 5c56800

File tree

7 files changed

+555
-697
lines changed

7 files changed

+555
-697
lines changed

sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -198,10 +198,19 @@ def _create_mlflow_app(sagemaker_session) -> Optional[MlflowApp]:
198198
if new_app.status in ["Created", "Updated"]:
199199
return new_app
200200
elif new_app.status in ["Failed", "Stopped"]:
201-
raise RuntimeError(f"MLflow app creation failed with status: {new_app.status}")
201+
# Get detailed error from MLflow app
202+
error_msg = f"MLflow app creation failed with status: {new_app.status}"
203+
if hasattr(new_app, 'failure_reason') and new_app.failure_reason:
204+
error_msg += f". Reason: {new_app.failure_reason}"
205+
raise RuntimeError(error_msg)
202206
time.sleep(poll_interval)
203207

204-
raise RuntimeError(f"MLflow app creation timed out after {max_wait_time} seconds")
208+
# Timeout case - get current status and any error details
209+
new_app.refresh()
210+
error_msg = f"MLflow app creation failed. Current status: {new_app.status}"
211+
if hasattr(new_app, 'failure_reason') and new_app.failure_reason:
212+
error_msg += f". Reason: {new_app.failure_reason}"
213+
raise RuntimeError(error_msg)
205214

206215
except Exception as e:
207216
logger.error("Failed to create MLflow app: %s", e)
@@ -693,7 +702,7 @@ def _validate_eula_for_gated_model(model, accept_eula, is_gated_model):
693702

694703

695704
def _validate_s3_path_exists(s3_path: str, sagemaker_session):
696-
"""Validate if S3 path exists and is accessible."""
705+
"""Validate S3 path and create bucket/prefix if they don't exist."""
697706
if not s3_path.startswith("s3://"):
698707
raise ValueError(f"Invalid S3 path format: {s3_path}")
699708

@@ -705,19 +714,34 @@ def _validate_s3_path_exists(s3_path: str, sagemaker_session):
705714
s3_client = sagemaker_session.boto_session.client('s3')
706715

707716
try:
708-
# Check if bucket exists and is accessible
709-
s3_client.head_bucket(Bucket=bucket_name)
717+
# Check if bucket exists, create if it doesn't
718+
try:
719+
s3_client.head_bucket(Bucket=bucket_name)
720+
except Exception as e:
721+
if "NoSuchBucket" in str(e) or "Not Found" in str(e):
722+
# Create bucket
723+
region = sagemaker_session.boto_region_name
724+
if region == 'us-east-1':
725+
s3_client.create_bucket(Bucket=bucket_name)
726+
else:
727+
s3_client.create_bucket(
728+
Bucket=bucket_name,
729+
CreateBucketConfiguration={'LocationConstraint': region}
730+
)
731+
else:
732+
raise
710733

711-
# If prefix is provided, check if it exists
734+
# If prefix is provided, check if it exists, create if it doesn't
712735
if prefix:
713736
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix, MaxKeys=1)
714737
if 'Contents' not in response:
715-
raise ValueError(f"S3 prefix '{prefix}' does not exist in bucket '{bucket_name}'")
738+
# Create the prefix by putting an empty object
739+
if not prefix.endswith('/'):
740+
prefix += '/'
741+
s3_client.put_object(Bucket=bucket_name, Key=prefix, Body=b'')
716742

717743
except Exception as e:
718-
if "NoSuchBucket" in str(e):
719-
raise ValueError(f"S3 bucket '{bucket_name}' does not exist or is not accessible")
720-
raise ValueError(f"Failed to validate S3 path '{s3_path}': {str(e)}")
744+
raise ValueError(f"Failed to validate/create S3 path '{s3_path}': {str(e)}")
721745

722746

723747
def _validate_hyperparameter_values(hyperparameters: dict):

sagemaker-train/tests/unit/train/common_utils/test_finetune_utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -537,16 +537,16 @@ def test__validate_s3_path_exists_with_prefix_exists(self, mock_boto_client):
537537

538538
@patch('boto3.client')
539539
def test__validate_s3_path_exists_with_prefix_not_exists(self, mock_boto_client):
540-
"""Test S3 path validation raises error when prefix doesn't exist"""
540+
"""Test S3 path validation creates prefix when it doesn't exist"""
541541
mock_session = Mock()
542542
mock_s3_client = Mock()
543543
mock_session.boto_session.client.return_value = mock_s3_client
544544
mock_s3_client.list_objects_v2.return_value = {} # No contents
545545

546-
with pytest.raises(ValueError, match="Failed to validate S3 path 's3://test-bucket/prefix': S3 prefix 'prefix' does not exist in bucket 'test-bucket'"):
547-
_validate_s3_path_exists("s3://test-bucket/prefix", mock_session)
546+
_validate_s3_path_exists("s3://test-bucket/prefix", mock_session)
548547

549548
mock_s3_client.head_bucket.assert_called_once_with(Bucket="test-bucket")
550549
mock_s3_client.list_objects_v2.assert_called_once_with(Bucket="test-bucket", Prefix="prefix", MaxKeys=1)
550+
mock_s3_client.put_object.assert_called_once_with(Bucket="test-bucket", Key="prefix/", Body=b'')
551551

552552

v3-examples/model-customization-examples/dpo-trainer-e2e.ipynb

Lines changed: 0 additions & 240 deletions
This file was deleted.

0 commit comments

Comments
 (0)