trufflesecurity · mustansir14 · Dec 4, 2025 · Jun 26, 2025 · Jul 7, 2025 · Nov 19, 2025
@@ -33,6 +33,10 @@ import (
 // resuming from the correct bucket. The scan will continue from the last checkpointed object
 // in that bucket.
 //
+// Unit scans are also supported. The encoded resume info in this case tracks the last processed object
+// for each bucket (unit) separately by using the SetEncodedResumeInfoFor method on Progress. To use the
+// checkpointer for unit scans, call SetIsUnitScan(true) before starting the scan.
+//
 // For example, if scanning is interrupted after processing 1500 objects across 2 pages:
 // Page 1 (objects 0-999): Fully processed, checkpoint saved at object 999
 // Page 2 (objects 1000-1999): Partially processed through 1600, but only consecutive through 1499
@@ -56,6 +60,8 @@ type Checkpointer struct {
 	// progress holds the scan's overall progress state and enables persistence.
 	// The EncodedResumeInfo field stores the JSON-encoded ResumeInfo checkpoint.
 	progress *sources.Progress // Reference to source's Progress
+
+	isUnitScan bool // Indicates if scanning is done in unit scan mode
 }
 
 const defaultMaxObjectsPerPage = 1000
@@ -199,6 +205,12 @@ func (p *Checkpointer) advanceLowestIncompleteIdx() {
 // updateCheckpoint persists the current resumption state.
 // Must be called with lock held.
 func (p *Checkpointer) updateCheckpoint(bucket string, lastKey string) error {
+	if p.isUnitScan {
+		// track sub-unit resumption state
+		p.progress.SetEncodedResumeInfoFor(bucket, lastKey)
+		return nil
+	}
+
 	encoded, err := json.Marshal(&ResumeInfo{CurrentBucket: bucket, StartAfter: lastKey})
 	if err != nil {
 		return fmt.Errorf("failed to encode resume info: %w", err)
@@ -212,3 +224,11 @@ func (p *Checkpointer) updateCheckpoint(bucket string, lastKey string) error {
 	)
 	return nil
 }
+
+// SetIsUnitScan sets whether the checkpointer is operating in unit scan mode.
+func (p *Checkpointer) SetIsUnitScan(isUnitScan bool) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	p.isUnitScan = isUnitScan
+}
@@ -258,6 +258,36 @@ func TestCheckpointerUpdate(t *testing.T) {
 	}
 }
 
+func TestCheckpointerUpdateUnitScan(t *testing.T) {
+	ctx := context.Background()
+	progress := new(sources.Progress)
+	tracker := NewCheckpointer(ctx, progress)
+	tracker.SetIsUnitScan(true)
+
+	page := &s3.ListObjectsV2Output{
+		Contents: make([]s3types.Object, 3),
+	}
+	for i := range 3 {
+		key := fmt.Sprintf("key-%d", i)
+		page.Contents[i] = s3types.Object{Key: &key}
+	}
+
+	// Complete first object.
+	err := tracker.UpdateObjectCompletion(ctx, 0, "test-bucket", page.Contents)
+	assert.NoError(t, err, "Unexpected error updating progress")
+
+	var info map[string]string
+	err = json.Unmarshal([]byte(progress.EncodedResumeInfo), &info)
+	var gotBucket, gotStartAfter string
+	for k, v := range info {
+		gotBucket = k
+		gotStartAfter = v
+	}
+	assert.NoError(t, err, "Failed to decode resume info")
+	assert.Equal(t, "test-bucket", gotBucket, "Incorrect bucket")
+	assert.Equal(t, "key-0", gotStartAfter, "Incorrect resume point")
+}
+
 func TestComplete(t *testing.T) {
 	tests := []struct {
 		name         string

@@ -62,6 +62,7 @@ type Source struct {
 var _ sources.Source = (*Source)(nil)
 var _ sources.SourceUnitUnmarshaller = (*Source)(nil)
 var _ sources.Validator = (*Source)(nil)
+var _ sources.SourceUnitEnumChunker = (*Source)(nil)
 
 // Type returns the type of source
 func (s *Source) Type() sourcespb.SourceType { return SourceType }
@@ -294,7 +295,7 @@ func (s *Source) scanBuckets(
 	if role != "" {
 		ctx = context.WithValue(ctx, "role", role)
 	}
-	var objectCount uint64
+	var totalObjectCount uint64
 
 	pos := determineResumePosition(ctx, s.checkpointer, bucketsToScan)
 	switch {
@@ -316,16 +317,7 @@ func (s *Source) scanBuckets(
 
 	bucketsToScanCount := len(bucketsToScan)
 	for bucketIdx := pos.index; bucketIdx < bucketsToScanCount; bucketIdx++ {
-		s.metricsCollector.RecordBucketForRole(role)
 		bucket := bucketsToScan[bucketIdx]
-		ctx := context.WithValue(ctx, "bucket", bucket)
-
-		if common.IsDone(ctx) {
-			ctx.Logger().Error(ctx.Err(), "context done, while scanning bucket")
-			return
-		}
-
-		ctx.Logger().V(3).Info("Scanning bucket")
 
 		s.SetProgressComplete(
 			bucketIdx,
@@ -334,63 +326,94 @@ func (s *Source) scanBuckets(
 			s.Progress.EncodedResumeInfo,
 		)
 
-		regionalClient, err := s.getRegionalClientForBucket(ctx, client, role, bucket)
-		if err != nil {
-			ctx.Logger().Error(err, "could not get regional client for bucket")
-			continue
-		}
-
-		errorCount := sync.Map{}
-
-		input := &s3.ListObjectsV2Input{Bucket: &bucket}
+		var startAfter *string
 		if bucket == pos.bucket && pos.startAfter != "" {
-			input.StartAfter = &pos.startAfter
+			startAfter = &pos.startAfter
 			ctx.Logger().V(3).Info(
 				"Resuming bucket scan",
 				"start_after", pos.startAfter,
+				"bucket", bucket,
 			)
 		}
 
-		pageNumber := 1
-		paginator := s3.NewListObjectsV2Paginator(regionalClient, input)
-		for paginator.HasMorePages() {
-			output, err := paginator.NextPage(ctx)
-			if err != nil {
-				if role == "" {
-					ctx.Logger().Error(err, "could not list objects in bucket")
-				} else {
-					// Our documentation blesses specifying a role to assume without specifying buckets to scan, which will
-					// often cause this to happen a lot (because in that case the scanner tries to scan every bucket in the
-					// account, but the role probably doesn't have access to all of them). This makes it expected behavior
-					// and therefore not an error.
-					ctx.Logger().V(3).Info("could not list objects in bucket", "err", err)
-				}
-				break
-			}
-			pageMetadata := pageMetadata{
-				bucket:     bucket,
-				pageNumber: pageNumber,
-				client:     regionalClient,
-				page:       output,
-			}
-			processingState := processingState{
-				errorCount:  &errorCount,
-				objectCount: &objectCount,
-			}
-			s.pageChunker(ctx, pageMetadata, processingState, chunksChan)
-
-			pageNumber++
-		}
+		objectCount := s.scanBucket(ctx, client, role, bucket, sources.ChanReporter{Ch: chunksChan}, startAfter)
+		totalObjectCount += objectCount
 	}
 
 	s.SetProgressComplete(
 		len(bucketsToScan),
 		len(bucketsToScan),
-		fmt.Sprintf("Completed scanning source %s. %d objects scanned.", s.name, objectCount),
+		fmt.Sprintf("Completed scanning source %s. %d objects scanned.", s.name, totalObjectCount),
 		"",
 	)
 }
 
+func (s *Source) scanBucket(
+	ctx context.Context,
+	client *s3.Client,
+	role string,
+	bucket string,
+	reporter sources.ChunkReporter,
+	startAfter *string,
+) uint64 {
+	s.metricsCollector.RecordBucketForRole(role)
+
+	ctx = context.WithValue(ctx, "bucket", bucket)
+
+	if common.IsDone(ctx) {
+		ctx.Logger().Error(ctx.Err(), "context done, while scanning bucket")
+		return 0
+	}
+
+	ctx.Logger().V(3).Info("Scanning bucket")
+
+	regionalClient, err := s.getRegionalClientForBucket(ctx, client, role, bucket)
+	if err != nil {
+		ctx.Logger().Error(err, "could not get regional client for bucket")
+		return 0
+	}
+
+	errorCount := sync.Map{}
+
+	input := &s3.ListObjectsV2Input{Bucket: &bucket}
+	if startAfter != nil {
+		input.StartAfter = startAfter
+	}
+
+	pageNumber := 1
+	paginator := s3.NewListObjectsV2Paginator(regionalClient, input)
+	var objectCount uint64
+	for paginator.HasMorePages() {
+		output, err := paginator.NextPage(ctx)
+		if err != nil {
+			if role == "" {
+				ctx.Logger().Error(err, "could not list objects in bucket")
+			} else {
+				// Our documentation blesses specifying a role to assume without specifying buckets to scan, which will
+				// often cause this to happen a lot (because in that case the scanner tries to scan every bucket in the
+				// account, but the role probably doesn't have access to all of them). This makes it expected behavior
+				// and therefore not an error.
+				ctx.Logger().V(3).Info("could not list objects in bucket", "err", err)
+			}
+			break
+		}
+		pageMetadata := pageMetadata{
+			bucket:     bucket,
+			pageNumber: pageNumber,
+			client:     regionalClient,
+			page:       output,
+		}
+		processingState := processingState{
+			errorCount:  &errorCount,
+			objectCount: &objectCount,
+		}
+		s.pageChunker(ctx, pageMetadata, processingState, reporter)
+
+		pageNumber++
+	}
+	return objectCount
+}
+
 // Chunks emits chunks of bytes over a channel.
 func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, _ ...sources.ChunkingTarget) error {
 	visitor := func(c context.Context, defaultRegionClient *s3.Client, roleArn string, buckets []string) error {
@@ -429,14 +452,12 @@ func (s *Source) pageChunker(
 	ctx context.Context,
 	metadata pageMetadata,
 	state processingState,
-	chunksChan chan *sources.Chunk,
+	reporter sources.ChunkReporter,
 ) {
 	s.checkpointer.Reset() // Reset the checkpointer for each PAGE
 	ctx = context.WithValues(ctx, "bucket", metadata.bucket, "page_number", metadata.pageNumber)
-
 	for objIdx, obj := range metadata.page.Contents {
 		ctx = context.WithValues(ctx, "key", *obj.Key, "size", *obj.Size)
-
 		if common.IsDone(ctx) {
 			return
 		}
@@ -572,12 +593,11 @@ func (s *Source) pageChunker(
 				Verify: s.verify,
 			}
 
-			if err := handlers.HandleFile(ctx, res.Body, chunkSkel, sources.ChanReporter{Ch: chunksChan}); err != nil {
+			if err := handlers.HandleFile(ctx, res.Body, chunkSkel, reporter); err != nil {
 				ctx.Logger().Error(err, "error handling file")
 				s.metricsCollector.RecordObjectError(metadata.bucket)
 				return nil
 			}
-
 			atomic.AddUint64(state.objectCount, 1)
 			ctx.Logger().V(5).Info("S3 object scanned.", "object_count", state.objectCount)
 			nErr, ok = state.errorCount.Load(prefix)
@@ -587,17 +607,14 @@ func (s *Source) pageChunker(
 			if nErr.(int) > 0 {
 				state.errorCount.Store(prefix, 0)
 			}
-
 			// Update progress after successful processing.
 			if err := s.checkpointer.UpdateObjectCompletion(ctx, objIdx, metadata.bucket, metadata.page.Contents); err != nil {
 				ctx.Logger().Error(err, "could not update progress for scanned object")
 			}
 			s.metricsCollector.RecordObjectScanned(metadata.bucket, float64(*obj.Size))
-
 			return nil
 		})
 	}
-
 	_ = s.jobPool.Wait()
 }
 
@@ -681,3 +698,76 @@ func (s *Source) visitRoles(
 func makeS3Link(bucket, region, key string) string {
 	return fmt.Sprintf("https://%s.s3.%s.amazonaws.com/%s", bucket, region, key)
 }
+
+type S3SourceUnit struct {
+	Bucket string
+	Role   string
+}
+
+func (s S3SourceUnit) SourceUnitID() (string, sources.SourceUnitKind) {
+	// The ID is the bucket name, and the kind is "s3_bucket".
+	return s.Bucket, "s3_bucket"
+}
+
+func (s S3SourceUnit) Display() string {
+	return s.Bucket
+}
+
+var _ sources.SourceUnit = S3SourceUnit{}
+
+// Enumerate implements SourceUnitEnumerator interface. This implementation visits
+// each configured role and passes each s3 bucket as a source unit
+func (s *Source) Enumerate(ctx context.Context, reporter sources.UnitReporter) error {
+	visitor := func(c context.Context, defaultRegionClient *s3.Client, roleArn string, buckets []string) error {
+		for _, bucket := range buckets {
+			if common.IsDone(ctx) {
+				return ctx.Err()
+			}
+
+			unit := S3SourceUnit{
+				Bucket: bucket,
+				Role:   roleArn,
+			}
+
+			if err := reporter.UnitOk(ctx, unit); err != nil {
+				return err
+			}
+		}
+		return nil
+	}
+
+	return s.visitRoles(ctx, visitor)
+}
+
+// ChunkUnit implements SourceUnitChunker interface. This implementation scans
+// the given S3 bucket source unit and emits chunks for each object found.
+// It supports sub-unit resumption by utilizing the checkpointer to track progress.
+func (s *Source) ChunkUnit(ctx context.Context, unit sources.SourceUnit, reporter sources.ChunkReporter) error {
+
+	s3unit, ok := unit.(S3SourceUnit)
+	if !ok {
+		return fmt.Errorf("expected *S3SourceUnit, got %T", unit)
+	}
+	bucket := s3unit.Bucket
+
+	defaultClient, err := s.newClient(ctx, defaultAWSRegion, s3unit.Role)
+	if err != nil {
+		return fmt.Errorf("could not create s3 client for bucket %s and role %s: %w", bucket, s3unit.Role, err)
+	}
+
+	s.checkpointer.SetIsUnitScan(true)
+
+	var startAfterPtr *string
+	startAfter := s.Progress.GetEncodedResumeInfoFor(bucket)
+	if startAfter != "" {
+		ctx.Logger().V(3).Info(
+			"Resuming bucket scan",
+			"start_after", startAfter,
+			"bucket", bucket,
+		)
+		startAfterPtr = &startAfter
+	}
+	defer s.Progress.ClearEncodedResumeInfoFor(bucket)
+	s.scanBucket(ctx, defaultClient, s3unit.Role, bucket, reporter, startAfterPtr)
+	return nil
+}