Unsupervisedcom · Copilot · Mar 20, 2026 · Mar 20, 2026
diff --git a/specs/deepwork/jobs/JOBS-REQ-004-quality-review-system.md b/specs/deepwork/jobs/JOBS-REQ-004-quality-review-system.md
@@ -84,8 +84,10 @@ The quality review system evaluates step outputs against defined quality criteri
 ### JOBS-REQ-004.8: Timeout Computation
 
 1. The base timeout MUST be 240 seconds (4 minutes).
-2. For file counts of 5 or fewer, the timeout MUST be 240 seconds.
+2. For file counts of 5 or fewer (with small content), the timeout MUST be 240 seconds.
 3. For file counts beyond 5, the timeout MUST increase by 30 seconds per additional file.
+4. For large content sizes, the timeout MUST increase by 30 seconds for every 5,000 characters beyond the first 5,000 characters of the review payload.
+5. The final timeout MUST be the maximum of the file-count-based and content-size-based calculations.
 
 ### JOBS-REQ-004.9: Result Parsing
 

diff --git a/src/deepwork/jobs/mcp/quality_gate.py b/src/deepwork/jobs/mcp/quality_gate.py
@@ -451,23 +451,32 @@ async def build_review_instructions_file(
         return "\n".join(parts)
 
     @staticmethod
-    def compute_timeout(file_count: int) -> int:
-        """Compute dynamic timeout based on number of files.
-
-        Base timeout is 240 seconds (4 minutes). For every file beyond
-        the first 5, add 30 seconds. Examples:
-          - 3 files  -> 240s
-          - 5 files  -> 240s
-          - 10 files -> 240 + 30*5 = 390s (6.5 min)
-          - 20 files -> 240 + 30*15 = 690s (11.5 min)
+    def compute_timeout(file_count: int, total_chars: int = 0) -> int:
+        """Compute dynamic timeout based on number of files and total content size.
+
+        Base timeout is 240 seconds (4 minutes). For every file beyond the first 5,
+        add 30 seconds. Additionally, for large content sizes, add extra time to
+        allow the reviewer to process large files. The final timeout is the maximum
+        of both calculations. Examples:
+          - 3 files, small content    -> 240s
+          - 5 files, small content    -> 240s
+          - 10 files, small content   -> 240 + 30*5 = 390s (6.5 min)
+          - 20 files, small content   -> 240 + 30*15 = 690s (11.5 min)
+          - 1 file,  25,000 chars     -> max(240, 240 + 30*4) = 360s (6 min)
+          - 1 file,  50,000 chars     -> max(240, 240 + 30*9) = 510s (8.5 min)
 
         Args:
             file_count: Total number of files being reviewed
+            total_chars: Total character count of the review payload (file contents
+                plus formatting). Defaults to 0 (size-based factor is ignored).
 
         Returns:
             Timeout in seconds
         """
-        return 240 + 30 * max(0, file_count - 5)
+        count_based = 240 + 30 * max(0, file_count - 5)
+        # Add 30 seconds for every 5,000 chars beyond the first 5,000
+        size_based = 240 + 30 * max(0, total_chars // 5000 - 1)
+        return max(count_based, size_based)
 
     async def evaluate(
         self,
@@ -513,9 +522,9 @@ async def evaluate(
         )
         payload = await self._build_payload(outputs, project_root, notes=notes)
 
-        # Dynamic timeout: more files = more time for the reviewer
+        # Dynamic timeout: more files/content = more time for the reviewer
         file_count = len(self._flatten_output_paths(outputs))
-        timeout = self.compute_timeout(file_count)
+        timeout = self.compute_timeout(file_count, total_chars=len(payload))
 
         from deepwork.jobs.mcp.claude_cli import ClaudeCLIError
 

diff --git a/src/deepwork/standard_jobs/deepwork_jobs/steps/define.md b/src/deepwork/standard_jobs/deepwork_jobs/steps/define.md
@@ -236,7 +236,16 @@ reviews:
 - Steps producing multiple files where each file needs individual review
 
 **Quality review timeout considerations:**
-Each individual quality review call has a 120-second timeout. For `run_each: <output_name>` with `files`-type outputs, each file gets its own separate review call — so having many files does NOT cause timeout accumulation. Timeout risk is only for individual reviews that are complex, such as:
+Each individual quality review call starts with a 240-second base timeout. The timeout scales
+automatically based on the number of files and the total size of the review payload:
+- Each file beyond the first 5 adds 30 seconds
+- Every 5,000 characters of content beyond the first 5,000 adds 30 seconds
+- The final timeout is the maximum of the file-count-based and content-size-based calculations
+
+For `run_each: <output_name>` with `files`-type outputs, each file gets its own separate review
+call — so having many files does NOT cause timeout accumulation per review.
+
+Timeout risk is highest for individual reviews with very large content, such as:
 - Reviewing a single very large file (500+ lines) with many criteria
 - Review criteria that require cross-referencing large amounts of context
 For these cases:

diff --git a/tests/unit/jobs/mcp/test_quality_gate.py b/tests/unit/jobs/mcp/test_quality_gate.py
@@ -693,7 +693,7 @@ class TestComputeTimeout:
     # THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-004.8.1, JOBS-REQ-004.8.2).
     # YOU MUST NOT MODIFY THIS TEST UNLESS THE REQUIREMENT CHANGES
     def test_base_timeout_for_few_files(self) -> None:
-        """Test that <=5 files gives base 240s (4 min) timeout."""
+        """Test that <=5 files with small content gives base 240s (4 min) timeout."""
         assert QualityGate.compute_timeout(0) == 240
         assert QualityGate.compute_timeout(1) == 240
         assert QualityGate.compute_timeout(5) == 240
@@ -706,6 +706,32 @@ def test_timeout_increases_after_five(self) -> None:
         assert QualityGate.compute_timeout(10) == 390  # 240 + 5*30
         assert QualityGate.compute_timeout(20) == 690  # 240 + 15*30
 
+    # THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-004.8.4).
+    # YOU MUST NOT MODIFY THIS TEST UNLESS THE REQUIREMENT CHANGES
+    def test_timeout_increases_with_large_content(self) -> None:
+        """Test that large total_chars adds 30s per 5,000 chars beyond first 5,000."""
+        # <=5,000 chars: no size-based increase
+        assert QualityGate.compute_timeout(1, total_chars=0) == 240
+        assert QualityGate.compute_timeout(1, total_chars=5000) == 240
+        # 10,000 chars: 240 + 30*1 = 270
+        assert QualityGate.compute_timeout(1, total_chars=10000) == 270
+        # 25,000 chars: 240 + 30*4 = 360
+        assert QualityGate.compute_timeout(1, total_chars=25000) == 360
+        # 50,000 chars: 240 + 30*9 = 510
+        assert QualityGate.compute_timeout(1, total_chars=50000) == 510
+
+    # THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-004.8.5).
+    # YOU MUST NOT MODIFY THIS TEST UNLESS THE REQUIREMENT CHANGES
+    def test_timeout_uses_maximum_of_count_and_size(self) -> None:
+        """Test that the final timeout is the max of count-based and size-based."""
+        # Many files, small content: count-based wins
+        assert QualityGate.compute_timeout(20, total_chars=1000) == 690  # 240+15*30 vs 240
+        # Few files, large content: size-based wins
+        assert QualityGate.compute_timeout(1, total_chars=50000) == 510  # 240 vs 240+30*9
+        # Both large: maximum wins
+        # 10 files -> count=390; 25,000 chars -> size=360: count wins
+        assert QualityGate.compute_timeout(10, total_chars=25000) == 390
+
 
 class TestDynamicTimeout:
     """Tests that evaluate passes dynamic timeout to CLI."""
@@ -725,7 +751,7 @@ async def test_timeout_passed_to_cli(self, mock_cli: ClaudeCLI, project_root: Pa
         )
 
         call_kwargs = mock_cli.run.call_args.kwargs
-        # 1 file -> timeout = 240
+        # 1 file, small content -> timeout = 240
         assert call_kwargs["timeout"] == 240
 
     # THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-004.6.3, JOBS-REQ-004.8.3).
@@ -746,9 +772,31 @@ async def test_timeout_scales_with_file_count(
         )
 
         call_kwargs = mock_cli.run.call_args.kwargs
-        # 10 files -> 240 + 5*30 = 390
+        # 10 files, small content -> 240 + 5*30 = 390
         assert call_kwargs["timeout"] == 390
 
+    # THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-004.8.4, JOBS-REQ-004.8.5).
+    # YOU MUST NOT MODIFY THIS TEST UNLESS THE REQUIREMENT CHANGES
+    async def test_timeout_scales_with_large_file_content(
+        self, mock_cli: ClaudeCLI, project_root: Path
+    ) -> None:
+        """Test that timeout increases for a single large file (large content)."""
+        gate = QualityGate(cli=mock_cli)
+
+        # Write a file with ~25,000 chars of content (simulating a large job.yml)
+        large_content = "x" * 25000
+        (project_root / "large_job.yml").write_text(large_content)
+
+        await gate.evaluate(
+            quality_criteria={"Valid": "Is it valid?"},
+            outputs={"job_yml": "large_job.yml"},
+            project_root=project_root,
+        )
+
+        call_kwargs = mock_cli.run.call_args.kwargs
+        # 1 file, ~25,000+ chars in payload -> size-based = 240 + 30*4 = 360
+        assert call_kwargs["timeout"] >= 360
+
 
 class TestMockQualityGate:
     """Tests for MockQualityGate class."""