Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion specs/deepwork/jobs/JOBS-REQ-004-quality-review-system.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,10 @@ The quality review system evaluates step outputs against defined quality criteri
### JOBS-REQ-004.8: Timeout Computation

1. The base timeout MUST be 240 seconds (4 minutes).
2. For file counts of 5 or fewer, the timeout MUST be 240 seconds.
2. For file counts of 5 or fewer (with small content), the timeout MUST be 240 seconds.
3. For file counts beyond 5, the timeout MUST increase by 30 seconds per additional file.
4. For large content sizes, the timeout MUST increase by 30 seconds for every 5,000 characters beyond the first 5,000 characters of the review payload.
5. The final timeout MUST be the maximum of the file-count-based and content-size-based calculations.

### JOBS-REQ-004.9: Result Parsing

Expand Down
33 changes: 21 additions & 12 deletions src/deepwork/jobs/mcp/quality_gate.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,23 +451,32 @@ async def build_review_instructions_file(
return "\n".join(parts)

@staticmethod
def compute_timeout(file_count: int) -> int:
"""Compute dynamic timeout based on number of files.

Base timeout is 240 seconds (4 minutes). For every file beyond
the first 5, add 30 seconds. Examples:
- 3 files -> 240s
- 5 files -> 240s
- 10 files -> 240 + 30*5 = 390s (6.5 min)
- 20 files -> 240 + 30*15 = 690s (11.5 min)
def compute_timeout(file_count: int, total_chars: int = 0) -> int:
"""Compute dynamic timeout based on number of files and total content size.

Base timeout is 240 seconds (4 minutes). For every file beyond the first 5,
add 30 seconds. Additionally, for large content sizes, add extra time to
allow the reviewer to process large files. The final timeout is the maximum
of both calculations. Examples:
- 3 files, small content -> 240s
- 5 files, small content -> 240s
- 10 files, small content -> 240 + 30*5 = 390s (6.5 min)
- 20 files, small content -> 240 + 30*15 = 690s (11.5 min)
- 1 file, 25,000 chars -> max(240, 240 + 30*4) = 360s (6 min)
- 1 file, 50,000 chars -> max(240, 240 + 30*9) = 510s (8.5 min)

Args:
file_count: Total number of files being reviewed
total_chars: Total character count of the review payload (file contents
plus formatting). Defaults to 0 (size-based factor is ignored).

Returns:
Timeout in seconds
"""
return 240 + 30 * max(0, file_count - 5)
count_based = 240 + 30 * max(0, file_count - 5)
# Add 30 seconds for every 5,000 chars beyond the first 5,000
size_based = 240 + 30 * max(0, total_chars // 5000 - 1)
return max(count_based, size_based)

async def evaluate(
self,
Expand Down Expand Up @@ -513,9 +522,9 @@ async def evaluate(
)
payload = await self._build_payload(outputs, project_root, notes=notes)

# Dynamic timeout: more files = more time for the reviewer
# Dynamic timeout: more files/content = more time for the reviewer
file_count = len(self._flatten_output_paths(outputs))
timeout = self.compute_timeout(file_count)
timeout = self.compute_timeout(file_count, total_chars=len(payload))

from deepwork.jobs.mcp.claude_cli import ClaudeCLIError

Expand Down
11 changes: 10 additions & 1 deletion src/deepwork/standard_jobs/deepwork_jobs/steps/define.md
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,16 @@ reviews:
- Steps producing multiple files where each file needs individual review

**Quality review timeout considerations:**
Each individual quality review call has a 120-second timeout. For `run_each: <output_name>` with `files`-type outputs, each file gets its own separate review call — so having many files does NOT cause timeout accumulation. Timeout risk is only for individual reviews that are complex, such as:
Each individual quality review call starts with a 240-second base timeout. The timeout scales
automatically based on the number of files and the total size of the review payload:
- Each file beyond the first 5 adds 30 seconds
- Every 5,000 characters of content beyond the first 5,000 adds 30 seconds
- The final timeout is the maximum of the file-count-based and content-size-based calculations

For `run_each: <output_name>` with `files`-type outputs, each file gets its own separate review
call — so having many files does NOT cause timeout accumulation per review.

Timeout risk is highest for individual reviews with very large content, such as:
- Reviewing a single very large file (500+ lines) with many criteria
- Review criteria that require cross-referencing large amounts of context
For these cases:
Expand Down
54 changes: 51 additions & 3 deletions tests/unit/jobs/mcp/test_quality_gate.py
Original file line number Diff line number Diff line change
Expand Up @@ -693,7 +693,7 @@ class TestComputeTimeout:
# THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-004.8.1, JOBS-REQ-004.8.2).
# YOU MUST NOT MODIFY THIS TEST UNLESS THE REQUIREMENT CHANGES
def test_base_timeout_for_few_files(self) -> None:
"""Test that <=5 files gives base 240s (4 min) timeout."""
"""Test that <=5 files with small content gives base 240s (4 min) timeout."""
assert QualityGate.compute_timeout(0) == 240
assert QualityGate.compute_timeout(1) == 240
assert QualityGate.compute_timeout(5) == 240
Expand All @@ -706,6 +706,32 @@ def test_timeout_increases_after_five(self) -> None:
assert QualityGate.compute_timeout(10) == 390 # 240 + 5*30
assert QualityGate.compute_timeout(20) == 690 # 240 + 15*30

# THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-004.8.4).
# YOU MUST NOT MODIFY THIS TEST UNLESS THE REQUIREMENT CHANGES
def test_timeout_increases_with_large_content(self) -> None:
"""Test that large total_chars adds 30s per 5,000 chars beyond first 5,000."""
# <=5,000 chars: no size-based increase
assert QualityGate.compute_timeout(1, total_chars=0) == 240
assert QualityGate.compute_timeout(1, total_chars=5000) == 240
# 10,000 chars: 240 + 30*1 = 270
assert QualityGate.compute_timeout(1, total_chars=10000) == 270
# 25,000 chars: 240 + 30*4 = 360
assert QualityGate.compute_timeout(1, total_chars=25000) == 360
# 50,000 chars: 240 + 30*9 = 510
assert QualityGate.compute_timeout(1, total_chars=50000) == 510

# THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-004.8.5).
# YOU MUST NOT MODIFY THIS TEST UNLESS THE REQUIREMENT CHANGES
def test_timeout_uses_maximum_of_count_and_size(self) -> None:
"""Test that the final timeout is the max of count-based and size-based."""
# Many files, small content: count-based wins
assert QualityGate.compute_timeout(20, total_chars=1000) == 690 # 240+15*30 vs 240
# Few files, large content: size-based wins
assert QualityGate.compute_timeout(1, total_chars=50000) == 510 # 240 vs 240+30*9
# Both large: maximum wins
# 10 files -> count=390; 25,000 chars -> size=360: count wins
assert QualityGate.compute_timeout(10, total_chars=25000) == 390


class TestDynamicTimeout:
"""Tests that evaluate passes dynamic timeout to CLI."""
Expand All @@ -725,7 +751,7 @@ async def test_timeout_passed_to_cli(self, mock_cli: ClaudeCLI, project_root: Pa
)

call_kwargs = mock_cli.run.call_args.kwargs
# 1 file -> timeout = 240
# 1 file, small content -> timeout = 240
assert call_kwargs["timeout"] == 240

# THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-004.6.3, JOBS-REQ-004.8.3).
Expand All @@ -746,9 +772,31 @@ async def test_timeout_scales_with_file_count(
)

call_kwargs = mock_cli.run.call_args.kwargs
# 10 files -> 240 + 5*30 = 390
# 10 files, small content -> 240 + 5*30 = 390
assert call_kwargs["timeout"] == 390

# THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-004.8.4, JOBS-REQ-004.8.5).
# YOU MUST NOT MODIFY THIS TEST UNLESS THE REQUIREMENT CHANGES
async def test_timeout_scales_with_large_file_content(
self, mock_cli: ClaudeCLI, project_root: Path
) -> None:
"""Test that timeout increases for a single large file (large content)."""
gate = QualityGate(cli=mock_cli)

# Write a file with ~25,000 chars of content (simulating a large job.yml)
large_content = "x" * 25000
(project_root / "large_job.yml").write_text(large_content)

await gate.evaluate(
quality_criteria={"Valid": "Is it valid?"},
outputs={"job_yml": "large_job.yml"},
project_root=project_root,
)

call_kwargs = mock_cli.run.call_args.kwargs
# 1 file, ~25,000+ chars in payload -> size-based = 240 + 30*4 = 360
assert call_kwargs["timeout"] >= 360


class TestMockQualityGate:
"""Tests for MockQualityGate class."""
Expand Down