Skip to content

Commit 230b8ae

Browse files
authored
MAINT: Unit test and pre-commit speedup (#1227)
1 parent a1a709b commit 230b8ae

15 files changed

+668
-117
lines changed

.github/workflows/build_and_test.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,12 +64,16 @@ jobs:
6464

6565
- name: Run pre-commit incrementally (on PR)
6666
if: github.event_name == 'pull_request'
67+
env:
68+
RUN_LONG_PRECOMMIT: true
6769
run: |
6870
git fetch origin main
6971
pre-commit run --from-ref origin/main --to-ref HEAD
7072
7173
- name: Run pre-commit fully (on main)
7274
if: github.ref == 'refs/heads/main'
75+
env:
76+
RUN_LONG_PRECOMMIT: true
7377
run: |
7478
pre-commit run --all-files
7579
@@ -117,12 +121,16 @@ jobs:
117121

118122
- name: Run pre-commit incrementally (on PR)
119123
if: github.event_name == 'pull_request'
124+
env:
125+
RUN_LONG_PRECOMMIT: true
120126
run: |
121127
git fetch origin main
122128
pre-commit run --from-ref origin/main --to-ref HEAD
123129
124130
- name: Run pre-commit fully (on main)
125131
if: github.ref == 'refs/heads/main'
132+
env:
133+
RUN_LONG_PRECOMMIT: true
126134
run: |
127135
pre-commit run --all-files
128136

.pre-commit-config.yaml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,16 @@ repos:
8686

8787
- repo: local
8888
hooks:
89+
- id: validate-jupyter-book
90+
name: Validate Jupyter Book Structure
91+
entry: python ./build_scripts/validate_jupyter_book.py
92+
language: python
93+
files: ^(doc/.*\.(py|ipynb|md|rst)|doc/_toc\.yml)$
94+
pass_filenames: false
95+
additional_dependencies: ['pyyaml']
8996
- id: website
9097
name: Jupyter Book Build Check
91-
entry: jb build -W -q ./doc
98+
entry: python ./build_scripts/conditional_jb_build.py
9299
language: system
93100
types: [python]
94101
pass_filenames: false

Makefile

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,15 @@ docs-build:
1919
jb build -W -v ./doc
2020
python ./build_scripts/generate_rss.py
2121

22+
# Because of import time, "auto" seemed to actually go slower than just using 4 processes
2223
unit-test:
23-
$(CMD) pytest --cov=$(PYMODULE) $(UNIT_TESTS)
24+
$(CMD) pytest -n 4 --dist=loadfile --cov=$(PYMODULE) $(UNIT_TESTS)
2425

2526
unit-test-cov-html:
26-
$(CMD) pytest --cov=$(PYMODULE) $(UNIT_TESTS) --cov-report html
27+
$(CMD) pytest -n 4 --dist=loadfile --cov=$(PYMODULE) $(UNIT_TESTS) --cov-report html
2728

2829
unit-test-cov-xml:
29-
$(CMD) pytest --cov=$(PYMODULE) $(UNIT_TESTS) --cov-report xml --junitxml=junit/test-results.xml --doctest-modules
30+
$(CMD) pytest -n 4 --dist=loadfile --cov=$(PYMODULE) $(UNIT_TESTS) --cov-report xml --junitxml=junit/test-results.xml --doctest-modules
3031

3132
integration-test:
3233
$(CMD) pytest $(INTEGRATION_TESTS) --cov=$(PYMODULE) $(INTEGRATION_TESTS) --cov-report xml --junitxml=junit/test-results.xml --doctest-modules

build_scripts/check_links.py

Lines changed: 72 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -118,35 +118,88 @@ def check_url(url, retries=2, delay=2):
118118
return url, False
119119

120120

121-
def check_links_in_file(file_path):
122-
urls = extract_urls(file_path)
123-
resolved_urls = [resolve_relative_url(file_path, url) for url in urls]
124-
broken_urls = []
125-
with ThreadPoolExecutor(max_workers=10) as executor:
126-
futures = {executor.submit(check_url, url): url for url in resolved_urls}
121+
def extract_all_urls_from_files(files):
122+
"""
123+
Extract all URLs from all files, returning a dict of {file_path: [urls]}.
124+
"""
125+
file_urls = {}
126+
skipped_files = ["doc/blog/"]
127+
128+
for file_path in files:
129+
if any(file_path.startswith(skipped) for skipped in skipped_files):
130+
continue
131+
urls = extract_urls(file_path)
132+
resolved_urls = [resolve_relative_url(file_path, url) for url in urls]
133+
if resolved_urls:
134+
file_urls[file_path] = resolved_urls
135+
136+
return file_urls
137+
138+
139+
def check_all_links_parallel(file_urls, max_workers=20):
140+
"""
141+
Check all URLs across all files in parallel with a shared thread pool.
142+
143+
Args:
144+
file_urls: Dict of {file_path: [urls]}
145+
max_workers: Max concurrent HTTP requests across ALL files
146+
147+
Returns:
148+
Dict of {file_path: [broken_urls]}
149+
"""
150+
all_broken_urls = {}
151+
152+
# Create a mapping of url -> file_path for tracking which file each URL came from
153+
url_to_files = {}
154+
for file_path, urls in file_urls.items():
155+
for url in urls:
156+
if url not in url_to_files:
157+
url_to_files[url] = []
158+
url_to_files[url].append(file_path)
159+
160+
# Check all unique URLs in parallel
161+
url_results = {}
162+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
163+
futures = {executor.submit(check_url, url): url for url in url_to_files.keys()}
127164
for future in as_completed(futures):
128-
url, is_valid = future.result()
129-
if not is_valid:
130-
broken_urls.append(url)
131-
return broken_urls
165+
url = futures[future]
166+
_, is_valid = future.result()
167+
url_results[url] = is_valid
168+
169+
# Map broken URLs back to their files
170+
for url, is_valid in url_results.items():
171+
if not is_valid:
172+
for file_path in url_to_files[url]:
173+
if file_path not in all_broken_urls:
174+
all_broken_urls[file_path] = []
175+
all_broken_urls[file_path].append(url)
176+
177+
return all_broken_urls
132178

133179

134180
if __name__ == "__main__":
135181
files = sys.argv[1:]
136-
all_broken_urls = {}
137-
skipped_files = ["doc/blog/"]
138-
for file_path in files:
139-
if any(file_path.startswith(skipped) for skipped in skipped_files):
140-
continue
141-
print(f"Checking links in {file_path}")
142-
broken_urls = check_links_in_file(file_path)
143-
if broken_urls:
144-
all_broken_urls[file_path] = broken_urls
182+
183+
print(f"Extracting URLs from {len(files)} file(s)...")
184+
file_urls = extract_all_urls_from_files(files)
185+
186+
if not file_urls:
187+
print("No URLs found to check.")
188+
sys.exit(0)
189+
190+
total_urls = sum(len(urls) for urls in file_urls.values())
191+
unique_urls = len(set(url for urls in file_urls.values() for url in urls))
192+
print(f"Checking {unique_urls} unique URL(s) across {len(file_urls)} file(s) (total: {total_urls})...")
193+
194+
all_broken_urls = check_all_links_parallel(file_urls, max_workers=30)
195+
145196
if all_broken_urls:
197+
print("\n" + "=" * 80)
146198
for file_path, urls in all_broken_urls.items():
147199
print(f"Broken links in {file_path}:")
148200
for url in urls:
149201
print(f" - {url}")
202+
print("=" * 80)
150203
sys.exit(1)
151204
else:
152205
print("No broken links found.")
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT license.
3+
4+
"""
5+
Conditional Jupyter Book build wrapper for pre-commit.
6+
7+
This script checks the RUN_LONG_PRECOMMIT environment variable:
8+
- If set to "true", runs the full `jb build -W -q ./doc` command
9+
- Otherwise, exits successfully (fast validation script runs instead)
10+
11+
This allows CI/pipeline to run full builds while local development uses fast validation.
12+
"""
13+
14+
import os
15+
import subprocess
16+
import sys
17+
18+
19+
def main():
20+
run_long = os.environ.get("RUN_LONG_PRECOMMIT", "").lower() == "true"
21+
22+
if run_long:
23+
print("RUN_LONG_PRECOMMIT=true: Running full Jupyter Book build...")
24+
# Run jb build with the same flags as before
25+
result = subprocess.run(
26+
["jb", "build", "-W", "-q", "./doc"], cwd=os.path.dirname(os.path.dirname(__file__)) # Repository root
27+
)
28+
return result.returncode
29+
else:
30+
print("RUN_LONG_PRECOMMIT not set: Skipping full Jupyter Book build (fast validation runs instead)")
31+
return 0
32+
33+
34+
if __name__ == "__main__":
35+
sys.exit(main())

0 commit comments

Comments
 (0)