@@ -118,35 +118,88 @@ def check_url(url, retries=2, delay=2):
118118 return url , False
119119
120120
121- def check_links_in_file (file_path ):
122- urls = extract_urls (file_path )
123- resolved_urls = [resolve_relative_url (file_path , url ) for url in urls ]
124- broken_urls = []
125- with ThreadPoolExecutor (max_workers = 10 ) as executor :
126- futures = {executor .submit (check_url , url ): url for url in resolved_urls }
121+ def extract_all_urls_from_files (files ):
122+ """
123+ Extract all URLs from all files, returning a dict of {file_path: [urls]}.
124+ """
125+ file_urls = {}
126+ skipped_files = ["doc/blog/" ]
127+
128+ for file_path in files :
129+ if any (file_path .startswith (skipped ) for skipped in skipped_files ):
130+ continue
131+ urls = extract_urls (file_path )
132+ resolved_urls = [resolve_relative_url (file_path , url ) for url in urls ]
133+ if resolved_urls :
134+ file_urls [file_path ] = resolved_urls
135+
136+ return file_urls
137+
138+
139+ def check_all_links_parallel (file_urls , max_workers = 20 ):
140+ """
141+ Check all URLs across all files in parallel with a shared thread pool.
142+
143+ Args:
144+ file_urls: Dict of {file_path: [urls]}
145+ max_workers: Max concurrent HTTP requests across ALL files
146+
147+ Returns:
148+ Dict of {file_path: [broken_urls]}
149+ """
150+ all_broken_urls = {}
151+
152+ # Create a mapping of url -> file_path for tracking which file each URL came from
153+ url_to_files = {}
154+ for file_path , urls in file_urls .items ():
155+ for url in urls :
156+ if url not in url_to_files :
157+ url_to_files [url ] = []
158+ url_to_files [url ].append (file_path )
159+
160+ # Check all unique URLs in parallel
161+ url_results = {}
162+ with ThreadPoolExecutor (max_workers = max_workers ) as executor :
163+ futures = {executor .submit (check_url , url ): url for url in url_to_files .keys ()}
127164 for future in as_completed (futures ):
128- url , is_valid = future .result ()
129- if not is_valid :
130- broken_urls .append (url )
131- return broken_urls
165+ url = futures [future ]
166+ _ , is_valid = future .result ()
167+ url_results [url ] = is_valid
168+
169+ # Map broken URLs back to their files
170+ for url , is_valid in url_results .items ():
171+ if not is_valid :
172+ for file_path in url_to_files [url ]:
173+ if file_path not in all_broken_urls :
174+ all_broken_urls [file_path ] = []
175+ all_broken_urls [file_path ].append (url )
176+
177+ return all_broken_urls
132178
133179
134180if __name__ == "__main__" :
135181 files = sys .argv [1 :]
136- all_broken_urls = {}
137- skipped_files = ["doc/blog/" ]
138- for file_path in files :
139- if any (file_path .startswith (skipped ) for skipped in skipped_files ):
140- continue
141- print (f"Checking links in { file_path } " )
142- broken_urls = check_links_in_file (file_path )
143- if broken_urls :
144- all_broken_urls [file_path ] = broken_urls
182+
183+ print (f"Extracting URLs from { len (files )} file(s)..." )
184+ file_urls = extract_all_urls_from_files (files )
185+
186+ if not file_urls :
187+ print ("No URLs found to check." )
188+ sys .exit (0 )
189+
190+ total_urls = sum (len (urls ) for urls in file_urls .values ())
191+ unique_urls = len (set (url for urls in file_urls .values () for url in urls ))
192+ print (f"Checking { unique_urls } unique URL(s) across { len (file_urls )} file(s) (total: { total_urls } )..." )
193+
194+ all_broken_urls = check_all_links_parallel (file_urls , max_workers = 30 )
195+
145196 if all_broken_urls :
197+ print ("\n " + "=" * 80 )
146198 for file_path , urls in all_broken_urls .items ():
147199 print (f"Broken links in { file_path } :" )
148200 for url in urls :
149201 print (f" - { url } " )
202+ print ("=" * 80 )
150203 sys .exit (1 )
151204 else :
152205 print ("No broken links found." )
0 commit comments