@@ -878,79 +878,29 @@ async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None
878878 }
879879 proxy_settings = {"server" : self .config .proxy } if self .config .proxy else None
880880
881- blocked_extensions = [
881+ # Define resource categories
882+ css_extensions = ["css" , "less" , "scss" , "sass" ]
883+ static_extensions = [
882884 # Images
883- "jpg" ,
884- "jpeg" ,
885- "png" ,
886- "gif" ,
887- "webp" ,
888- "svg" ,
889- "ico" ,
890- "bmp" ,
891- "tiff" ,
892- "psd" ,
885+ "jpg" , "jpeg" , "png" , "gif" , "webp" , "svg" , "ico" , "bmp" , "tiff" ,
893886 # Fonts
894- "woff" ,
895- "woff2" ,
896- "ttf" ,
897- "otf" ,
898- "eot" ,
899- # Styles
900- "css" , "less" , "scss" , "sass" ,
887+ "woff" , "woff2" , "ttf" , "otf" , "eot" ,
901888 # Media
902- "mp4" ,
903- "webm" ,
904- "ogg" ,
905- "avi" ,
906- "mov" ,
907- "wmv" ,
908- "flv" ,
909- "m4v" ,
910- "mp3" ,
911- "wav" ,
912- "aac" ,
913- "m4a" ,
914- "opus" ,
915- "flac" ,
916- # Documents
917- "pdf" ,
918- "doc" ,
919- "docx" ,
920- "xls" ,
921- "xlsx" ,
922- "ppt" ,
923- "pptx" ,
924- # Archives
925- "zip" ,
926- "rar" ,
927- "7z" ,
928- "tar" ,
929- "gz" ,
930- # Scripts and data
931- "xml" ,
932- "swf" ,
933- "wasm" ,
889+ "mp4" , "webm" , "ogg" , "mp3" , "wav" , "aac" , "flac" ,
890+ # Documents & Archives
891+ "pdf" , "doc" , "docx" , "xls" , "xlsx" , "zip" , "rar" , "7z" , "tar" , "gz" ,
892+ # Other
893+ "xml" , "swf" , "wasm"
934894 ]
935895
936- # Ad and Tracker patterns
896+ # Ad and Tracker patterns (Top 20 curated from uBlock sources for performance)
937897 ad_tracker_patterns = [
938- "**/google-analytics.com/**" ,
939- "**/googletagmanager.com/**" ,
940- "**/googlesyndication.com/**" ,
941- "**/doubleclick.net/**" ,
942- "**/adservice.google.com/**" ,
943- "**/adsystem.com/**" ,
944- "**/adzerk.net/**" ,
945- "**/adnxs.com/**" ,
946- "**/ads.linkedin.com/**" ,
947- "**/facebook.net/**" ,
948- "**/analytics.twitter.com/**" ,
949- "**/t.co/**" ,
950- "**/hotjar.com/**" ,
951- "**/clarity.ms/**" ,
952- "**/scorecardresearch.com/**" ,
953- "**/pixel.wp.com/**" ,
898+ "**/google-analytics.com/**" , "**/googletagmanager.com/**" , "**/googlesyndication.com/**" ,
899+ "**/doubleclick.net/**" , "**/adservice.google.com/**" , "**/adsystem.com/**" ,
900+ "**/adzerk.net/**" , "**/adnxs.com/**" , "**/ads.linkedin.com/**" , "**/facebook.net/**" ,
901+ "**/analytics.twitter.com/**" , "**/t.co/**" , "**/ads-twitter.com/**" ,
902+ "**/hotjar.com/**" , "**/clarity.ms/**" , "**/scorecardresearch.com/**" , "**/pixel.wp.com/**" ,
903+ "**/amazon-adsystem.com/**" , "**/mixpanel.com/**" , "**/segment.com/**"
954904 ]
955905
956906 # Common context settings
@@ -1006,10 +956,15 @@ async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None
1006956 # Create and return the context with all settings
1007957 context = await self .browser .new_context (** context_settings )
1008958
1009- # Apply resource filtering based on config
1010- if self .config .avoid_css or self .config .text_mode :
1011- # Create and apply route patterns for each extension
1012- for ext in blocked_extensions :
959+ # Apply resource filtering based on config (Dynamic addition)
960+ to_block = []
961+ if self .config .avoid_css :
962+ to_block += css_extensions
963+ if self .config .text_mode :
964+ to_block += static_extensions
965+
966+ if to_block :
967+ for ext in to_block :
1013968 await context .route (f"**/*.{ ext } " , lambda route : route .abort ())
1014969
1015970 if self .config .avoid_ads :
0 commit comments