@@ -705,6 +705,48 @@ def build_detection_from_node(
705705 # as javadoc
706706 (r'^@[Cc]opyrights?:?$' , 'COPY' ),
707707
708+ ############################################################################
709+ # URLS and emails - moved here to prevent (c) in URLs from being matched as copyright
710+ ############################################################################
711+
712+ # email start-at-end: <sebastian.classen at freenet.ag>: <EMAIL_START> <AT> <EMAIL_END>
713+ (r'^<([a-zA-Z]+[a-zA-Z\.]){2,5}$' , 'EMAIL_START' ),
714+ (r'^[a-zA-Z\.]{2,5}>$' , 'EMAIL_END' ),
715+
716+ # a .sh shell scripts is NOT an email.
717+ (r'^.*\.sh\.?$' , 'JUNK' ),
718+ # email eventually in parens or brackets with some trailing punct. Note the @ or "at "
719+ (r'^(?:[A-Za-z])*[<\(]?[a-zA-Z0-9]+[a-zA-Z0-9\+_\-\.\%]*(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]+\.[a-zA-Z]{2,3}[>\)\.\,]*$' , 'EMAIL' ),
720+
721+ # mailto URLs
722+ (r'^mailto:.{2,}@.{2,}\.[a-z]{2,3}' , 'EMAIL' ),
723+
724+ (r'^<[a-zA-Z]+[a-zA-Z0-9\.]+@[a-zA-Z][a-zA-Z0-9]+\.[a-zA-Z]{2,5}>$' , 'EMAIL' ),
725+
726+ # URLS such as <(http://fedorahosted.org/lohit)> or ()
727+ (r'[<\(]https?:.*[>\)]' , 'URL' ),
728+ # URLS such as ibm.com without a scheme
729+ (r'\s?[a-z0-9A-Z\-\.\\_]+\.([Cc][Oo][Mm]|[Nn][Ee][Tt]|[Oo][Rr][Gg]|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|de|be|se|nl|au|biz|sy|dev)\s?[\.,]?$' , 'URL2' ),
730+ # TODO: add more extensions: there are so many TLDs these days!
731+ # URL wrapped in () or <>
732+ (r'[\(<]+\s?[a-z0-9A-Z\-\.\\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)\s?[\.\)>]+$' , 'URL' ),
733+ (r'<?a?.(href)?.\(?[a-z0-9A-Z\-\.\\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)[\.\)>]?$' , 'URL' ),
734+ # derived from regex in cluecode.finder
735+ (r'<?a?.(href)?('
736+ r'(?:http|ftp|sftp)s?://[^\s<>\[\]"]+'
737+ r'|(?:www|ftp)\.[^\s<>\[\]]+'
738+ r')\.?>?' , 'URL' ),
739+
740+ (r'^\(?<?https?://[a-zA-Z0-9_\-]+(\\.([a-zA-Z0-9_\-])+)+.?\)?>?$' , 'URL' ),
741+
742+ # URLS with trailing/ such as http://fedorahosted.org/lohit/
743+ # URLS with leading( such as (http://qbnz.com/highlighter/
744+ (r'\(?https?:.*/' , 'URL' ),
745+
746+ ############################################################################
747+ # Back to COPYRIGHT patterns
748+ ############################################################################
749+
708750 (r'^\(C\)\,?$' , 'COPY' ),
709751 (r'^\(c\)\,?$' , 'COPY' ),
710752
@@ -2258,44 +2300,6 @@ def build_detection_from_node(
22582300 # this was capturing AbCdEf or a bare comma.
22592301 (r'^([A-Z][a-z0-9]+){1,2}\.?,?$' , 'NNP' ),
22602302
2261- ############################################################################
2262- # URLS and emails
2263- ############################################################################
2264-
2265- # email start-at-end: <sebastian.classen at freenet.ag>: <EMAIL_START> <AT> <EMAIL_END>
2266- (r'^<([a-zA-Z]+[a-zA-Z\.]){2,5}$' , 'EMAIL_START' ),
2267- (r'^[a-zA-Z\.]{2,5}>$' , 'EMAIL_END' ),
2268-
2269- # a .sh shell scripts is NOT an email.
2270- (r'^.*\.sh\.?$' , 'JUNK' ),
2271- # email eventually in parens or brackets with some trailing punct. Note the @ or "at "
2272- (r'^(?:[A-Za-z])*[\<\(]?[a-zA-Z0-9]+[a-zA-Z0-9\+_\-\.\%]*(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]+\.[a-zA-Z]{2,3}[\>\)\.\,]*$' , 'EMAIL' ),
2273-
2274- # mailto URLs
2275- (r'^mailto:.{2,}@.{2,}\.[a-z]{2,3}' , 'EMAIL' ),
2276-
2277- (r'^<[a-zA-Z]+[a-zA-Z0-9\.]+@[a-zA-Z][a-zA-Z0-9]+\.[a-zA-Z]{2,5}>$' , 'EMAIL' ),
2278-
2279- # URLS such as <(http://fedorahosted.org/lohit)> or ()
2280- (r'[<\(]https?:.*[>\)]' , 'URL' ),
2281- # URLS such as ibm.com without a scheme
2282- (r'\s?[a-z0-9A-Z\-\.\_]+\.([Cc][Oo][Mm]|[Nn][Ee][Tt]|[Oo][Rr][Gg]|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|de|be|se|nl|au|biz|sy|dev)\s?[\.,]?$' , 'URL2' ),
2283- # TODO: add more extensions: there are so many TLDs these days!
2284- # URL wrapped in () or <>
2285- (r'[\(<]+\s?[a-z0-9A-Z\-\.\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)\s?[\.\)>]+$' , 'URL' ),
2286- (r'<?a?.(href)?.\(?[a-z0-9A-Z\-\.\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)[\.\)>]?$' , 'URL' ),
2287- # derived from regex in cluecode.finder
2288- (r'<?a?.(href)?.('
2289- r'(?:http|ftp|sftp)s?://[^\s<>\[\]"]+'
2290- r'|(?:www|ftp)\.[^\s<>\[\]"]+'
2291- r')\.?>?' , 'URL' ),
2292-
2293- (r'^\(?<?https?://[a-zA-Z0-9_\-]+(\.([a-zA-Z0-9_\-])+)+.?\)?>?$' , 'URL' ),
2294-
2295- # URLS with trailing/ such as http://fedorahosted.org/lohit/
2296- # URLS with leading( such as (http://qbnz.com/highlighter/
2297- (r'\(?https?:.*/' , 'URL' ),
2298-
22992303 ############################################################################
23002304 # Misc
23012305 ############################################################################
@@ -3783,6 +3787,10 @@ def refine_names(s, prefixes):
37833787 r'^\(c\) \(c\) B$' ,
37843788 r'^\(c\) group$' ,
37853789 r'^\(c\) \(c\) A$' ,
3790+ # URLs with (c) in path or query - these are false positives
3791+ r'.*https?://' , # contains http:// or https://
3792+ r'.*/.*\(c\)' , # has path-like structure with (c)
3793+ r'\(c\).*https?://' , # (c) followed by URL
37863794]
37873795
37883796# a collection of junk junk matcher callables
0 commit comments