Skip to content

Commit 48b5dd8

Browse files
MacMac
authored andcommitted
Fix copyright detection for URLs containing (c) symbol
Fixes #4724 URLs containing (c) in their path or query parameters were incorrectly detected as copyright statements. For example: http://example.com/path/(c)/test This fix addresses the issue by: 1. Reordering URL/email patterns to appear before (C) and (c) copyright patterns in the lexer, ensuring URL tokens are matched as URLs first 2. Adding junk copyright patterns to filter out false positives from URL fragments containing (c) The tokenizer splits URLs on = and ; characters, which can cause (c) to appear as a separate token. By prioritizing URL pattern matching and filtering URL-like detections, we prevent these false positives. Tested with the original urls.10K file from the issue - now shows 0 false positives (previously had 2).
1 parent 022ddc8 commit 48b5dd8

2 files changed

Lines changed: 49 additions & 38 deletions

File tree

src/cluecode/copyrights.py

Lines changed: 46 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -705,6 +705,48 @@ def build_detection_from_node(
705705
# as javadoc
706706
(r'^@[Cc]opyrights?:?$', 'COPY'),
707707

708+
############################################################################
709+
# URLS and emails - moved here to prevent (c) in URLs from being matched as copyright
710+
############################################################################
711+
712+
# email start-at-end: <sebastian.classen at freenet.ag>: <EMAIL_START> <AT> <EMAIL_END>
713+
(r'^<([a-zA-Z]+[a-zA-Z\.]){2,5}$', 'EMAIL_START'),
714+
(r'^[a-zA-Z\.]{2,5}>$', 'EMAIL_END'),
715+
716+
# a .sh shell scripts is NOT an email.
717+
(r'^.*\.sh\.?$', 'JUNK'),
718+
# email eventually in parens or brackets with some trailing punct. Note the @ or "at "
719+
(r'^(?:[A-Za-z])*[<\(]?[a-zA-Z0-9]+[a-zA-Z0-9\+_\-\.\%]*(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]+\.[a-zA-Z]{2,3}[>\)\.\,]*$', 'EMAIL'),
720+
721+
# mailto URLs
722+
(r'^mailto:.{2,}@.{2,}\.[a-z]{2,3}', 'EMAIL'),
723+
724+
(r'^<[a-zA-Z]+[a-zA-Z0-9\.]+@[a-zA-Z][a-zA-Z0-9]+\.[a-zA-Z]{2,5}>$', 'EMAIL'),
725+
726+
# URLS such as <(http://fedorahosted.org/lohit)> or ()
727+
(r'[<\(]https?:.*[>\)]', 'URL'),
728+
# URLS such as ibm.com without a scheme
729+
(r'\s?[a-z0-9A-Z\-\.\\_]+\.([Cc][Oo][Mm]|[Nn][Ee][Tt]|[Oo][Rr][Gg]|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|de|be|se|nl|au|biz|sy|dev)\s?[\.,]?$', 'URL2'),
730+
# TODO: add more extensions: there are so many TLDs these days!
731+
# URL wrapped in () or <>
732+
(r'[\(<]+\s?[a-z0-9A-Z\-\.\\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)\s?[\.\)>]+$', 'URL'),
733+
(r'<?a?.(href)?.\(?[a-z0-9A-Z\-\.\\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)[\.\)>]?$', 'URL'),
734+
# derived from regex in cluecode.finder
735+
(r'<?a?.(href)?('
736+
r'(?:http|ftp|sftp)s?://[^\s<>\[\]"]+'
737+
r'|(?:www|ftp)\.[^\s<>\[\]]+'
738+
r')\.?>?', 'URL'),
739+
740+
(r'^\(?<?https?://[a-zA-Z0-9_\-]+(\\.([a-zA-Z0-9_\-])+)+.?\)?>?$', 'URL'),
741+
742+
# URLS with trailing/ such as http://fedorahosted.org/lohit/
743+
# URLS with leading( such as (http://qbnz.com/highlighter/
744+
(r'\(?https?:.*/', 'URL'),
745+
746+
############################################################################
747+
# Back to COPYRIGHT patterns
748+
############################################################################
749+
708750
(r'^\(C\)\,?$', 'COPY'),
709751
(r'^\(c\)\,?$', 'COPY'),
710752

@@ -2258,44 +2300,6 @@ def build_detection_from_node(
22582300
# this was capturing AbCdEf or a bare comma.
22592301
(r'^([A-Z][a-z0-9]+){1,2}\.?,?$', 'NNP'),
22602302

2261-
############################################################################
2262-
# URLS and emails
2263-
############################################################################
2264-
2265-
# email start-at-end: <sebastian.classen at freenet.ag>: <EMAIL_START> <AT> <EMAIL_END>
2266-
(r'^<([a-zA-Z]+[a-zA-Z\.]){2,5}$', 'EMAIL_START'),
2267-
(r'^[a-zA-Z\.]{2,5}>$', 'EMAIL_END'),
2268-
2269-
# a .sh shell scripts is NOT an email.
2270-
(r'^.*\.sh\.?$', 'JUNK'),
2271-
# email eventually in parens or brackets with some trailing punct. Note the @ or "at "
2272-
(r'^(?:[A-Za-z])*[\<\(]?[a-zA-Z0-9]+[a-zA-Z0-9\+_\-\.\%]*(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]+\.[a-zA-Z]{2,3}[\>\)\.\,]*$', 'EMAIL'),
2273-
2274-
# mailto URLs
2275-
(r'^mailto:.{2,}@.{2,}\.[a-z]{2,3}', 'EMAIL'),
2276-
2277-
(r'^<[a-zA-Z]+[a-zA-Z0-9\.]+@[a-zA-Z][a-zA-Z0-9]+\.[a-zA-Z]{2,5}>$', 'EMAIL'),
2278-
2279-
# URLS such as <(http://fedorahosted.org/lohit)> or ()
2280-
(r'[<\(]https?:.*[>\)]', 'URL'),
2281-
# URLS such as ibm.com without a scheme
2282-
(r'\s?[a-z0-9A-Z\-\.\_]+\.([Cc][Oo][Mm]|[Nn][Ee][Tt]|[Oo][Rr][Gg]|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|de|be|se|nl|au|biz|sy|dev)\s?[\.,]?$', 'URL2'),
2283-
# TODO: add more extensions: there are so many TLDs these days!
2284-
# URL wrapped in () or <>
2285-
(r'[\(<]+\s?[a-z0-9A-Z\-\.\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)\s?[\.\)>]+$', 'URL'),
2286-
(r'<?a?.(href)?.\(?[a-z0-9A-Z\-\.\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)[\.\)>]?$', 'URL'),
2287-
# derived from regex in cluecode.finder
2288-
(r'<?a?.(href)?.('
2289-
r'(?:http|ftp|sftp)s?://[^\s<>\[\]"]+'
2290-
r'|(?:www|ftp)\.[^\s<>\[\]"]+'
2291-
r')\.?>?', 'URL'),
2292-
2293-
(r'^\(?<?https?://[a-zA-Z0-9_\-]+(\.([a-zA-Z0-9_\-])+)+.?\)?>?$', 'URL'),
2294-
2295-
# URLS with trailing/ such as http://fedorahosted.org/lohit/
2296-
# URLS with leading( such as (http://qbnz.com/highlighter/
2297-
(r'\(?https?:.*/', 'URL'),
2298-
22992303
############################################################################
23002304
# Misc
23012305
############################################################################
@@ -3783,6 +3787,10 @@ def refine_names(s, prefixes):
37833787
r'^\(c\) \(c\) B$',
37843788
r'^\(c\) group$',
37853789
r'^\(c\) \(c\) A$',
3790+
# URLs with (c) in path or query - these are false positives
3791+
r'.*https?://', # contains http:// or https://
3792+
r'.*/.*\(c\)', # has path-like structure with (c)
3793+
r'\(c\).*https?://', # (c) followed by URL
37863794
]
37873795

37883796
# a collection of junk junk matcher callables
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
http://biblio.cesga.es:81/search*gag/aXove,+Xosé/axove+xose/7,-1,0,B/frameset&F=axuntanza&1,,3
2+
http://example.com/path/(c)/test
3+
http://test.org/query?param=(c)&other=value

0 commit comments

Comments
 (0)