Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Escape dots in patterns #338

Merged
merged 3 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 24 additions & 24 deletions crawler-user-agents.json
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@
}
,
{
"pattern": "grub.org",
"pattern": "grub\\.org",
"instances": [
"Mozilla/4.0 (compatible; grub-client-0.3.0; Crawl your own stuff with http://grub.org)",
"Mozilla/4.0 (compatible; grub-client-1.0.4; Crawl your own stuff with http://grub.org)",
Expand Down Expand Up @@ -855,7 +855,7 @@
}
,
{
"pattern": "Mail.RU_Bot",
"pattern": "Mail\\.RU_Bot",
"addition_date": "2011/04/27",
"instances": [
"Mozilla/5.0 (compatible; Linux x86_64; Mail.RU_Bot/2.0; +http://go.mail.ru/help/robots)",
Expand Down Expand Up @@ -914,7 +914,7 @@
}
,
{
"pattern": "europarchive.org",
"pattern": "europarchive\\.org",
"addition_date": "2011/06/21",
"url": "",
"instances": [
Expand All @@ -923,7 +923,7 @@
}
,
{
"pattern": "NerdByNature.Bot",
"pattern": "NerdByNature\\.Bot",
"addition_date": "2011/07/12",
"url": "http://www.nerdbynature.net/bot",
"instances": [
Expand Down Expand Up @@ -1299,7 +1299,7 @@
}
,
{
"pattern": "web-archive-net.com.bot",
"pattern": "web-archive-net\\.com\\.bot",
"instances": []
}
,
Expand Down Expand Up @@ -1359,13 +1359,13 @@
}
,
{
"pattern": "ip-web-crawler.com",
"pattern": "ip-web-crawler\\.com",
"addition_date": "2013/03/22",
"instances": []
}
,
{
"pattern": "siteexplorer.info",
"pattern": "siteexplorer\\.info",
"addition_date": "2013/05/01",
"instances": [
"Mozilla/5.0 (compatible; SiteExplorer/1.0b; +http://siteexplorer.info/)",
Expand Down Expand Up @@ -1493,7 +1493,7 @@
}
,
{
"pattern": "g00g1e.net",
"pattern": "g00g1e\\.net",
"addition_date": "2014/04/01",
"url": "http://www.g00g1e.net/",
"instances": []
Expand Down Expand Up @@ -1584,7 +1584,7 @@
}
,
{
"pattern": "bnf.fr_bot",
"pattern": "bnf\\.fr_bot",
"addition_date": "2014/11/18",
"url": "http://www.bnf.fr/fr/outils/a.dl_web_capture_robot.html",
"instances": [
Expand Down Expand Up @@ -1715,7 +1715,7 @@
}
,
{
"pattern": "archive.org_bot",
"pattern": "archive\\.org_bot",
"url": "http://www.archive.org/details/archive.org_bot",
"depends_on": ["heritrix"],
"instances": [
Expand Down Expand Up @@ -1895,7 +1895,7 @@
}
,
{
"pattern": "[email protected]",
"pattern": "collection@infegy\\.com",
"url": "http://infegy.com/",
"instances": [
"Mozilla/5.0 (compatible) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36 [email protected]"
Expand Down Expand Up @@ -2179,7 +2179,7 @@
}
,
{
"pattern": "pinterest.com.bot",
"pattern": "pinterest\\.com\\/bot",
"addition_date": "2017/03/03",
"instances": [
"Mozilla/5.0 (compatible; Pinterestbot/1.0; +http://www.pinterest.com/bot.html)",
Expand Down Expand Up @@ -2805,7 +2805,7 @@
}
,
{
"pattern": "Traackr.com",
"pattern": "Traackr\\.com",
"addition_date": "2017/11/02",
"url": "Traackr.com",
"instances": [
Expand Down Expand Up @@ -2941,7 +2941,7 @@
}
,
{
"pattern": "filterdb.iss.net\\/crawler",
"pattern": "filterdb\\.iss\\.net\\/crawler",
"addition_date": "2018/03/16",
"instances": [
"Mozilla/5.0 (compatible; oBot/2.3.1; +http://filterdb.iss.net/crawler/)"
Expand Down Expand Up @@ -3210,7 +3210,7 @@
}
,
{
"pattern": "Bot.AraTurka.com",
"pattern": "Bot\\.AraTurka\\.com",
"addition_date": "2018/06/27",
"instances": [
"Bot.AraTurka.com/0.0.1"
Expand All @@ -3219,7 +3219,7 @@
}
,
{
"pattern": "bot-pge.chlooe.com",
"pattern": "bot-pge\\.chlooe\\.com",
"addition_date": "2018/06/27",
"instances": [
"bot-pge.chlooe.com/1.0.0 (+http://www.chlooe.com/)"
Expand Down Expand Up @@ -3397,7 +3397,7 @@
}
,
{
"pattern": "Siteimprove.com",
"pattern": "Siteimprove\\.com",
"addition_date": "2018/06/22",
"instances": [
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0) LinkCheck by Siteimprove.com",
Expand Down Expand Up @@ -3506,7 +3506,7 @@
}
,
{
"pattern": "PR-CY.RU",
"pattern": "PR-CY\\.RU",
"addition_date": "2018/08/30",
"instances": [
"Mozilla/5.0 (compatible; PR-CY.RU; + https://a.pr-cy.ru)"
Expand Down Expand Up @@ -3827,7 +3827,7 @@
]
},
{
"pattern": "Dataprovider.com",
"pattern": "Dataprovider\\.com",
"addition_date": "2018/11/24",
"instances": [
"Mozilla/5.0 (compatible; Dataprovider.com)"
Expand All @@ -3843,7 +3843,7 @@
"url": "http://www.grouphigh.com/"
},
{
"pattern": "theoldreader.com",
"pattern": "theoldreader\\.com",
"addition_date": "2018/12/02",
"instances": [
"Mozilla/5.0 (compatible; theoldreader.com)"
Expand Down Expand Up @@ -3879,7 +3879,7 @@
}
,
{
"pattern": "2ip.ru",
"pattern": "2ip\\.ru",
"addition_date": "2019/02/12",
"instances": [
"2ip.ru CMS Detector (https://2ip.ru/cms/)"
Expand Down Expand Up @@ -5000,7 +5000,7 @@
"url": "https://metrics-tools.de/robot.html"
},
{
"pattern": "hyscore.io",
"pattern": "hyscore\\.io",
"addition_date": "2023/09/08",
"instances": [
"Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12F70 Safari/600.1. 4 (compatible; HyScore/1.0; +https://hyscore.io/crawler/)"
Expand Down Expand Up @@ -5104,7 +5104,7 @@
"url": "https://torus.company/bot.html"
},
{
"pattern": "sempi.tech",
"pattern": "sempi\\.tech",
"addition_date": "2023/09/08",
"instances": [
"Mozilla/5.0 (compatible; Semanticbot/1.0; +http://sempi.tech/bot.html)"
Expand Down Expand Up @@ -5160,7 +5160,7 @@
"url": "https://opengraphcheck.com"
},
{
"pattern": "developers.google.com\\/\\+\\/web\\/snippet",
"pattern": "developers\\.google\\.com\\/\\+\\/web\\/snippet",
"addition_date": "2023/09/08",
"instances": [
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 Google-PageRenderer Google (+https://developers.google.com/+/web/snippet/)",
Expand Down
6 changes: 6 additions & 0 deletions validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ def main():
if re.search('[^\\\\]/', pattern):
raise ValueError('Pattern {!r} has an unescaped slash character'.format(pattern))

# check that no pattern contains unescaped dot .
for entry in json_data:
pattern = entry['pattern']
if re.search('[^\\\\]\\.', pattern):
raise ValueError('Pattern {!r} has an unescaped dot character'.format(pattern))

# check that we match the given instances
num_instances = 0
for entry in json_data:
Expand Down