diff --git a/__init__.py b/__init__.py index 6ffe061..6a7a7d0 100644 --- a/__init__.py +++ b/__init__.py @@ -1,22 +1,25 @@ -import crawleruseragents import re import json -from importlib import resources - +from pathlib import Path + + def load_json(): - return json.loads(resources.read_text(crawleruseragents,"crawler-user-agents.json")) + cwd = Path(__file__).parent + user_agents_file_path = cwd / "crawler-user-agents.json" + with user_agents_file_path.open() as patterns_file: + return json.load(patterns_file) -DATA = load_json() -def is_crawler(s): - # print(s) - for i in DATA: - test=re.search(i["pattern"],s,re.IGNORECASE) - if test: - return True - return False +CRAWLER_USER_AGENTS_DATA = load_json() -def is_crawler2(s): - regexp = re.compile("|".join([i["pattern"] for i in DATA])) - return regexp.search(s) != None +def is_crawler(user_agent: str) -> bool: + for crawler_user_agent in CRAWLER_USER_AGENTS_DATA: + if re.search(crawler_user_agent["pattern"], user_agent, re.IGNORECASE): + return True + return False + + +def is_crawler2(s): + regexp = re.compile("|".join([i["pattern"] for i in CRAWLER_USER_AGENTS_DATA])) + return regexp.search(s) is not None diff --git a/crawler-user-agents.json b/crawler-user-agents.json index b8fda48..9b76a83 100644 --- a/crawler-user-agents.json +++ b/crawler-user-agents.json @@ -12,8 +12,7 @@ "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Safari/537.36" ] - } - , + }, { "pattern": "Googlebot-Mobile", "instances": [