diff --git a/__init__.py b/__init__.py index 942603e..5b02fa4 100644 --- a/__init__.py +++ b/__init__.py @@ -1,8 +1,25 @@ import re import json +from functools import cached_property from pathlib import Path +class CrawlerPatterns: + def __init__(self): + pass + + @cached_property + def case_insensitive(self): + return re.compile( + "|".join(i["pattern"] for i in CRAWLER_USER_AGENTS_DATA), + re.IGNORECASE + ) + + @cached_property + def case_sensitive(self): + return re.compile("|".join(i["pattern"] for i in CRAWLER_USER_AGENTS_DATA)) + + def load_json(): cwd = Path(__file__).parent user_agents_file_path = cwd / "crawler-user-agents.json" @@ -11,24 +28,24 @@ def load_json(): CRAWLER_USER_AGENTS_DATA = load_json() -CRAWLER_USER_AGENTS_REGEXP = re.compile( - "|".join(i["pattern"] for i in CRAWLER_USER_AGENTS_DATA) -) +CRAWLER_PATTERNS = CrawlerPatterns() -def is_crawler(user_agent: str) -> bool: +def is_crawler(user_agent: str, case_sensitive: bool = True) -> bool: """Return True if the given User-Agent matches a known crawler.""" - return bool(CRAWLER_USER_AGENTS_REGEXP.search(user_agent)) + if case_sensitive: + return bool(re.search(CRAWLER_PATTERNS.case_sensitive, user_agent)) + return bool(re.search(CRAWLER_PATTERNS.case_insensitive, user_agent)) -def matching_crawlers(user_agent: str) -> list[int]: +def matching_crawlers(user_agent: str, case_sensitive: bool = True) -> list[int]: """ Return a list of the indices in CRAWLER_USER_AGENTS_DATA of any crawlers matching the given User-Agent. """ result = [] - if is_crawler(user_agent): + if is_crawler(user_agent, case_sensitive): for num, crawler_user_agent in enumerate(CRAWLER_USER_AGENTS_DATA): - if re.search(crawler_user_agent["pattern"], user_agent): + if re.search(crawler_user_agent["pattern"], user_agent, 0 if case_sensitive else re.IGNORECASE): result.append(num) return result diff --git a/test_harness.py b/test_harness.py index ec3e551..ecf0016 100644 --- a/test_harness.py +++ b/test_harness.py @@ -5,28 +5,42 @@ $ pytest test_harness.py """ -from crawleruseragents import is_crawler, matching_crawlers +from crawleruseragents import is_crawler, matching_crawlers -def test_match(): - assert is_crawler("test Googlebot/2.0 test") is True def test_nomatch(): assert is_crawler("!!!!!!!!!!!!") is False -def test_case(): +def test_case_sensitive(): assert is_crawler("test googlebot/2.0 test") is False -def test_matching_crawlers_match(): +def test_case_insensitive(): + assert is_crawler("test googlebot/2.0 test", case_sensitive=False) is True + + +def test_matching_crawlers_match_case_sensitive(): result = matching_crawlers("test Googlebot/2.0 test") assert isinstance(result, list) assert len(result) > 0 assert all(isinstance(val, int) for val in result) +def test_matching_crawlers_match_case_insensitive(): + result = matching_crawlers("test googlebot/2.0 test", False) + assert isinstance(result, list) + assert len(result) > 0 + assert all(isinstance(val, int) for val in result) + +def test_matching_crawlers_match_lower_case_agent(): + result = matching_crawlers("test googlebot/2.0 test") + assert isinstance(result, list) + assert len(result) == 0 + + def test_matching_crawlers_nomatch(): result = matching_crawlers("!!!!!!!!!!!!") assert isinstance(result, list)