Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: make case sensitivity optional #382

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 25 additions & 8 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,25 @@
import re
import json
from functools import cached_property
from pathlib import Path


class CrawlerPatterns:
def __init__(self):
pass

@cached_property
def case_insensitive(self):
return re.compile(
"|".join(i["pattern"] for i in CRAWLER_USER_AGENTS_DATA),
re.IGNORECASE
)

@cached_property
def case_sensitive(self):
return re.compile("|".join(i["pattern"] for i in CRAWLER_USER_AGENTS_DATA))


def load_json():
cwd = Path(__file__).parent
user_agents_file_path = cwd / "crawler-user-agents.json"
Expand All @@ -11,24 +28,24 @@ def load_json():


CRAWLER_USER_AGENTS_DATA = load_json()
CRAWLER_USER_AGENTS_REGEXP = re.compile(
"|".join(i["pattern"] for i in CRAWLER_USER_AGENTS_DATA)
)
CRAWLER_PATTERNS = CrawlerPatterns()


def is_crawler(user_agent: str) -> bool:
def is_crawler(user_agent: str, case_sensitive: bool = True) -> bool:
"""Return True if the given User-Agent matches a known crawler."""
return bool(CRAWLER_USER_AGENTS_REGEXP.search(user_agent))
if case_sensitive:
return bool(re.search(CRAWLER_PATTERNS.case_sensitive, user_agent))
return bool(re.search(CRAWLER_PATTERNS.case_insensitive, user_agent))


def matching_crawlers(user_agent: str) -> list[int]:
def matching_crawlers(user_agent: str, case_sensitive: bool = True) -> list[int]:
"""
Return a list of the indices in CRAWLER_USER_AGENTS_DATA of any crawlers
matching the given User-Agent.
"""
result = []
if is_crawler(user_agent):
if is_crawler(user_agent, case_sensitive):
for num, crawler_user_agent in enumerate(CRAWLER_USER_AGENTS_DATA):
if re.search(crawler_user_agent["pattern"], user_agent):
if re.search(crawler_user_agent["pattern"], user_agent, 0 if case_sensitive else re.IGNORECASE):
result.append(num)
return result
24 changes: 19 additions & 5 deletions test_harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,42 @@
$ pytest test_harness.py

"""
from crawleruseragents import is_crawler, matching_crawlers

from crawleruseragents import is_crawler, matching_crawlers

def test_match():
assert is_crawler("test Googlebot/2.0 test") is True


def test_nomatch():
assert is_crawler("!!!!!!!!!!!!") is False


def test_case():
def test_case_sensitive():
assert is_crawler("test googlebot/2.0 test") is False


def test_matching_crawlers_match():
def test_case_insensitive():
assert is_crawler("test googlebot/2.0 test", case_sensitive=False) is True


def test_matching_crawlers_match_case_sensitive():
result = matching_crawlers("test Googlebot/2.0 test")
assert isinstance(result, list)
assert len(result) > 0
assert all(isinstance(val, int) for val in result)


def test_matching_crawlers_match_case_insensitive():
result = matching_crawlers("test googlebot/2.0 test", False)
assert isinstance(result, list)
assert len(result) > 0
assert all(isinstance(val, int) for val in result)

def test_matching_crawlers_match_lower_case_agent():
result = matching_crawlers("test googlebot/2.0 test")
assert isinstance(result, list)
assert len(result) == 0


def test_matching_crawlers_nomatch():
result = matching_crawlers("!!!!!!!!!!!!")
assert isinstance(result, list)
Expand Down
Loading