Skip to content

Commit

Permalink
Merge pull request #73 from SheezZarR/master
Browse files Browse the repository at this point in the history
fix: explicit fingerprinter param in custom dupefilter
jasonbosco authored Dec 9, 2024
2 parents 7a34f9c + 3b8f652 commit f07889e
Showing 3 changed files with 409 additions and 326 deletions.
26 changes: 13 additions & 13 deletions Pipfile
Original file line number Diff line number Diff line change
@@ -4,23 +4,23 @@ verify_ssl = true
name = "pypi"

[packages]
Scrapy = ">=2.2.1"
pyperclip = ">=1.5.27"
python-dotenv = ">=0.7.1"
ratelimit = ">=1.4.1"
selenium = ">=4.4.3"
slacker = ">=0.9.60"
tldextract = ">=2.1.0"
pytest = ">=3.10.0"
requests-iap = ">=0.2.0"
python-helpscout-v2 = ">=1.0.1"
algoliasearch = ">=2.0,<3.0"
Scrapy = "==2.12.0"
pyperclip = "==1.9.0"
python-dotenv = "==1.0.1"
ratelimit = "==2.2.1"
selenium = "==4.27.0"
slacker = "==0.14.0"
tldextract = "==5.1.3"
pytest = "==8.3.3"
requests-iap = "==0.2.0"
python-helpscout-v2 = "==2.0.0"
algoliasearch = "==2.6.3"
typesense = "==0.10.0"
python-keycloak-client = "==0.2.3"
webdriver-manager = ">=4.0.2"
webdriver-manager = "==4.0.2"

[dev-packages]
pylint = ">=2.3.1"
pylint = "==3.2.6"

[requires]
python_version = "3.10"
701 changes: 391 additions & 310 deletions Pipfile.lock

Large diffs are not rendered by default.

8 changes: 5 additions & 3 deletions scraper/src/custom_dupefilter.py
Original file line number Diff line number Diff line change
@@ -56,15 +56,15 @@ def custom_request_fingerprint(self, request, include_headers=None,
cache[include_headers] = fp.hexdigest()
return cache[include_headers]

def __init__(self, path=None, debug=False, use_anchors=False):
super(CustomDupeFilter, self).__init__(path=path, debug=debug)
def __init__(self, path=None, debug=False, use_anchors=False, fingerprinter=None):
super(CustomDupeFilter, self).__init__(path=path, debug=debug, fingerprinter=fingerprinter)
# Spread config bool
self.use_anchors = use_anchors
self.fingerprints_with_scheme = set() # This set will not be scheme agnostic

# Overridden method in order to add the use_anchors attribute
@classmethod
def from_settings(cls, settings):
def from_settings(cls, settings, fingerprinter=None):
debug = settings.getbool('DUPEFILTER_DEBUG')
use_anchors = settings.getbool('DUPEFILTER_USE_ANCHORS')
return cls(job_dir(settings), debug, use_anchors)
@@ -97,6 +97,8 @@ def request_seen(self, request):
self.register_fingerprint(fp)
self.fingerprints_with_scheme.add(fp_with_scheme)

return False

def register_fingerprint(self, fp):
self.fingerprints.add(fp)
if self.file:

0 comments on commit f07889e

Please sign in to comment.