From 4446baae33b8654ec505df06cf95528eff5ccaf1 Mon Sep 17 00:00:00 2001 From: Jamey Sharp Date: Sun, 28 Dec 2014 19:16:31 -0800 Subject: [PATCH 1/7] Use cached responses if revalidation errors out. --- scrapy/downloadermiddlewares/httpcache.py | 16 ++++++++++++++++ scrapy/extensions/httpcache.py | 8 ++++++++ 2 files changed, 24 insertions(+) diff --git a/scrapy/downloadermiddlewares/httpcache.py b/scrapy/downloadermiddlewares/httpcache.py index bd112c48dd6..521327bfeea 100644 --- a/scrapy/downloadermiddlewares/httpcache.py +++ b/scrapy/downloadermiddlewares/httpcache.py @@ -1,11 +1,21 @@ from email.utils import formatdate +from twisted.internet import defer +from twisted.internet.error import TimeoutError, DNSLookupError, \ + ConnectionRefusedError, ConnectionDone, ConnectError, \ + ConnectionLost, TCPTimedOutError from scrapy import signals from scrapy.exceptions import NotConfigured, IgnoreRequest from scrapy.utils.misc import load_object +from scrapy.xlib.tx import ResponseFailed class HttpCacheMiddleware(object): + DOWNLOAD_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError, + ConnectionRefusedError, ConnectionDone, ConnectError, + ConnectionLost, TCPTimedOutError, ResponseFailed, + IOError) + def __init__(self, settings, stats): if not settings.getbool('HTTPCACHE_ENABLED'): raise NotConfigured @@ -84,6 +94,12 @@ def process_response(self, request, response, spider): self._cache_response(spider, response, request, cachedresponse) return response + def process_exception(self, request, exception, spider): + cachedresponse = request.meta.pop('cached_response', None) + if cachedresponse is not None and isinstance(exception, self.DOWNLOAD_EXCEPTIONS): + self.stats.inc_value('httpcache/errorrecovery', spider=spider) + return cachedresponse + def _cache_response(self, spider, response, request, cachedresponse): if self.policy.should_cache_response(response, request): self.stats.inc_value('httpcache/store', spider=spider) diff --git a/scrapy/extensions/httpcache.py b/scrapy/extensions/httpcache.py index 3173656fe38..8011581acf4 100644 --- a/scrapy/extensions/httpcache.py +++ b/scrapy/extensions/httpcache.py @@ -99,6 +99,14 @@ def is_cached_response_fresh(self, cachedresponse, request): return False def is_cached_response_valid(self, cachedresponse, response, request): + # Use the cached response if the new response is a server error, + # as long as the old response didn't specify must-revalidate. + if response.status >= 500: + cc = self._parse_cachecontrol(cachedresponse) + if 'must-revalidate' not in cc: + return True + + # Use the cached response if the server says it hasn't changed. return response.status == 304 def _set_conditional_validators(self, request, cachedresponse): From dd3a46295c069561b0c278a8af0db784b57a6416 Mon Sep 17 00:00:00 2001 From: Jamey Sharp Date: Sun, 28 Dec 2014 19:21:45 -0800 Subject: [PATCH 2/7] Support "Cache-Control: max-stale" in requests. This allows spiders to be configured with the full RFC2616 cache policy, but avoid revalidation on a request-by-request basis, while remaining conformant with the HTTP spec. --- scrapy/extensions/httpcache.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/scrapy/extensions/httpcache.py b/scrapy/extensions/httpcache.py index 8011581acf4..665ad3439f8 100644 --- a/scrapy/extensions/httpcache.py +++ b/scrapy/extensions/httpcache.py @@ -94,6 +94,25 @@ def is_cached_response_fresh(self, cachedresponse, request): currentage = self._compute_current_age(cachedresponse, request, now) if currentage < freshnesslifetime: return True + + if 'max-stale' in ccreq and 'must-revalidate' not in cc: + # From RFC2616: "Indicates that the client is willing to + # accept a response that has exceeded its expiration time. + # If max-stale is assigned a value, then the client is + # willing to accept a response that has exceeded its + # expiration time by no more than the specified number of + # seconds. If no value is assigned to max-stale, then the + # client is willing to accept a stale response of any age." + staleage = ccreq['max-stale'] + if staleage is None: + return True + + try: + if currentage < freshnesslifetime + max(0, int(staleage)): + return True + except ValueError: + pass + # Cached response is stale, try to set validators if any self._set_conditional_validators(request, cachedresponse) return False From e23a38133726b716f5931e59e163cfe70169d17c Mon Sep 17 00:00:00 2001 From: Jamey Sharp Date: Sun, 28 Dec 2014 19:43:16 -0800 Subject: [PATCH 3/7] Let spiders ignore bogus Cache-Control headers. Sites often set "no-store", "no-cache", "must-revalidate", etc., but get upset at the traffic a spider can generate if it respects those directives. Allow the spider's author to selectively ignore Cache-Control directives that are known to be unimportant for the sites being crawled. We assume that the spider will not issue Cache-Control directives in requests unless it actually needs them, so directives in requests are not filtered. --- scrapy/extensions/httpcache.py | 9 +++++++-- scrapy/settings/default_settings.py | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/scrapy/extensions/httpcache.py b/scrapy/extensions/httpcache.py index 665ad3439f8..c0efb899674 100644 --- a/scrapy/extensions/httpcache.py +++ b/scrapy/extensions/httpcache.py @@ -7,7 +7,7 @@ from weakref import WeakKeyDictionary from email.utils import mktime_tz, parsedate_tz from w3lib.http import headers_raw_to_dict, headers_dict_to_raw -from scrapy.http import Headers +from scrapy.http import Headers, Response from scrapy.responsetypes import responsetypes from scrapy.utils.request import request_fingerprint from scrapy.utils.project import data_path @@ -39,12 +39,17 @@ class RFC2616Policy(object): def __init__(self, settings): self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES') + self.ignore_response_cache_controls = settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS') self._cc_parsed = WeakKeyDictionary() def _parse_cachecontrol(self, r): if r not in self._cc_parsed: cch = r.headers.get('Cache-Control', '') - self._cc_parsed[r] = parse_cachecontrol(cch) + parsed = parse_cachecontrol(cch) + if isinstance(r, Response): + for key in self.ignore_response_cache_controls: + parsed.pop(key, None) + self._cc_parsed[r] = parsed return self._cc_parsed[r] def should_cache_request(self, request): diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 9debaabc30f..bd1bb0936b8 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -155,6 +155,7 @@ HTTPCACHE_EXPIRATION_SECS = 0 HTTPCACHE_IGNORE_HTTP_CODES = [] HTTPCACHE_IGNORE_SCHEMES = ['file'] +HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS = [] HTTPCACHE_DBM_MODULE = 'anydbm' HTTPCACHE_POLICY = 'scrapy.extensions.httpcache.DummyPolicy' HTTPCACHE_GZIP = False From c3b2cabf6c6600a5a2c6bbef2035ac7616ef6a06 Mon Sep 17 00:00:00 2001 From: Jamey Sharp Date: Sun, 28 Dec 2014 20:04:36 -0800 Subject: [PATCH 4/7] Allow setting RFC2616Policy to cache unconditionally. A spider may wish to have all responses available in the cache, for future use with "Cache-Control: max-stale", for instance. The DummyPolicy caches all responses but never revalidates them, and sometimes a more nuanced policy is desirable. This setting still respects "Cache-Control: no-store" directives in responses. If you don't want that, filter "no-store" out of the Cache-Control headers in responses you feed to the cache middleware. --- scrapy/extensions/httpcache.py | 4 ++++ scrapy/settings/default_settings.py | 1 + 2 files changed, 5 insertions(+) diff --git a/scrapy/extensions/httpcache.py b/scrapy/extensions/httpcache.py index c0efb899674..4276ec9286a 100644 --- a/scrapy/extensions/httpcache.py +++ b/scrapy/extensions/httpcache.py @@ -38,6 +38,7 @@ class RFC2616Policy(object): MAXAGE = 3600 * 24 * 365 # one year def __init__(self, settings): + self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE') self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES') self.ignore_response_cache_controls = settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS') self._cc_parsed = WeakKeyDictionary() @@ -73,6 +74,9 @@ def should_cache_response(self, response, request): # Never cache 304 (Not Modified) responses elif response.status == 304: return False + # Cache unconditionally if configured to do so + elif self.always_store: + return True # Any hint on response expiration is good elif 'max-age' in cc or 'Expires' in response.headers: return True diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index bd1bb0936b8..5f9f4b98ef2 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -153,6 +153,7 @@ HTTPCACHE_IGNORE_MISSING = False HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' HTTPCACHE_EXPIRATION_SECS = 0 +HTTPCACHE_ALWAYS_STORE = False HTTPCACHE_IGNORE_HTTP_CODES = [] HTTPCACHE_IGNORE_SCHEMES = ['file'] HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS = [] From 19915504422ccf6735b11f176ae8170c43562c29 Mon Sep 17 00:00:00 2001 From: Jamey Sharp Date: Mon, 29 Dec 2014 14:06:04 -0800 Subject: [PATCH 5/7] Allow client to bound max-age for revalidation. Unlike specifying "Cache-Control: no-cache", if the request specifies "max-age=0", then the cached validators will be used if possible to avoid re-fetching unchanged pages. That said, it's still useful to be able to specify "no-cache" on the request, in cases where the origin server may have changed page contents without changing validators. --- scrapy/extensions/httpcache.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/scrapy/extensions/httpcache.py b/scrapy/extensions/httpcache.py index 4276ec9286a..f33fcf8196b 100644 --- a/scrapy/extensions/httpcache.py +++ b/scrapy/extensions/httpcache.py @@ -101,6 +101,11 @@ def is_cached_response_fresh(self, cachedresponse, request): now = time() freshnesslifetime = self._compute_freshness_lifetime(cachedresponse, request, now) currentage = self._compute_current_age(cachedresponse, request, now) + + reqmaxage = self._get_max_age(ccreq) + if reqmaxage is not None: + freshnesslifetime = min(freshnesslifetime, reqmaxage) + if currentage < freshnesslifetime: return True @@ -144,15 +149,19 @@ def _set_conditional_validators(self, request, cachedresponse): if 'ETag' in cachedresponse.headers: request.headers['If-None-Match'] = cachedresponse.headers['ETag'] + def _get_max_age(self, cc): + try: + return max(0, int(cc['max-age'])) + except (KeyError, ValueError): + return None + def _compute_freshness_lifetime(self, response, request, now): # Reference nsHttpResponseHead::ComputeFreshnessLifetime # http://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#410 cc = self._parse_cachecontrol(response) - if 'max-age' in cc: - try: - return max(0, int(cc['max-age'])) - except ValueError: - pass + maxage = self._get_max_age(cc) + if maxage is not None: + return maxage # Parse date header or synthesize it if none exists date = rfc1123_to_epoch(response.headers.get('Date')) or now From bb3ebf13f97a06e8fce0e3dd6a734ab2f4a91fbd Mon Sep 17 00:00:00 2001 From: Marven Sanchez Date: Mon, 1 Jun 2015 18:20:12 +0800 Subject: [PATCH 6/7] Add tests for RFC2616 policy enhancements Add `scrapy/downloadermiddlewares/httpcache.py` to `tests/py3-ignores.txt --- tests/py3-ignores.txt | 1 + tests/test_downloadermiddleware_httpcache.py | 85 ++++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/tests/py3-ignores.txt b/tests/py3-ignores.txt index 3d87bcb9aee..141c44909a1 100644 --- a/tests/py3-ignores.txt +++ b/tests/py3-ignores.txt @@ -97,6 +97,7 @@ scrapy/contrib/downloadermiddleware/retry.py scrapy/contrib/downloadermiddleware/httpproxy.py scrapy/contrib/downloadermiddleware/cookies.py scrapy/downloadermiddlewares/retry.py +scrapy/downloadermiddlewares/httpcache.py scrapy/downloadermiddlewares/httpproxy.py scrapy/downloadermiddlewares/cookies.py scrapy/contrib/statsmailer.py diff --git a/tests/test_downloadermiddleware_httpcache.py b/tests/test_downloadermiddleware_httpcache.py index ac954cc1523..6c95e7b3a73 100644 --- a/tests/test_downloadermiddleware_httpcache.py +++ b/tests/test_downloadermiddleware_httpcache.py @@ -343,6 +343,25 @@ def test_response_cacheability(self): self.assertFalse(resc) assert 'cached' not in res2.flags + # cache unconditionally unless response contains no-store or is a 304 + with self._middleware(HTTPCACHE_ALWAYS_STORE=True) as mw: + for idx, (_, status, headers) in enumerate(responses): + shouldcache = 'no-store' not in headers.get('Cache-Control', '') and status != 304 + req0 = Request('http://example2-%d.com' % idx) + res0 = Response(req0.url, status=status, headers=headers) + res1 = self._process_requestresponse(mw, req0, res0) + res304 = res0.replace(status=304) + res2 = self._process_requestresponse(mw, req0, res304 if shouldcache else res0) + self.assertEqualResponse(res1, res0) + self.assertEqualResponse(res2, res0) + resc = mw.storage.retrieve_response(self.spider, req0) + if shouldcache: + self.assertEqualResponse(resc, res1) + assert 'cached' in res2.flags and res2.status != 304 + else: + self.assertFalse(resc) + assert 'cached' not in res2.flags + def test_cached_and_fresh(self): sampledata = [ (200, {'Date': self.yesterday, 'Expires': self.tomorrow}), @@ -381,6 +400,13 @@ def test_cached_and_fresh(self): res2 = self._process_requestresponse(mw, req0, None) self.assertEqualResponse(res1, res2) assert 'cached' in res2.flags + # validate cached response if request max-age set as 0 + req1 = req0.replace(headers={'Cache-Control': 'max-age=0'}) + res304 = res0.replace(status=304) + assert mw.process_request(req1, self.spider) is None + res3 = self._process_requestresponse(mw, req1, res304) + self.assertEqualResponse(res1, res3) + assert 'cached' in res3.flags def test_cached_and_stale(self): sampledata = [ @@ -395,6 +421,9 @@ def test_cached_and_stale(self): (200, {'Cache-Control': 'no-cache'}), (200, {'Cache-Control': 'no-cache', 'ETag': 'foo'}), (200, {'Cache-Control': 'no-cache', 'Last-Modified': self.yesterday}), + (200, {'Cache-Control': 'no-cache,must-revalidate', 'Last-Modified': self.yesterday}), + (200, {'Cache-Control': 'must-revalidate', 'Expires': self.yesterday, 'Last-Modified': self.yesterday}), + (200, {'Cache-Control': 'max-age=86400,must-revalidate', 'Age': '86405'}), ] with self._middleware() as mw: for idx, (status, headers) in enumerate(sampledata): @@ -410,6 +439,7 @@ def test_cached_and_stale(self): res2 = self._process_requestresponse(mw, req0, res0b) self.assertEqualResponse(res2, res0b) assert 'cached' not in res2.flags + cc = headers.get('Cache-Control', '') # Previous response expired too, subsequent request to same # resource must revalidate and succeed on 304 if validators # are present @@ -418,7 +448,62 @@ def test_cached_and_stale(self): res3 = self._process_requestresponse(mw, req0, res0c) self.assertEqualResponse(res3, res0b) assert 'cached' in res3.flags + # get cached response on server errors unless must-revalidate + # in cached response + res0d = res0b.replace(status=500) + res4 = self._process_requestresponse(mw, req0, res0d) + if 'must-revalidate' in cc: + assert 'cached' not in res4.flags + self.assertEqualResponse(res4, res0d) + else: + assert 'cached' in res4.flags + self.assertEqualResponse(res4, res0b) + # Requests with max-stale can fetch expired cached responses + # unless cached response has must-revalidate + req1 = req0.replace(headers={'Cache-Control': 'max-stale'}) + res5 = self._process_requestresponse(mw, req1, res0b) + self.assertEqualResponse(res5, res0b) + if 'no-cache' in cc or 'must-revalidate' in cc: + assert 'cached' not in res5.flags + else: + assert 'cached' in res5.flags + def test_process_exception(self): + with self._middleware() as mw: + res0 = Response(self.request.url, headers={'Expires': self.yesterday}) + req0 = Request(self.request.url) + self._process_requestresponse(mw, req0, res0) + for e in mw.DOWNLOAD_EXCEPTIONS: + # Simulate encountering an error on download attempts + assert mw.process_request(req0, self.spider) is None + res1 = mw.process_exception(req0, e('foo'), self.spider) + # Use cached response as recovery + assert 'cached' in res1.flags + self.assertEqualResponse(res0, res1) + # Do not use cached response for unhandled exceptions + mw.process_request(req0, self.spider) + assert mw.process_exception(req0, Exception('foo'), self.spider) is None + + def test_ignore_response_cache_controls(self): + sampledata = [ + (200, {'Date': self.yesterday, 'Expires': self.tomorrow}), + (200, {'Date': self.yesterday, 'Cache-Control': 'no-store,max-age=86405'}), + (200, {'Age': '299', 'Cache-Control': 'max-age=300,no-cache'}), + (300, {'Cache-Control': 'no-cache'}), + (200, {'Expires': self.tomorrow, 'Cache-Control': 'no-store'}), + ] + with self._middleware(HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS=['no-cache', 'no-store']) as mw: + for idx, (status, headers) in enumerate(sampledata): + req0 = Request('http://example-%d.com' % idx) + res0 = Response(req0.url, status=status, headers=headers) + # cache fresh response + res1 = self._process_requestresponse(mw, req0, res0) + self.assertEqualResponse(res1, res0) + assert 'cached' not in res1.flags + # return fresh cached response without network interaction + res2 = self._process_requestresponse(mw, req0, None) + self.assertEqualResponse(res1, res2) + assert 'cached' in res2.flags if __name__ == '__main__': unittest.main() From 8771d1f79bccd8163e08185e591fd702e9f0b715 Mon Sep 17 00:00:00 2001 From: Marven Sanchez Date: Mon, 1 Jun 2015 18:20:59 +0800 Subject: [PATCH 7/7] Update HTTPCache middleware docs --- docs/topics/downloader-middleware.rst | 51 +++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst index 5cb6c98240b..38533c47a20 100644 --- a/docs/topics/downloader-middleware.rst +++ b/docs/topics/downloader-middleware.rst @@ -373,6 +373,18 @@ what is implemented: * Revalidate stale responses based on `Last-Modified` response header * Revalidate stale responses based on `ETag` response header * Set `Date` header for any received response missing it +* Support `max-stale` cache-control directive in requests + + This allows spiders to be configured with the full RFC2616 cache policy, + but avoid revalidation on a request-by-request basis, while remaining + conformant with the HTTP spec. + + Example: + + Add `Cache-Control: max-stale=600` to Request headers to accept responses that + have exceeded their expiration time by no more than 600 seconds. + + See also: RFC2616, 14.9.3 what is missing: @@ -575,6 +587,45 @@ Default: ``False`` If enabled, will compress all cached data with gzip. This setting is specific to the Filesystem backend. +.. setting:: HTTPCACHE_ALWAYS_STORE + +HTTPCACHE_ALWAYS_STORE +^^^^^^^^^^^^^^^^^^^^^^ + +.. versionadded:: 0.25 + +Default: ``False`` + +If enabled, will cache pages unconditionally. + +A spider may wish to have all responses available in the cache, for +future use with `Cache-Control: max-stale`, for instance. The +DummyPolicy caches all responses but never revalidates them, and +sometimes a more nuanced policy is desirable. + +This setting still respects `Cache-Control: no-store` directives in responses. +If you don't want that, filter `no-store` out of the Cache-Control headers in +responses you feedto the cache middleware. + +.. setting:: HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS + +HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. versionadded:: 0.25 + +Default: ``[]`` + +List of Cache-Control directives in responses to be ignored. + +Sites often set "no-store", "no-cache", "must-revalidate", etc., but get +upset at the traffic a spider can generate if it respects those +directives. This allows to selectively ignore Cache-Control directives +that are known to be unimportant for the sites being crawled. + +We assume that the spider will not issue Cache-Control directives +in requests unless it actually needs them, so directives in requests are +not filtered. HttpCompressionMiddleware -------------------------