From 4446baae33b8654ec505df06cf95528eff5ccaf1 Mon Sep 17 00:00:00 2001
From: Jamey Sharp <jamey@minilop.net>
Date: Sun, 28 Dec 2014 19:16:31 -0800
Subject: [PATCH 1/7] Use cached responses if revalidation errors out.

---
 scrapy/downloadermiddlewares/httpcache.py | 16 ++++++++++++++++
 scrapy/extensions/httpcache.py            |  8 ++++++++
 2 files changed, 24 insertions(+)

diff --git a/scrapy/downloadermiddlewares/httpcache.py b/scrapy/downloadermiddlewares/httpcache.py
index bd112c48dd6..521327bfeea 100644
--- a/scrapy/downloadermiddlewares/httpcache.py
+++ b/scrapy/downloadermiddlewares/httpcache.py
@@ -1,11 +1,21 @@
 from email.utils import formatdate
+from twisted.internet import defer
+from twisted.internet.error import TimeoutError, DNSLookupError, \
+        ConnectionRefusedError, ConnectionDone, ConnectError, \
+        ConnectionLost, TCPTimedOutError
 from scrapy import signals
 from scrapy.exceptions import NotConfigured, IgnoreRequest
 from scrapy.utils.misc import load_object
+from scrapy.xlib.tx import ResponseFailed
 
 
 class HttpCacheMiddleware(object):
 
+    DOWNLOAD_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError,
+                           ConnectionRefusedError, ConnectionDone, ConnectError,
+                           ConnectionLost, TCPTimedOutError, ResponseFailed,
+                           IOError)
+
     def __init__(self, settings, stats):
         if not settings.getbool('HTTPCACHE_ENABLED'):
             raise NotConfigured
@@ -84,6 +94,12 @@ def process_response(self, request, response, spider):
         self._cache_response(spider, response, request, cachedresponse)
         return response
 
+    def process_exception(self, request, exception, spider):
+        cachedresponse = request.meta.pop('cached_response', None)
+        if cachedresponse is not None and isinstance(exception, self.DOWNLOAD_EXCEPTIONS):
+            self.stats.inc_value('httpcache/errorrecovery', spider=spider)
+            return cachedresponse
+
     def _cache_response(self, spider, response, request, cachedresponse):
         if self.policy.should_cache_response(response, request):
             self.stats.inc_value('httpcache/store', spider=spider)
diff --git a/scrapy/extensions/httpcache.py b/scrapy/extensions/httpcache.py
index 3173656fe38..8011581acf4 100644
--- a/scrapy/extensions/httpcache.py
+++ b/scrapy/extensions/httpcache.py
@@ -99,6 +99,14 @@ def is_cached_response_fresh(self, cachedresponse, request):
         return False
 
     def is_cached_response_valid(self, cachedresponse, response, request):
+        # Use the cached response if the new response is a server error,
+        # as long as the old response didn't specify must-revalidate.
+        if response.status >= 500:
+            cc = self._parse_cachecontrol(cachedresponse)
+            if 'must-revalidate' not in cc:
+                return True
+
+        # Use the cached response if the server says it hasn't changed.
         return response.status == 304
 
     def _set_conditional_validators(self, request, cachedresponse):

From dd3a46295c069561b0c278a8af0db784b57a6416 Mon Sep 17 00:00:00 2001
From: Jamey Sharp <jamey@minilop.net>
Date: Sun, 28 Dec 2014 19:21:45 -0800
Subject: [PATCH 2/7] Support "Cache-Control: max-stale" in requests.

This allows spiders to be configured with the full RFC2616 cache policy,
but avoid revalidation on a request-by-request basis, while remaining
conformant with the HTTP spec.
---
 scrapy/extensions/httpcache.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/scrapy/extensions/httpcache.py b/scrapy/extensions/httpcache.py
index 8011581acf4..665ad3439f8 100644
--- a/scrapy/extensions/httpcache.py
+++ b/scrapy/extensions/httpcache.py
@@ -94,6 +94,25 @@ def is_cached_response_fresh(self, cachedresponse, request):
         currentage = self._compute_current_age(cachedresponse, request, now)
         if currentage < freshnesslifetime:
             return True
+
+        if 'max-stale' in ccreq and 'must-revalidate' not in cc:
+            # From RFC2616: "Indicates that the client is willing to
+            # accept a response that has exceeded its expiration time.
+            # If max-stale is assigned a value, then the client is
+            # willing to accept a response that has exceeded its
+            # expiration time by no more than the specified number of
+            # seconds. If no value is assigned to max-stale, then the
+            # client is willing to accept a stale response of any age."
+            staleage = ccreq['max-stale']
+            if staleage is None:
+                return True
+
+            try:
+                if currentage < freshnesslifetime + max(0, int(staleage)):
+                    return True
+            except ValueError:
+                pass
+
         # Cached response is stale, try to set validators if any
         self._set_conditional_validators(request, cachedresponse)
         return False

From e23a38133726b716f5931e59e163cfe70169d17c Mon Sep 17 00:00:00 2001
From: Jamey Sharp <jamey@minilop.net>
Date: Sun, 28 Dec 2014 19:43:16 -0800
Subject: [PATCH 3/7] Let spiders ignore bogus Cache-Control headers.

Sites often set "no-store", "no-cache", "must-revalidate", etc., but get
upset at the traffic a spider can generate if it respects those
directives.

Allow the spider's author to selectively ignore Cache-Control directives
that are known to be unimportant for the sites being crawled.

We assume that the spider will not issue Cache-Control directives in
requests unless it actually needs them, so directives in requests are
not filtered.
---
 scrapy/extensions/httpcache.py      | 9 +++++++--
 scrapy/settings/default_settings.py | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/scrapy/extensions/httpcache.py b/scrapy/extensions/httpcache.py
index 665ad3439f8..c0efb899674 100644
--- a/scrapy/extensions/httpcache.py
+++ b/scrapy/extensions/httpcache.py
@@ -7,7 +7,7 @@
 from weakref import WeakKeyDictionary
 from email.utils import mktime_tz, parsedate_tz
 from w3lib.http import headers_raw_to_dict, headers_dict_to_raw
-from scrapy.http import Headers
+from scrapy.http import Headers, Response
 from scrapy.responsetypes import responsetypes
 from scrapy.utils.request import request_fingerprint
 from scrapy.utils.project import data_path
@@ -39,12 +39,17 @@ class RFC2616Policy(object):
 
     def __init__(self, settings):
         self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
+        self.ignore_response_cache_controls = settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')
         self._cc_parsed = WeakKeyDictionary()
 
     def _parse_cachecontrol(self, r):
         if r not in self._cc_parsed:
             cch = r.headers.get('Cache-Control', '')
-            self._cc_parsed[r] = parse_cachecontrol(cch)
+            parsed = parse_cachecontrol(cch)
+            if isinstance(r, Response):
+                for key in self.ignore_response_cache_controls:
+                    parsed.pop(key, None)
+            self._cc_parsed[r] = parsed
         return self._cc_parsed[r]
 
     def should_cache_request(self, request):
diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py
index 9debaabc30f..bd1bb0936b8 100644
--- a/scrapy/settings/default_settings.py
+++ b/scrapy/settings/default_settings.py
@@ -155,6 +155,7 @@
 HTTPCACHE_EXPIRATION_SECS = 0
 HTTPCACHE_IGNORE_HTTP_CODES = []
 HTTPCACHE_IGNORE_SCHEMES = ['file']
+HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS = []
 HTTPCACHE_DBM_MODULE = 'anydbm'
 HTTPCACHE_POLICY = 'scrapy.extensions.httpcache.DummyPolicy'
 HTTPCACHE_GZIP = False

From c3b2cabf6c6600a5a2c6bbef2035ac7616ef6a06 Mon Sep 17 00:00:00 2001
From: Jamey Sharp <jamey@minilop.net>
Date: Sun, 28 Dec 2014 20:04:36 -0800
Subject: [PATCH 4/7] Allow setting RFC2616Policy to cache unconditionally.

A spider may wish to have all responses available in the cache, for
future use with "Cache-Control: max-stale", for instance. The
DummyPolicy caches all responses but never revalidates them, and
sometimes a more nuanced policy is desirable.

This setting still respects "Cache-Control: no-store" directives in
responses. If you don't want that, filter "no-store" out of the
Cache-Control headers in responses you feed to the cache middleware.
---
 scrapy/extensions/httpcache.py      | 4 ++++
 scrapy/settings/default_settings.py | 1 +
 2 files changed, 5 insertions(+)

diff --git a/scrapy/extensions/httpcache.py b/scrapy/extensions/httpcache.py
index c0efb899674..4276ec9286a 100644
--- a/scrapy/extensions/httpcache.py
+++ b/scrapy/extensions/httpcache.py
@@ -38,6 +38,7 @@ class RFC2616Policy(object):
     MAXAGE = 3600 * 24 * 365  # one year
 
     def __init__(self, settings):
+        self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE')
         self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
         self.ignore_response_cache_controls = settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')
         self._cc_parsed = WeakKeyDictionary()
@@ -73,6 +74,9 @@ def should_cache_response(self, response, request):
         # Never cache 304 (Not Modified) responses
         elif response.status == 304:
             return False
+        # Cache unconditionally if configured to do so
+        elif self.always_store:
+            return True
         # Any hint on response expiration is good
         elif 'max-age' in cc or 'Expires' in response.headers:
             return True
diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py
index bd1bb0936b8..5f9f4b98ef2 100644
--- a/scrapy/settings/default_settings.py
+++ b/scrapy/settings/default_settings.py
@@ -153,6 +153,7 @@
 HTTPCACHE_IGNORE_MISSING = False
 HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 HTTPCACHE_EXPIRATION_SECS = 0
+HTTPCACHE_ALWAYS_STORE = False
 HTTPCACHE_IGNORE_HTTP_CODES = []
 HTTPCACHE_IGNORE_SCHEMES = ['file']
 HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS = []

From 19915504422ccf6735b11f176ae8170c43562c29 Mon Sep 17 00:00:00 2001
From: Jamey Sharp <jamey@minilop.net>
Date: Mon, 29 Dec 2014 14:06:04 -0800
Subject: [PATCH 5/7] Allow client to bound max-age for revalidation.

Unlike specifying "Cache-Control: no-cache", if the request specifies
"max-age=0", then the cached validators will be used if possible to
avoid re-fetching unchanged pages.

That said, it's still useful to be able to specify "no-cache" on the
request, in cases where the origin server may have changed page contents
without changing validators.
---
 scrapy/extensions/httpcache.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/scrapy/extensions/httpcache.py b/scrapy/extensions/httpcache.py
index 4276ec9286a..f33fcf8196b 100644
--- a/scrapy/extensions/httpcache.py
+++ b/scrapy/extensions/httpcache.py
@@ -101,6 +101,11 @@ def is_cached_response_fresh(self, cachedresponse, request):
         now = time()
         freshnesslifetime = self._compute_freshness_lifetime(cachedresponse, request, now)
         currentage = self._compute_current_age(cachedresponse, request, now)
+
+        reqmaxage = self._get_max_age(ccreq)
+        if reqmaxage is not None:
+            freshnesslifetime = min(freshnesslifetime, reqmaxage)
+
         if currentage < freshnesslifetime:
             return True
 
@@ -144,15 +149,19 @@ def _set_conditional_validators(self, request, cachedresponse):
         if 'ETag' in cachedresponse.headers:
             request.headers['If-None-Match'] = cachedresponse.headers['ETag']
 
+    def _get_max_age(self, cc):
+        try:
+            return max(0, int(cc['max-age']))
+        except (KeyError, ValueError):
+            return None
+
     def _compute_freshness_lifetime(self, response, request, now):
         # Reference nsHttpResponseHead::ComputeFreshnessLifetime
         # http://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#410
         cc = self._parse_cachecontrol(response)
-        if 'max-age' in cc:
-            try:
-                return max(0, int(cc['max-age']))
-            except ValueError:
-                pass
+        maxage = self._get_max_age(cc)
+        if maxage is not None:
+            return maxage
 
         # Parse date header or synthesize it if none exists
         date = rfc1123_to_epoch(response.headers.get('Date')) or now

From bb3ebf13f97a06e8fce0e3dd6a734ab2f4a91fbd Mon Sep 17 00:00:00 2001
From: Marven Sanchez <marven.sanchez@gmail.com>
Date: Mon, 1 Jun 2015 18:20:12 +0800
Subject: [PATCH 6/7] Add tests for RFC2616 policy enhancements Add
 `scrapy/downloadermiddlewares/httpcache.py` to `tests/py3-ignores.txt

---
 tests/py3-ignores.txt                        |  1 +
 tests/test_downloadermiddleware_httpcache.py | 85 ++++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/tests/py3-ignores.txt b/tests/py3-ignores.txt
index 3d87bcb9aee..141c44909a1 100644
--- a/tests/py3-ignores.txt
+++ b/tests/py3-ignores.txt
@@ -97,6 +97,7 @@ scrapy/contrib/downloadermiddleware/retry.py
 scrapy/contrib/downloadermiddleware/httpproxy.py
 scrapy/contrib/downloadermiddleware/cookies.py
 scrapy/downloadermiddlewares/retry.py
+scrapy/downloadermiddlewares/httpcache.py
 scrapy/downloadermiddlewares/httpproxy.py
 scrapy/downloadermiddlewares/cookies.py
 scrapy/contrib/statsmailer.py
diff --git a/tests/test_downloadermiddleware_httpcache.py b/tests/test_downloadermiddleware_httpcache.py
index ac954cc1523..6c95e7b3a73 100644
--- a/tests/test_downloadermiddleware_httpcache.py
+++ b/tests/test_downloadermiddleware_httpcache.py
@@ -343,6 +343,25 @@ def test_response_cacheability(self):
                     self.assertFalse(resc)
                     assert 'cached' not in res2.flags
 
+        # cache unconditionally unless response contains no-store or is a 304
+        with self._middleware(HTTPCACHE_ALWAYS_STORE=True) as mw:
+            for idx, (_, status, headers) in enumerate(responses):
+                shouldcache = 'no-store' not in headers.get('Cache-Control', '') and status != 304
+                req0 = Request('http://example2-%d.com' % idx)
+                res0 = Response(req0.url, status=status, headers=headers)
+                res1 = self._process_requestresponse(mw, req0, res0)
+                res304 = res0.replace(status=304)
+                res2 = self._process_requestresponse(mw, req0, res304 if shouldcache else res0)
+                self.assertEqualResponse(res1, res0)
+                self.assertEqualResponse(res2, res0)
+                resc = mw.storage.retrieve_response(self.spider, req0)
+                if shouldcache:
+                    self.assertEqualResponse(resc, res1)
+                    assert 'cached' in res2.flags and res2.status != 304
+                else:
+                    self.assertFalse(resc)
+                    assert 'cached' not in res2.flags
+
     def test_cached_and_fresh(self):
         sampledata = [
             (200, {'Date': self.yesterday, 'Expires': self.tomorrow}),
@@ -381,6 +400,13 @@ def test_cached_and_fresh(self):
                 res2 = self._process_requestresponse(mw, req0, None)
                 self.assertEqualResponse(res1, res2)
                 assert 'cached' in res2.flags
+                # validate cached response if request max-age set as 0
+                req1 = req0.replace(headers={'Cache-Control': 'max-age=0'})
+                res304 = res0.replace(status=304)
+                assert mw.process_request(req1, self.spider) is None
+                res3 = self._process_requestresponse(mw, req1, res304)
+                self.assertEqualResponse(res1, res3)
+                assert 'cached' in res3.flags
 
     def test_cached_and_stale(self):
         sampledata = [
@@ -395,6 +421,9 @@ def test_cached_and_stale(self):
             (200, {'Cache-Control': 'no-cache'}),
             (200, {'Cache-Control': 'no-cache', 'ETag': 'foo'}),
             (200, {'Cache-Control': 'no-cache', 'Last-Modified': self.yesterday}),
+            (200, {'Cache-Control': 'no-cache,must-revalidate', 'Last-Modified': self.yesterday}),
+            (200, {'Cache-Control': 'must-revalidate', 'Expires': self.yesterday, 'Last-Modified': self.yesterday}),
+            (200, {'Cache-Control': 'max-age=86400,must-revalidate', 'Age': '86405'}),
         ]
         with self._middleware() as mw:
             for idx, (status, headers) in enumerate(sampledata):
@@ -410,6 +439,7 @@ def test_cached_and_stale(self):
                 res2 = self._process_requestresponse(mw, req0, res0b)
                 self.assertEqualResponse(res2, res0b)
                 assert 'cached' not in res2.flags
+                cc = headers.get('Cache-Control', '')
                 # Previous response expired too, subsequent request to same
                 # resource must revalidate and succeed on 304 if validators
                 # are present
@@ -418,7 +448,62 @@ def test_cached_and_stale(self):
                     res3 = self._process_requestresponse(mw, req0, res0c)
                     self.assertEqualResponse(res3, res0b)
                     assert 'cached' in res3.flags
+                    # get cached response on server errors unless must-revalidate
+                    # in cached response
+                    res0d = res0b.replace(status=500)
+                    res4 = self._process_requestresponse(mw, req0, res0d)
+                    if 'must-revalidate' in cc:
+                        assert 'cached' not in res4.flags
+                        self.assertEqualResponse(res4, res0d)
+                    else:
+                        assert 'cached' in res4.flags
+                        self.assertEqualResponse(res4, res0b)
+                # Requests with max-stale can fetch expired cached responses
+                # unless cached response has must-revalidate
+                req1 = req0.replace(headers={'Cache-Control': 'max-stale'})
+                res5 = self._process_requestresponse(mw, req1, res0b)
+                self.assertEqualResponse(res5, res0b)
+                if 'no-cache' in cc or 'must-revalidate' in cc:
+                    assert 'cached' not in res5.flags
+                else:
+                    assert 'cached' in res5.flags
 
+    def test_process_exception(self):
+        with self._middleware() as mw:
+            res0 = Response(self.request.url, headers={'Expires': self.yesterday})
+            req0 = Request(self.request.url)
+            self._process_requestresponse(mw, req0, res0)
+            for e in mw.DOWNLOAD_EXCEPTIONS:
+                # Simulate encountering an error on download attempts
+                assert mw.process_request(req0, self.spider) is None
+                res1 = mw.process_exception(req0, e('foo'), self.spider)
+                # Use cached response as recovery
+                assert 'cached' in res1.flags
+                self.assertEqualResponse(res0, res1)
+            # Do not use cached response for unhandled exceptions
+            mw.process_request(req0, self.spider)
+            assert mw.process_exception(req0, Exception('foo'), self.spider) is None
+
+    def test_ignore_response_cache_controls(self):
+        sampledata = [
+            (200, {'Date': self.yesterday, 'Expires': self.tomorrow}),
+            (200, {'Date': self.yesterday, 'Cache-Control': 'no-store,max-age=86405'}),
+            (200, {'Age': '299', 'Cache-Control': 'max-age=300,no-cache'}),
+            (300, {'Cache-Control': 'no-cache'}),
+            (200, {'Expires': self.tomorrow, 'Cache-Control': 'no-store'}),
+        ]
+        with self._middleware(HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS=['no-cache', 'no-store']) as mw:
+            for idx, (status, headers) in enumerate(sampledata):
+                req0 = Request('http://example-%d.com' % idx)
+                res0 = Response(req0.url, status=status, headers=headers)
+                # cache fresh response
+                res1 = self._process_requestresponse(mw, req0, res0)
+                self.assertEqualResponse(res1, res0)
+                assert 'cached' not in res1.flags
+                # return fresh cached response without network interaction
+                res2 = self._process_requestresponse(mw, req0, None)
+                self.assertEqualResponse(res1, res2)
+                assert 'cached' in res2.flags
 
 if __name__ == '__main__':
     unittest.main()

From 8771d1f79bccd8163e08185e591fd702e9f0b715 Mon Sep 17 00:00:00 2001
From: Marven Sanchez <marven.sanchez@gmail.com>
Date: Mon, 1 Jun 2015 18:20:59 +0800
Subject: [PATCH 7/7] Update HTTPCache middleware docs

---
 docs/topics/downloader-middleware.rst | 51 +++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst
index 5cb6c98240b..38533c47a20 100644
--- a/docs/topics/downloader-middleware.rst
+++ b/docs/topics/downloader-middleware.rst
@@ -373,6 +373,18 @@ what is implemented:
 * Revalidate stale responses based on `Last-Modified` response header
 * Revalidate stale responses based on `ETag` response header
 * Set `Date` header for any received response missing it
+* Support `max-stale` cache-control directive in requests
+
+  This allows spiders to be configured with the full RFC2616 cache policy,
+  but avoid revalidation on a request-by-request basis, while remaining
+  conformant with the HTTP spec.
+
+  Example:
+
+  Add `Cache-Control: max-stale=600` to Request headers to accept responses that
+  have exceeded their expiration time by no more than 600 seconds.
+
+  See also: RFC2616, 14.9.3
 
 what is missing:
 
@@ -575,6 +587,45 @@ Default: ``False``
 If enabled, will compress all cached data with gzip.
 This setting is specific to the Filesystem backend.
 
+.. setting:: HTTPCACHE_ALWAYS_STORE
+
+HTTPCACHE_ALWAYS_STORE
+^^^^^^^^^^^^^^^^^^^^^^
+
+.. versionadded:: 0.25
+
+Default: ``False``
+
+If enabled, will cache pages unconditionally.
+
+A spider may wish to have all responses available in the cache, for
+future use with `Cache-Control: max-stale`, for instance. The
+DummyPolicy caches all responses but never revalidates them, and
+sometimes a more nuanced policy is desirable.
+
+This setting still respects `Cache-Control: no-store` directives in responses.
+If you don't want that, filter `no-store` out of the Cache-Control headers in
+responses you feedto the cache middleware.
+
+.. setting:: HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS
+
+HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. versionadded:: 0.25
+
+Default: ``[]``
+
+List of Cache-Control directives in responses to be ignored.
+
+Sites often set "no-store", "no-cache", "must-revalidate", etc., but get
+upset at the traffic a spider can generate if it respects those
+directives. This allows to selectively ignore Cache-Control directives
+that are known to be unimportant for the sites being crawled.
+
+We assume that the spider will not issue Cache-Control directives
+in requests unless it actually needs them, so directives in requests are
+not filtered.
 
 HttpCompressionMiddleware
 -------------------------