Skip to content

Commit

Permalink
Merge pull request scrapy#1151 from marven/cache-control
Browse files Browse the repository at this point in the history
[MRG+1] RFC2616 policy enhancements + tests
  • Loading branch information
curita committed Jul 11, 2015
2 parents 6ac971e + 8771d1f commit d706310
Show file tree
Hide file tree
Showing 6 changed files with 207 additions and 7 deletions.
51 changes: 51 additions & 0 deletions docs/topics/downloader-middleware.rst
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,18 @@ what is implemented:
* Revalidate stale responses based on `Last-Modified` response header
* Revalidate stale responses based on `ETag` response header
* Set `Date` header for any received response missing it
* Support `max-stale` cache-control directive in requests

This allows spiders to be configured with the full RFC2616 cache policy,
but avoid revalidation on a request-by-request basis, while remaining
conformant with the HTTP spec.

Example:

Add `Cache-Control: max-stale=600` to Request headers to accept responses that
have exceeded their expiration time by no more than 600 seconds.

See also: RFC2616, 14.9.3

what is missing:

Expand Down Expand Up @@ -575,6 +587,45 @@ Default: ``False``
If enabled, will compress all cached data with gzip.
This setting is specific to the Filesystem backend.

.. setting:: HTTPCACHE_ALWAYS_STORE

HTTPCACHE_ALWAYS_STORE
^^^^^^^^^^^^^^^^^^^^^^

.. versionadded:: 0.25

Default: ``False``

If enabled, will cache pages unconditionally.

A spider may wish to have all responses available in the cache, for
future use with `Cache-Control: max-stale`, for instance. The
DummyPolicy caches all responses but never revalidates them, and
sometimes a more nuanced policy is desirable.

This setting still respects `Cache-Control: no-store` directives in responses.
If you don't want that, filter `no-store` out of the Cache-Control headers in
responses you feedto the cache middleware.

.. setting:: HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS

HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. versionadded:: 0.25

Default: ``[]``

List of Cache-Control directives in responses to be ignored.

Sites often set "no-store", "no-cache", "must-revalidate", etc., but get
upset at the traffic a spider can generate if it respects those
directives. This allows to selectively ignore Cache-Control directives
that are known to be unimportant for the sites being crawled.

We assume that the spider will not issue Cache-Control directives
in requests unless it actually needs them, so directives in requests are
not filtered.

HttpCompressionMiddleware
-------------------------
Expand Down
16 changes: 16 additions & 0 deletions scrapy/downloadermiddlewares/httpcache.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
from email.utils import formatdate
from twisted.internet import defer
from twisted.internet.error import TimeoutError, DNSLookupError, \
ConnectionRefusedError, ConnectionDone, ConnectError, \
ConnectionLost, TCPTimedOutError
from scrapy import signals
from scrapy.exceptions import NotConfigured, IgnoreRequest
from scrapy.utils.misc import load_object
from scrapy.xlib.tx import ResponseFailed


class HttpCacheMiddleware(object):

DOWNLOAD_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError,
ConnectionRefusedError, ConnectionDone, ConnectError,
ConnectionLost, TCPTimedOutError, ResponseFailed,
IOError)

def __init__(self, settings, stats):
if not settings.getbool('HTTPCACHE_ENABLED'):
raise NotConfigured
Expand Down Expand Up @@ -84,6 +94,12 @@ def process_response(self, request, response, spider):
self._cache_response(spider, response, request, cachedresponse)
return response

def process_exception(self, request, exception, spider):
cachedresponse = request.meta.pop('cached_response', None)
if cachedresponse is not None and isinstance(exception, self.DOWNLOAD_EXCEPTIONS):
self.stats.inc_value('httpcache/errorrecovery', spider=spider)
return cachedresponse

def _cache_response(self, spider, response, request, cachedresponse):
if self.policy.should_cache_response(response, request):
self.stats.inc_value('httpcache/store', spider=spider)
Expand Down
59 changes: 52 additions & 7 deletions scrapy/extensions/httpcache.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from weakref import WeakKeyDictionary
from email.utils import mktime_tz, parsedate_tz
from w3lib.http import headers_raw_to_dict, headers_dict_to_raw
from scrapy.http import Headers
from scrapy.http import Headers, Response
from scrapy.responsetypes import responsetypes
from scrapy.utils.request import request_fingerprint
from scrapy.utils.project import data_path
Expand Down Expand Up @@ -38,13 +38,19 @@ class RFC2616Policy(object):
MAXAGE = 3600 * 24 * 365 # one year

def __init__(self, settings):
self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE')
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
self.ignore_response_cache_controls = settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')
self._cc_parsed = WeakKeyDictionary()

def _parse_cachecontrol(self, r):
if r not in self._cc_parsed:
cch = r.headers.get('Cache-Control', '')
self._cc_parsed[r] = parse_cachecontrol(cch)
parsed = parse_cachecontrol(cch)
if isinstance(r, Response):
for key in self.ignore_response_cache_controls:
parsed.pop(key, None)
self._cc_parsed[r] = parsed
return self._cc_parsed[r]

def should_cache_request(self, request):
Expand All @@ -68,6 +74,9 @@ def should_cache_response(self, response, request):
# Never cache 304 (Not Modified) responses
elif response.status == 304:
return False
# Cache unconditionally if configured to do so
elif self.always_store:
return True
# Any hint on response expiration is good
elif 'max-age' in cc or 'Expires' in response.headers:
return True
Expand All @@ -92,13 +101,45 @@ def is_cached_response_fresh(self, cachedresponse, request):
now = time()
freshnesslifetime = self._compute_freshness_lifetime(cachedresponse, request, now)
currentage = self._compute_current_age(cachedresponse, request, now)

reqmaxage = self._get_max_age(ccreq)
if reqmaxage is not None:
freshnesslifetime = min(freshnesslifetime, reqmaxage)

if currentage < freshnesslifetime:
return True

if 'max-stale' in ccreq and 'must-revalidate' not in cc:
# From RFC2616: "Indicates that the client is willing to
# accept a response that has exceeded its expiration time.
# If max-stale is assigned a value, then the client is
# willing to accept a response that has exceeded its
# expiration time by no more than the specified number of
# seconds. If no value is assigned to max-stale, then the
# client is willing to accept a stale response of any age."
staleage = ccreq['max-stale']
if staleage is None:
return True

try:
if currentage < freshnesslifetime + max(0, int(staleage)):
return True
except ValueError:
pass

# Cached response is stale, try to set validators if any
self._set_conditional_validators(request, cachedresponse)
return False

def is_cached_response_valid(self, cachedresponse, response, request):
# Use the cached response if the new response is a server error,
# as long as the old response didn't specify must-revalidate.
if response.status >= 500:
cc = self._parse_cachecontrol(cachedresponse)
if 'must-revalidate' not in cc:
return True

# Use the cached response if the server says it hasn't changed.
return response.status == 304

def _set_conditional_validators(self, request, cachedresponse):
Expand All @@ -108,15 +149,19 @@ def _set_conditional_validators(self, request, cachedresponse):
if 'ETag' in cachedresponse.headers:
request.headers['If-None-Match'] = cachedresponse.headers['ETag']

def _get_max_age(self, cc):
try:
return max(0, int(cc['max-age']))
except (KeyError, ValueError):
return None

def _compute_freshness_lifetime(self, response, request, now):
# Reference nsHttpResponseHead::ComputeFreshnessLifetime
# http://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#410
cc = self._parse_cachecontrol(response)
if 'max-age' in cc:
try:
return max(0, int(cc['max-age']))
except ValueError:
pass
maxage = self._get_max_age(cc)
if maxage is not None:
return maxage

# Parse date header or synthesize it if none exists
date = rfc1123_to_epoch(response.headers.get('Date')) or now
Expand Down
2 changes: 2 additions & 0 deletions scrapy/settings/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,10 @@
HTTPCACHE_IGNORE_MISSING = False
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_ALWAYS_STORE = False
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_IGNORE_SCHEMES = ['file']
HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS = []
HTTPCACHE_DBM_MODULE = 'anydbm'
HTTPCACHE_POLICY = 'scrapy.extensions.httpcache.DummyPolicy'
HTTPCACHE_GZIP = False
Expand Down
1 change: 1 addition & 0 deletions tests/py3-ignores.txt
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ scrapy/linkextractors/sgml.py
scrapy/linkextractors/regex.py
scrapy/linkextractors/htmlparser.py
scrapy/downloadermiddlewares/retry.py
scrapy/downloadermiddlewares/httpcache.py
scrapy/downloadermiddlewares/httpproxy.py
scrapy/downloadermiddlewares/cookies.py
scrapy/extensions/statsmailer.py
Expand Down
85 changes: 85 additions & 0 deletions tests/test_downloadermiddleware_httpcache.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,25 @@ def test_response_cacheability(self):
self.assertFalse(resc)
assert 'cached' not in res2.flags

# cache unconditionally unless response contains no-store or is a 304
with self._middleware(HTTPCACHE_ALWAYS_STORE=True) as mw:
for idx, (_, status, headers) in enumerate(responses):
shouldcache = 'no-store' not in headers.get('Cache-Control', '') and status != 304
req0 = Request('http://example2-%d.com' % idx)
res0 = Response(req0.url, status=status, headers=headers)
res1 = self._process_requestresponse(mw, req0, res0)
res304 = res0.replace(status=304)
res2 = self._process_requestresponse(mw, req0, res304 if shouldcache else res0)
self.assertEqualResponse(res1, res0)
self.assertEqualResponse(res2, res0)
resc = mw.storage.retrieve_response(self.spider, req0)
if shouldcache:
self.assertEqualResponse(resc, res1)
assert 'cached' in res2.flags and res2.status != 304
else:
self.assertFalse(resc)
assert 'cached' not in res2.flags

def test_cached_and_fresh(self):
sampledata = [
(200, {'Date': self.yesterday, 'Expires': self.tomorrow}),
Expand Down Expand Up @@ -381,6 +400,13 @@ def test_cached_and_fresh(self):
res2 = self._process_requestresponse(mw, req0, None)
self.assertEqualResponse(res1, res2)
assert 'cached' in res2.flags
# validate cached response if request max-age set as 0
req1 = req0.replace(headers={'Cache-Control': 'max-age=0'})
res304 = res0.replace(status=304)
assert mw.process_request(req1, self.spider) is None
res3 = self._process_requestresponse(mw, req1, res304)
self.assertEqualResponse(res1, res3)
assert 'cached' in res3.flags

def test_cached_and_stale(self):
sampledata = [
Expand All @@ -395,6 +421,9 @@ def test_cached_and_stale(self):
(200, {'Cache-Control': 'no-cache'}),
(200, {'Cache-Control': 'no-cache', 'ETag': 'foo'}),
(200, {'Cache-Control': 'no-cache', 'Last-Modified': self.yesterday}),
(200, {'Cache-Control': 'no-cache,must-revalidate', 'Last-Modified': self.yesterday}),
(200, {'Cache-Control': 'must-revalidate', 'Expires': self.yesterday, 'Last-Modified': self.yesterday}),
(200, {'Cache-Control': 'max-age=86400,must-revalidate', 'Age': '86405'}),
]
with self._middleware() as mw:
for idx, (status, headers) in enumerate(sampledata):
Expand All @@ -410,6 +439,7 @@ def test_cached_and_stale(self):
res2 = self._process_requestresponse(mw, req0, res0b)
self.assertEqualResponse(res2, res0b)
assert 'cached' not in res2.flags
cc = headers.get('Cache-Control', '')
# Previous response expired too, subsequent request to same
# resource must revalidate and succeed on 304 if validators
# are present
Expand All @@ -418,7 +448,62 @@ def test_cached_and_stale(self):
res3 = self._process_requestresponse(mw, req0, res0c)
self.assertEqualResponse(res3, res0b)
assert 'cached' in res3.flags
# get cached response on server errors unless must-revalidate
# in cached response
res0d = res0b.replace(status=500)
res4 = self._process_requestresponse(mw, req0, res0d)
if 'must-revalidate' in cc:
assert 'cached' not in res4.flags
self.assertEqualResponse(res4, res0d)
else:
assert 'cached' in res4.flags
self.assertEqualResponse(res4, res0b)
# Requests with max-stale can fetch expired cached responses
# unless cached response has must-revalidate
req1 = req0.replace(headers={'Cache-Control': 'max-stale'})
res5 = self._process_requestresponse(mw, req1, res0b)
self.assertEqualResponse(res5, res0b)
if 'no-cache' in cc or 'must-revalidate' in cc:
assert 'cached' not in res5.flags
else:
assert 'cached' in res5.flags

def test_process_exception(self):
with self._middleware() as mw:
res0 = Response(self.request.url, headers={'Expires': self.yesterday})
req0 = Request(self.request.url)
self._process_requestresponse(mw, req0, res0)
for e in mw.DOWNLOAD_EXCEPTIONS:
# Simulate encountering an error on download attempts
assert mw.process_request(req0, self.spider) is None
res1 = mw.process_exception(req0, e('foo'), self.spider)
# Use cached response as recovery
assert 'cached' in res1.flags
self.assertEqualResponse(res0, res1)
# Do not use cached response for unhandled exceptions
mw.process_request(req0, self.spider)
assert mw.process_exception(req0, Exception('foo'), self.spider) is None

def test_ignore_response_cache_controls(self):
sampledata = [
(200, {'Date': self.yesterday, 'Expires': self.tomorrow}),
(200, {'Date': self.yesterday, 'Cache-Control': 'no-store,max-age=86405'}),
(200, {'Age': '299', 'Cache-Control': 'max-age=300,no-cache'}),
(300, {'Cache-Control': 'no-cache'}),
(200, {'Expires': self.tomorrow, 'Cache-Control': 'no-store'}),
]
with self._middleware(HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS=['no-cache', 'no-store']) as mw:
for idx, (status, headers) in enumerate(sampledata):
req0 = Request('http://example-%d.com' % idx)
res0 = Response(req0.url, status=status, headers=headers)
# cache fresh response
res1 = self._process_requestresponse(mw, req0, res0)
self.assertEqualResponse(res1, res0)
assert 'cached' not in res1.flags
# return fresh cached response without network interaction
res2 = self._process_requestresponse(mw, req0, None)
self.assertEqualResponse(res1, res2)
assert 'cached' in res2.flags

if __name__ == '__main__':
unittest.main()

0 comments on commit d706310

Please sign in to comment.