From 0ea5bea44bcbc4c1457415575086059608c465eb Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 19 Feb 2019 15:44:51 -0800 Subject: [PATCH 1/3] brotli: if the brotli module can not be loaded, print warning and also remove 'br' from any Accept-Encoding to avoid recording with brotli, addresses #434 --- pywb/rewrite/rewriteinputreq.py | 12 ++++++++++++ tests/test_record_replay.py | 25 ++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/pywb/rewrite/rewriteinputreq.py b/pywb/rewrite/rewriteinputreq.py index 21e7f94eb..322664436 100644 --- a/pywb/rewrite/rewriteinputreq.py +++ b/pywb/rewrite/rewriteinputreq.py @@ -5,6 +5,13 @@ from six.moves.urllib.parse import urlsplit import re +try: # pragma: no cover + import brotli + has_brotli = True +except: # pragma: no cover + has_brotli = False + print('Warning: brotli module could not be loaded, will not be able to replay brotli-encoded content') + #============================================================================= class RewriteInputRequest(DirectWSGIInputRequest): @@ -79,6 +86,11 @@ def get_req_headers(self): if self.splits: value = self.splits.scheme + elif not has_brotli and name == 'HTTP_ACCEPT_ENCODING' and 'br' in value: + # if brotli not available, remove brotli encoded content + name = 'Accept-Encoding' + value = ','.join([enc for enc in value.split(',') if enc.strip() != 'br']) + elif name.startswith('HTTP_'): name = name[5:].title().replace('_', '-') diff --git a/tests/test_record_replay.py b/tests/test_record_replay.py index d6a3f5838..314cd2360 100644 --- a/tests/test_record_replay.py +++ b/tests/test_record_replay.py @@ -3,10 +3,15 @@ from pywb.manager.autoindex import AutoIndexer from pywb.warcserver.test.testutils import to_path, HttpBinLiveTests, TEST_WARC_PATH, TEST_CDX_PATH +from warcio import ArchiveIterator + import os import time import json +from mock import patch +import pytest + # ============================================================================ class TestRecordReplay(HttpBinLiveTests, CollsDirMixin, BaseConfigTest): @@ -153,6 +158,20 @@ def test_init_and_rec(self): assert names[0].startswith('pywb-rec-test-') assert names[0].endswith('.warcgz') + TestRecordCustomConfig.warc_name = os.path.join(dir_name, names[0]) + + @patch('pywb.rewrite.rewriteinputreq.has_brotli', False) + def test_no_brotli(self): + res = self.testapp.get('/test-new/record/mp_/http://httpbin.org/get?C=D', + headers={'Accept-Encoding': 'gzip, deflate, br'}) + assert '"C": "D"' in res.text + + with open(self.warc_name, 'rb') as fh: + for record in ArchiveIterator(fh): + last_record = record + + assert record.http_headers['Accept-Encoding'] == 'gzip, deflate' + # ============================================================================ class TestRecordFilter(HttpBinLiveTests, CollsDirMixin, BaseConfigTest): @@ -174,17 +193,17 @@ def setup_class(cls): } super(TestRecordFilter, cls).setup_class('config_test_record.yaml', custom_config=rec_custom) manager(['init', 'test-new']) - + def test_skip_existing(self): dir_name = os.path.join(self.root_dir, '_test_colls', 'test-new', 'archive') assert os.path.isdir(dir_name) res = self.testapp.get('/fallback/cdx?url=http://example.com/?example=1') assert res.text != '' - + res = self.testapp.get('/test-new/record/mp_/http://example.com/?example=1') assert 'Example Domain' in res.text assert os.listdir(dir_name) == [] - + def test_record_new(self): dir_name = os.path.join(self.root_dir, '_test_colls', 'test-new', 'archive') assert os.path.isdir(dir_name) From 952d6d12da243d8296f6bacc58c982a1d41c7799 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 19 Feb 2019 15:55:14 -0800 Subject: [PATCH 2/3] fix comment --- pywb/rewrite/rewriteinputreq.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pywb/rewrite/rewriteinputreq.py b/pywb/rewrite/rewriteinputreq.py index 322664436..6b5ec8b70 100644 --- a/pywb/rewrite/rewriteinputreq.py +++ b/pywb/rewrite/rewriteinputreq.py @@ -87,7 +87,8 @@ def get_req_headers(self): value = self.splits.scheme elif not has_brotli and name == 'HTTP_ACCEPT_ENCODING' and 'br' in value: - # if brotli not available, remove brotli encoded content + # if brotli not available, remove 'br' from accept-encoding to avoid + # capture brotli encoded content name = 'Accept-Encoding' value = ','.join([enc for enc in value.split(',') if enc.strip() != 'br']) From f2df7ede649c0fb27ab142a76a2bebd38e28379c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 19 Feb 2019 17:17:15 -0800 Subject: [PATCH 3/3] style: add Exception --- pywb/rewrite/rewriteinputreq.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pywb/rewrite/rewriteinputreq.py b/pywb/rewrite/rewriteinputreq.py index 6b5ec8b70..11d12d92f 100644 --- a/pywb/rewrite/rewriteinputreq.py +++ b/pywb/rewrite/rewriteinputreq.py @@ -5,10 +5,11 @@ from six.moves.urllib.parse import urlsplit import re + try: # pragma: no cover import brotli has_brotli = True -except: # pragma: no cover +except Exception: # pragma: no cover has_brotli = False print('Warning: brotli module could not be loaded, will not be able to replay brotli-encoded content')