From 0ea5bea44bcbc4c1457415575086059608c465eb Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 19 Feb 2019 15:44:51 -0800 Subject: [PATCH] brotli: if the brotli module can not be loaded, print warning and also remove 'br' from any Accept-Encoding to avoid recording with brotli, addresses #434 --- pywb/rewrite/rewriteinputreq.py | 12 ++++++++++++ tests/test_record_replay.py | 25 ++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/pywb/rewrite/rewriteinputreq.py b/pywb/rewrite/rewriteinputreq.py index 21e7f94eb..322664436 100644 --- a/pywb/rewrite/rewriteinputreq.py +++ b/pywb/rewrite/rewriteinputreq.py @@ -5,6 +5,13 @@ from six.moves.urllib.parse import urlsplit import re +try: # pragma: no cover + import brotli + has_brotli = True +except: # pragma: no cover + has_brotli = False + print('Warning: brotli module could not be loaded, will not be able to replay brotli-encoded content') + #============================================================================= class RewriteInputRequest(DirectWSGIInputRequest): @@ -79,6 +86,11 @@ def get_req_headers(self): if self.splits: value = self.splits.scheme + elif not has_brotli and name == 'HTTP_ACCEPT_ENCODING' and 'br' in value: + # if brotli not available, remove brotli encoded content + name = 'Accept-Encoding' + value = ','.join([enc for enc in value.split(',') if enc.strip() != 'br']) + elif name.startswith('HTTP_'): name = name[5:].title().replace('_', '-') diff --git a/tests/test_record_replay.py b/tests/test_record_replay.py index d6a3f5838..314cd2360 100644 --- a/tests/test_record_replay.py +++ b/tests/test_record_replay.py @@ -3,10 +3,15 @@ from pywb.manager.autoindex import AutoIndexer from pywb.warcserver.test.testutils import to_path, HttpBinLiveTests, TEST_WARC_PATH, TEST_CDX_PATH +from warcio import ArchiveIterator + import os import time import json +from mock import patch +import pytest + # ============================================================================ class TestRecordReplay(HttpBinLiveTests, CollsDirMixin, BaseConfigTest): @@ -153,6 +158,20 @@ def test_init_and_rec(self): assert names[0].startswith('pywb-rec-test-') assert names[0].endswith('.warcgz') + TestRecordCustomConfig.warc_name = os.path.join(dir_name, names[0]) + + @patch('pywb.rewrite.rewriteinputreq.has_brotli', False) + def test_no_brotli(self): + res = self.testapp.get('/test-new/record/mp_/http://httpbin.org/get?C=D', + headers={'Accept-Encoding': 'gzip, deflate, br'}) + assert '"C": "D"' in res.text + + with open(self.warc_name, 'rb') as fh: + for record in ArchiveIterator(fh): + last_record = record + + assert record.http_headers['Accept-Encoding'] == 'gzip, deflate' + # ============================================================================ class TestRecordFilter(HttpBinLiveTests, CollsDirMixin, BaseConfigTest): @@ -174,17 +193,17 @@ def setup_class(cls): } super(TestRecordFilter, cls).setup_class('config_test_record.yaml', custom_config=rec_custom) manager(['init', 'test-new']) - + def test_skip_existing(self): dir_name = os.path.join(self.root_dir, '_test_colls', 'test-new', 'archive') assert os.path.isdir(dir_name) res = self.testapp.get('/fallback/cdx?url=http://example.com/?example=1') assert res.text != '' - + res = self.testapp.get('/test-new/record/mp_/http://example.com/?example=1') assert 'Example Domain' in res.text assert os.listdir(dir_name) == [] - + def test_record_new(self): dir_name = os.path.join(self.root_dir, '_test_colls', 'test-new', 'archive') assert os.path.isdir(dir_name)