Skip to content

Commit

Permalink
Brotli: Don't accept brotli if library can't be loaded. (#444)
Browse files Browse the repository at this point in the history
* brotli: if the brotli module can not be loaded, print warning
and also remove `br` from any Accept-Encoding header to avoid recording with brotli, addresses #434
  • Loading branch information
ikreymer authored Feb 20, 2019
1 parent 000ed89 commit 32c1e6c
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 3 deletions.
14 changes: 14 additions & 0 deletions pywb/rewrite/rewriteinputreq.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@
import re


try: # pragma: no cover
import brotli
has_brotli = True
except Exception: # pragma: no cover
has_brotli = False
print('Warning: brotli module could not be loaded, will not be able to replay brotli-encoded content')


#=============================================================================
class RewriteInputRequest(DirectWSGIInputRequest):
RANGE_ARG_RX = re.compile('.*.googlevideo.com/videoplayback.*([&?]range=(\d+)-(\d+))')
Expand Down Expand Up @@ -79,6 +87,12 @@ def get_req_headers(self):
if self.splits:
value = self.splits.scheme

elif not has_brotli and name == 'HTTP_ACCEPT_ENCODING' and 'br' in value:
# if brotli not available, remove 'br' from accept-encoding to avoid
# capture brotli encoded content
name = 'Accept-Encoding'
value = ','.join([enc for enc in value.split(',') if enc.strip() != 'br'])

elif name.startswith('HTTP_'):
name = name[5:].title().replace('_', '-')

Expand Down
25 changes: 22 additions & 3 deletions tests/test_record_replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,15 @@
from pywb.manager.autoindex import AutoIndexer
from pywb.warcserver.test.testutils import to_path, HttpBinLiveTests, TEST_WARC_PATH, TEST_CDX_PATH

from warcio import ArchiveIterator

import os
import time
import json

from mock import patch
import pytest


# ============================================================================
class TestRecordReplay(HttpBinLiveTests, CollsDirMixin, BaseConfigTest):
Expand Down Expand Up @@ -153,6 +158,20 @@ def test_init_and_rec(self):
assert names[0].startswith('pywb-rec-test-')
assert names[0].endswith('.warcgz')

TestRecordCustomConfig.warc_name = os.path.join(dir_name, names[0])

@patch('pywb.rewrite.rewriteinputreq.has_brotli', False)
def test_no_brotli(self):
res = self.testapp.get('/test-new/record/mp_/http://httpbin.org/get?C=D',
headers={'Accept-Encoding': 'gzip, deflate, br'})
assert '"C": "D"' in res.text

with open(self.warc_name, 'rb') as fh:
for record in ArchiveIterator(fh):
last_record = record

assert record.http_headers['Accept-Encoding'] == 'gzip, deflate'


# ============================================================================
class TestRecordFilter(HttpBinLiveTests, CollsDirMixin, BaseConfigTest):
Expand All @@ -174,17 +193,17 @@ def setup_class(cls):
}
super(TestRecordFilter, cls).setup_class('config_test_record.yaml', custom_config=rec_custom)
manager(['init', 'test-new'])

def test_skip_existing(self):
dir_name = os.path.join(self.root_dir, '_test_colls', 'test-new', 'archive')
assert os.path.isdir(dir_name)
res = self.testapp.get('/fallback/cdx?url=http://example.com/?example=1')
assert res.text != ''

res = self.testapp.get('/test-new/record/mp_/http://example.com/?example=1')
assert 'Example Domain' in res.text
assert os.listdir(dir_name) == []

def test_record_new(self):
dir_name = os.path.join(self.root_dir, '_test_colls', 'test-new', 'archive')
assert os.path.isdir(dir_name)
Expand Down

0 comments on commit 32c1e6c

Please sign in to comment.