Skip to content

Commit

Permalink
fix(security): Harden lxml parsing of remote feeds
Browse files Browse the repository at this point in the history
Bandit only pointed it out and [wasn't particularly
helpful](PyCQA/bandit#767 (comment)).
  • Loading branch information
rpatterson committed Feb 22, 2023
1 parent 0787d5c commit ef2b366
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 17 deletions.
10 changes: 7 additions & 3 deletions src/feedarchiver/feed.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import pathlib
import logging

from lxml import etree
from lxml import etree # nosec B410
from requests_toolbelt.downloadutils import stream

from . import utils
Expand Down Expand Up @@ -333,9 +333,10 @@ def load_remote_tree(self, remote_response):
Also do any pre-processing needed to start updating the archive.
"""
logger.debug("Parsing remote XML: %r", self.url)
remote_root = etree.fromstring(
remote_root = etree.fromstring( # nosec: B320
remote_response.content,
base_url=remote_response.url,
parser=utils.XML_PARSER,
)
return etree.ElementTree(remote_root)

Expand Down Expand Up @@ -379,7 +380,10 @@ def load_archive_tree(self):
# there are errors parsing it, then treat it as if it's the first time
# archiving this feed.
try:
archive_tree = etree.parse(feed_archive_opened)
archive_tree = etree.parse( # nosec B320
feed_archive_opened,
parser=utils.XML_PARSER,
)
except SyntaxError:
logger.exception(
"Unhandled exception parsing archive feed: %r",
Expand Down
2 changes: 1 addition & 1 deletion src/feedarchiver/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import typing
import logging

from lxml import etree
from lxml import etree # nosec: B410

logger = logging.getLogger(__name__)

Expand Down
4 changes: 2 additions & 2 deletions src/feedarchiver/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import email.utils
import unittest

from lxml import etree
from lxml import etree # nosec: B410

import requests_mock

Expand Down Expand Up @@ -216,7 +216,7 @@ def get_feed_items(feed_path):
Map item ID to item element for all the items in the given feed XML.
"""
with feed_path.open() as feed_opened:
tree = etree.parse(feed_opened)
tree = etree.parse(feed_opened, parser=utils.XML_PARSER) # nosec: B320
channel = tree.getroot().find("channel")
return {
item_elem.find("guid").text: item_elem for item_elem in channel.iter("item")
Expand Down
8 changes: 6 additions & 2 deletions src/feedarchiver/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
import pathlib
import urllib

from lxml import etree
from lxml import etree # nosec: B410
import requests_mock

from .. import utils
from .. import tests


Expand Down Expand Up @@ -171,7 +172,10 @@ def test_downloads(self): # pylint: disable=too-many-locals
)

# Assert archive URLs updated
archive_tree = etree.parse(self.archive_feed.path.open())
archive_tree = etree.parse( # nosec: B320
self.archive_feed.path.open(),
parser=utils.XML_PARSER,
)
feed_link_split = urllib.parse.urlsplit(
archive_tree.find("channel").find("link").text,
)
Expand Down
43 changes: 34 additions & 9 deletions src/feedarchiver/tests/test_feed.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
import datetime
import logging

from lxml import etree
from lxml import etree # nosec: B410

from .. import utils
from .. import formats
from .. import feed
from .. import tests
Expand Down Expand Up @@ -69,13 +70,19 @@ def test_feed_configs_requested(self):
"Archive of feed XML does not exist after updating",
)
with feed_path.open() as remote_opened:
remote_tree = etree.parse(remote_opened)
remote_tree = etree.parse( # nosec: B320
remote_opened,
parser=utils.XML_PARSER,
)
remote_items = remote_tree.find("channel").iterchildren("item")
remote_item_ids = [
remote_item.find("guid").text for remote_item in remote_items
]
with self.feed_path.open() as archive_opened:
archive_tree = etree.parse(archive_opened)
archive_tree = etree.parse( # nosec: B320
archive_opened,
parser=utils.XML_PARSER,
)
archive_items = archive_tree.find("channel").iterchildren("item")
archive_item_ids = [
archive_item.find("guid").text for archive_item in archive_items
Expand Down Expand Up @@ -108,13 +115,19 @@ def test_feed_unchanged(self):
"Wrong number of original feed URL requests",
)
with feed_path.open() as remote_opened:
remote_tree = etree.parse(remote_opened)
remote_tree = etree.parse( # nosec: B320
remote_opened,
parser=utils.XML_PARSER,
)
remote_items = remote_tree.find("channel").iterchildren("item")
remote_item_ids = [
remote_item.find("guid").text for remote_item in remote_items
]
with self.feed_path.open() as archive_opened:
archive_tree = etree.parse(archive_opened)
archive_tree = etree.parse( # nosec: B320
archive_opened,
parser=utils.XML_PARSER,
)
archive_items = archive_tree.find("channel").iterchildren("item")
archive_item_ids = [
archive_item.find("guid").text for archive_item in archive_items
Expand Down Expand Up @@ -220,9 +233,15 @@ def test_feed_reordered_item(self):
)
reordered_item_feed_path, _ = reordered_item_request_mocks[self.feed_url]

remote_tree = etree.parse(reordered_item_feed_path.open())
remote_tree = etree.parse( # nosec: B320
reordered_item_feed_path.open(),
parser=utils.XML_PARSER,
)
remote_items = remote_tree.find("channel").findall("item")
archive_tree = etree.parse(self.feed_path.open())
archive_tree = etree.parse( # nosec: B320
self.feed_path.open(),
parser=utils.XML_PARSER,
)
archive_items = archive_tree.find("channel").findall("item")
self.assertEqual(
archive_items[0].find("guid").text,
Expand All @@ -249,7 +268,10 @@ def test_feed_empty(self):
archive_feed=self.archive_feed,
remote_mock="empty",
)
archive_tree = etree.parse(self.feed_path.open())
archive_tree = etree.parse( # nosec: B320
self.feed_path.open(),
parser=utils.XML_PARSER,
)
archive_items = archive_tree.find("channel").findall("item")
self.assertEqual(
archive_items,
Expand All @@ -269,7 +291,10 @@ def test_feed_formats(self):
feed_format_class=feed_format_class,
):
with (self.REMOTES_PATH / relative_path).open() as feed_opened:
feed_tree = etree.parse(feed_opened)
feed_tree = etree.parse( # nosec: B320
feed_opened,
parser=utils.XML_PARSER,
)
feed_root = feed_tree.getroot()
feed_format = feed_format_class(self.archive_feed)

Expand Down
7 changes: 7 additions & 0 deletions src/feedarchiver/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,15 @@
import logging
import tracemalloc

from lxml import etree # nosec B410

logger = logging.getLogger(__name__)

# Configure the XML parser as securely as possible since we're parsing XML from
# untrusted sources:
# https://lxml.de/FAQ.html#how-do-i-use-lxml-safely-as-a-web-service-endpoint
XML_PARSER = etree.XMLParser(resolve_entities=False)

TRUE_STRS = {"1", "true", "yes", "on"}
DEBUG = ( # noqa: F841
"DEBUG" in os.environ and os.environ["DEBUG"].strip().lower() in TRUE_STRS
Expand Down

0 comments on commit ef2b366

Please sign in to comment.