Skip to content

Commit

Permalink
lib/url.py: Strengthen URL escaping.
Browse files Browse the repository at this point in the history
Use the same implementation of how zulip sanitizes its stream name and
URL into a safe URL.

Closes #35
  • Loading branch information
refeed committed Jul 14, 2022
1 parent 6d188a8 commit 0d03636
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 30 deletions.
6 changes: 3 additions & 3 deletions lib/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from .url import (
sanitize_stream,
sanitize_topic,
sanitize,
)

from .url import (
Expand Down Expand Up @@ -87,7 +87,7 @@ def format_message_html(
site_url,
html_root,
sanitize_stream(stream_name, stream_id),
sanitize_topic(topic_name),
sanitize(topic_name),
msg_id,
)
anchor_html = '<a name="{0}"></a>'.format(html.escape(msg_id))
Expand Down Expand Up @@ -184,7 +184,7 @@ def topic_list_html(topic_data):
"""

def item_html(topic_name, message_data):
link_html = f'<a href="topic/{html.escape(sanitize_topic(topic_name))}.html">{html.escape(topic_name)}</a>'
link_html = f'<a href="topic/{html.escape(sanitize(topic_name))}.html">{html.escape(topic_name)}</a>'
topic_info = topic_info_string(message_data)
return f"<li> {link_html} ({html.escape(topic_info)}) </li>"

Expand Down
6 changes: 3 additions & 3 deletions lib/populate.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
)
from .url import (
sanitize_stream,
sanitize_topic,
sanitize,
)


Expand Down Expand Up @@ -222,7 +222,7 @@ def populate_incremental(
p = (
json_root
/ Path(sanitize_stream(s["name"], s["stream_id"]))
/ Path(sanitize_topic(topic_name) + ".json")
/ Path(sanitize(topic_name) + ".json")
)
topic_exists = p.exists()
old = []
Expand Down Expand Up @@ -257,7 +257,7 @@ def dump_topic_messages(json_root, stream_data, topic_name, message_data):
sanitized_stream_name = sanitize_stream(stream_name, stream_id)
stream_dir = json_root / Path(sanitized_stream_name)

sanitized_topic_name = sanitize_topic(topic_name)
sanitized_topic_name = sanitize(topic_name)
topic_fn = sanitized_topic_name + ".json"

out = open_outfile(stream_dir, topic_fn, "w")
Expand Down
34 changes: 15 additions & 19 deletions lib/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,29 +66,25 @@ def archive_message_url(

## String cleaning functions

# remove non-alnum ascii symbols from string
def sanitize(s):
return (
"".join(
filter(
lambda x: x.isalnum or x == " ",
s.encode("ascii", "ignore").decode("utf-8"),
)
)
.replace(" ", "-")
.replace("?", "%3F")
)

def sanitize(s):
"""
Sanitize the string to a safe string that can be used in URLs
# create a unique sanitized identifier for a topic
def sanitize_topic(topic_name):
return (
urllib.parse.quote(topic_name, safe="~()*!.'")
.replace(".", "%2E")
.replace("%", ".")
)
This is copied from Zulip's core code:
https://github.com/zulip/zulip/blob/de31114d700561f32139a63a0e5f33d5c30039b3/zerver/lib/url_encoding.py#L8
"""
return urllib.parse.quote(s, safe=b"").replace(".", "%2E").replace("%", ".")


# create a unique sanitized identifier for a stream
def sanitize_stream(stream_name, stream_id):
"""
Encode streams for urls as something like 99-Foo-bar.
This is copied from Zulip's core code:
https://github.com/zulip/zulip/blob/de31114d700561f32139a63a0e5f33d5c30039b3/zerver/lib/url_encoding.py#L15
"""

stream_name = stream_name.replace(" ", "-")
return str(stream_id) + "-" + sanitize(stream_name)
4 changes: 2 additions & 2 deletions lib/website.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from .url import (
sanitize_stream,
sanitize_topic,
sanitize,
)

from .files import (
Expand Down Expand Up @@ -219,7 +219,7 @@ def write_topic_messages(
stream_id = stream["id"]

sanitized_stream_name = sanitize_stream(stream_name, stream_id)
sanitized_topic_name = sanitize_topic(topic_name)
sanitized_topic_name = sanitize(topic_name)

messages = read_zulip_messages_for_topic(
json_root, sanitized_stream_name, sanitized_topic_name
Expand Down
21 changes: 18 additions & 3 deletions tests/testCommon.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,26 @@ def assert_equal(v1, v2):


def test_sanitize():
assert_equal(url.sanitize_stream(stream_name="foo bar", stream_id=7), "7-foo-bar")
assert_equal(
url.sanitize_stream(stream_name="foo bar", stream_id=7),
"7-foo-bar",
)
assert_equal(
url.sanitize_stream(stream_name="foo/bar/turtle[🐢]", stream_id=7),
"7-foo.2Fbar.2Fturtle.5B.F0.9F.90.A2.5D",
)

assert_equal(
url.sanitize_topic(topic_name="pick a place for lunch"),
"pick.20a.20place.20for.20lunch",
url.sanitize("pick a place for lunch *"),
"pick.20a.20place.20for.20lunch.20.2A",
)
assert_equal(
url.sanitize("!!cute-turlte/tortoise (🐢)?"),
".21.21cute-turlte.2Ftortoise.20.28.F0.9F.90.A2.29.3F",
)
assert_equal(
url.sanitize('"the mighty turtle 🐢"'),
".22the.20mighty.20turtle.20.F0.9F.90.A2.22",
)


Expand Down

0 comments on commit 0d03636

Please sign in to comment.