Skip to content

Commit

Permalink
lib/url.py: Strengthen URL escaping.
Browse files Browse the repository at this point in the history
Use similar implementation of how zulip sanitizes its stream name and
URL into a safe URL.
The only difference is the function now append a non-dot char as the
prefix so that it is safe to use as the filename especially for jekyll
usecase where it ignores file that has dot prefix in its filename.

Closes zulip#35
Closes zulip#51
  • Loading branch information
refeed committed Jul 6, 2022
1 parent 5da606f commit 8b57ace
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 27 deletions.
6 changes: 3 additions & 3 deletions lib/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from .url import (
sanitize_stream,
sanitize_topic,
sanitize,
)

from .url import (
Expand Down Expand Up @@ -87,7 +87,7 @@ def format_message_html(
site_url,
html_root,
sanitize_stream(stream_name, stream_id),
sanitize_topic(topic_name),
sanitize(topic_name),
msg_id,
)
anchor_html = '<a name="{0}"></a>'.format(html.escape(msg_id))
Expand Down Expand Up @@ -184,7 +184,7 @@ def topic_list_html(topic_data):
"""

def item_html(topic_name, message_data):
link_html = f'<a href="topic/{html.escape(sanitize_topic(topic_name))}.html">{html.escape(topic_name)}</a>'
link_html = f'<a href="topic/{html.escape(sanitize(topic_name))}.html">{html.escape(topic_name)}</a>'
topic_info = topic_info_string(message_data)
return f"<li> {link_html} ({html.escape(topic_info)}) </li>"

Expand Down
6 changes: 3 additions & 3 deletions lib/populate.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
)
from .url import (
sanitize_stream,
sanitize_topic,
sanitize,
)


Expand Down Expand Up @@ -222,7 +222,7 @@ def populate_incremental(
p = (
json_root
/ Path(sanitize_stream(s["name"], s["stream_id"]))
/ Path(sanitize_topic(topic_name) + ".json")
/ Path(sanitize(topic_name) + ".json")
)
topic_exists = p.exists()
old = []
Expand Down Expand Up @@ -257,7 +257,7 @@ def dump_topic_messages(json_root, stream_data, topic_name, message_data):
sanitized_stream_name = sanitize_stream(stream_name, stream_id)
stream_dir = json_root / Path(sanitized_stream_name)

sanitized_topic_name = sanitize_topic(topic_name)
sanitized_topic_name = sanitize(topic_name)
topic_fn = sanitized_topic_name + ".json"

out = open_outfile(stream_dir, topic_fn, "w")
Expand Down
23 changes: 7 additions & 16 deletions lib/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,24 +66,15 @@ def archive_message_url(

## String cleaning functions

# remove non-alnum ascii symbols from string
def sanitize(s):
# Sanitize the string to a safe string that can be used as the URL
# It copies zulip's way of encoding stream and topic name to URL
return (
"".join(
filter(
lambda x: x.isalnum or x == " ",
s.encode("ascii", "ignore").decode("utf-8"),
)
)
.replace(" ", "-")
.replace("?", "%3F")
)


# create a unique sanitized identifier for a topic
def sanitize_topic(topic_name):
return (
urllib.parse.quote(topic_name, safe="~()*!.'")
# We add a prefix of non-dot character so that the resulting string
# is safe to use as the filename especially for jekyll usecase
# where it ignores file that has dot prefix in its filename
"A-" +
urllib.parse.quote(s, safe=b"")
.replace(".", "%2E")
.replace("%", ".")
)
Expand Down
4 changes: 2 additions & 2 deletions lib/website.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from .url import (
sanitize_stream,
sanitize_topic,
sanitize,
)

from .files import (
Expand Down Expand Up @@ -219,7 +219,7 @@ def write_topic_messages(
stream_id = stream["id"]

sanitized_stream_name = sanitize_stream(stream_name, stream_id)
sanitized_topic_name = sanitize_topic(topic_name)
sanitized_topic_name = sanitize(topic_name)

messages = read_zulip_messages_for_topic(
json_root, sanitized_stream_name, sanitized_topic_name
Expand Down
19 changes: 16 additions & 3 deletions tests/testCommon.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,24 @@ def assert_equal(v1, v2):


def test_sanitize():
assert_equal(url.sanitize_stream(stream_name="foo bar", stream_id=7), "7-foo-bar")
assert_equal(
url.sanitize_stream(stream_name="foo bar", stream_id=7),
"7-A-foo.20bar")
assert_equal(
url.sanitize_stream(stream_name="foo/bar/turtle[🐢]", stream_id=7),
"7-A-foo.2Fbar.2Fturtle.5B.F0.9F.90.A2.5D")

assert_equal(
url.sanitize_topic(topic_name="pick a place for lunch"),
"pick.20a.20place.20for.20lunch",
url.sanitize("pick a place for lunch"),
"A-pick.20a.20place.20for.20lunch",
)
assert_equal(
url.sanitize("!!cute-turlte/tortoise (🐢)?"),
"A-.21.21cute-turlte.2Ftortoise.20.28.F0.9F.90.A2.29.3F",
)
assert_equal(
url.sanitize('"the mighty turtle 🐢"'),
"A-.22the.20mighty.20turtle.20.F0.9F.90.A2.22",
)


Expand Down

0 comments on commit 8b57ace

Please sign in to comment.