diff --git a/lib/html.py b/lib/html.py
index 71b9b5a7e0e6b8..9d3fee06728258 100644
--- a/lib/html.py
+++ b/lib/html.py
@@ -17,7 +17,7 @@
from .url import (
sanitize_stream,
- sanitize_topic,
+ sanitize,
)
from .url import (
@@ -87,7 +87,7 @@ def format_message_html(
site_url,
html_root,
sanitize_stream(stream_name, stream_id),
- sanitize_topic(topic_name),
+ sanitize(topic_name),
msg_id,
)
anchor_html = ''.format(html.escape(msg_id))
@@ -184,7 +184,7 @@ def topic_list_html(topic_data):
"""
def item_html(topic_name, message_data):
- link_html = f'{html.escape(topic_name)}'
+ link_html = f'{html.escape(topic_name)}'
topic_info = topic_info_string(message_data)
return f"
{link_html} ({html.escape(topic_info)}) "
diff --git a/lib/populate.py b/lib/populate.py
index 1ab11f91ac8538..fcbbfecf12ba02 100644
--- a/lib/populate.py
+++ b/lib/populate.py
@@ -50,7 +50,7 @@
)
from .url import (
sanitize_stream,
- sanitize_topic,
+ sanitize,
)
@@ -222,7 +222,7 @@ def populate_incremental(
p = (
json_root
/ Path(sanitize_stream(s["name"], s["stream_id"]))
- / Path(sanitize_topic(topic_name) + ".json")
+ / Path(sanitize(topic_name) + ".json")
)
topic_exists = p.exists()
old = []
@@ -257,7 +257,7 @@ def dump_topic_messages(json_root, stream_data, topic_name, message_data):
sanitized_stream_name = sanitize_stream(stream_name, stream_id)
stream_dir = json_root / Path(sanitized_stream_name)
- sanitized_topic_name = sanitize_topic(topic_name)
+ sanitized_topic_name = sanitize(topic_name)
topic_fn = sanitized_topic_name + ".json"
out = open_outfile(stream_dir, topic_fn, "w")
diff --git a/lib/url.py b/lib/url.py
index bee5c6853a2d6e..341b560956aaad 100644
--- a/lib/url.py
+++ b/lib/url.py
@@ -66,24 +66,15 @@ def archive_message_url(
## String cleaning functions
-# remove non-alnum ascii symbols from string
def sanitize(s):
+ # Sanitize the string to a safe string that can be used as the URL
+ # It copies zulip's way of encoding stream and topic name to URL
return (
- "".join(
- filter(
- lambda x: x.isalnum or x == " ",
- s.encode("ascii", "ignore").decode("utf-8"),
- )
- )
- .replace(" ", "-")
- .replace("?", "%3F")
- )
-
-
-# create a unique sanitized identifier for a topic
-def sanitize_topic(topic_name):
- return (
- urllib.parse.quote(topic_name, safe="~()*!.'")
+ # We add a prefix of non-dot character so that the resulting string
+ # is safe to use as the filename especially for jekyll usecase
+ # where it ignores file that has dot prefix in its filename
+ "A-" +
+ urllib.parse.quote(s, safe=b"")
.replace(".", "%2E")
.replace("%", ".")
)
diff --git a/lib/website.py b/lib/website.py
index 8b590d32885245..051d52bc1319a1 100644
--- a/lib/website.py
+++ b/lib/website.py
@@ -21,7 +21,7 @@
from .url import (
sanitize_stream,
- sanitize_topic,
+ sanitize,
)
from .files import (
@@ -219,7 +219,7 @@ def write_topic_messages(
stream_id = stream["id"]
sanitized_stream_name = sanitize_stream(stream_name, stream_id)
- sanitized_topic_name = sanitize_topic(topic_name)
+ sanitized_topic_name = sanitize(topic_name)
messages = read_zulip_messages_for_topic(
json_root, sanitized_stream_name, sanitized_topic_name
diff --git a/tests/testCommon.py b/tests/testCommon.py
index 296e798c0e0e47..4b85e7ed2dc626 100644
--- a/tests/testCommon.py
+++ b/tests/testCommon.py
@@ -24,11 +24,24 @@ def assert_equal(v1, v2):
def test_sanitize():
- assert_equal(url.sanitize_stream(stream_name="foo bar", stream_id=7), "7-foo-bar")
+ assert_equal(
+ url.sanitize_stream(stream_name="foo bar", stream_id=7),
+ "7-A-foo.20bar")
+ assert_equal(
+ url.sanitize_stream(stream_name="foo/bar/turtle[🐢]", stream_id=7),
+ "7-A-foo.2Fbar.2Fturtle.5B.F0.9F.90.A2.5D")
assert_equal(
- url.sanitize_topic(topic_name="pick a place for lunch"),
- "pick.20a.20place.20for.20lunch",
+ url.sanitize("pick a place for lunch"),
+ "A-pick.20a.20place.20for.20lunch",
+ )
+ assert_equal(
+ url.sanitize("!!cute-turlte/tortoise (🐢)?"),
+ "A-.21.21cute-turlte.2Ftortoise.20.28.F0.9F.90.A2.29.3F",
+ )
+ assert_equal(
+ url.sanitize('"the mighty turtle 🐢"'),
+ "A-.22the.20mighty.20turtle.20.F0.9F.90.A2.22",
)