From 8b57ace42663fa530f8a3ad07df110b8d74cc0c6 Mon Sep 17 00:00:00 2001
From: "Rafid Aslam (refeed)" <rafidteam@gmail.com>
Date: Wed, 6 Jul 2022 14:28:04 +0700
Subject: [PATCH] lib/url.py: Strengthen URL escaping.

Use similar implementation of how zulip sanitizes its stream name and
URL into a safe URL.
The only difference is the function now append a non-dot char as the
prefix so that it is safe to use as the filename especially for jekyll
usecase where it ignores file that has dot prefix in its filename.

Closes https://github.com/zulip/zulip-archive/issues/35
Closes https://github.com/zulip/zulip-archive/issues/51
---
 lib/html.py         |  6 +++---
 lib/populate.py     |  6 +++---
 lib/url.py          | 23 +++++++----------------
 lib/website.py      |  4 ++--
 tests/testCommon.py | 19 ++++++++++++++++---
 5 files changed, 31 insertions(+), 27 deletions(-)
diff --git a/lib/html.py b/lib/html.py
index 71b9b5a7e0e6b8..9d3fee06728258 100644
--- a/lib/html.py
+++ b/lib/html.py
@@ -17,7 +17,7 @@
 
 from .url import (
     sanitize_stream,
-    sanitize_topic,
+    sanitize,
 )
 
 from .url import (
@@ -87,7 +87,7 @@ def format_message_html(
         site_url,
         html_root,
         sanitize_stream(stream_name, stream_id),
-        sanitize_topic(topic_name),
+        sanitize(topic_name),
         msg_id,
     )
     anchor_html = '<a name="{0}"></a>'.format(html.escape(msg_id))
@@ -184,7 +184,7 @@ def topic_list_html(topic_data):
     """
 
     def item_html(topic_name, message_data):
-        link_html = f'<a href="topic/{html.escape(sanitize_topic(topic_name))}.html">{html.escape(topic_name)}</a>'
+        link_html = f'<a href="topic/{html.escape(sanitize(topic_name))}.html">{html.escape(topic_name)}</a>'
         topic_info = topic_info_string(message_data)
         return f"<li> {link_html} ({html.escape(topic_info)}) </li>"
 
diff --git a/lib/populate.py b/lib/populate.py
index 1ab11f91ac8538..fcbbfecf12ba02 100644
--- a/lib/populate.py
+++ b/lib/populate.py
@@ -50,7 +50,7 @@
 )
 from .url import (
     sanitize_stream,
-    sanitize_topic,
+    sanitize,
 )
 
 
@@ -222,7 +222,7 @@ def populate_incremental(
             p = (
                 json_root
                 / Path(sanitize_stream(s["name"], s["stream_id"]))
-                / Path(sanitize_topic(topic_name) + ".json")
+                / Path(sanitize(topic_name) + ".json")
             )
             topic_exists = p.exists()
             old = []
@@ -257,7 +257,7 @@ def dump_topic_messages(json_root, stream_data, topic_name, message_data):
     sanitized_stream_name = sanitize_stream(stream_name, stream_id)
     stream_dir = json_root / Path(sanitized_stream_name)
 
-    sanitized_topic_name = sanitize_topic(topic_name)
+    sanitized_topic_name = sanitize(topic_name)
     topic_fn = sanitized_topic_name + ".json"
 
     out = open_outfile(stream_dir, topic_fn, "w")
diff --git a/lib/url.py b/lib/url.py
index bee5c6853a2d6e..341b560956aaad 100644
--- a/lib/url.py
+++ b/lib/url.py
@@ -66,24 +66,15 @@ def archive_message_url(
 
 ## String cleaning functions
 
-# remove non-alnum ascii symbols from string
 def sanitize(s):
+    # Sanitize the string to a safe string that can be used as the URL
+    # It copies zulip's way of encoding stream and topic name to URL
     return (
-        "".join(
-            filter(
-                lambda x: x.isalnum or x == " ",
-                s.encode("ascii", "ignore").decode("utf-8"),
-            )
-        )
-        .replace(" ", "-")
-        .replace("?", "%3F")
-    )
-
-
-# create a unique sanitized identifier for a topic
-def sanitize_topic(topic_name):
-    return (
-        urllib.parse.quote(topic_name, safe="~()*!.'")
+        # We add a prefix of non-dot character so that the resulting string
+        # is safe to use as the filename especially for jekyll usecase
+        # where it ignores file that has dot prefix in its filename
+        "A-" +
+        urllib.parse.quote(s, safe=b"")
         .replace(".", "%2E")
         .replace("%", ".")
     )
diff --git a/lib/website.py b/lib/website.py
index 8b590d32885245..051d52bc1319a1 100644
--- a/lib/website.py
+++ b/lib/website.py
@@ -21,7 +21,7 @@
 
 from .url import (
     sanitize_stream,
-    sanitize_topic,
+    sanitize,
 )
 
 from .files import (
@@ -219,7 +219,7 @@ def write_topic_messages(
     stream_id = stream["id"]
 
     sanitized_stream_name = sanitize_stream(stream_name, stream_id)
-    sanitized_topic_name = sanitize_topic(topic_name)
+    sanitized_topic_name = sanitize(topic_name)
 
     messages = read_zulip_messages_for_topic(
         json_root, sanitized_stream_name, sanitized_topic_name
diff --git a/tests/testCommon.py b/tests/testCommon.py
index 296e798c0e0e47..4b85e7ed2dc626 100644
--- a/tests/testCommon.py
+++ b/tests/testCommon.py
@@ -24,11 +24,24 @@ def assert_equal(v1, v2):
 
 
 def test_sanitize():
-    assert_equal(url.sanitize_stream(stream_name="foo bar", stream_id=7), "7-foo-bar")
+    assert_equal(
+        url.sanitize_stream(stream_name="foo bar", stream_id=7),
+        "7-A-foo.20bar")
+    assert_equal(
+        url.sanitize_stream(stream_name="foo/bar/turtle[🐢]", stream_id=7),
+        "7-A-foo.2Fbar.2Fturtle.5B.F0.9F.90.A2.5D")
 
     assert_equal(
-        url.sanitize_topic(topic_name="pick a place for lunch"),
-        "pick.20a.20place.20for.20lunch",
+        url.sanitize("pick a place for lunch"),
+        "A-pick.20a.20place.20for.20lunch",
+    )
+    assert_equal(
+        url.sanitize("!!cute-turlte/tortoise (🐢)?"),
+        "A-.21.21cute-turlte.2Ftortoise.20.28.F0.9F.90.A2.29.3F",
+    )
+    assert_equal(
+        url.sanitize('"the mighty turtle 🐢"'),
+        "A-.22the.20mighty.20turtle.20.F0.9F.90.A2.22",
     )