diff --git a/lib/html.py b/lib/html.py index 71b9b5a7e0e6b8..9d3fee06728258 100644 --- a/lib/html.py +++ b/lib/html.py @@ -17,7 +17,7 @@ from .url import ( sanitize_stream, - sanitize_topic, + sanitize, ) from .url import ( @@ -87,7 +87,7 @@ def format_message_html( site_url, html_root, sanitize_stream(stream_name, stream_id), - sanitize_topic(topic_name), + sanitize(topic_name), msg_id, ) anchor_html = ''.format(html.escape(msg_id)) @@ -184,7 +184,7 @@ def topic_list_html(topic_data): """ def item_html(topic_name, message_data): - link_html = f'{html.escape(topic_name)}' + link_html = f'{html.escape(topic_name)}' topic_info = topic_info_string(message_data) return f"
  • {link_html} ({html.escape(topic_info)})
  • " diff --git a/lib/populate.py b/lib/populate.py index 1ab11f91ac8538..fcbbfecf12ba02 100644 --- a/lib/populate.py +++ b/lib/populate.py @@ -50,7 +50,7 @@ ) from .url import ( sanitize_stream, - sanitize_topic, + sanitize, ) @@ -222,7 +222,7 @@ def populate_incremental( p = ( json_root / Path(sanitize_stream(s["name"], s["stream_id"])) - / Path(sanitize_topic(topic_name) + ".json") + / Path(sanitize(topic_name) + ".json") ) topic_exists = p.exists() old = [] @@ -257,7 +257,7 @@ def dump_topic_messages(json_root, stream_data, topic_name, message_data): sanitized_stream_name = sanitize_stream(stream_name, stream_id) stream_dir = json_root / Path(sanitized_stream_name) - sanitized_topic_name = sanitize_topic(topic_name) + sanitized_topic_name = sanitize(topic_name) topic_fn = sanitized_topic_name + ".json" out = open_outfile(stream_dir, topic_fn, "w") diff --git a/lib/url.py b/lib/url.py index bee5c6853a2d6e..341b560956aaad 100644 --- a/lib/url.py +++ b/lib/url.py @@ -66,24 +66,15 @@ def archive_message_url( ## String cleaning functions -# remove non-alnum ascii symbols from string def sanitize(s): + # Sanitize the string to a safe string that can be used as the URL + # It copies zulip's way of encoding stream and topic name to URL return ( - "".join( - filter( - lambda x: x.isalnum or x == " ", - s.encode("ascii", "ignore").decode("utf-8"), - ) - ) - .replace(" ", "-") - .replace("?", "%3F") - ) - - -# create a unique sanitized identifier for a topic -def sanitize_topic(topic_name): - return ( - urllib.parse.quote(topic_name, safe="~()*!.'") + # We add a prefix of non-dot character so that the resulting string + # is safe to use as the filename especially for jekyll usecase + # where it ignores file that has dot prefix in its filename + "A-" + + urllib.parse.quote(s, safe=b"") .replace(".", "%2E") .replace("%", ".") ) diff --git a/lib/website.py b/lib/website.py index 8b590d32885245..051d52bc1319a1 100644 --- a/lib/website.py +++ b/lib/website.py @@ -21,7 +21,7 @@ from .url import ( sanitize_stream, - sanitize_topic, + sanitize, ) from .files import ( @@ -219,7 +219,7 @@ def write_topic_messages( stream_id = stream["id"] sanitized_stream_name = sanitize_stream(stream_name, stream_id) - sanitized_topic_name = sanitize_topic(topic_name) + sanitized_topic_name = sanitize(topic_name) messages = read_zulip_messages_for_topic( json_root, sanitized_stream_name, sanitized_topic_name diff --git a/tests/testCommon.py b/tests/testCommon.py index 296e798c0e0e47..4b85e7ed2dc626 100644 --- a/tests/testCommon.py +++ b/tests/testCommon.py @@ -24,11 +24,24 @@ def assert_equal(v1, v2): def test_sanitize(): - assert_equal(url.sanitize_stream(stream_name="foo bar", stream_id=7), "7-foo-bar") + assert_equal( + url.sanitize_stream(stream_name="foo bar", stream_id=7), + "7-A-foo.20bar") + assert_equal( + url.sanitize_stream(stream_name="foo/bar/turtle[🐢]", stream_id=7), + "7-A-foo.2Fbar.2Fturtle.5B.F0.9F.90.A2.5D") assert_equal( - url.sanitize_topic(topic_name="pick a place for lunch"), - "pick.20a.20place.20for.20lunch", + url.sanitize("pick a place for lunch"), + "A-pick.20a.20place.20for.20lunch", + ) + assert_equal( + url.sanitize("!!cute-turlte/tortoise (🐢)?"), + "A-.21.21cute-turlte.2Ftortoise.20.28.F0.9F.90.A2.29.3F", + ) + assert_equal( + url.sanitize('"the mighty turtle 🐢"'), + "A-.22the.20mighty.20turtle.20.F0.9F.90.A2.22", )