fix: don't produce <string> elements in _resolve_placeholders

There's no such HTML element, it was probably an attempt to work around lxml's shortcomings.
questionpy-org · Jun 18, 2024 · af68006 · af68006
1 parent 4adabe7
commit af68006
Show file tree

Hide file tree

Showing 3 changed files with 76 additions and 53 deletions.
diff --git a/questionpy_sdk/webserver/question_ui.py b/questionpy_sdk/webserver/question_ui.py
@@ -1,14 +1,16 @@
 #  This file is part of the QuestionPy SDK. (https://questionpy.org)
 #  The QuestionPy SDK is free software released under terms of the MIT license. See LICENSE.md.
 #  (c) Technische Universität Berlin, innoCampus <[email protected]>
+from __future__ import annotations
 
 import re
 from functools import cached_property
 from random import Random
 from typing import Any
 
+import lxml.html
+import lxml.html.clean
 from lxml import etree
-from lxml.html.clean import Cleaner
 from pydantic import BaseModel
 
 
@@ -111,6 +113,35 @@ def _int_to_roman(index: int) -> str:
     return roman_num
 
 
+def _add_text_before(before: etree._Element, text: str) -> None:
+    """Add plain text before the given sibling.
+
+    LXML doesn't represent text as nodes, but as attributes of the parent or of the preceding node, which makes this
+    less trivial than one might expect.
+    """
+    prev = before.getprevious()
+    if prev is None:
+        parent = _require_parent(before)
+        # The parent's 'text' attribute sets the text before the first child.
+        parent.text = ("" if parent.text is None else parent.text) + text
+    else:
+        prev.tail = ("" if prev.tail is None else prev.tail) + text
+
+
+def _require_parent(node: etree._Element) -> etree._Element:
+    parent = node.getparent()
+    if parent is None:
+        msg = f"Node '{node}' on line '{node.sourceline}' somehow has no parent."
+        raise ValueError(msg)
+    return parent
+
+
+def _remove_preserving_tail(node: etree._Element) -> None:
+    if node.tail is not None:
+        _add_text_before(node, node.tail)
+    _require_parent(node).remove(node)
+
+
 class QuestionMetadata:
     def __init__(self) -> None:
         self.correct_response: dict[str, str] = {}
@@ -128,6 +159,7 @@ class QuestionDisplayOptions(BaseModel):
 
 class QuestionUIRenderer:
     """General renderer for the question UI except for the formulation part."""
+
     XHTML_NAMESPACE: str = "http://www.w3.org/1999/xhtml"
     QPY_NAMESPACE: str = "http://questionpy.org/ns/question"
 
@@ -183,7 +215,7 @@ def _resolve_placeholders(self) -> None:
                 continue
             parts = p_instruction.text.strip().split()
             key = parts[0]
-            clean_option = parts[1] if len(parts) > 1 else "clean"
+            clean_option = parts[1].lower() if len(parts) > 1 else "clean"
 
             parent = p_instruction.getparent()
             if parent is None:
@@ -195,32 +227,24 @@ def _resolve_placeholders(self) -> None:
 
             raw_value = self._placeholders[key]
 
-            if clean_option.lower() not in {"clean", "noclean"}:
-                if clean_option.lower() != "plain":
-                    msg = f"clean_option expected to have value 'plain', found '{clean_option}' instead"
-                    raise ValueError(msg)
-                # Treat the value as plain text
-                root = etree.Element("string")
-                root.text = etree.CDATA(raw_value)
-                parent.replace(p_instruction, root)
-                continue
-
-            if clean_option.lower() == "clean":
-                cleaner = Cleaner()
-                cleaned_value = etree.fromstring(cleaner.clean_html(raw_value))
-                # clean_html wraps the result in <p> or <div>
-                # Remove the wrapping from clean_html
-                content = ""
-                if cleaned_value.text:
-                    content += cleaned_value.text
-                for child in cleaned_value:
-                    content += etree.tostring(child, encoding="unicode", with_tail=True)
-                replacement = content
+            if clean_option == "plain":
+                # Treat the value as plain text.
+                _add_text_before(p_instruction, raw_value)
             else:
-                replacement = raw_value
-
-            p_instruction.addnext(etree.fromstring(f"<string>{replacement}</string>"))
-            parent.remove(p_instruction)
+                # html.clean works on different element classes than etree, so we need to use different parse functions.
+                # Since the HTML elements are subclasses of the etree elements though, we can reuse them without dumping
+                # and reparsing.
+
+                # It doesn't really matter what element we wrap the fragment with, as we'll unwrap it immediately.
+                fragment = lxml.html.fragment_fromstring(raw_value, create_parent=True)
+                if clean_option != "noclean":
+                    lxml.html.clean.clean(fragment)
+                if fragment.text is not None:
+                    _add_text_before(p_instruction, fragment.text)
+                for child in fragment:
+                    p_instruction.addprevious(child)
+
+            _remove_preserving_tail(p_instruction)
 
     def _hide_unwanted_feedback(self) -> None:
         """Hides elements marked with `qpy:feedback` if the type of feedback is disabled in `options`."""

diff --git a/tests/webserver/conftest.py b/tests/webserver/conftest.py
@@ -76,12 +76,12 @@ def normalize_element(element: etree._Element) -> etree._Element:
     return element
 
 
-def assert_xhtml_is_equal(xhtml1: str, xhtml2: str) -> None:
-    parser = etree.XMLParser(remove_blank_text=True)
-    tree1 = etree.fromstring(xhtml1, parser)
-    tree2 = etree.fromstring(xhtml2, parser)
+def assert_html_is_equal(actual: str, expected: str) -> None:
+    parser = etree.HTMLParser(remove_blank_text=True)
+    actual_tree = etree.fromstring(actual, parser)
+    expected_tree = etree.fromstring(expected, parser)
 
-    normalize_element(tree1)
-    normalize_element(tree2)
+    normalize_element(actual_tree)
+    normalize_element(expected_tree)
 
-    assert etree.tostring(tree1, method="c14n") == etree.tostring(tree2, method="c14n")
+    assert etree.tostring(actual_tree, method="c14n") == etree.tostring(expected_tree, method="c14n")
diff --git a/tests/webserver/test_question_ui.py b/tests/webserver/test_question_ui.py
@@ -12,7 +12,7 @@
     QuestionMetadata,
     QuestionUIRenderer,
 )
-from tests.webserver.conftest import assert_xhtml_is_equal
+from tests.webserver.conftest import assert_html_is_equal
 
 
 @pytest.fixture
@@ -83,20 +83,19 @@ def test_should_extract_correct_metadata(xml_content: str) -> None:
     }
 )
 def test_should_resolve_placeholders(result: str) -> None:
-    # TODO: remove <string> surrounding the resolved placeholder
     expected = """
     <div>
-        <div><string>My simple description.</string></div>
-        <span>By default cleaned parameter: <string>Value of param <b>one</b>.</string></span>
-        <span>Explicitly cleaned parameter: <string>Value of param <b>one</b>.</string></span>
-        <span>Noclean parameter: <string>Value of param <b>one</b>.<script>'Oh no, danger!'</script></string></span>
+        <div>My simple description.</div>
+        <span>By default cleaned parameter: Value of param <b>one</b>.</span>
+        <span>Explicitly cleaned parameter: Value of param <b>one</b>.</span>
+        <span>Noclean parameter: Value of param <b>one</b>.<script>'Oh no, danger!'</script></span>
         <span>Plain parameter:
-            <string><![CDATA[Value of param <b>one</b>.<script>'Oh no, danger!'</script>]]></string>
+            Value of param &lt;b>one&lt;/b>.&lt;script>'Oh no, danger!'&lt;/script>
         </span>
     </div>
     """
 
-    assert_xhtml_is_equal(result, expected)
+    assert_html_is_equal(result, expected)
 
 
 @pytest.mark.ui_file("feedbacks")
@@ -107,7 +106,7 @@ def test_should_hide_inline_feedback(result: str) -> None:
             <span>No feedback</span>
         </div>
     """
-    assert_xhtml_is_equal(result, expected)
+    assert_html_is_equal(result, expected)
 
 
 @pytest.mark.ui_file("feedbacks")
@@ -119,7 +118,7 @@ def test_should_show_inline_feedback(result: str) -> None:
             <span>Specific feedback</span>
         </div>
     """
-    assert_xhtml_is_equal(result, expected)
+    assert_html_is_equal(result, expected)
 
 
 @pytest.mark.parametrize(
@@ -151,7 +150,7 @@ def test_element_visibility_based_on_role(user_context: str, expected: str, xml_
     renderer = QuestionUIRenderer(xml_content, {}, options)
     result = renderer.html
 
-    assert_xhtml_is_equal(result, expected)
+    assert_html_is_equal(result, expected)
 
 
 @pytest.mark.ui_file("inputs")
@@ -180,7 +179,7 @@ def test_should_set_input_values(result: str) -> None:
             <input name="my_radio" type="radio" value="radio_value_2" class="qpy-input" checked="checked"/>
         </div>
     """
-    assert_xhtml_is_equal(result, expected)
+    assert_html_is_equal(result, expected)
 
 
 @pytest.mark.ui_file("inputs")
@@ -201,7 +200,7 @@ def test_should_disable_inputs(result: str) -> None:
             <input name="my_radio" type="radio" value="radio_value_2" disabled="disabled" class="qpy-input"/>
         </div>
     """
-    assert_xhtml_is_equal(result, expected)
+    assert_html_is_equal(result, expected)
 
 
 @pytest.mark.ui_file("validations")
@@ -220,7 +219,7 @@ def test_should_soften_validations(result: str) -> None:
         </div>
     """
 
-    assert_xhtml_is_equal(result, expected)
+    assert_html_is_equal(result, expected)
 
 
 @pytest.mark.ui_file("buttons")
@@ -236,7 +235,7 @@ def test_should_defuse_buttons(result: str) -> None:
             <input class="btn btn-primary qpy-input" type="button" value="Button"/>
         </div>
     """
-    assert_xhtml_is_equal(result, expected)
+    assert_html_is_equal(result, expected)
 
 
 @pytest.mark.skip("""1. The text directly in the root of <qpy:formulation> is not copied in render_part.
@@ -254,7 +253,7 @@ def test_should_format_floats_in_en(result: str) -> None:
             Strip zeros: <span>1.1</span>
         </div>
     """
-    assert_xhtml_is_equal(result, expected)
+    assert_html_is_equal(result, expected)
 
 
 @pytest.mark.ui_file("shuffle")
@@ -287,7 +286,7 @@ def test_should_replace_shuffled_index(result: str) -> None:
             </fieldset>
         </div>
         """
-    assert_xhtml_is_equal(result, expected)
+    assert_html_is_equal(result, expected)
 
 
 @pytest.mark.render_params(
@@ -307,7 +306,7 @@ def test_clean_up(result: str) -> None:
             <regular>Normal Content</regular>
         </div>
     """
-    assert_xhtml_is_equal(result, expected)
+    assert_html_is_equal(result, expected)
 
 
 @pytest.mark.ui_file("qpy-urls")
@@ -324,4 +323,4 @@ def test_should_replace_qpy_urls(result: str) -> None:
         </div>
     """
 
-    assert_xhtml_is_equal(result, expected)
+    assert_html_is_equal(result, expected)