diff --git a/CHANGELOG.md b/CHANGELOG.md index 67eb58f..5a6f82c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,10 +9,16 @@ ## Latest changes +### Feature + +* Add `as_json` keyword parameter to `convert` + ### Internal +* Moves `classes.typeclass` functions to `transformer.py` and renames the `convert` function to `transform`. This is so that we can have a new `convert` function in our `__init__.py` with the new parameter and this feature will not introduce a breaking change. * Fix CI after main branch change + ## 2.0.0 No new features in this release, but we now support python 3.9 and 3.10 diff --git a/setup.cfg b/setup.cfg index 81afad0..16e905e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -88,3 +88,9 @@ warn_unused_ignores = True warn_redundant_casts = True warn_unused_configs = True warn_unreachable = True + + +[coverage:report] +exclude_lines = + pragma: no cover + if TYPE_CHECKING: diff --git a/soup2dict/__init__.py b/soup2dict/__init__.py index cdc8b84..3e97d0d 100644 --- a/soup2dict/__init__.py +++ b/soup2dict/__init__.py @@ -1,95 +1,44 @@ -from typing import Any, Dict, Optional, Union +import json +from typing import Union, overload -from bs4 import BeautifulSoup, element -from classes import typeclass +from bs4 import BeautifulSoup +from typing_extensions import Literal +from soup2dict.transformer import transform -def _get_key_name( - instance: Any, -): - instance_type = type(instance) - if instance_type == element.Tag: - return instance.name - - return str(instance_type.__name__).lower() - - -def _attribute_at_prepender( - instance: dict, +@overload +def convert( + soup: BeautifulSoup, + *, + as_json: Literal[False], ) -> dict: - prefixed = {} - for key, attribute_value in instance.items(): - prefixed['@{key}'.format(key=key)] = attribute_value - return prefixed - - -@typeclass -def convert(instance) -> dict: - """Convert beautifulsoup to dict. This is a typeclass definition.""" - + """Return dict when as_json is false.""" -@convert.instance(BeautifulSoup) -def _convert_bs(instance: BeautifulSoup) -> dict: - """Handle The Soup.""" - return convert(instance.contents) +@overload +def convert( + soup: BeautifulSoup, + *, + as_json: Literal[True], +) -> str: + """Return json string when as_json is True.""" -@convert.instance(element.ResultSet) -@convert.instance(list) -def _convert_rs(instance: Union[element.ResultSet, list]) -> dict: - """Handle list and ResultSet types.""" - transformed: Dict[str, Any] = {} - for soup_element in instance: - parsed = convert(soup_element) - if not parsed: - continue - - key_name = _get_key_name(soup_element) - - dict_value = transformed.get(key_name, []) - dict_value.append(parsed) - transformed[key_name] = dict_value - - return transformed - - -@convert.instance(element.NavigableString) # type: ignore -@convert.instance(element.Comment) -@convert.instance(element.CData) -@convert.instance(element.ProcessingInstruction) -@convert.instance(element.XMLProcessingInstruction) -@convert.instance(element.Declaration) -@convert.instance(element.Doctype) -@convert.instance(element.Stylesheet) -@convert.instance(element.Script) -@convert.instance(element.TemplateString) -def _convert_ns( - instance: Union[ - element.NavigableString, - element.Comment, - element.CData, - element.ProcessingInstruction, - element.XMLProcessingInstruction, - element.Declaration, - element.Doctype, - element.Stylesheet, - element.Script, - element.TemplateString, - ], -) -> Optional[str]: - """Handle NavigableString type.""" - return str(instance).strip().strip('\n') or None +@overload +def convert( + soup: BeautifulSoup, +) -> dict: + """When as_json is not specified return value is a dict.""" -@convert.instance(element.Tag) -def _convert_tag(instance: element.Tag) -> dict: - """Handle Tag type.""" - tag_result = _attribute_at_prepender(instance.attrs) - tag_result['#text'] = ' '.join( - [text.replace('\n ', ' ') for text in instance.stripped_strings], - ) - tag_result.update(convert(instance.contents)) +def convert( + soup: BeautifulSoup, + *, + as_json: bool = False, +) -> Union[dict, str]: + """Run soup to dict transformer and dumps to json if as_json is True.""" + if as_json: + return json.dumps(transform(soup)) - return tag_result + return transform(soup) diff --git a/soup2dict/transformer.py b/soup2dict/transformer.py new file mode 100644 index 0000000..efdabb5 --- /dev/null +++ b/soup2dict/transformer.py @@ -0,0 +1,95 @@ +from typing import Any, Dict, Optional, Union + +from bs4 import BeautifulSoup, element +from classes import typeclass + + +def _get_key_name( + instance: Any, +): + instance_type = type(instance) + + if instance_type == element.Tag: + return instance.name + + return str(instance_type.__name__).lower() + + +def _attribute_at_prepender( + instance: dict, +) -> dict: + prefixed = {} + for key, attribute_value in instance.items(): + prefixed['@{key}'.format(key=key)] = attribute_value + return prefixed + + +@typeclass +def transform(instance) -> dict: + """Convert beautifulsoup to dict. This is a typeclass definition.""" + + +@transform.instance(BeautifulSoup) # type: ignore +def _transform_bs(instance: BeautifulSoup) -> dict: + """Handle The Soup.""" + return transform(instance.contents) + + +@transform.instance(element.ResultSet) # type: ignore +@transform.instance(list) +def _transform_rs(instance: Union[element.ResultSet, list]) -> dict: + """Handle list and ResultSet types.""" + transformed: Dict[str, Any] = {} + + for soup_element in instance: + parsed = transform(soup_element) + if not parsed: + continue + + key_name = _get_key_name(soup_element) + + dict_value = transformed.get(key_name, []) + dict_value.append(parsed) + transformed[key_name] = dict_value + + return transformed + + +@transform.instance(element.NavigableString) # type: ignore +@transform.instance(element.Comment) +@transform.instance(element.CData) +@transform.instance(element.ProcessingInstruction) +@transform.instance(element.XMLProcessingInstruction) +@transform.instance(element.Declaration) +@transform.instance(element.Doctype) +@transform.instance(element.Stylesheet) +@transform.instance(element.Script) +@transform.instance(element.TemplateString) +def _transform_ns( + instance: Union[ + element.NavigableString, + element.Comment, + element.CData, + element.ProcessingInstruction, + element.XMLProcessingInstruction, + element.Declaration, + element.Doctype, + element.Stylesheet, + element.Script, + element.TemplateString, + ], +) -> Optional[str]: + """Handle NavigableString type.""" + return str(instance).strip().strip('\n') or None + + +@transform.instance(element.Tag) # type: ignore +def _transform_tag(instance: element.Tag) -> dict: + """Handle Tag type.""" + tag_result = _attribute_at_prepender(instance.attrs) + tag_result['#text'] = ' '.join( + [text.replace('\n ', ' ') for text in instance.stripped_strings], + ) + tag_result.update(transform(instance.contents)) + + return tag_result diff --git a/tests/test_attributes.py b/tests/test_attributes.py index b68ffdb..b733721 100644 --- a/tests/test_attributes.py +++ b/tests/test_attributes.py @@ -8,7 +8,7 @@ def test_attributes_are_at_prepended(): html_doc = """
""" - soup = BeautifulSoup(html_doc, 'html.parser') + soup: BeautifulSoup = BeautifulSoup(html_doc, 'html.parser') main_element = convert(soup)['main'][0] assert main_element['@class'] == ['bob', 'arne'] assert main_element['@abc'] == 'test' diff --git a/tests/test_is_dumped_to_json.py b/tests/test_is_dumped_to_json.py new file mode 100644 index 0000000..143a8ec --- /dev/null +++ b/tests/test_is_dumped_to_json.py @@ -0,0 +1,45 @@ +import json + +from bs4 import BeautifulSoup + +from soup2dict import convert + +html_doc = """ +
+
+
+ test + test + test +
+""" + +expected_result = { + 'main': [ + {'#text': ''}, + { + '#text': 'test test test', + 'sub': [ + { + '#text': 'test', + 'navigablestring': ['test'], + }, + { + '#text': 'test', + 'navigablestring': ['test'], + }, + { + '#text': 'test', + 'navigablestring': ['test'], + }, + ], + }, + ], +} + + +def test_as_json_true_dumps_to_json(): + """Convert result should be json when as_json is true.""" + soup = BeautifulSoup(html_doc, 'html.parser') + + assert convert(soup, as_json=True) == json.dumps(expected_result)