Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/add as_json parameter to convert directly to json #10

Merged
merged 9 commits into from
Dec 26, 2021
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,16 @@

## Latest changes

### Feature

* Add `as_json` keyword parameter to `convert`

### Internal

* Moves `classes.typeclass` functions to `transformer.py` and renames the `convert` function to `transform`. This is so that we can have a new `convert` function in our `__init__.py` with the new parameter and this feature will not introduce a breaking change.
* Fix CI after main branch change


## 2.0.0

No new features in this release, but we now support python 3.9 and 3.10
Expand Down
6 changes: 6 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,9 @@ warn_unused_ignores = True
warn_redundant_casts = True
warn_unused_configs = True
warn_unreachable = True


[coverage:report]
exclude_lines =
pragma: no cover
if TYPE_CHECKING:
115 changes: 32 additions & 83 deletions soup2dict/__init__.py
Original file line number Diff line number Diff line change
@@ -1,95 +1,44 @@
from typing import Any, Dict, Optional, Union
import json
from typing import Union, overload

from bs4 import BeautifulSoup, element
from classes import typeclass
from bs4 import BeautifulSoup
from typing_extensions import Literal

from soup2dict.transformer import transform

def _get_key_name(
instance: Any,
):
instance_type = type(instance)

if instance_type == element.Tag:
return instance.name

return str(instance_type.__name__).lower()


def _attribute_at_prepender(
instance: dict,
@overload
def convert(
soup: BeautifulSoup,
*,
as_json: Literal[False],
) -> dict:
prefixed = {}
for key, attribute_value in instance.items():
prefixed['@{key}'.format(key=key)] = attribute_value
return prefixed


@typeclass
def convert(instance) -> dict:
"""Convert beautifulsoup to dict. This is a typeclass definition."""

"""Return dict when as_json is false."""

@convert.instance(BeautifulSoup)
def _convert_bs(instance: BeautifulSoup) -> dict:
"""Handle The Soup."""
return convert(instance.contents)

@overload
def convert(
soup: BeautifulSoup,
*,
as_json: Literal[True],
) -> str:
"""Return json string when as_json is True."""

@convert.instance(element.ResultSet)
@convert.instance(list)
def _convert_rs(instance: Union[element.ResultSet, list]) -> dict:
"""Handle list and ResultSet types."""
transformed: Dict[str, Any] = {}

for soup_element in instance:
parsed = convert(soup_element)
if not parsed:
continue

key_name = _get_key_name(soup_element)

dict_value = transformed.get(key_name, [])
dict_value.append(parsed)
transformed[key_name] = dict_value

return transformed


@convert.instance(element.NavigableString) # type: ignore
@convert.instance(element.Comment)
@convert.instance(element.CData)
@convert.instance(element.ProcessingInstruction)
@convert.instance(element.XMLProcessingInstruction)
@convert.instance(element.Declaration)
@convert.instance(element.Doctype)
@convert.instance(element.Stylesheet)
@convert.instance(element.Script)
@convert.instance(element.TemplateString)
def _convert_ns(
instance: Union[
element.NavigableString,
element.Comment,
element.CData,
element.ProcessingInstruction,
element.XMLProcessingInstruction,
element.Declaration,
element.Doctype,
element.Stylesheet,
element.Script,
element.TemplateString,
],
) -> Optional[str]:
"""Handle NavigableString type."""
return str(instance).strip().strip('\n') or None
@overload
def convert(
soup: BeautifulSoup,
) -> dict:
"""When as_json is not specified return value is a dict."""


@convert.instance(element.Tag)
def _convert_tag(instance: element.Tag) -> dict:
"""Handle Tag type."""
tag_result = _attribute_at_prepender(instance.attrs)
tag_result['#text'] = ' '.join(
[text.replace('\n ', ' ') for text in instance.stripped_strings],
)
tag_result.update(convert(instance.contents))
def convert(
soup: BeautifulSoup,
*,
as_json: bool = False,
) -> Union[dict, str]:
"""Run soup to dict transformer and dumps to json if as_json is True."""
if as_json:
return json.dumps(transform(soup))

return tag_result
return transform(soup)
95 changes: 95 additions & 0 deletions soup2dict/transformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from typing import Any, Dict, Optional, Union

from bs4 import BeautifulSoup, element
from classes import typeclass


def _get_key_name(
instance: Any,
):
instance_type = type(instance)

if instance_type == element.Tag:
return instance.name

return str(instance_type.__name__).lower()


def _attribute_at_prepender(
instance: dict,
) -> dict:
prefixed = {}
for key, attribute_value in instance.items():
prefixed['@{key}'.format(key=key)] = attribute_value
return prefixed


@typeclass
def transform(instance) -> dict:
"""Convert beautifulsoup to dict. This is a typeclass definition."""


@transform.instance(BeautifulSoup) # type: ignore
def _transform_bs(instance: BeautifulSoup) -> dict:
"""Handle The Soup."""
return transform(instance.contents)


@transform.instance(element.ResultSet) # type: ignore
@transform.instance(list)
def _transform_rs(instance: Union[element.ResultSet, list]) -> dict:
"""Handle list and ResultSet types."""
transformed: Dict[str, Any] = {}

for soup_element in instance:
parsed = transform(soup_element)
if not parsed:
continue

key_name = _get_key_name(soup_element)

dict_value = transformed.get(key_name, [])
dict_value.append(parsed)
transformed[key_name] = dict_value

return transformed


@transform.instance(element.NavigableString) # type: ignore
@transform.instance(element.Comment)
@transform.instance(element.CData)
@transform.instance(element.ProcessingInstruction)
@transform.instance(element.XMLProcessingInstruction)
@transform.instance(element.Declaration)
@transform.instance(element.Doctype)
@transform.instance(element.Stylesheet)
@transform.instance(element.Script)
@transform.instance(element.TemplateString)
def _transform_ns(
instance: Union[
element.NavigableString,
element.Comment,
element.CData,
element.ProcessingInstruction,
element.XMLProcessingInstruction,
element.Declaration,
element.Doctype,
element.Stylesheet,
element.Script,
element.TemplateString,
],
) -> Optional[str]:
"""Handle NavigableString type."""
return str(instance).strip().strip('\n') or None


@transform.instance(element.Tag) # type: ignore
def _transform_tag(instance: element.Tag) -> dict:
"""Handle Tag type."""
tag_result = _attribute_at_prepender(instance.attrs)
tag_result['#text'] = ' '.join(
[text.replace('\n ', ' ') for text in instance.stripped_strings],
)
tag_result.update(transform(instance.contents))

return tag_result
2 changes: 1 addition & 1 deletion tests/test_attributes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def test_attributes_are_at_prepended():
html_doc = """
<main class="bob arne" abc="test" style="color: red;" />
"""
soup = BeautifulSoup(html_doc, 'html.parser')
soup: BeautifulSoup = BeautifulSoup(html_doc, 'html.parser')
main_element = convert(soup)['main'][0]
assert main_element['@class'] == ['bob', 'arne']
assert main_element['@abc'] == 'test'
Expand Down
45 changes: 45 additions & 0 deletions tests/test_is_dumped_to_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import json

from bs4 import BeautifulSoup

from soup2dict import convert

html_doc = """
<main>
</main>
<main>
<sub>test</sub>
<sub>test</sub>
<sub>test</sub>
</main>
"""

expected_result = {
'main': [
{'#text': ''},
{
'#text': 'test test test',
'sub': [
{
'#text': 'test',
'navigablestring': ['test'],
},
{
'#text': 'test',
'navigablestring': ['test'],
},
{
'#text': 'test',
'navigablestring': ['test'],
},
],
},
],
}


def test_as_json_true_dumps_to_json():
"""Convert result should be json when as_json is true."""
soup = BeautifulSoup(html_doc, 'html.parser')

assert convert(soup, as_json=True) == json.dumps(expected_result)