Skip to content

Commit

Permalink
Feature/add as_json parameter to convert directly to json (#10)
Browse files Browse the repository at this point in the history
* Adds as_json parameter to convert

* add changelog entry

* Be more spesific why we moved the convert function

* Fix typing issues and handle @overload when no as_json param is sent

* Make as_json a forced keyword argument

* Try to fix Literal not being available in stdlib typing in 3.7
  • Loading branch information
thomasborgen authored Dec 26, 2021
1 parent 80810ef commit 6cf30fb
Show file tree
Hide file tree
Showing 6 changed files with 185 additions and 84 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,16 @@

## Latest changes

### Feature

* Add `as_json` keyword parameter to `convert`

### Internal

* Moves `classes.typeclass` functions to `transformer.py` and renames the `convert` function to `transform`. This is so that we can have a new `convert` function in our `__init__.py` with the new parameter and this feature will not introduce a breaking change.
* Fix CI after main branch change


## 2.0.0

No new features in this release, but we now support python 3.9 and 3.10
Expand Down
6 changes: 6 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,9 @@ warn_unused_ignores = True
warn_redundant_casts = True
warn_unused_configs = True
warn_unreachable = True


[coverage:report]
exclude_lines =
pragma: no cover
if TYPE_CHECKING:
115 changes: 32 additions & 83 deletions soup2dict/__init__.py
Original file line number Diff line number Diff line change
@@ -1,95 +1,44 @@
from typing import Any, Dict, Optional, Union
import json
from typing import Union, overload

from bs4 import BeautifulSoup, element
from classes import typeclass
from bs4 import BeautifulSoup
from typing_extensions import Literal

from soup2dict.transformer import transform

def _get_key_name(
instance: Any,
):
instance_type = type(instance)

if instance_type == element.Tag:
return instance.name

return str(instance_type.__name__).lower()


def _attribute_at_prepender(
instance: dict,
@overload
def convert(
soup: BeautifulSoup,
*,
as_json: Literal[False],
) -> dict:
prefixed = {}
for key, attribute_value in instance.items():
prefixed['@{key}'.format(key=key)] = attribute_value
return prefixed


@typeclass
def convert(instance) -> dict:
"""Convert beautifulsoup to dict. This is a typeclass definition."""

"""Return dict when as_json is false."""

@convert.instance(BeautifulSoup)
def _convert_bs(instance: BeautifulSoup) -> dict:
"""Handle The Soup."""
return convert(instance.contents)

@overload
def convert(
soup: BeautifulSoup,
*,
as_json: Literal[True],
) -> str:
"""Return json string when as_json is True."""

@convert.instance(element.ResultSet)
@convert.instance(list)
def _convert_rs(instance: Union[element.ResultSet, list]) -> dict:
"""Handle list and ResultSet types."""
transformed: Dict[str, Any] = {}

for soup_element in instance:
parsed = convert(soup_element)
if not parsed:
continue

key_name = _get_key_name(soup_element)

dict_value = transformed.get(key_name, [])
dict_value.append(parsed)
transformed[key_name] = dict_value

return transformed


@convert.instance(element.NavigableString) # type: ignore
@convert.instance(element.Comment)
@convert.instance(element.CData)
@convert.instance(element.ProcessingInstruction)
@convert.instance(element.XMLProcessingInstruction)
@convert.instance(element.Declaration)
@convert.instance(element.Doctype)
@convert.instance(element.Stylesheet)
@convert.instance(element.Script)
@convert.instance(element.TemplateString)
def _convert_ns(
instance: Union[
element.NavigableString,
element.Comment,
element.CData,
element.ProcessingInstruction,
element.XMLProcessingInstruction,
element.Declaration,
element.Doctype,
element.Stylesheet,
element.Script,
element.TemplateString,
],
) -> Optional[str]:
"""Handle NavigableString type."""
return str(instance).strip().strip('\n') or None
@overload
def convert(
soup: BeautifulSoup,
) -> dict:
"""When as_json is not specified return value is a dict."""


@convert.instance(element.Tag)
def _convert_tag(instance: element.Tag) -> dict:
"""Handle Tag type."""
tag_result = _attribute_at_prepender(instance.attrs)
tag_result['#text'] = ' '.join(
[text.replace('\n ', ' ') for text in instance.stripped_strings],
)
tag_result.update(convert(instance.contents))
def convert(
soup: BeautifulSoup,
*,
as_json: bool = False,
) -> Union[dict, str]:
"""Run soup to dict transformer and dumps to json if as_json is True."""
if as_json:
return json.dumps(transform(soup))

return tag_result
return transform(soup)
95 changes: 95 additions & 0 deletions soup2dict/transformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from typing import Any, Dict, Optional, Union

from bs4 import BeautifulSoup, element
from classes import typeclass


def _get_key_name(
instance: Any,
):
instance_type = type(instance)

if instance_type == element.Tag:
return instance.name

return str(instance_type.__name__).lower()


def _attribute_at_prepender(
instance: dict,
) -> dict:
prefixed = {}
for key, attribute_value in instance.items():
prefixed['@{key}'.format(key=key)] = attribute_value
return prefixed


@typeclass
def transform(instance) -> dict:
"""Convert beautifulsoup to dict. This is a typeclass definition."""


@transform.instance(BeautifulSoup) # type: ignore
def _transform_bs(instance: BeautifulSoup) -> dict:
"""Handle The Soup."""
return transform(instance.contents)


@transform.instance(element.ResultSet) # type: ignore
@transform.instance(list)
def _transform_rs(instance: Union[element.ResultSet, list]) -> dict:
"""Handle list and ResultSet types."""
transformed: Dict[str, Any] = {}

for soup_element in instance:
parsed = transform(soup_element)
if not parsed:
continue

key_name = _get_key_name(soup_element)

dict_value = transformed.get(key_name, [])
dict_value.append(parsed)
transformed[key_name] = dict_value

return transformed


@transform.instance(element.NavigableString) # type: ignore
@transform.instance(element.Comment)
@transform.instance(element.CData)
@transform.instance(element.ProcessingInstruction)
@transform.instance(element.XMLProcessingInstruction)
@transform.instance(element.Declaration)
@transform.instance(element.Doctype)
@transform.instance(element.Stylesheet)
@transform.instance(element.Script)
@transform.instance(element.TemplateString)
def _transform_ns(
instance: Union[
element.NavigableString,
element.Comment,
element.CData,
element.ProcessingInstruction,
element.XMLProcessingInstruction,
element.Declaration,
element.Doctype,
element.Stylesheet,
element.Script,
element.TemplateString,
],
) -> Optional[str]:
"""Handle NavigableString type."""
return str(instance).strip().strip('\n') or None


@transform.instance(element.Tag) # type: ignore
def _transform_tag(instance: element.Tag) -> dict:
"""Handle Tag type."""
tag_result = _attribute_at_prepender(instance.attrs)
tag_result['#text'] = ' '.join(
[text.replace('\n ', ' ') for text in instance.stripped_strings],
)
tag_result.update(transform(instance.contents))

return tag_result
2 changes: 1 addition & 1 deletion tests/test_attributes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def test_attributes_are_at_prepended():
html_doc = """
<main class="bob arne" abc="test" style="color: red;" />
"""
soup = BeautifulSoup(html_doc, 'html.parser')
soup: BeautifulSoup = BeautifulSoup(html_doc, 'html.parser')
main_element = convert(soup)['main'][0]
assert main_element['@class'] == ['bob', 'arne']
assert main_element['@abc'] == 'test'
Expand Down
45 changes: 45 additions & 0 deletions tests/test_is_dumped_to_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import json

from bs4 import BeautifulSoup

from soup2dict import convert

html_doc = """
<main>
</main>
<main>
<sub>test</sub>
<sub>test</sub>
<sub>test</sub>
</main>
"""

expected_result = {
'main': [
{'#text': ''},
{
'#text': 'test test test',
'sub': [
{
'#text': 'test',
'navigablestring': ['test'],
},
{
'#text': 'test',
'navigablestring': ['test'],
},
{
'#text': 'test',
'navigablestring': ['test'],
},
],
},
],
}


def test_as_json_true_dumps_to_json():
"""Convert result should be json when as_json is true."""
soup = BeautifulSoup(html_doc, 'html.parser')

assert convert(soup, as_json=True) == json.dumps(expected_result)

0 comments on commit 6cf30fb

Please sign in to comment.