-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feature/add as_json parameter to convert directly to json (#10)
* Adds as_json parameter to convert * add changelog entry * Be more spesific why we moved the convert function * Fix typing issues and handle @overload when no as_json param is sent * Make as_json a forced keyword argument * Try to fix Literal not being available in stdlib typing in 3.7
- Loading branch information
1 parent
80810ef
commit 6cf30fb
Showing
6 changed files
with
185 additions
and
84 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,95 +1,44 @@ | ||
from typing import Any, Dict, Optional, Union | ||
import json | ||
from typing import Union, overload | ||
|
||
from bs4 import BeautifulSoup, element | ||
from classes import typeclass | ||
from bs4 import BeautifulSoup | ||
from typing_extensions import Literal | ||
|
||
from soup2dict.transformer import transform | ||
|
||
def _get_key_name( | ||
instance: Any, | ||
): | ||
instance_type = type(instance) | ||
|
||
if instance_type == element.Tag: | ||
return instance.name | ||
|
||
return str(instance_type.__name__).lower() | ||
|
||
|
||
def _attribute_at_prepender( | ||
instance: dict, | ||
@overload | ||
def convert( | ||
soup: BeautifulSoup, | ||
*, | ||
as_json: Literal[False], | ||
) -> dict: | ||
prefixed = {} | ||
for key, attribute_value in instance.items(): | ||
prefixed['@{key}'.format(key=key)] = attribute_value | ||
return prefixed | ||
|
||
|
||
@typeclass | ||
def convert(instance) -> dict: | ||
"""Convert beautifulsoup to dict. This is a typeclass definition.""" | ||
|
||
"""Return dict when as_json is false.""" | ||
|
||
@convert.instance(BeautifulSoup) | ||
def _convert_bs(instance: BeautifulSoup) -> dict: | ||
"""Handle The Soup.""" | ||
return convert(instance.contents) | ||
|
||
@overload | ||
def convert( | ||
soup: BeautifulSoup, | ||
*, | ||
as_json: Literal[True], | ||
) -> str: | ||
"""Return json string when as_json is True.""" | ||
|
||
@convert.instance(element.ResultSet) | ||
@convert.instance(list) | ||
def _convert_rs(instance: Union[element.ResultSet, list]) -> dict: | ||
"""Handle list and ResultSet types.""" | ||
transformed: Dict[str, Any] = {} | ||
|
||
for soup_element in instance: | ||
parsed = convert(soup_element) | ||
if not parsed: | ||
continue | ||
|
||
key_name = _get_key_name(soup_element) | ||
|
||
dict_value = transformed.get(key_name, []) | ||
dict_value.append(parsed) | ||
transformed[key_name] = dict_value | ||
|
||
return transformed | ||
|
||
|
||
@convert.instance(element.NavigableString) # type: ignore | ||
@convert.instance(element.Comment) | ||
@convert.instance(element.CData) | ||
@convert.instance(element.ProcessingInstruction) | ||
@convert.instance(element.XMLProcessingInstruction) | ||
@convert.instance(element.Declaration) | ||
@convert.instance(element.Doctype) | ||
@convert.instance(element.Stylesheet) | ||
@convert.instance(element.Script) | ||
@convert.instance(element.TemplateString) | ||
def _convert_ns( | ||
instance: Union[ | ||
element.NavigableString, | ||
element.Comment, | ||
element.CData, | ||
element.ProcessingInstruction, | ||
element.XMLProcessingInstruction, | ||
element.Declaration, | ||
element.Doctype, | ||
element.Stylesheet, | ||
element.Script, | ||
element.TemplateString, | ||
], | ||
) -> Optional[str]: | ||
"""Handle NavigableString type.""" | ||
return str(instance).strip().strip('\n') or None | ||
@overload | ||
def convert( | ||
soup: BeautifulSoup, | ||
) -> dict: | ||
"""When as_json is not specified return value is a dict.""" | ||
|
||
|
||
@convert.instance(element.Tag) | ||
def _convert_tag(instance: element.Tag) -> dict: | ||
"""Handle Tag type.""" | ||
tag_result = _attribute_at_prepender(instance.attrs) | ||
tag_result['#text'] = ' '.join( | ||
[text.replace('\n ', ' ') for text in instance.stripped_strings], | ||
) | ||
tag_result.update(convert(instance.contents)) | ||
def convert( | ||
soup: BeautifulSoup, | ||
*, | ||
as_json: bool = False, | ||
) -> Union[dict, str]: | ||
"""Run soup to dict transformer and dumps to json if as_json is True.""" | ||
if as_json: | ||
return json.dumps(transform(soup)) | ||
|
||
return tag_result | ||
return transform(soup) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
from typing import Any, Dict, Optional, Union | ||
|
||
from bs4 import BeautifulSoup, element | ||
from classes import typeclass | ||
|
||
|
||
def _get_key_name( | ||
instance: Any, | ||
): | ||
instance_type = type(instance) | ||
|
||
if instance_type == element.Tag: | ||
return instance.name | ||
|
||
return str(instance_type.__name__).lower() | ||
|
||
|
||
def _attribute_at_prepender( | ||
instance: dict, | ||
) -> dict: | ||
prefixed = {} | ||
for key, attribute_value in instance.items(): | ||
prefixed['@{key}'.format(key=key)] = attribute_value | ||
return prefixed | ||
|
||
|
||
@typeclass | ||
def transform(instance) -> dict: | ||
"""Convert beautifulsoup to dict. This is a typeclass definition.""" | ||
|
||
|
||
@transform.instance(BeautifulSoup) # type: ignore | ||
def _transform_bs(instance: BeautifulSoup) -> dict: | ||
"""Handle The Soup.""" | ||
return transform(instance.contents) | ||
|
||
|
||
@transform.instance(element.ResultSet) # type: ignore | ||
@transform.instance(list) | ||
def _transform_rs(instance: Union[element.ResultSet, list]) -> dict: | ||
"""Handle list and ResultSet types.""" | ||
transformed: Dict[str, Any] = {} | ||
|
||
for soup_element in instance: | ||
parsed = transform(soup_element) | ||
if not parsed: | ||
continue | ||
|
||
key_name = _get_key_name(soup_element) | ||
|
||
dict_value = transformed.get(key_name, []) | ||
dict_value.append(parsed) | ||
transformed[key_name] = dict_value | ||
|
||
return transformed | ||
|
||
|
||
@transform.instance(element.NavigableString) # type: ignore | ||
@transform.instance(element.Comment) | ||
@transform.instance(element.CData) | ||
@transform.instance(element.ProcessingInstruction) | ||
@transform.instance(element.XMLProcessingInstruction) | ||
@transform.instance(element.Declaration) | ||
@transform.instance(element.Doctype) | ||
@transform.instance(element.Stylesheet) | ||
@transform.instance(element.Script) | ||
@transform.instance(element.TemplateString) | ||
def _transform_ns( | ||
instance: Union[ | ||
element.NavigableString, | ||
element.Comment, | ||
element.CData, | ||
element.ProcessingInstruction, | ||
element.XMLProcessingInstruction, | ||
element.Declaration, | ||
element.Doctype, | ||
element.Stylesheet, | ||
element.Script, | ||
element.TemplateString, | ||
], | ||
) -> Optional[str]: | ||
"""Handle NavigableString type.""" | ||
return str(instance).strip().strip('\n') or None | ||
|
||
|
||
@transform.instance(element.Tag) # type: ignore | ||
def _transform_tag(instance: element.Tag) -> dict: | ||
"""Handle Tag type.""" | ||
tag_result = _attribute_at_prepender(instance.attrs) | ||
tag_result['#text'] = ' '.join( | ||
[text.replace('\n ', ' ') for text in instance.stripped_strings], | ||
) | ||
tag_result.update(transform(instance.contents)) | ||
|
||
return tag_result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import json | ||
|
||
from bs4 import BeautifulSoup | ||
|
||
from soup2dict import convert | ||
|
||
html_doc = """ | ||
<main> | ||
</main> | ||
<main> | ||
<sub>test</sub> | ||
<sub>test</sub> | ||
<sub>test</sub> | ||
</main> | ||
""" | ||
|
||
expected_result = { | ||
'main': [ | ||
{'#text': ''}, | ||
{ | ||
'#text': 'test test test', | ||
'sub': [ | ||
{ | ||
'#text': 'test', | ||
'navigablestring': ['test'], | ||
}, | ||
{ | ||
'#text': 'test', | ||
'navigablestring': ['test'], | ||
}, | ||
{ | ||
'#text': 'test', | ||
'navigablestring': ['test'], | ||
}, | ||
], | ||
}, | ||
], | ||
} | ||
|
||
|
||
def test_as_json_true_dumps_to_json(): | ||
"""Convert result should be json when as_json is true.""" | ||
soup = BeautifulSoup(html_doc, 'html.parser') | ||
|
||
assert convert(soup, as_json=True) == json.dumps(expected_result) |