From eb341d13f68030d383e1bf2912e3a3fe6b2a1070 Mon Sep 17 00:00:00 2001 From: Simon van Lierde <90462640+simonvanlierde@users.noreply.github.com> Date: Wed, 9 Oct 2024 22:49:43 +0200 Subject: [PATCH] 2024 10 iri improvements (#2) * Update .gitignore and README * Avoid boolean trap in VocabIRI methods * Modernize typing * Include IRI in query for easier results handling * Refactor subject bool to triple_position enum * Add graph_url check in VocabIRI * Add some unit tests for VocabIRI and ProductIRI * Add error handling and tests to convert_json_object * Add back prefix to triples query --------- Co-authored-by: Chris Mutel --- .gitignore | 36 +++++++++ README.md | 2 +- sentier_data_tools/iri/main.py | 138 ++++++++++++++++++-------------- sentier_data_tools/iri/utils.py | 20 ++++- tests/iri/__init__.py | 0 tests/iri/test_main.py | 109 +++++++++++++++++++++++++ tests/iri/test_utils.py | 63 +++++++++++++++ 7 files changed, 305 insertions(+), 63 deletions(-) create mode 100644 tests/iri/__init__.py create mode 100644 tests/iri/test_main.py create mode 100644 tests/iri/test_utils.py diff --git a/.gitignore b/.gitignore index 3fc369b..8c2a8aa 100644 --- a/.gitignore +++ b/.gitignore @@ -129,3 +129,39 @@ dmypy.json # Pyre type checker .pyre/ + + +### MacOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### VS Code ### +.vscode/ +*.code-workspace + +# Local History for Visual Studio Code +.history/ diff --git a/README.md b/README.md index ec7bbf4..4304d18 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ please [file an issue][Issue Tracker] along with a detailed description. You can build the documentation locally by installing the documentation Conda environment: ```bash -conda env create -f docs/environment.yml +conda env create -f docs/environment.yaml ``` activating the environment diff --git a/sentier_data_tools/iri/main.py b/sentier_data_tools/iri/main.py index 3157213..c5b6f15 100644 --- a/sentier_data_tools/iri/main.py +++ b/sentier_data_tools/iri/main.py @@ -1,9 +1,14 @@ -from typing import List, Optional, Union +"""Module for querying RDF triples from sentier.dev vocabularies. + +This module provides base classes and utility functions to handle IRIs +and retrieve RDF triples from vocabularies like products and units using SPARQL queries. +""" from rdflib import Graph, URIRef from sentier_data_tools.iri.utils import ( VOCAB_FUSEKI, + TriplePosition, convert_json_object, display_value_for_uri, execute_sparql_query, @@ -13,50 +18,56 @@ class VocabIRI(URIRef): - def triples(self, subject: bool = True, limit: Optional[int] = 25) -> List[tuple]: - """Return a list of triples with `rdflib` objects""" - if subject: - QUERY = f""" - SELECT ?p ?o - FROM <{self.graph_url}> - WHERE {{ - <{str(self)}> ?p ?o - }} - """ - else: - QUERY = f""" - PREFIX skos: - - SELECT ?s ?p - FROM <{self.graph_url}> - WHERE {{ - ?s ?p <{str(self)}> - }} - """ + """Base class for standard queries for IRIs from sentier.dev vocabularies.""" + + def triples( + self, + *, + iri_position: TriplePosition = TriplePosition.SUBJECT, + limit: int | None = 25, + ) -> list[tuple]: + """Get triples from a sentier.dev vocabulary for the given IRI. + + Args: + iri_position (TriplePosition, optional): The IRI position in the triple + (SUBJECT, PREDICATE, or OBJECT). Defaults to TriplePosition.SUBJECT. + limit (int | None, optional): The maximum number of triples to return. + Defaults to 25. + + Returns: + list[tuple]: A list of triples from a sentier.dev vocabulary. + """ + # Ensure a vocabulary graph_url is defined in a subclass + if not getattr(self, "graph_url", None): + error_msg = ( + f"{self.__class__.__name__} must define a 'graph_url' attribute " + "to indicate the vocabulary graph URL." + ) + logger.error(error_msg) + raise AttributeError(error_msg) + + # pylint: disable=no-member + QUERY = f""" + PREFIX skos: + + SELECT ?s ?p ?o + FROM <{self.graph_url}> + WHERE {{ + VALUES ?{iri_position.value} {{ <{str(self)}> }} + ?s ?p ?o + }} + """ + if limit is not None: QUERY += f"LIMIT {int(limit)}" logger.debug(f"Executing query:\n{QUERY}") results = execute_sparql_query(QUERY) logger.info(f"Retrieved {len(results)} triples from {VOCAB_FUSEKI}") - if subject: - return [ - ( - URIRef(str(self)), - convert_json_object(line["p"]), - convert_json_object(line["o"]), - ) - for line in results - ] - else: - return [ - ( - convert_json_object(line["s"]), - convert_json_object(line["p"]), - URIRef(str(self)), - ) - for line in results - ] + return [ + tuple(convert_json_object(line[key]) for key in ["s", "p", "o"]) + for line in results + ] def __repr__(self) -> str: return self.display() @@ -64,25 +75,32 @@ def __repr__(self) -> str: def display(self) -> str: return display_value_for_uri(str(self), self.kind, self.graph_url) - def graph(self, subject: bool = True) -> Graph: - """Return an `rdflib` graph of the data from the sentier.dev vocabulary for this IRI""" + def graph( + self, + *, + iri_position: TriplePosition = TriplePosition.SUBJECT, + ) -> Graph: + """Return an `rdflib` graph of the data from the sentier.dev vocabulary for this IRI.""" graph = Graph() - for triple in self.triples(subject=subject, limit=None): + for triple in self.triples( + iri_position=iri_position, + limit=None, + ): graph.add(triple) return graph def narrower( self, include_self: bool = False, raw_strings: bool = False - ) -> Union[list["VocabIRI"], list[str]]: + ) -> list["VocabIRI"] | list[str]: QUERY = f""" -PREFIX skos: - -SELECT ?o ?s -FROM <{self.graph_url}> -WHERE {{ - <{str(self)}> skos:narrower+ ?o . - ?o skos:broader ?s . -}}""" + PREFIX skos: + + SELECT ?o ?s + FROM <{self.graph_url}> + WHERE {{ + <{str(self)}> skos:narrower+ ?o . + ?o skos:broader ?s . + }}""" logger.debug(f"Executing query:\n{QUERY}") results = [ (elem["s"]["value"], elem["o"]["value"]) @@ -98,16 +116,16 @@ def narrower( def broader( self, include_self: bool = False, raw_strings: bool = False - ) -> Union[list["VocabIRI"], list[str]]: + ) -> list["VocabIRI"] | list[str]: QUERY = f""" -PREFIX skos: - -SELECT ?o ?s -FROM <{self.graph_url}> -WHERE {{ - <{str(self)}> skos:broader+ ?o . - ?o skos:narrower ?s . -}}""" + PREFIX skos: + + SELECT ?o ?s + FROM <{self.graph_url}> + WHERE {{ + <{str(self)}> skos:broader+ ?o . + ?o skos:narrower ?s . + }}""" logger.debug(f"Executing query:\n{QUERY}") results = [ (elem["s"]["value"], elem["o"]["value"]) diff --git a/sentier_data_tools/iri/utils.py b/sentier_data_tools/iri/utils.py index 6dd69ef..96737e0 100644 --- a/sentier_data_tools/iri/utils.py +++ b/sentier_data_tools/iri/utils.py @@ -1,6 +1,7 @@ import locale import platform from collections import defaultdict, deque +from enum import Enum from functools import lru_cache import os from typing import Union @@ -8,6 +9,8 @@ from rdflib import Literal, URIRef from SPARQLWrapper import JSON, SPARQLWrapper +from sentier_data_tools.logs import stdout_feedback_logger as logger + if language := os.environ.get("SDT_LOCALE"): pass elif platform.system() == "Windows": @@ -31,13 +34,18 @@ def execute_sparql_query(query: str) -> list: return sparql.queryAndConvert()["results"]["bindings"] -def convert_json_object(obj: dict) -> Union[URIRef, Literal]: +def convert_json_object(obj: dict) -> URIRef | Literal: + if "value" not in obj: + error_msg = f"Missing 'value' key in object: {obj}" + logger.error(error_msg) + raise ValueError(error_msg) + if obj["type"] == "literal": return Literal( obj["value"], lang=obj.get("xml:lang"), datatype=obj.get("datatype") ) else: - return URIRef(obj["value"]) + return URIRef(str(obj["value"])) @lru_cache(maxsize=2048) @@ -99,3 +107,11 @@ def resolve_hierarchy( queue.append(code) return ordered + + +class TriplePosition(Enum): + """Represents the position of an object in a triple store.""" + + SUBJECT = "s" + PREDICATE = "p" + OBJECT = "o" diff --git a/tests/iri/__init__.py b/tests/iri/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/iri/test_main.py b/tests/iri/test_main.py new file mode 100644 index 0000000..890377b --- /dev/null +++ b/tests/iri/test_main.py @@ -0,0 +1,109 @@ +"""Unit tests for IRI classes and their helper functions. + +These tests focus on verifying the core functionality of the VocabIRI class and its +subclasses (ProductIRI and UnitIRI). The primary behavior of ProductIRI and UnitIRI, +such as SPARQL querying and triple retrieval, should be covered in integration tests. +""" + +from unittest.mock import patch + +import pytest +from rdflib import Literal, URIRef + +from sentier_data_tools.iri.main import ProductIRI, VocabIRI +from sentier_data_tools.iri.utils import TriplePosition + + +# Test Vocab IRI class without graph_url attribute +class IncompleteVocabIRI(VocabIRI): + """Subclass of VocabIRI without graph_url to test the get_url error.""" + + +@pytest.fixture +def incomplete_vocab_iri() -> IncompleteVocabIRI: + """Incomplete VocabIRI subclass fixture for testing.""" + return IncompleteVocabIRI("https://example.org/incomplete/123") + + +def test_vocab_iri_missing_graph_url( + incomplete_vocab_iri: IncompleteVocabIRI, +) -> None: + """Test that AttributeError is raised when graph_url is missing.""" + with pytest.raises(AttributeError, match="must define a 'graph_url' attribute"): + incomplete_vocab_iri.triples() + + +# Test base functionality of ProductIRI for each TriplePosition value + + +@pytest.fixture +def product_iri() -> ProductIRI: + """ProductIRI fixture for testing.""" + return ProductIRI("https://example.com/product/123") + + +def mock_sparql_result( + mock_execute_sparql_query: patch, iri_value: URIRef, position: TriplePosition +) -> None: + """A helper function to mock the SPARQL query result for a given TriplePosition.""" + default_values = { + "s": {"type": "uri", "value": "https://example.com/default_subject"}, + "p": {"type": "uri", "value": "https://example.com/default_predicate"}, + "o": {"type": "literal", "value": "default_object"}, + } + + # Update the default values based on the IRI position in the triple + if position == TriplePosition.SUBJECT: + default_values["s"] = {"type": "uri", "value": iri_value} + elif position == TriplePosition.PREDICATE: + default_values["p"] = {"type": "uri", "value": iri_value} + elif position == TriplePosition.OBJECT: + default_values["o"] = {"type": "uri", "value": iri_value} + + mock_execute_sparql_query.return_value = [default_values] + + +@pytest.mark.parametrize( + "position", + [TriplePosition.SUBJECT, TriplePosition.PREDICATE, TriplePosition.OBJECT], +) +@patch("sentier_data_tools.iri.main.execute_sparql_query") +def test_product_iri_triples_for_all_positions( + mock_execute_sparql_query: patch, product_iri: ProductIRI, position: TriplePosition +) -> None: + """Test that ProductIRI works for all values of TriplePosition.""" + product_iri_str = str(product_iri) + + # Mock the SPARQL result + mock_sparql_result(mock_execute_sparql_query, product_iri_str, position) + + # Call the triples method + triples = product_iri.triples(iri_position=position) + + # Ensure that triples are returned + assert triples, "Expected triples but got empty results" + + # Unpack the triple elements (subject, predicate, object) + subject, predicate, obj = triples[0] + + # Common assertions for subject and predicate types + assert isinstance(subject, URIRef) + assert isinstance(predicate, URIRef) + assert len(triples) == 1 + + # Check the expected values based on the position + if position == TriplePosition.SUBJECT: + assert isinstance(obj, Literal) + assert str(subject) == product_iri_str + assert str(predicate) == "https://example.com/default_predicate" + assert str(obj) == "default_object" + elif position == TriplePosition.PREDICATE: + assert isinstance(obj, Literal) + assert str(subject) == "https://example.com/default_subject" + assert str(predicate) == product_iri_str + assert str(obj) == "default_object" + elif position == TriplePosition.OBJECT: + assert isinstance(obj, URIRef) + assert str(subject) == "https://example.com/default_subject" + assert str(predicate) == "https://example.com/default_predicate" + assert str(obj) == product_iri_str diff --git a/tests/iri/test_utils.py b/tests/iri/test_utils.py new file mode 100644 index 0000000..bd49a33 --- /dev/null +++ b/tests/iri/test_utils.py @@ -0,0 +1,63 @@ +import pytest +from rdflib import Literal, URIRef + +from sentier_data_tools.iri.utils import convert_json_object + + +def test_convert_json_object_literal_with_language() -> None: + """Test that a literal object is correctly converted.""" + obj = { + "type": "literal", + "value": "Hello World", + "xml:lang": "en", + } + result = convert_json_object(obj) + assert isinstance(result, Literal) + assert result.value == "Hello World" + assert result.language == "en" + assert result.datatype is None + + +def test_convert_json_object_literal_with_datatype() -> None: + """Test that a literal object with a datatype is correctly converted.""" + obj = { + "type": "literal", + "value": "42", + "datatype": "http://www.w3.org/2001/XMLSchema#int", + } + result = convert_json_object(obj) + print(result) + assert isinstance(result, Literal) + assert result.value == 42 + assert result.datatype == URIRef("http://www.w3.org/2001/XMLSchema#int") + + +def test_convert_json_object_uri() -> None: + """Test that a URI object is correctly converted to URIRef.""" + obj = { + "type": "uri", + "value": "https://example.com", + } + result = convert_json_object(obj) + assert isinstance(result, URIRef) + assert str(result) == "https://example.com" + + +def test_convert_json_object_missing_value_key() -> None: + """Test that a ValueError is raised when the 'value' key is missing.""" + obj = { + "type": "literal", + } + with pytest.raises(ValueError, match="Missing 'value' key in object:"): + convert_json_object(obj) + + +def test_convert_json_object_unknown_type() -> None: + """Test that a non-literal and non-uri type returns URIRef (default behavior).""" + obj = { + "type": "unknown", + "value": "https://example.com/unknown", + } + result = convert_json_object(obj) + assert isinstance(result, URIRef) + assert str(result) == "https://example.com/unknown"