Skip to content

Commit

Permalink
Add min and max lat-lon options
Browse files Browse the repository at this point in the history
  • Loading branch information
ghanse committed Dec 6, 2024
1 parent 16eb579 commit 4614e87
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 50 deletions.
24 changes: 18 additions & 6 deletions dbldatagen/datasets/basic_geometries.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,18 @@ class BasicGeometriesProvider(DatasetProvider.NoAssociatedDatasetsMixin, Dataset
"""
MIN_LOCATION_ID = 1000000
MAX_LOCATION_ID = 9223372036854775807
DEFAULT_MIN_LAT = -90.0
DEFAULT_MAX_LAT = 90.0
DEFAULT_MIN_LON = -180.0
DEFAULT_MAX_LON = 180.0
COLUMN_COUNT = 2
ALLOWED_OPTIONS = [
"geometryType",
"maxVertices",
"minLatitude",
"maxLatitude",
"minLongitude",
"maxLongitude",
"random"
]

Expand All @@ -45,6 +53,10 @@ def getTableGenerator(self, sparkSession, *, tableName=None, rows=-1, partitions
generateRandom = options.get("random", False)
geometryType = options.get("geometryType", "point")
maxVertices = options.get("maxVertices", 1 if geometryType == "point" else 3)
minLatitude = options.get("minLatitude", self.DEFAULT_MIN_LAT)
maxLatitude = options.get("maxLatitude", self.DEFAULT_MAX_LAT)
minLongitude = options.get("minLongitude", self.DEFAULT_MIN_LON)
maxLongitude = options.get("maxLongitude", self.DEFAULT_MAX_LON)

assert tableName is None or tableName == DatasetProvider.DEFAULT_TABLE_NAME, "Invalid table name"
if rows is None or rows < 0:
Expand All @@ -62,9 +74,9 @@ def getTableGenerator(self, sparkSession, *, tableName=None, rows=-1, partitions
if maxVertices > 1:
w.warn('Ignoring property maxVertices for point geometries')
df_spec = (
df_spec.withColumn("lat", "float", minValue=-90.0, maxValue=90.0,
df_spec.withColumn("lat", "float", minValue=minLatitude, maxValue=maxLatitude,
step=1e-5, random=generateRandom, omit=True)
.withColumn("lon", "float", minValue=-180.0, maxValue=180.0,
.withColumn("lon", "float", minValue=minLongitude, maxValue=maxLongitude,
step=1e-5, random=generateRandom, omit=True)
.withColumn("wkt", "string", expr="concat('POINT(', lon, ' ', lat, ')')")
)
Expand All @@ -75,9 +87,9 @@ def getTableGenerator(self, sparkSession, *, tableName=None, rows=-1, partitions
j = 0
while j < maxVertices:
df_spec = (
df_spec.withColumn(f"lat_{j}", "float", minValue=-90.0, maxValue=90.0,
df_spec.withColumn(f"lat_{j}", "float", minValue=minLatitude, maxValue=maxLatitude,
step=1e-5, random=generateRandom, omit=True)
.withColumn(f"lon_{j}", "float", minValue=-180.0, maxValue=180.0,
.withColumn(f"lon_{j}", "float", minValue=minLongitude, maxValue=maxLongitude,
step=1e-5, random=generateRandom, omit=True)
)
j = j + 1
Expand All @@ -93,9 +105,9 @@ def getTableGenerator(self, sparkSession, *, tableName=None, rows=-1, partitions
j = 0
while j < maxVertices:
df_spec = (
df_spec.withColumn(f"lat_{j}", "float", minValue=-90.0, maxValue=90.0,
df_spec.withColumn(f"lat_{j}", "float", minValue=minLatitude, maxValue=maxLatitude,
step=1e-5, random=generateRandom, omit=True)
.withColumn(f"lon_{j}", "float", minValue=-180.0, maxValue=180.0,
.withColumn(f"lon_{j}", "float", minValue=minLongitude, maxValue=maxLongitude,
step=1e-5, random=generateRandom, omit=True)
)
j = j + 1
Expand Down
94 changes: 50 additions & 44 deletions tests/test_standard_dataset_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,56 +9,62 @@
class TestStandardDatasetProviders:

# BASIC GEOMETRIES tests:
@pytest.mark.parametrize("providerName, providerOptions", [
("basic/geometries",
{"rows": 50, "partitions": 4, "random": False, "geometryType": "point", "maxVertices": 1}),
("basic/geometries",
{"rows": 100, "partitions": -1, "random": False, "geometryType": "point", "maxVertices": 2}),
("basic/geometries",
{"rows": -1, "partitions": 4, "random": True, "geometryType": "point"}),
("basic/geometries", {}),
("basic/geometries",
{"rows": 5000, "partitions": -1, "random": True, "geometryType": "lineString"}),
("basic/geometries",
{"rows": -1, "partitions": -1, "random": False, "geometryType": "lineString", "maxVertices": 2}),
("basic/geometries",
{"rows": -1, "partitions": 4, "random": True, "geometryType": "lineString", "maxVertices": 1}),
("basic/geometries",
{"rows": 5000, "partitions": 4, "geometryType": "lineString", "maxVertices": 2}),
("basic/geometries",
{"rows": 5000, "partitions": -1, "random": False, "geometryType": "polygon"}),
("basic/geometries",
{"rows": -1, "partitions": -1, "random": True, "geometryType": "polygon", "maxVertices": 3}),
("basic/geometries",
{"rows": -1, "partitions": 4, "random": True, "geometryType": "polygon", "maxVertices": 2}),
("basic/geometries",
{"rows": 5000, "partitions": 4, "geometryType": "polygon", "maxVertices": 5}),
@pytest.mark.parametrize("providerName, providerOptions, expectation", [
("basic/geometries", {}, does_not_raise()),
("basic/geometries", {"rows": 50, "partitions": 4, "random": False,
"geometryType": "point", "maxVertices": 1}, does_not_raise()),
("basic/geometries", {"rows": 100, "partitions": -1, "random": False,
"geometryType": "point", "maxVertices": 2}, does_not_raise()),
("basic/geometries", {"rows": -1, "partitions": 4, "random": True,
"geometryType": "point"}, does_not_raise()),
("basic/geometries", {"rows": 5000, "partitions": -1, "random": True,
"geometryType": "lineString"}, does_not_raise()),
("basic/geometries", {"rows": -1, "partitions": -1, "random": False,
"geometryType": "lineString", "maxVertices": 2}, does_not_raise()),
("basic/geometries", {"rows": -1, "partitions": 4, "random": True,
"geometryType": "lineString", "maxVertices": 1}, does_not_raise()),
("basic/geometries", {"rows": 5000, "partitions": 4,
"geometryType": "lineString", "maxVertices": 2}, does_not_raise()),
("basic/geometries", {"rows": 5000, "partitions": -1, "random": False,
"geometryType": "polygon"}, does_not_raise()),
("basic/geometries", {"rows": -1, "partitions": -1, "random": True,
"geometryType": "polygon", "maxVertices": 3}, does_not_raise()),
("basic/geometries", {"rows": -1, "partitions": 4, "random": True,
"geometryType": "polygon", "maxVertices": 2}, does_not_raise()),
("basic/geometries", {"rows": 5000, "partitions": 4,
"geometryType": "polygon", "maxVertices": 5}, does_not_raise()),
("basic/geometries",
{"rows": 5000, "partitions": 4, "geometryType": "polygon", "minLatitude": 45.0,
"maxLatitude": 50.0, "minLongitude": -85.0, "maxLongitude": -75.0}, does_not_raise()),
("basic/geometries",
{"rows": -1, "partitions": -1, "geometryType": "multipolygon"}, pytest.raises(ValueError))
])
def test_basic_geometries_retrieval(self, providerName, providerOptions):
ds = dg.Datasets(spark, providerName).get(**providerOptions)
assert ds is not None
def test_basic_geometries_retrieval(self, providerName, providerOptions, expectation):
with expectation:
ds = dg.Datasets(spark, providerName).get(**providerOptions)
assert ds is not None

df = ds.build()
assert df.count() >= 0
assert "wkt" in df.columns
df = ds.build()
assert df.count() >= 0
assert "wkt" in df.columns

geometryType = providerOptions.get("geometryType", None)
row = df.first().asDict()
if geometryType == "point" or geometryType is None:
assert "POINT" in row["wkt"]
geometryType = providerOptions.get("geometryType", None)
row = df.first().asDict()
if geometryType == "point" or geometryType is None:
assert "POINT" in row["wkt"]

if geometryType == "lineString":
assert "LINESTRING" in row["wkt"]
if geometryType == "lineString":
assert "LINESTRING" in row["wkt"]

if geometryType == "polygon":
assert "POLYGON" in row["wkt"]
if geometryType == "polygon":
assert "POLYGON" in row["wkt"]

random = providerOptions.get("random", None)
if random:
print("")
leadingRows = df.limit(100).collect()
ids = [r.location_id for r in leadingRows]
assert ids != sorted(ids)
random = providerOptions.get("random", None)
if random:
print("")
leadingRows = df.limit(100).collect()
ids = [r.location_id for r in leadingRows]
assert ids != sorted(ids)

# BASIC PROCESS HISTORIAN tests:
@pytest.mark.parametrize("providerName, providerOptions", [
Expand Down

0 comments on commit 4614e87

Please sign in to comment.