Skip to content

Commit

Permalink
Merge pull request #735 from IBM/fix-dpk-pyprojects
Browse files Browse the repository at this point in the history
Update all transforms to use single package library with [extra]
  • Loading branch information
touma-I authored Oct 28, 2024
2 parents 0711226 + 1b84023 commit 620eb79
Show file tree
Hide file tree
Showing 104 changed files with 437 additions and 456 deletions.
155 changes: 111 additions & 44 deletions .make.defaults
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@ DPK_PYTHON_LIB_DIR=$(REPOROOT)/data-processing-lib/python
DPK_RAY_LIB_DIR=$(REPOROOT)/data-processing-lib/ray
DPK_SPARK_LIB_DIR=$(REPOROOT)/data-processing-lib/spark

DPK_PYTHON_BUILD_DIR=$(REPOROOT)/data-processing-lib
DPK_RAY_BUILD_DIR=$(REPOROOT)/data-processing-lib
DPK_SPARK_BUILD_DIR=$(REPOROOT)/data-processing-lib


KFPv2?=0

#######################################################################################
Expand Down Expand Up @@ -221,6 +226,7 @@ __check_defined = \
--platform $(DOCKER_PLATFORM) \
--build-arg EXTRA_INDEX_URL=$(EXTRA_INDEX_URL) \
--build-arg BASE_IMAGE=$(BASE_IMAGE) \
--build-arg DPK_WHEEL_FILE_NAME=$(DPK_WHEEL_FILE_NAME) \
--build-arg BUILD_DATE=$(shell date -u +'%Y-%m-%dT%H:%M:%SZ') \
--build-arg GIT_COMMIT=$(shell git log -1 --format=%h) .
$(DOCKER) tag $(DOCKER_LOCAL_IMAGE) $(DOCKER_REMOTE_IMAGE)
Expand All @@ -241,74 +247,133 @@ __check_defined = \

# Build and image using the local Dockerfile and make the data-processing-lib/python
# available in the current directory for use by the Dockerfile (i.e. to install the library).
.PHONY: .defaults.python-lib-src-image
.defaults.python-lib-src-image:: # Must be called with a DOCKER_LOCAL_IMAGE= settings.
@# Help: Build the Python $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE), requirements.txt and data-processing-lib/python source
ifeq ($(USE_REPO_LIB_SRC), 1)
$(MAKE) LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python .defaults.copy-lib
endif
$(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) .defaults.image
-rm -rf data-processing-lib-python
#.PHONY: .defaults.python-lib-src-image
#.defaults.python-lib-src-image:: # Must be called with a DOCKER_LOCAL_IMAGE= settings.
# @# Help: Build the Python $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE), requirements.txt and data-processing-lib/python source
#ifeq ($(USE_REPO_LIB_SRC), 1)
# $(MAKE) LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python .defaults.copy-lib
#endif
# $(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) .defaults.image
# -rm -rf data-processing-lib-python

.PHONY: .default.build-lib-wheel
.default.build-lib-wheel:
make -C $(REPOROOT)/data-processing-lib build-pkg-dist
rm -rf data-processing-dist && mkdir data-processing-dist
cp $(REPOROOT)/data-processing-lib/dist/*.whl data-processing-dist

# Build and image using the local Dockerfile and make the wheel for data-processing-lib
# available in the current directory for use by the Dockerfile (i.e. to install the library).
.PHONY: .defaults.python-lib-whl-image
.defaults.python-lib-whl-image:: .default.build-lib-wheel
# Must be called with a DOCKER_LOCAL_IMAGE= settings.
@# Help: Build the Python $(DOCKER_LOCAL_IMAGE) using the the wheel file for the library
@$(eval LIB_WHEEL_FILE := $(shell find data-processing-dist/*.whl))
$(eval LIB_WHEEL_FILE := $(shell basename $(LIB_WHEEL_FILE)))
$(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image
-rm -rf data-processing-dist

# Build an image using the local Dockerfile and make the data-processing-lib/ray
# available in the current directory for use by the Dockerfile (i.e. to install the library).
# Note that this looks for the ../python directory, which is currently only used in the transform projects,
# but we add it here as a convenience to avoid duplicating a lot of this in transforms/.make.transforms.
.PHONY: .defaults.ray-lib-src-image
.defaults.ray-lib-src-image:: # Must be called with a DOCKER_LOCAL_IMAGE= settings.
@# Help: Build the Ray $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE), requirements.txt and data-processing-libs source
ifeq ($(USE_REPO_LIB_SRC), 1)
$(MAKE) LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python .defaults.copy-lib
$(MAKE) LIB_PATH=$(DPK_RAY_LIB_DIR) LIB_NAME=data-processing-lib-ray .defaults.copy-lib
endif
#.PHONY: .defaults.ray-lib-src-image
#.defaults.ray-lib-src-image:: # Must be called with a DOCKER_LOCAL_IMAGE= settings.
# @# Help: Build the Ray $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE), requirements.txt and data-processing-libs source
#ifeq ($(USE_REPO_LIB_SRC), 1)
# $(MAKE) LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python .defaults.copy-lib
# $(MAKE) LIB_PATH=$(DPK_RAY_LIB_DIR) LIB_NAME=data-processing-lib-ray .defaults.copy-lib
#endif
# if [ -e ../python ]; then \
# $(MAKE) LIB_PATH=../python LIB_NAME=python-transform .defaults.copy-lib; \
# fi
# $(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) .defaults.image
# -rm -rf data-processing-lib-python
# -rm -rf data-processing-lib-ray
# -rm -rf python-transform


# Build an image using the local Dockerfile and make the data-processing wheel
# available in the current directory for use by the Dockerfile (i.e. to install the library).
# Note that this looks for the ../python directory, which is currently only used in the transform projects,
# but we add it here as a convenience to avoid duplicating a lot of this in transforms/.make.transforms.
.PHONY: .defaults.ray-lib-whl-image
.defaults.ray-lib-whl-image:: .default.build-lib-wheel
# Must be called with a DOCKER_LOCAL_IMAGE= settings.
@# Help: Build the Ray $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE) and library wheel
@$(eval LIB_WHEEL_FILE := $(shell find data-processing-dist/*.whl))
$(eval LIB_WHEEL_FILE := $(shell basename $(LIB_WHEEL_FILE)))
if [ -e ../python ]; then \
$(MAKE) LIB_PATH=../python LIB_NAME=python-transform .defaults.copy-lib; \
fi
$(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) .defaults.image
-rm -rf data-processing-lib-python
-rm -rf data-processing-lib-ray
$(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image
-rm -rf python-transform
-rm -rf data-processing-dist


# Build the base spark image used by spark-based transforms
.PHONY: .defaults.spark-lib-base-image
.defaults.spark-lib-base-image-spark:
.defaults.spark-lib-base-image:
$(MAKE) -C $(DPK_SPARK_LIB_DIR) image

# Note that this looks for the ../python directory, which is currently only used in the transform projects,
# but we add it here as a convenience to avoid duplicating a lot of this in transforms/.make.transforms.
# Must be called with a DOCKER_LOCAL_IMAGE= settings.
.PHONY: .defaults.spark-lib-src-image
.defaults.spark-lib-src-image:: .defaults.spark-lib-base-image
@# Help: Build the Spark $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE), requirements.txt and data-processing-libs source
#.PHONY: .defaults.spark-lib-src-image
#.defaults.spark-lib-src-image:: .defaults.spark-lib-base-image
# @# Help: Build the Spark $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE), requirements.txt and data-processing-libs source
# $(MAKE) IMAGE_NAME_TO_VERIFY=$(DOCKER_SPARK_BASE_IMAGE_NAME) .defaults.verify-image-availability
#ifeq ($(USE_REPO_LIB_SRC), 1)
# $(MAKE) LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python .defaults.copy-lib
# $(MAKE) LIB_PATH=$(DPK_SPARK_LIB_DIR) LIB_NAME=data-processing-lib-spark .defaults.copy-lib
#endif
# if [ -e ../python ]; then \
# $(MAKE) LIB_PATH=../python LIB_NAME=python-transform .defaults.copy-lib; \
# fi
# $(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) BASE_IMAGE=$(DOCKER_SPARK_BASE_IMAGE) .defaults.image
# -rm -rf data-processing-lib-python
# -rm -rf data-processing-lib-spark
# -rm -rf python-transform

.PHONY: .defaults.spark-lib-whl-image
.defaults.spark-lib-whl-image:: .default.build-lib-wheel .defaults.spark-lib-base-image
# Must be called with a DOCKER_LOCAL_IMAGE= settings.
@# Help: Build the Ray $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE) and library wheel
$(MAKE) IMAGE_NAME_TO_VERIFY=$(DOCKER_SPARK_BASE_IMAGE_NAME) .defaults.verify-image-availability
ifeq ($(USE_REPO_LIB_SRC), 1)
$(MAKE) LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python .defaults.copy-lib
$(MAKE) LIB_PATH=$(DPK_SPARK_LIB_DIR) LIB_NAME=data-processing-lib-spark .defaults.copy-lib
endif
@$(eval LIB_WHEEL_FILE := $(shell find data-processing-dist/*.whl))
$(eval LIB_WHEEL_FILE := $(shell basename $(LIB_WHEEL_FILE)))
if [ -e ../python ]; then \
$(MAKE) LIB_PATH=../python LIB_NAME=python-transform .defaults.copy-lib; \
fi
$(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) BASE_IMAGE=$(DOCKER_SPARK_BASE_IMAGE) .defaults.image
-rm -rf data-processing-lib-python
-rm -rf data-processing-lib-spark
$(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) BASE_IMAGE=$(DOCKER_SPARK_BASE_IMAGE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image
-rm -rf python-transform
-rm -rf data-processing-dist




# Install the source from the given directory into an existing venv
# Expected PYTHON_PROJECT_DIR and uses EXTRA_INDEX_URL if set.
# PYTHON_PROJECT_DIR is expected to have src and pyproject.toml
.PHONY: .defaults.install-src-venv
.defaults.install-src-venv::
@echo Begin installing source from $(PYTHON_PROJECT_DIR) into venv
$(call check_defined, PYTHON_PROJECT_DIR)
@echo Begin installing source from $(PYTHON_PROJECT_BUILD_DIR) into venv
$(call check_defined, PYTHON_PROJECT_BUILD_DIR)
@source venv/bin/activate; \
if [ ! -z "$(EXTRA_INDEX_URL)" ]; then \
extra_url='--extra-index-url $(EXTRA_INDEX_URL)'; \
fi; \
if [ -e $(PYTHON_PROJECT_DIR)/requirements.txt ]; then \
pip install -r $(PYTHON_PROJECT_DIR)/requirements.txt; \
if [ -e $(PYTHON_PROJECT_BUILD_DIR)/requirements.txt ]; then \
pip install -r $(PYTHON_PROJECT_BUILD_DIR)/requirements.txt; \
fi; \
pip install $(PIP_INSTALL_EXTRA_ARGS) $${extra_url} -e $(PYTHON_PROJECT_DIR)
@echo Done installing source from $(PYTHON_PROJECT_DIR) into venv
if [ -e $(PYTHON_PROJECT_BUILD_DIR)/pyproject.toml ]; then \
if [ -z "$(PROJECT_BUILD_EXTRA)" ]; then \
pip install $(PIP_INSTALL_EXTRA_ARGS) $${extra_url} -e $(PYTHON_PROJECT_BUILD_DIR); \
else \
pip install $(PIP_INSTALL_EXTRA_ARGS) $${extra_url} -e $(PYTHON_PROJECT_BUILD_DIR)[$(PROJECT_BUILD_EXTRA)]; \
fi;\
fi
@echo Done installing source from $(PYTHON_PROJECT_BUILD_DIR) into venv

# Install local requirements last as it generally includes our lib source
.PHONY: .defaults.python-lib-src-venv
Expand Down Expand Up @@ -338,7 +403,7 @@ ifeq ($(USE_REPO_LIB_SRC), 1)
@echo Installing Python data processing library source to existing venv
@source venv/bin/activate; \
$(MAKE) PIP_TARGET=data-prep-toolkit .defaults.pip-uninstall; \
$(MAKE) PYTHON_PROJECT_DIR=$(DPK_PYTHON_LIB_DIR) .defaults.install-src-venv; \
$(MAKE) PYTHON_PROJECT_BUILD_DIR=$(DPK_PYTHON_BUILD_DIR) .defaults.install-src-venv; \
echo Installed source from Python processing library for `which $(PYTHON)`
else
@# Help: DO NOT install Python data processing library source into existing venv
Expand All @@ -363,18 +428,19 @@ ifeq ($(USE_REPO_LIB_SRC), 1)
@# Help: Install Ray and Python data processing library source into existing venv
@echo Installing Ray and Python data processing library source to existing venv
@source venv/bin/activate; \
$(MAKE) PIP_TARGET=data-prep-toolkit-ray .defaults.pip-uninstall; \
$(MAKE) PIP_TARGET=data-prep-toolkit[ray] .defaults.pip-uninstall; \
$(MAKE) PIP_TARGET=data-prep-toolkit .defaults.pip-uninstall; \
$(MAKE) PYTHON_PROJECT_DIR=$(DPK_PYTHON_LIB_DIR) .defaults.install-src-venv; \
$(MAKE) PYTHON_PROJECT_DIR=$(DPK_RAY_LIB_DIR) .defaults.install-src-venv; \
$(MAKE) PYTHON_PROJECT_BUILD_DIR=$(DPK_PYTHON_BUILD_DIR) .defaults.install-src-venv; \
$(MAKE) PYTHON_PROJECT_BUILD_DIR=$(DPK_RAY_BUILD_DIR) PROJECT_BUILD_EXTRA=ray .defaults.install-src-venv; \
echo Installed source from Python and Ray data processing libraries for `which $(PYTHON)`
else
@# Help: DO NOT install Python or Ray data processing library source into existing venv
@echo USE_REPO_LIB_SRC!=1 so do NOT installing Python or Ray data processing library source into existing venv
endif
# Install the module python library if it has one
@if [ -d ../python ]; then \
source venv/bin/activate; \
$(MAKE) PYTHON_PROJECT_DIR=../python .defaults.install-src-venv; \
$(MAKE) PYTHON_PROJECT_BUILD_DIR=../python .defaults.install-src-venv; \
fi

# Install local requirements last as it generally includes our lib source
Expand All @@ -389,18 +455,18 @@ ifeq ($(USE_REPO_LIB_SRC), 1)
@# Help: Install Spark and Python data processing library source into existing venv
@echo Installing Spark and Python data processing library source to existing venv
@source venv/bin/activate; \
$(MAKE) PIP_TARGET=data-prep-toolkit-spark .defaults.pip-uninstall; \
$(MAKE) PIP_TARGET=data-prep-toolkit[spark] .defaults.pip-uninstall; \
$(MAKE) PIP_TARGET=data-prep-toolkit .defaults.pip-uninstall; \
$(MAKE) PYTHON_PROJECT_DIR=$(DPK_PYTHON_LIB_DIR) .defaults.install-src-venv; \
$(MAKE) PYTHON_PROJECT_DIR=$(DPK_SPARK_LIB_DIR) .defaults.install-src-venv; \
$(MAKE) PYTHON_PROJECT_BUILD_DIR=$(DPK_PYTHON_BUILD_DIR) .defaults.install-src-venv; \
$(MAKE) PYTHON_PROJECT_BUILD_DIR=$(DPK_SPARK_BUILD_DIR) PROJECT_BUILD_EXTRA=spark .defaults.install-src-venv; \
echo Installed source from Python and Spark processing libraries for `which $(PYTHON)`
else
@# Help: DO NOT install Python or Spark data processing library source into existing venv
@echo USE_REPO_LIB_SRC!=1 so do NOT installing Python or Spark data processing library source into existing venv
endif
if [ -d ../python ]; then \
source venv/bin/activate; \
$(MAKE) PYTHON_PROJECT_DIR=../python .defaults.install-src-venv; \
$(MAKE) PYTHON_PROJECT_BUILD_DIR=../python .defaults.install-src-venv; \
fi

# Run tests in test directory from that dir after adding ../src to PYTHONPATH
Expand Down Expand Up @@ -652,3 +718,4 @@ endif
fi
${PYTHON} -m twine check dist/*
${PYTHON} -m twine upload --verbose --non-interactive dist/*

31 changes: 31 additions & 0 deletions data-processing-lib/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Data Processing Library
This provides a python framework for developing _transforms_
on data stored in files - currently parquet files are supported -
and running them in a [ray](https://www.ray.io/) cluster.
Data files may be stored in the local file system or COS/S3.
For more details see the [documentation](../doc/overview.md).

### Virtual Environment
The project uses `pyproject.toml` and a Makefile for operations.
To do development you should establish the virtual environment
```shell
make venv
```
and then either activate
```shell
source venv/bin/activate
```
or set up your IDE to use the venv directory when developing in this project

## Library Artifact Build and Publish
To test, build and publish the library
```shell
make test build publish
```

To up the version number, edit the Makefile to change VERSION and rerun
the above. This will require committing both the `Makefile` and the
autotmatically updated `pyproject.toml` file.



8 changes: 4 additions & 4 deletions data-processing-lib/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,20 @@ dynamic = ["dependencies", "optional-dependencies"]
[project_urls]
Repository = "https://github.com/IBM/data-prep-kit"
Issues = "https://github.com/IBM/data-prep-kit/issues"
Documentation = "https://ibm.github.io/data-prep-kit/"
Documentation = "https://ibm.github.io/data-prep-kit/doc"
"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/noop"

[build-system]
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
build-backend = "setuptools.build_meta"

[tool.setuptools.dynamic.dependencies]
file = ["requirements.txt"]
file = ["python/requirements.txt"]

[tool.setuptools.dynamic.optional-dependencies]
dev = { file = ["requirements-dev.txt"]}
ray = { file = ["requirements-ray.txt"]}
spark = { file = ["requirements-spark.txt"]}
ray = { file = ["ray/requirements.txt"]}
spark = { file = ["spark/requirements.txt"]}

[tool.setuptools.packages.find]
where = ["python/src", "ray/src", "spark/src"]
Expand Down
7 changes: 4 additions & 3 deletions data-processing-lib/python/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,16 @@ publish:: publish-dist

publish-dist :: .check-env .defaults.publish-dist

venv:: pyproject.toml
@# Help: Create the virtual environment using pyproject.toml
venv::
@# Help: Create the virtual environment using pyproject.toml for installing python library
# pyproject.toml is now common for python, [ray] and [spark]
rm -r dist venv || true
rm -rf src/*egg-info || true
rm makeenv || true
$(PYTHON) -m venv venv
source venv/bin/activate; \
pip install --upgrade pip; \
pip install -e .; \
pip install -e ../ ; \
pip install pytest pytest-cov moto==5.0.5 markupsafe==2.0.1

image::
Expand Down
56 changes: 0 additions & 56 deletions data-processing-lib/python/pyproject.toml

This file was deleted.

File renamed without changes.
2 changes: 1 addition & 1 deletion data-processing-lib/ray/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ publish:: publish-dist

publish-dist :: .check-env .defaults.publish-dist

venv:: pyproject.toml
venv::
$(MAKE) .defaults.ray-lib-src-venv
pip install moto==5.0.5 markupsafe==2.0.1

Expand Down
Loading

0 comments on commit 620eb79

Please sign in to comment.