Skip to content

Commit

Permalink
Merge pull request #900 from touma-I/filter-simplify
Browse files Browse the repository at this point in the history
Refactored fitler tansform as its own dpk_filter named module
  • Loading branch information
touma-I authored Dec 23, 2024
2 parents 25103fa + 28eaaf8 commit d372c2b
Show file tree
Hide file tree
Showing 106 changed files with 568 additions and 1,931 deletions.
9 changes: 6 additions & 3 deletions transforms/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_transforms"
version = "1.0.0a1"
version = "1.0.0a2"
requires-python = ">=3.10,<3.13"
keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
description = "Data Preparation Toolkit Transforms using Ray"
Expand Down Expand Up @@ -33,7 +33,6 @@ all = { file = [
## "language/pii_redactor/python/requirements.txt",

"universal/profiler/python/requirements.txt",
"universal/filter/python/requirements.txt",
"universal/resize/python/requirements.txt",

"language/lang_id/requirements.txt",
Expand All @@ -48,6 +47,7 @@ all = { file = [
## trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8"
## "language/html2parquet/requirements.txt",

"universal/filter/requirements.txt",
"universal/doc_id/requirements.txt",
"universal/ededup/requirements.txt",
"universal/fdedup/requirements.txt",
Expand All @@ -72,6 +72,7 @@ language = { file = [
## trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8"
## "language/html2parquet/requirements.txt",

"universal/filter/requirements.txt",
"universal/doc_id/requirements.txt",
"universal/ededup/requirements.txt",
"universal/fdedup/requirements.txt",
Expand All @@ -93,7 +94,6 @@ code_profiler = { file = ["code/code_profiler/python/requirements.txt"]}
pii_redactor = { file = ["language/pii_redactor/python/requirements.txt"]}

profiler = { file = ["universal/profiler/python/requirements.txt"]}
filter = { file = ["universal/filter/python/requirements.txt"]}
resize = { file = ["universal/resize/python/requirements.txt"]}

######## Named transforms
Expand All @@ -104,6 +104,7 @@ lang_id = { file = ["language/lang_id/requirements.txt"]}
pdf2parquet = { file = ["language/pdf2parquet/requirements.txt"]}
text_encoder = { file = ["language/text_encoder/requirements.txt"]}

filter = { file = ["universal/filter/requirements.txt"]}
doc_id = { file = ["universal/doc_id/requirements.txt"]}
hap = { file = ["universal/hap/requirements.txt"]}
ededup = { file = ["universal/ededup/requirements.txt"]}
Expand Down Expand Up @@ -134,6 +135,8 @@ dpk_ededup = "universal/ededup/dpk_ededup"
dpk_fdedup = "universal/fdedup/dpk_fdedup"
dpk_tokenization = "universal/tokenization/dpk_tokenization"
dpk_similarity = "language/similarity/dpk_similarity"
dpk_filter = "universal/filter/dpk_filter"


#[tool.setuptools.package-data]
#"*" = ["*.txt"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@ FROM docker.io/python:3.10.14-slim-bullseye

RUN pip install --upgrade --no-cache-dir pip

# install pytest
RUN pip install --no-cache-dir pytest

# Create a user and use it to run the transform
RUN useradd -ms /bin/bash dpk
USER dpk
Expand All @@ -16,22 +13,10 @@ ARG DPK_WHEEL_FILE_NAME
COPY --chown=dpk:root data-processing-dist data-processing-dist
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}

# END OF STEPS destined for a data-prep-kit base image

COPY --chown=dpk:root src/ src/
COPY --chown=dpk:root pyproject.toml pyproject.toml
COPY --chown=dpk:root dpk_filter/ dpk_filter/
COPY --chown=dpk:root requirements.txt requirements.txt
RUN pip install --no-cache-dir -e .

# copy the main() entry point to the image
COPY ./src/filter_transform_python.py .

# copy some of the samples in
COPY ./src/filter_local.py local/
RUN pip install --no-cache-dir -r requirements.txt

# copy test
COPY test/ test/
COPY test-data/ test-data/

# Set environment
ENV PYTHONPATH /home/dpk
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310

FROM ${BASE_IMAGE}

RUN pip install --upgrade --no-cache-dir pip
Expand All @@ -13,25 +14,9 @@ COPY --chown=ray:users data-processing-dist data-processing-dist
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray]

## Copy the python version of the tansform
COPY --chown=ray:users python-transform/ python-transform
RUN cd python-transform && pip install --no-cache-dir -e .

#COPY requirements.txt requirements.txt
#RUN pip install --no-cache-dir -r requirements.txt

COPY --chown=ray:users src/ src/
COPY --chown=ray:users pyproject.toml pyproject.toml
RUN pip install --no-cache-dir -e .

# copy the main() entry point to the image
COPY ./src/filter_transform_ray.py .

# copy some of the samples in
COPY src/filter_local_ray.py local/

# copy test
COPY test/ test/
COPY test-data/ test-data/
COPY --chown=ray:users dpk_filter/ dpk_filter/
COPY --chown=ray:users requirements.txt requirements.txt
RUN pip install -r requirements.txt

# Grant non-root users the necessary permissions to the ray directory
RUN chmod 755 /home/ray
Expand All @@ -43,4 +28,4 @@ ENV PYTHONPATH /home/ray
ARG BUILD_DATE
ARG GIT_COMMIT
LABEL build-date=$BUILD_DATE
LABEL git-commit=$GIT_COMMIT
LABEL git-commit=$GIT_COMMIT
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
ARG BASE_IMAGE=quay.io/dataprep1/data-prep-kit/data-prep-kit-spark-3.5.2:latest
FROM ${BASE_IMAGE}
FROM quay.io/dataprep1/data-prep-kit/data-prep-kit-spark-3.5.2:latest

USER root
# install pytest
Expand All @@ -13,23 +12,14 @@ ARG DPK_WHEEL_FILE_NAME
COPY --chown=spark:root data-processing-dist data-processing-dist
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[spark]

COPY --chown=spark:root python-transform/ python-transform/
RUN cd python-transform && pip install --no-cache-dir -e .

# Install project source
COPY --chown=spark:root src/ src/
COPY --chown=spark:root pyproject.toml pyproject.toml
RUN pip install --no-cache-dir -e .

# copy the main() entry point to the image
COPY ./src/filter_transform_spark.py .
## Copy the python version of the tansform
COPY --chown=spark:root dpk_filter/ dpk_filter/
COPY --chown=spark:root requirements.txt requirements.txt
RUN pip install -r requirements.txt

# copy some of the samples in
COPY src/filter_local_spark.py local/

# copy test
COPY test/ test/
COPY test-data/ test-data/

USER spark

Expand Down
89 changes: 18 additions & 71 deletions transforms/universal/filter/Makefile
Original file line number Diff line number Diff line change
@@ -1,79 +1,26 @@
REPOROOT=../../..
# Use make help, to see the available rules
include $(REPOROOT)/.make.defaults
include $(REPOROOT)/transforms/.make.cicd.targets

setup::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse
#
# This is intended to be included across the Makefiles provided within
# a given transform's directory tree, so must use compatible syntax.
#
################################################################################
# This defines the name of the transform and is used to match against
# expected files and is used to define the transform's image name.
TRANSFORM_NAME=$(shell basename `pwd`)

clean::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse
################################################################################

build::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse
venv::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse

image::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

publish::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

set-versions:
@# Help: Recursively $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-image::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-src::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

kind-load-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

docker-load-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

docker-save-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

.PHONY: workflow-venv
workflow-venv:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-venv; \
fi

.PHONY: workflow-test
workflow-test:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-test; \
fi

.PHONY: workflow-upload
workflow-upload:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-upload; \
fi

.PHONY: workflow-build
workflow-build:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-build; \
fi

run-spark-cli-sample:
$(MAKE) venv
source venv/bin/activate && \
$(PYTHON) -m dpk_$(TRANSFORM_NAME).spark.transform \
--spark_local_config_filepath ../config/spark_profile_local.yml \
--data_local_config "{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}" \
--filter_criteria_list "[ 'docq_total_words > 100 AND docq_total_words < 200', 'ibmkenlm_docq_perplex_score < 230']" \
--filter_columns_to_drop "[ 'extra', 'cluster' ]"
Loading

0 comments on commit d372c2b

Please sign in to comment.