diff --git a/.make.versions b/.make.versions index ed36fe8c8..564caa214 100644 --- a/.make.versions +++ b/.make.versions @@ -16,10 +16,10 @@ DPK_MAJOR_VERSION=0 # The minor version is incremented manually when significant features have been added that are backward compatible with the previous major.minor release. DPK_MINOR_VERSION=2 # The minor version is incremented AUTOMATICALLY by the release.sh script when a new release is set. -DPK_MICRO_VERSION=2 +DPK_MICRO_VERSION=3 # The suffix is generally always set in the main/development branch and only nulled out when creating release branches. # It can be manually incremented, for example, to allow publishing a new intermediate version wheel to pypi. -DPK_VERSION_SUFFIX=.dev2 +DPK_VERSION_SUFFIX=.dev0 DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_VERSION_SUFFIX) @@ -39,7 +39,7 @@ DPK_LIB_KFP_SHARED=$(DPK_VERSION) KFP_DOCKER_VERSION=$(DOCKER_IMAGE_VERSION) KFP_DOCKER_VERSION_v2=$(DOCKER_IMAGE_VERSION) -DPK_CONNECTOR_VERSION=0.2.3.dev0 +DPK_CONNECTOR_VERSION=0.2.4.dev0 ################## ################## ################## ################## ################## ################## # Begin versions that the repo depends on. @@ -59,3 +59,11 @@ else WORKFLOW_SUPPORT_LIB=kfp_v1_workflow_support endif +################################################################################ +# This defines the transforms' package version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +TRANSFORMS_PKG_VERSION=0.2.3.dev0 diff --git a/data-connector-lib/pyproject.toml b/data-connector-lib/pyproject.toml index 4fcc97ed9..69e914f0c 100644 --- a/data-connector-lib/pyproject.toml +++ b/data-connector-lib/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_connector" -version = "0.2.3.dev1" +version = "0.2.4.dev0" requires-python = ">=3.10,<3.13" keywords = [ "data", diff --git a/data-processing-lib/pyproject.toml b/data-processing-lib/pyproject.toml index 2e827ea82..40bf6b2a1 100644 --- a/data-processing-lib/pyproject.toml +++ b/data-processing-lib/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit" -version = "0.2.2.dev2" +version = "0.2.3.dev0" keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] requires-python = ">=3.10,<3.13" description = "Data Preparation Toolkit Library for Ray and Python" diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml index d7058f2ae..f09b2f32a 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_v1" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -13,7 +13,7 @@ authors = [ ] dependencies = [ "kfp==1.8.22", - "data-prep-toolkit-kfp-shared==0.2.2.dev2", + "data-prep-toolkit-kfp-shared==0.2.3.dev0", ] [build-system] diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml index 04b6bc7a2..01c5b3e17 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_v2" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -14,7 +14,7 @@ authors = [ dependencies = [ "kfp==2.8.0", "kfp-kubernetes==1.2.0", - "data-prep-toolkit-kfp-shared==0.2.2.dev2", + "data-prep-toolkit-kfp-shared==0.2.3.dev0", ] [build-system] diff --git a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml index df27ad1cf..aa7a6dd3a 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_shared" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -14,7 +14,7 @@ authors = [ dependencies = [ "requests", "kubernetes", - "data-prep-toolkit[ray]==0.2.2.dev2", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/release-notes.md b/release-notes.md index 15f23c542..4b7b8d553 100644 --- a/release-notes.md +++ b/release-notes.md @@ -1,5 +1,42 @@ # Data Prep Kit Release notes +## Release 0.2.2 - 11/25/2024 + +### General +1. Update RAG example to use granite model +1. Updated transforms with Docling 2 +1. Added single package for dpk with extra for \[spark\] and \[ray\] +1. Added single package for transforms with extra for \[all\] or \[individual-transform-name\] + + +### data-prep-toolkit libraries (python, ray, spark) + +1. Fix metadata logging even when actors crash +1. Add multilock for ray workers downloads/cleanup +1. Multiple updates to spark runtime +1. Added support for python 3.12 +1. refactoring of data access code + + +### KFP Workloads + +1. Modify superpipeline params type Str/json +1. Set kuberay apiserver version +1. Add Super pipeline for code transforms + + +### Transforms + +1. Enhance pdf2parquet with docling2 support for extracting HTML, DOCS, etc. +1. Added web2parquet transform +1. Added HAP transform + +### HTTP Connector 0.2.3 + +1. Enhanced parameter/configuration allows the user to customize crawler settings +1. implement subdomain focus feature in data-prep-connector + + ## Release 0.2.2- HTTP Connector Module - 10/23/2024 ### General diff --git a/transforms/code/code2parquet/python/pyproject.toml b/transforms/code/code2parquet/python/pyproject.toml index 5e6f41bb2..be84b2f20 100644 --- a/transforms/code/code2parquet/python/pyproject.toml +++ b/transforms/code/code2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code2parquet_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "code2parquet Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code2parquet/python/requirements.txt b/transforms/code/code2parquet/python/requirements.txt index bbb84b749..cec7f9c5f 100644 --- a/transforms/code/code2parquet/python/requirements.txt +++ b/transforms/code/code2parquet/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.3.dev0 parameterized pandas diff --git a/transforms/code/code2parquet/ray/pyproject.toml b/transforms/code/code2parquet/ray/pyproject.toml index 15a4be4c1..d56fed1e8 100644 --- a/transforms/code/code2parquet/ray/pyproject.toml +++ b/transforms/code/code2parquet/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code2parquet_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "code2parquet Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.2.dev2", - "dpk-code2parquet-transform-python==0.2.2.dev2", + "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-code2parquet-transform-python==0.2.3.dev0", "parameterized", "pandas", ] diff --git a/transforms/code/code_profiler/python/pyproject.toml b/transforms/code/code_profiler/python/pyproject.toml index 492603d54..334c86fed 100644 --- a/transforms/code/code_profiler/python/pyproject.toml +++ b/transforms/code/code_profiler/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_profiler_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Code Profiler Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code_profiler/python/requirements.txt b/transforms/code/code_profiler/python/requirements.txt index 8608c6d6e..27706b467 100644 --- a/transforms/code/code_profiler/python/requirements.txt +++ b/transforms/code/code_profiler/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.3.dev0 parameterized pandas aiolimiter==1.1.0 diff --git a/transforms/code/code_profiler/ray/pyproject.toml b/transforms/code/code_profiler/ray/pyproject.toml index 933152e3f..9b760c1c3 100644 --- a/transforms/code/code_profiler/ray/pyproject.toml +++ b/transforms/code/code_profiler/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_profiler_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Code Profiler Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Pankaj Thorat", email = "pankaj.thorat@ibm.com" }, ] dependencies = [ - "dpk-code-profiler-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-code-profiler-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/code/code_quality/python/pyproject.toml b/transforms/code/code_quality/python/pyproject.toml index 5f201c8ae..17cbce67d 100644 --- a/transforms/code/code_quality/python/pyproject.toml +++ b/transforms/code/code_quality/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_quality_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Code Quality Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code_quality/python/requirements.txt b/transforms/code/code_quality/python/requirements.txt index 0bd936ef2..ef627d39f 100644 --- a/transforms/code/code_quality/python/requirements.txt +++ b/transforms/code/code_quality/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.3.dev0 bs4==0.0.2 transformers==4.38.2 diff --git a/transforms/code/code_quality/ray/pyproject.toml b/transforms/code/code_quality/ray/pyproject.toml index 290429f95..eceee32ed 100644 --- a/transforms/code/code_quality/ray/pyproject.toml +++ b/transforms/code/code_quality/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_quality_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Code Quality Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "dpk-code-quality-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-code-quality-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/code/header_cleanser/python/pyproject.toml b/transforms/code/header_cleanser/python/pyproject.toml index ecaf4d7bb..3703ec55f 100644 --- a/transforms/code/header_cleanser/python/pyproject.toml +++ b/transforms/code/header_cleanser/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_header_cleanser_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "License and Copyright Removal Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/code/header_cleanser/python/requirements.txt b/transforms/code/header_cleanser/python/requirements.txt index c2d0d8793..915a462dc 100644 --- a/transforms/code/header_cleanser/python/requirements.txt +++ b/transforms/code/header_cleanser/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.3.dev0 scancode-toolkit==32.1.0 ; platform_system != 'Darwin' diff --git a/transforms/code/header_cleanser/ray/pyproject.toml b/transforms/code/header_cleanser/ray/pyproject.toml index adff71cfc..5fb1bcf26 100644 --- a/transforms/code/header_cleanser/ray/pyproject.toml +++ b/transforms/code/header_cleanser/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_header_cleanser_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "License and copyright removal Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Yash kalathiya", email = "yashkalathiya164@gmail.com" }, ] dependencies = [ - "dpk-header-cleanser-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-header-cleanser-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", "scancode-toolkit==32.1.0", ] diff --git a/transforms/code/license_select/python/pyproject.toml b/transforms/code/license_select/python/pyproject.toml index 30f2f001e..3345d3a5a 100644 --- a/transforms/code/license_select/python/pyproject.toml +++ b/transforms/code/license_select/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_license_select_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "License Select Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/license_select/python/requirements.txt b/transforms/code/license_select/python/requirements.txt index 368287e5d..2f67f6a80 100644 --- a/transforms/code/license_select/python/requirements.txt +++ b/transforms/code/license_select/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.2.dev2 \ No newline at end of file +data-prep-toolkit==0.2.3.dev0 \ No newline at end of file diff --git a/transforms/code/license_select/ray/pyproject.toml b/transforms/code/license_select/ray/pyproject.toml index 815121787..ce5979d62 100644 --- a/transforms/code/license_select/ray/pyproject.toml +++ b/transforms/code/license_select/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_license_select_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "License Select Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Mark Lewis", email = "mark_lewis@uk.ibm.com" }, ] dependencies = [ - "dpk-license-select-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-license-select-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/code/malware/python/pyproject.toml b/transforms/code/malware/python/pyproject.toml index 22d92fd8c..a1bc05ab4 100644 --- a/transforms/code/malware/python/pyproject.toml +++ b/transforms/code/malware/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_malware_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Malware Python Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Takuya Goto", email = "tkyg@jp.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.2.dev2", + "data-prep-toolkit==0.2.3.dev0", "clamd==1.0.2", ] diff --git a/transforms/code/malware/ray/pyproject.toml b/transforms/code/malware/ray/pyproject.toml index 791b8d253..659ee62ef 100644 --- a/transforms/code/malware/ray/pyproject.toml +++ b/transforms/code/malware/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_malware_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Malware Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Takuya Goto", email = "tkyg@jp.ibm.com" }, ] dependencies = [ - "dpk-malware-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-malware-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/code/proglang_select/python/pyproject.toml b/transforms/code/proglang_select/python/pyproject.toml index 186198d83..e5736a9c7 100644 --- a/transforms/code/proglang_select/python/pyproject.toml +++ b/transforms/code/proglang_select/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_proglang_select_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Programming Language Selection Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/proglang_select/python/requirements.txt b/transforms/code/proglang_select/python/requirements.txt index 368287e5d..2f67f6a80 100644 --- a/transforms/code/proglang_select/python/requirements.txt +++ b/transforms/code/proglang_select/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.2.dev2 \ No newline at end of file +data-prep-toolkit==0.2.3.dev0 \ No newline at end of file diff --git a/transforms/code/proglang_select/ray/pyproject.toml b/transforms/code/proglang_select/ray/pyproject.toml index bf3e5f9f4..d8288d189 100644 --- a/transforms/code/proglang_select/ray/pyproject.toml +++ b/transforms/code/proglang_select/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_proglang_select_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Programming Language Selection Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "dpk-proglang-select-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-proglang-select-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/code/repo_level_ordering/ray/pyproject.toml b/transforms/code/repo_level_ordering/ray/pyproject.toml index 80440a362..9581c8941 100644 --- a/transforms/code/repo_level_ordering/ray/pyproject.toml +++ b/transforms/code/repo_level_ordering/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_repo_level_order_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "repo_level_order Ray Transform" license = {text = "Apache-2.0"} @@ -11,7 +11,7 @@ authors = [ { name = "Shanmukha Guttula", email = "shagutt1@in.ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.2.dev2", + "data-prep-toolkit[ray]==0.2.3.dev0", "networkx==3.3", "colorlog==6.8.2", "func-timeout==4.3.5", diff --git a/transforms/language/doc_chunk/python/requirements.txt b/transforms/language/doc_chunk/python/requirements.txt index c24d0c3e2..207ab9249 100644 --- a/transforms/language/doc_chunk/python/requirements.txt +++ b/transforms/language/doc_chunk/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.3.dev0 docling-core==2.3.0 pydantic>=2.0.0,<2.10.0 llama-index-core>=0.11.22,<0.12.0 diff --git a/transforms/language/doc_chunk/ray/pyproject.toml b/transforms/language/doc_chunk/ray/pyproject.toml index 29b594fac..4fb356038 100644 --- a/transforms/language/doc_chunk/ray/pyproject.toml +++ b/transforms/language/doc_chunk/ray/pyproject.toml @@ -12,7 +12,7 @@ authors = [ ] dependencies = [ "dpk-doc-chunk-transform-python==0.3.0", - "data-prep-toolkit[ray]==0.2.2.dev2", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/language/doc_quality/python/pyproject.toml b/transforms/language/doc_quality/python/pyproject.toml index 72406e945..23538b8c7 100644 --- a/transforms/language/doc_quality/python/pyproject.toml +++ b/transforms/language/doc_quality/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_quality_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Document Quality Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/doc_quality/python/requirements.txt b/transforms/language/doc_quality/python/requirements.txt index 2993d6b12..4aa2d8111 100644 --- a/transforms/language/doc_quality/python/requirements.txt +++ b/transforms/language/doc_quality/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.3.dev0 diff --git a/transforms/language/doc_quality/ray/pyproject.toml b/transforms/language/doc_quality/ray/pyproject.toml index dc13d5f94..ec56ac2c7 100644 --- a/transforms/language/doc_quality/ray/pyproject.toml +++ b/transforms/language/doc_quality/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_quality_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Document Quality Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "dpk-doc_quality-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-doc_quality-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/language/html2parquet/notebooks/html2parquet.ipynb b/transforms/language/html2parquet/notebooks/html2parquet.ipynb new file mode 100644 index 000000000..669a4d30d --- /dev/null +++ b/transforms/language/html2parquet/notebooks/html2parquet.ipynb @@ -0,0 +1,235 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8435e1f7-0c2e-49f4-a77a-b525ee6c532b", + "metadata": {}, + "source": [ + "# Html2Parquet Transform Sample Notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9420989-ec8a-4fde-9a93-dc25096389f1", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install data-prep-toolkit==0.2.2.dev2\n", + "!pip install 'data-prep-toolkit-transforms[html2parquet]==0.2.2.dev2'\n", + "!pip install pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "20663a67-5aa1-4b61-b989-94201613e41f", + "metadata": {}, + "outputs": [], + "source": [ + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from data_processing.utils import ParamsUtils\n", + "\n", + "from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration\n" + ] + }, + { + "cell_type": "markdown", + "id": "6d85491b-0093-46e7-8653-ca8052ea59f0", + "metadata": {}, + "source": [ + "## Specify input/output folders and parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e75f6922-eb0f-4164-a536-f96393e04604", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "\n", + "# create parameters\n", + "local_conf = {\n", + " \"input_folder\": \"/path/to/your/input/folder\", # For the sample input files, refer to the 'python/test-data/input' folder\n", + " \"output_folder\": \"/path/to/your/output/folder\",\n", + "}\n", + "\n", + "params = {\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " \"data_files_to_use\": ast.literal_eval(\"['.zip', '.html']\"),\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "0dcd1249-1eb8-4b33-9827-626f90c840b4", + "metadata": {}, + "source": [ + "## Invoke the html2parquet transformation" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4d2354db-1bb3-4a71-98df-f0f148af3a02", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "17:09:40 INFO - html2parquet parameters are : {'output_format': , 'favor_precision': , 'favor_recall': }\n", + "17:09:40 INFO - pipeline id pipeline_id\n", + "17:09:40 INFO - code location None\n", + "17:09:40 INFO - data factory data_ is using local data access: input_folder - input output_folder - output\n", + "17:09:40 INFO - data factory data_ max_files -1, n_sample -1\n", + "17:09:40 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.html'], files to checkpoint ['.parquet']\n", + "17:09:40 INFO - orchestrator html2parquet started at 2024-11-13 17:09:40\n", + "17:09:40 INFO - Number of files is 1, source profile {'max_file_size': 0.2035503387451172, 'min_file_size': 0.2035503387451172, 'total_file_size': 0.2035503387451172}\n", + "17:09:47 INFO - Completed 1 files (100.0%) in 0.111 min\n", + "17:09:47 INFO - Done processing 1 files, waiting for flush() completion.\n", + "17:09:47 INFO - done flushing in 0.0 sec\n", + "17:09:47 INFO - Completed execution in 0.111 min, execution result 0\n" + ] + } + ], + "source": [ + "import sys\n", + "sys.argv = ParamsUtils.dict_to_req(d=(params))\n", + "# create launcher\n", + "launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration())\n", + "# launch\n", + "return_code = launcher.launch()\n" + ] + }, + { + "cell_type": "markdown", + "id": "3c66468d-703f-427f-a1dd-a758edd334de", + "metadata": {}, + "source": [ + "## Checking the output Parquet file" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e2bee8da-c566-4e45-bca1-354dfd04b0df", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titledocumentcontentsdocument_idsizedate_acquired
0ai-alliance-index.htmlai-alliance-index.html![](https://images.prismic.io/ai-alliance/Ztf3...f86b8cebe07ec9f43a351bb4dc897f162f5a88cbb0f121...3942024-11-13T17:09:40.947095
\n", + "
" + ], + "text/plain": [ + " title document \\\n", + "0 ai-alliance-index.html ai-alliance-index.html \n", + "\n", + " contents \\\n", + "0 ![](https://images.prismic.io/ai-alliance/Ztf3... \n", + "\n", + " document_id size \\\n", + "0 f86b8cebe07ec9f43a351bb4dc897f162f5a88cbb0f121... 394 \n", + "\n", + " date_acquired \n", + "0 2024-11-13T17:09:40.947095 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pyarrow.parquet as pq\n", + "import pandas as pd\n", + "table = pq.read_table('/path/to/your/output/folder/sample.parquet')\n", + "table.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cde6e37d-c437-490f-8e01-f4f51a123484", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'![](https://images.prismic.io/ai-alliance/Ztf3gLzzk9ZrW8v8_caliopensourceslide.jpg?auto=format%2Ccompress&fit=max&w=3840)\\n\\n## Open Source AI Demo Night\\n\\nThe AI Alliance, in collaboration with Cerebral Valley and Ollama, hosted Open Source AI Demo Night in San Francisco, bringing together more than 200+ developers and innovators to showcase and celebrate the latest advances in open-source AI.'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table.to_pandas()['contents'][0]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/language/html2parquet/python/README.md b/transforms/language/html2parquet/python/README.md index 0d25553e1..35e781007 100644 --- a/transforms/language/html2parquet/python/README.md +++ b/transforms/language/html2parquet/python/README.md @@ -1,25 +1,52 @@ -# html2parquet Transform +# HTML to Parquet Transform -This tranforms iterate through zip of HTML files or single HTML files and generates parquet files containing the converted document in string. +--- -The HTML conversion is using the [Trafilatura](https://trafilatura.readthedocs.io/en/latest/usage-python.html). +## Description -## Output format +This transform iterates through zipped collections of HTML files or single HTML files and generates Parquet files containing the extracted content, leveraging the [Trafilatura library](https://trafilatura.readthedocs.io/en/latest/usage-python.html) for extraction of text, tables, images, and other components. -The output format will contain the following colums +--- + +## Contributors + +- Sungeun An (sungeun.an@ibm.com) +- Syed Zawad (szawad@ibm.com) + +--- + +## Date + +**Last updated:** 10/16/24 +**Update details:** Enhanced table and image extraction features by adding the corresponding Trafilatura parameters. + +--- + +## Input and Output + +### Input +- Accepted Formats: Single HTML files or zipped collections of HTML files. +- Sample Input Files: [sample html files](test-data/input) + +### Output +- Format: Parquet files with the following structure: ```jsonc { - "title": "string", // the member filename - "document": "string", // the base of the source archive - "contents": "string", // the content of the HTML + "title": "string", // the member filename + "document": "string", // the base of the source archive + "contents": "string", // the content of the HTML "document_id": "string", // the document id, a hash of `contents` "size": "string", // the size of `contents` "date_acquired": "date", // the date when the transform was executing } ``` + + ## Parameters +### User-Configurable Parameters + The table below provides the parameters that users can adjust to control the behavior of the extraction: | Parameter | Default | Description | @@ -28,6 +55,8 @@ The table below provides the parameters that users can adjust to control the beh | `favor_precision` | `True` | Prefers less content but more accurate extraction. Options: `True`, `False`. | | `favor_recall` | `True` | Extracts more content when uncertain. Options: `True`, `False`. | +### Default Parameters + The table below provides the parameters that are enabled by default to ensure a comprehensive extraction process: | Parameter | Default | Description | @@ -43,6 +72,7 @@ The table below provides the parameters that are enabled by default to ensure a - To prioritize extracting more content over accuracy, set `favor_recall=True` and `favor_precision=False`. - When invoking the CLI, use the following syntax for these parameters: `--html2parquet_`. For example: `--html2parquet_output_format='markdown'`. + ## Example ### Sample HTML @@ -155,3 +185,27 @@ Chicago | ## Contact Us ``` +## Usage + +### Command-Line Interface (CLI) + +Run the transform with the following command: + +``` +python ../html2parquet/python/src/html2parquet_transform_python.py \ + --data_local_config "{'input_folder': '../html2parquet/python/test-data/input', 'output_folder': '../html2parquet/python/test-data/expected'}" \ + --data_files_to_use '[".html", ".zip"]' +``` + +- When invoking the CLI, use the following syntax for these parameters: `--html2parquet_`. For example: `--html2parquet_output_format='markdown'`. + + +### Sample Notebook + +See the [sample notebook](../notebooks/html2parquet.ipynb) +) for an example. + + +## Further Resources + +- [Trafilatura](https://trafilatura.readthedocs.io/en/latest/usage-python.html). diff --git a/transforms/language/html2parquet/python/pyproject.toml b/transforms/language/html2parquet/python/pyproject.toml index dfd0c3928..3a7a6efbc 100644 --- a/transforms/language/html2parquet/python/pyproject.toml +++ b/transforms/language/html2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_html2parquet_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "HTML2PARQUET Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/html2parquet/python/requirements.txt b/transforms/language/html2parquet/python/requirements.txt index af6ffe1e5..f21e65774 100644 --- a/transforms/language/html2parquet/python/requirements.txt +++ b/transforms/language/html2parquet/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.3.dev0 trafilatura==1.12.0 diff --git a/transforms/language/html2parquet/ray/pyproject.toml b/transforms/language/html2parquet/ray/pyproject.toml index 873883e49..5e888748c 100644 --- a/transforms/language/html2parquet/ray/pyproject.toml +++ b/transforms/language/html2parquet/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_html2parquet_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "HTML2PARQUET Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/html2parquet/ray/requirements.txt b/transforms/language/html2parquet/ray/requirements.txt index 151d05a3e..9aa193432 100644 --- a/transforms/language/html2parquet/ray/requirements.txt +++ b/transforms/language/html2parquet/ray/requirements.txt @@ -1,3 +1,3 @@ -dpk-html2parquet-transform-python==0.2.2.dev2 -data-prep-toolkit[ray]==0.2.2.dev2 +dpk-html2parquet-transform-python==0.2.3.dev0 +data-prep-toolkit[ray]==0.2.3.dev0 trafilatura==1.12.0 \ No newline at end of file diff --git a/transforms/language/lang_id/python/pyproject.toml b/transforms/language/lang_id/python/pyproject.toml index c5de6826a..a69724a2d 100644 --- a/transforms/language/lang_id/python/pyproject.toml +++ b/transforms/language/lang_id/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_lang_id_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Language Identification Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/lang_id/python/requirements.txt b/transforms/language/lang_id/python/requirements.txt index a405f7afc..06bec1ab9 100644 --- a/transforms/language/lang_id/python/requirements.txt +++ b/transforms/language/lang_id/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.3.dev0 fasttext==0.9.2 langcodes==3.3.0 huggingface-hub >= 0.21.4, <1.0.0 diff --git a/transforms/language/lang_id/ray/pyproject.toml b/transforms/language/lang_id/ray/pyproject.toml index ac45a167e..dba929905 100644 --- a/transforms/language/lang_id/ray/pyproject.toml +++ b/transforms/language/lang_id/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_lang_id_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Language Identification Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "dpk-lang_id-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-lang_id-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/language/pdf2parquet/python/requirements.txt b/transforms/language/pdf2parquet/python/requirements.txt index 2912af252..310909164 100644 --- a/transforms/language/pdf2parquet/python/requirements.txt +++ b/transforms/language/pdf2parquet/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.3.dev0 docling-core==2.3.0 docling-ibm-models==2.0.3 deepsearch-glm==0.26.1 diff --git a/transforms/language/pdf2parquet/ray/requirements.txt b/transforms/language/pdf2parquet/ray/requirements.txt index 2b414c59e..34831cde8 100644 --- a/transforms/language/pdf2parquet/ray/requirements.txt +++ b/transforms/language/pdf2parquet/ray/requirements.txt @@ -1,5 +1,5 @@ dpk-pdf2parquet-transform-python==0.3.0 -data-prep-toolkit[ray]==0.2.2.dev2 +data-prep-toolkit[ray]==0.2.3.dev0 # docling-core==1.7.2 # docling-ibm-models==2.0.0 # deepsearch-glm==0.22.0 diff --git a/transforms/language/pii_redactor/python/requirements.txt b/transforms/language/pii_redactor/python/requirements.txt index 958210865..0abcc1d96 100644 --- a/transforms/language/pii_redactor/python/requirements.txt +++ b/transforms/language/pii_redactor/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.3.dev0 presidio-analyzer>=2.2.355 presidio-anonymizer>=2.2.355 flair>=0.14.0 diff --git a/transforms/language/pii_redactor/ray/pyproject.toml b/transforms/language/pii_redactor/ray/pyproject.toml index b96f16615..4549851d0 100644 --- a/transforms/language/pii_redactor/ray/pyproject.toml +++ b/transforms/language/pii_redactor/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pii_redactor_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "PII Redactor Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk_pii_redactor_transform_python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk_pii_redactor_transform_python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", "presidio-analyzer>=2.2.355", "presidio-anonymizer>=2.2.355", "flair>=0.14.0", diff --git a/transforms/language/text_encoder/python/pyproject.toml b/transforms/language/text_encoder/python/pyproject.toml index 87dad3c1c..dc15beb6e 100644 --- a/transforms/language/text_encoder/python/pyproject.toml +++ b/transforms/language/text_encoder/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_text_encoder_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Text Encoder Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/text_encoder/python/requirements.txt b/transforms/language/text_encoder/python/requirements.txt index 2eb79e69b..3ac880bba 100644 --- a/transforms/language/text_encoder/python/requirements.txt +++ b/transforms/language/text_encoder/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.3.dev0 sentence-transformers==3.0.1 diff --git a/transforms/language/text_encoder/ray/pyproject.toml b/transforms/language/text_encoder/ray/pyproject.toml index ef08f697a..f1b2c09d5 100644 --- a/transforms/language/text_encoder/ray/pyproject.toml +++ b/transforms/language/text_encoder/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_text_encoder_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Text Encoder Ray Transform" license = {text = "Apache-2.0"} @@ -11,8 +11,8 @@ authors = [ { name = "Peter Staar", email = "taa@zurich.ibm.com" }, ] dependencies = [ - "dpk-text_encoder-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-text_encoder-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml index badb8bbd9..eb25124c6 100644 --- a/transforms/pyproject.toml +++ b/transforms/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_transforms" -version = "0.2.2.dev4" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms using Ray" @@ -85,6 +85,9 @@ web2parquet = { file = ["universal/web2parquet/requirements.txt"]} #[tool.setuptools.package-data] #"*" = ["*.txt"] +#[tool.setuptools.packages.find] +#where = ["src"] + #[tool.setuptools.package-dir] #dpk_web2parquet = "universal/web2parquet/dpk_web2parquet" diff --git a/transforms/requirements-ray.txt b/transforms/requirements-ray.txt index 9012f685b..b0527bdd6 100644 --- a/transforms/requirements-ray.txt +++ b/transforms/requirements-ray.txt @@ -1,4 +1,4 @@ -data-prep-toolkit[ray]>=0.2.2.dev2 +data-prep-toolkit[ray]>=0.2.3.dev0 networkx==3.3 colorlog==6.8.2 func-timeout==4.3.5 diff --git a/transforms/requirements.txt b/transforms/requirements.txt index 8b48a970f..934c95182 100644 --- a/transforms/requirements.txt +++ b/transforms/requirements.txt @@ -1 +1 @@ -data-prep-toolkit>=0.2.2.dev2 +data-prep-toolkit>=0.2.3.dev0 diff --git a/transforms/transform.config b/transforms/transform.config index c226171c6..7bafba684 100644 --- a/transforms/transform.config +++ b/transforms/transform.config @@ -7,11 +7,3 @@ # expected files and is used to define the transform's image name. TRANSFORM_NAME=data-prep-kit-transforms -################################################################################ -# This defines the transforms' package version number as would be used -# when publishing the wheel. In general, only the micro version -# number should be advanced relative to the DPK_VERSION. -# -# If you change the versions numbers, be sure to run "make set-versions" to -# update version numbers across the transform (e.g., pyproject.toml). -TRANSFORMS_PKG_VERSION=0.2.2.dev2 diff --git a/transforms/universal/doc_id/python/pyproject.toml b/transforms/universal/doc_id/python/pyproject.toml index 0e2658087..1a962662d 100644 --- a/transforms/universal/doc_id/python/pyproject.toml +++ b/transforms/universal/doc_id/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "ededup Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/doc_id/python/requirements.txt b/transforms/universal/doc_id/python/requirements.txt index 368287e5d..2f67f6a80 100644 --- a/transforms/universal/doc_id/python/requirements.txt +++ b/transforms/universal/doc_id/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.2.dev2 \ No newline at end of file +data-prep-toolkit==0.2.3.dev0 \ No newline at end of file diff --git a/transforms/universal/doc_id/ray/pyproject.toml b/transforms/universal/doc_id/ray/pyproject.toml index 5a5941155..da34dded3 100644 --- a/transforms/universal/doc_id/ray/pyproject.toml +++ b/transforms/universal/doc_id/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "docid Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk_doc_id_transform_python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk_doc_id_transform_python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/doc_id/spark/pyproject.toml b/transforms/universal/doc_id/spark/pyproject.toml index 36f345c09..369a1bb72 100644 --- a/transforms/universal/doc_id/spark/pyproject.toml +++ b/transforms/universal/doc_id/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_spark" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Doc ID Spark Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[spark]==0.2.2.dev2", + "data-prep-toolkit[spark]==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/ededup/python/pyproject.toml b/transforms/universal/ededup/python/pyproject.toml index 735104f20..da28e715f 100644 --- a/transforms/universal/ededup/python/pyproject.toml +++ b/transforms/universal/ededup/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_ededup_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "ededup Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/ededup/python/requirements.txt b/transforms/universal/ededup/python/requirements.txt index 75baaef62..aa73a106a 100644 --- a/transforms/universal/ededup/python/requirements.txt +++ b/transforms/universal/ededup/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.3.dev0 mmh3>=4.1.0 xxhash==3.4.1 diff --git a/transforms/universal/ededup/ray/pyproject.toml b/transforms/universal/ededup/ray/pyproject.toml index 9e3885e50..424e220fd 100644 --- a/transforms/universal/ededup/ray/pyproject.toml +++ b/transforms/universal/ededup/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_ededup_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "ededup Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.2.dev2", - "dpk_ededup_transform_python==0.2.2.dev2", + "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk_ededup_transform_python==0.2.3.dev0", "tqdm==4.66.3", ] diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml index 97be33d54..08b20ed75 100644 --- a/transforms/universal/fdedup/python/pyproject.toml +++ b/transforms/universal/fdedup/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Fuzzy Dedup Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt index 4e69a72e4..3e5dfc16d 100644 --- a/transforms/universal/fdedup/python/requirements.txt +++ b/transforms/universal/fdedup/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.3.dev0 pyyaml>=6.0.2 boto3>=1.34.69 kubernetes>=30.1.0 diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index cb8c6306a..485d6de21 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "fdedup Ray Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/fdedup/ray/requirements.txt b/transforms/universal/fdedup/ray/requirements.txt index 6ee40ef7f..81e48e5ee 100644 --- a/transforms/universal/fdedup/ray/requirements.txt +++ b/transforms/universal/fdedup/ray/requirements.txt @@ -1,5 +1,5 @@ -data-prep-toolkit[ray]==0.2.2.dev2 -dpk_fdedup_transform_python==0.2.2.dev2 +data-prep-toolkit[ray]==0.2.3.dev0 +dpk_fdedup_transform_python==0.2.3.dev0 mmh3>=4.1.0 xxhash==3.4.1 tqdm==4.66.3 diff --git a/transforms/universal/fdedup/spark/pyproject.toml b/transforms/universal/fdedup/spark/pyproject.toml index f77df2010..8a072b31b 100644 --- a/transforms/universal/fdedup/spark/pyproject.toml +++ b/transforms/universal/fdedup/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_spark" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Fuzzy Dedup Spark Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/fdedup/spark/requirements.txt b/transforms/universal/fdedup/spark/requirements.txt index c373ffbb7..bfb0f04a2 100644 --- a/transforms/universal/fdedup/spark/requirements.txt +++ b/transforms/universal/fdedup/spark/requirements.txt @@ -1,5 +1,5 @@ -dpk_fdedup_transform_python==0.2.2.dev2 -data-prep-toolkit[spark]==0.2.2.dev2 +dpk_fdedup_transform_python==0.2.3.dev0 +data-prep-toolkit[spark]==0.2.3.dev0 pyyaml>=6.0.2 boto3>=1.34.69 kubernetes>=30.1.0 diff --git a/transforms/universal/filter/python/pyproject.toml b/transforms/universal/filter/python/pyproject.toml index 64f148799..fcf0f6419 100644 --- a/transforms/universal/filter/python/pyproject.toml +++ b/transforms/universal/filter/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Filter Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/universal/filter/python/requirements.txt b/transforms/universal/filter/python/requirements.txt index 9f1feff29..100626f60 100644 --- a/transforms/universal/filter/python/requirements.txt +++ b/transforms/universal/filter/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.3.dev0 duckdb>=0.10.1 diff --git a/transforms/universal/filter/ray/pyproject.toml b/transforms/universal/filter/ray/pyproject.toml index a794a1a0b..64776e0c1 100644 --- a/transforms/universal/filter/ray/pyproject.toml +++ b/transforms/universal/filter/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Filter Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ - "dpk-filter-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-filter-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/filter/spark/pyproject.toml b/transforms/universal/filter/spark/pyproject.toml index 7b60dba46..ef46c9a1b 100644 --- a/transforms/universal/filter/spark/pyproject.toml +++ b/transforms/universal/filter/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_spark" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Filter Spark Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[spark]==0.2.2.dev2", + "data-prep-toolkit[spark]==0.2.3.dev0", ] [project.optional-dependencies] diff --git a/transforms/universal/hap/python/pyproject.toml b/transforms/universal/hap/python/pyproject.toml index 389788363..bf7c85577 100644 --- a/transforms/universal/hap/python/pyproject.toml +++ b/transforms/universal/hap/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_hap_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "HAP Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/hap/python/requirements.txt b/transforms/universal/hap/python/requirements.txt index 505dd9ceb..1250d1f77 100644 --- a/transforms/universal/hap/python/requirements.txt +++ b/transforms/universal/hap/python/requirements.txt @@ -1,5 +1,5 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.3.dev0 nltk==3.9.1 transformers==4.38.2 -torch==2.4.1 +torch>=2.2.2,<=2.4.1 pandas==2.2.2 diff --git a/transforms/universal/hap/ray/pyproject.toml b/transforms/universal/hap/ray/pyproject.toml index abbb1a30c..38e78938b 100644 --- a/transforms/universal/hap/ray/pyproject.toml +++ b/transforms/universal/hap/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_hap_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "HAP Ray Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/hap/ray/requirements.txt b/transforms/universal/hap/ray/requirements.txt index 0ed65f625..7c4c8eb94 100644 --- a/transforms/universal/hap/ray/requirements.txt +++ b/transforms/universal/hap/ray/requirements.txt @@ -1,6 +1,6 @@ -data-prep-toolkit[ray]==0.2.2.dev2 -dpk-hap-transform-python==0.2.2.dev2 +data-prep-toolkit[ray]==0.2.3.dev0 +dpk-hap-transform-python==0.2.3.dev0 nltk==3.9.1 transformers==4.38.2 -torch==2.4.1 +torch>=2.2.2,<=2.4.1 pandas==2.2.2 diff --git a/transforms/universal/noop/python/pyproject.toml b/transforms/universal/noop/python/pyproject.toml index 998161e31..ff9a24244 100644 --- a/transforms/universal/noop/python/pyproject.toml +++ b/transforms/universal/noop/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "NOOP Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.2.dev2", + "data-prep-toolkit==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/noop/ray/pyproject.toml b/transforms/universal/noop/ray/pyproject.toml index 5d475fe12..da9327917 100644 --- a/transforms/universal/noop/ray/pyproject.toml +++ b/transforms/universal/noop/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "NOOP Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk-noop-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-noop-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/noop/spark/pyproject.toml b/transforms/universal/noop/spark/pyproject.toml index f867fb070..d3cd47bf6 100644 --- a/transforms/universal/noop/spark/pyproject.toml +++ b/transforms/universal/noop/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_spark" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "NOOP Spark Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-noop-transform-python==0.2.2.dev2", - "data-prep-toolkit[spark]==0.2.2.dev2", + "dpk-noop-transform-python==0.2.3.dev0", + "data-prep-toolkit[spark]==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/profiler/python/pyproject.toml b/transforms/universal/profiler/python/pyproject.toml index 95775e3a6..39d9788f8 100644 --- a/transforms/universal/profiler/python/pyproject.toml +++ b/transforms/universal/profiler/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "profiler Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/profiler/python/requirements.txt b/transforms/universal/profiler/python/requirements.txt index 89801e4ad..526140ada 100644 --- a/transforms/universal/profiler/python/requirements.txt +++ b/transforms/universal/profiler/python/requirements.txt @@ -1,5 +1,5 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.3.dev0 mmh3==4.1.0 xxhash==3.4.1 diff --git a/transforms/universal/profiler/ray/pyproject.toml b/transforms/universal/profiler/ray/pyproject.toml index 6060653fa..ac8d729ec 100644 --- a/transforms/universal/profiler/ray/pyproject.toml +++ b/transforms/universal/profiler/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "profiler Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.2.dev2", - "dpk_profiler_transform_python==0.2.2.dev2", + "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk_profiler_transform_python==0.2.3.dev0", "tqdm==4.66.3", ] diff --git a/transforms/universal/profiler/spark/pyproject.toml b/transforms/universal/profiler/spark/pyproject.toml index 455684b4f..6ba790301 100644 --- a/transforms/universal/profiler/spark/pyproject.toml +++ b/transforms/universal/profiler/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_spark" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Profiler Spark Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-profiler-transform-python==0.2.2.dev2", - "data-prep-toolkit[spark]==0.2.2.dev2", + "dpk-profiler-transform-python==0.2.3.dev0", + "data-prep-toolkit[spark]==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/resize/python/pyproject.toml b/transforms/universal/resize/python/pyproject.toml index 082f37f0c..6fdad69d0 100644 --- a/transforms/universal/resize/python/pyproject.toml +++ b/transforms/universal/resize/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "resize Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/resize/python/requirements.txt b/transforms/universal/resize/python/requirements.txt index 368287e5d..2f67f6a80 100644 --- a/transforms/universal/resize/python/requirements.txt +++ b/transforms/universal/resize/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.2.dev2 \ No newline at end of file +data-prep-toolkit==0.2.3.dev0 \ No newline at end of file diff --git a/transforms/universal/resize/ray/pyproject.toml b/transforms/universal/resize/ray/pyproject.toml index 1490303bb..c266a39f4 100644 --- a/transforms/universal/resize/ray/pyproject.toml +++ b/transforms/universal/resize/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Resize Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk-resize-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-resize-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/resize/spark/pyproject.toml b/transforms/universal/resize/spark/pyproject.toml index 538c12d20..7de14c673 100644 --- a/transforms/universal/resize/spark/pyproject.toml +++ b/transforms/universal/resize/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_spark" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Resize Spark Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-resize-transform-python==0.2.2.dev2", - "data-prep-toolkit[spark]==0.2.2.dev2", + "dpk-resize-transform-python==0.2.3.dev0", + "data-prep-toolkit[spark]==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/tokenization/python/pyproject.toml b/transforms/universal/tokenization/python/pyproject.toml index bc352f0fd..dbb8e84ba 100644 --- a/transforms/universal/tokenization/python/pyproject.toml +++ b/transforms/universal/tokenization/python/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dpk_tokenization_transform_python" keywords = ["tokenizer", "data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Tokenization Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/universal/tokenization/python/requirements.txt b/transforms/universal/tokenization/python/requirements.txt index 5e00dbaa1..8a1920162 100644 --- a/transforms/universal/tokenization/python/requirements.txt +++ b/transforms/universal/tokenization/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.3.dev0 transformers==4.38.2 diff --git a/transforms/universal/tokenization/ray/pyproject.toml b/transforms/universal/tokenization/ray/pyproject.toml index 095cb63e0..c094b9e7e 100644 --- a/transforms/universal/tokenization/ray/pyproject.toml +++ b/transforms/universal/tokenization/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_tokenization_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Tokenization Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"}, ] dependencies = [ - "dpk-tokenization-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-tokenization-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/web2parquet/requirements.txt b/transforms/universal/web2parquet/requirements.txt index 5c989591d..1af3f12a4 100644 --- a/transforms/universal/web2parquet/requirements.txt +++ b/transforms/universal/web2parquet/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit>=0.2.2.dev2 -data_prep_connector>=0.2.3.dev0 \ No newline at end of file +data-prep-toolkit>=0.2.3.dev0 +data_prep_connector>=0.2.3 \ No newline at end of file diff --git a/transforms/universal/web2parquet/web2parquet.ipynb b/transforms/universal/web2parquet/web2parquet.ipynb index 2bd55f0bc..ea802d734 100644 --- a/transforms/universal/web2parquet/web2parquet.ipynb +++ b/transforms/universal/web2parquet/web2parquet.ipynb @@ -5,12 +5,12 @@ "id": "afd55886-5f5b-4794-838e-ef8179fb0394", "metadata": {}, "source": [ - "##### **** These pip install need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release\n", + "##### **** These pip install need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "##### \n", "\n", - "##### **** example: \n", "```\n", - "python -m venv && source venv/bin/activate\n", - "pip install -r requirements.txt\n", + "make venv \n", + "source venv/bin/activate \n", "pip install jupyterlab\n", "```" ]