Merge pull request #916 from pankajskku/dev

Refactoring code profiler transform to new pythonic code layout #913
IBM · Jan 8, 2025 · 9773a03 · 9773a03
2 parents a06a728 + 8992c2a
commit 9773a03
Show file tree

Hide file tree

Showing 582 changed files with 4,859 additions and 11,354 deletions.
diff --git a/transforms/Makefile b/transforms/Makefile
@@ -114,7 +114,7 @@ build-pkg-dist:
 		fi \
 	done
 	# Only needs to build the whl
-	git show --no-patch > src/data/gitshow.txt
+	git show --no-patch > src/gitshow.txt
 	$(MAKE) BUILD_WHEEL_EXTRA_ARG=-w .defaults.build-dist
 	-rm -fr src
 

diff --git a/...s/code/code_profiler/python/.dockerignore → transforms/code/code_profiler/.dockerignore b/...s/code/code_profiler/python/.dockerignore → transforms/code/code_profiler/.dockerignore
diff --git a/transforms/code/code_profiler/.make.subdirs b/transforms/code/code_profiler/.make.subdirs
@@ -0,0 +1,2 @@
+python
+ray
diff --git a/...orms/code/code_profiler/python/Dockerfile → ...orms/code/code_profiler/Dockerfile.python b/...orms/code/code_profiler/python/Dockerfile → ...orms/code/code_profiler/Dockerfile.python
@@ -9,32 +9,19 @@ RUN pip install --no-cache-dir pytest
 RUN useradd -ms /bin/bash dpk
 USER dpk
 WORKDIR /home/dpk
+
 ARG DPK_WHEEL_FILE_NAME
 
 # Copy and install data processing libraries 
 # These are expected to be placed in the docker context before this is run (see the make image).
-COPY --chown=dpk:root data-processing-dist data-processing-dist
+COPY --chown=dpk:root data-processing-dist/ data-processing-dist/
 RUN  pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}
 
-COPY --chown=dpk:root src/ src/
-COPY --chown=dpk:root pyproject.toml pyproject.toml
-COPY --chown=dpk:root README.md README.md
-COPY --chown=dpk:root requirements.txt requirements.txt
-
-RUN pip install --no-cache-dir -e .
-
-# copy the main() entry point to the image 
-COPY ./src/code_profiler_transform_python.py .
+# END OF STEPS destined for a data-prep-kit base image 
 
-# copy some of the samples in
-COPY ./src/code_profiler_local.py local/
-
-# Copy the tree-sitter bindings (this is the important part)
-COPY --chown=dpk:root ../../input/tree-sitter-bindings/ /home/dpk/input/tree-sitter-bindings/
-
-# copy test
-# COPY test/ test/
-# COPY test-data/ test-data/
+COPY --chown=dpk:root dpk_code_profiler/ dpk_code_profiler/
+COPY --chown=dpk:root requirements.txt requirements.txt
+RUN pip install -r requirements.txt
 
 # Set environment
 ENV PYTHONPATH /home/dpk

diff --git a/transforms/code/code_profiler/ray/Dockerfile → transforms/code/code_profiler/Dockerfile.ray b/transforms/code/code_profiler/ray/Dockerfile → transforms/code/code_profiler/Dockerfile.ray
@@ -1,37 +1,22 @@
 ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310
+
 FROM ${BASE_IMAGE}
 
 RUN pip install --upgrade --no-cache-dir pip 
 
 # install pytest
 RUN pip install --no-cache-dir pytest
-
 ARG DPK_WHEEL_FILE_NAME
 
 # Copy and install data processing libraries 
 # These are expected to be placed in the docker context before this is run (see the make image).
 COPY --chown=ray:users data-processing-dist data-processing-dist
 RUN  pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray]
 
-COPY --chown=ray:users python-transform/  python-transform/
-RUN cd python-transform && pip install --no-cache-dir -e .
-
-#COPY requirements.txt requirements.txt
-#RUN pip install --no-cache-dir -r  requirements.txt
-
-COPY --chown=ray:users src/ src/
-COPY --chown=ray:users pyproject.toml pyproject.toml 
-RUN pip install --no-cache-dir -e .
-
-# copy the main() entry point to the image 
-COPY ./src/code_profiler_transform_ray.py .
-
-# copy some of the samples in
-COPY ./src/code_profiler_local_ray.py local/
-
-# copy test
-COPY test/ test/
-COPY test-data/ test-data/
+## Copy the python version of the tansform
+COPY --chown=ray:users dpk_code_profiler/ dpk_code_profiler/
+COPY --chown=ray:users requirements.txt requirements.txt
+RUN pip install -r requirements.txt
 
 # Grant non-root users the necessary permissions to the ray directory
 RUN chmod 755 /home/ray

diff --git a/transforms/code/code_profiler/Makefile b/transforms/code/code_profiler/Makefile
@@ -1,49 +1,32 @@
 REPOROOT=../../..
 # Use make help, to see the available rules
-include $(REPOROOT)/.make.defaults
-
-setup::
-	@# Help: Recursively make $@ all subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-clean::
-	@# Help: Recursively make $@ all subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-build::
-	@# Help: Recursively make $@ in subdirs 
-	$(MAKE) RULE=$@ .recurse
-venv::
-	@# Help: Recursively make $@ in subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-publish:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-test-image:
-	@echo "Skipping test-image step as per configuration."
-
-test:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-test-src::
-	@# Help: Recursively make $@ in all subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-set-versions:
-	@# Help: Recursively $@ in all subdirs
-	$(MAKE) RULE=$@ .recurse
-
-.PHONY: workflow-venv
-workflow-venv:
-
-.PHONY: workflow-test
-workflow-test:
-
-.PHONY: workflow-upload
-workflow-upload:
-
-.PHONY: workflow-build
-workflow-build:
+include $(REPOROOT)/transforms/.make.cicd.targets
+
+#
+# This is intended to be included across the Makefiles provided within
+# a given transform's directory tree,  so must use compatible syntax.
+#
+################################################################################
+# This defines the name of the transform and is used to match against
+# expected files and is used to define the transform's image name. 
+TRANSFORM_NAME=$(shell basename `pwd`)
+
+################################################################################
+export RUNTIME_HOST_ARCH=x86_64
+
+run-cli-sample:
+	make venv
+	source venv/bin/activate && \
+	$(PYTHON) -m dpk_$(TRANSFORM_NAME).transform_python \
+             --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}"  \
+			 --language "language" \
+			 --contents "contents"
+
+run-ray-cli-sample:
+	make venv
+	source venv/bin/activate && \
+	$(PYTHON) -m dpk_$(TRANSFORM_NAME).ray.transform \
+		--data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}"  \
+        --run_locally True \
+		--language "language" \
+		--contents "contents"
diff --git a/transforms/code/code_profiler/README.md b/transforms/code/code_profiler/README.md
@@ -39,20 +39,13 @@ As shown in Table 2, the framework standardizes code representation by categoriz
 |                                                    | **Ocaml**                                                                                         | Yes         | NA           | Yes         |
 
 
-* [python](python/README.md) - provides the base python-based syntactic concept extractor
-implementation.
-* [ray](ray/README.md) - provides the base ray-based syntactic concept extractor
-implementation.
-
-
-
 **Offline Path for Syntactic Rule Generation**
 
 The offline path is critical for expanding and refining the syntactic rule database, enabling the UBSR framework to adapt to new languages and syntactic constructs. This process leverages LLMs to generate syntactic rules for languages that are not yet included in the rule database. To achieve this, we utilize a Few-shot Chain of Thought prompting technique, guiding the LLM through a step-by-step rule generation process. By providing carefully curated training exemplars and detailed instructions, this method ensures the LLM can accurately generalize from these examples to produce effective syntactic rules for a wide range of languages. This structured approach enhances the flexibility of the UBSR framework, allowing it to seamlessly handle evolving language constructs.
 
-The implementation for UI-based offline customization tool is present [here](python/src/offline-customizations). To run the tool, use the following command.
+The implementation for UI-based offline customization tool is present [here](dpk_code_profiler/offline-customizations). To run the tool, use the following command.
 
-`streamlit run LLM_runner_app.py`
+`streamlit run generic_LLM_runner_app.py`
 
 The high-level system design is as follows:
 
@@ -62,12 +55,99 @@ For each new target language, the offline phase is utilized to create determinis
 
 In the online phase, the system dynamically generates profiling outputs for any incoming code snippets. This is achieved by extracting concepts from the snippets using the rules in the database and storing these extractions in a tabular format. The structured tabular format allows for generating additional concept columns, which are then utilized to create comprehensive profiling reports.
 
-The following runtimes are available:
-* [python](python/README.md) - provides the base python-based transformation 
-implementation and python runtime.
-* [ray](ray/README.md) - enables the running of the base python transformation
-in a Ray runtime
 
-Please refer to the playbook at `transforms/code/code_profiler/notebook_example/code-profiler.ipynb` to run the pythonic code profiler
+## Configuration and command line Options
+
+The set of dictionary keys holding [code_profiler_transform](dpk_code_profiler/transform.py) 
+configuration for values are as follows:
+
+* content - specifies the column name in the dataframe that has the code snippet
+* language - specifies the programming languages of the code snippet
+
+## Running
+
+Copy your input parquet file to `transforms/code/code_profiler/test-data/input`, the output will be created in a directory `transforms/code/code_profiler/output`
+
+### Running the samples
+
+The code profiler can be run on mach-arm64 and x86_64 host architecture.
+Depending on your host architecture, please change the `RUNTIME_HOST_ARCH` in the Makefile.
+```
+# values possible mach-arm64, x86_64
+export RUNTIME_HOST_ARCH=x86_64
+```
+If you are using mac, you may need to permit your Mac to load the .so from the security settings. Generally, you get the pop-up under the tab security while running the transform.
+
+![alt text](image.png)
+
+To run the samples, use the following `make` target
+
+* `run-cli-sample` - runs dpk_code_profiler/transform.py using command line args
+
+This target will activate the virtual environment and sets up any configuration needed.
+Use the `-n` option of `make` to see the detail of what is done to run the sample.
+
+For example, 
+```shell
+make run-cli-sample
+...
+```
+Then 
+```shell
+ls output
+```
+To see results of the transform.
+
+## Testing
+
+Following [the testing strategy of data-processing-lib](../../../data-processing-lib/doc/transform-testing.md)
+
+Currently we have:
+- [Unit test](test/test_code_profiler_python.py)
+- [Integration test](test/test_code_profiler.py)
+
+
+## Document Quality Ray Transform 
+Please see the set of
+[transform project conventions](../../README.md#transform-project-conventions)
+for details on general project conventions, transform configuration,
+testing and IDE set up.
+
+
+### Configuration and command line Options
+
+Document Quality configuration and command line options are the same as for the base python transform. 
+
+### Running
+
+#### Launched Command Line Options 
+When running the transform with the Ray launcher (i.e., TransformLauncher),
+In addition to those available to the transform as defined here,
+the set of 
+[ray launcher](../../../data-processing-lib/doc/ray-launcher-options.md) are available.
+
+#### Running the samples
+To run the samples, use the following `make` target
+
+* `run-ray-cli-sample` - runs dpk_code_profiler/ray/transform.py using command line args
+
+This target will activate the virtual environment and sets up any configuration needed.
+Use the `-n` option of `make` to see the detail of what is done to run the sample.
+
+For example, 
+```shell
+make run-ray-cli-sample
+...
+```
+Then 
+```shell
+ls output
+```
+To see results of the transform.
+
 
+### Transforming data using the transform image
 
+To use the transform image to transform your data, please refer to the 
+[running images quickstart](../../../doc/quick-start/run-transform-image.md),
+substituting the name of this transform image and runtime as appropriate.