Skip to content

Commit

Permalink
Merge pull request #916 from pankajskku/dev
Browse files Browse the repository at this point in the history
Refactoring code profiler transform to new pythonic code layout #913
  • Loading branch information
touma-I authored Jan 8, 2025
2 parents a06a728 + 8992c2a commit 9773a03
Show file tree
Hide file tree
Showing 582 changed files with 4,859 additions and 11,354 deletions.
2 changes: 1 addition & 1 deletion transforms/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ build-pkg-dist:
fi \
done
# Only needs to build the whl
git show --no-patch > src/data/gitshow.txt
git show --no-patch > src/gitshow.txt
$(MAKE) BUILD_WHEEL_EXTRA_ARG=-w .defaults.build-dist
-rm -fr src

Expand Down
File renamed without changes.
2 changes: 2 additions & 0 deletions transforms/code/code_profiler/.make.subdirs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
python
ray
Original file line number Diff line number Diff line change
Expand Up @@ -9,32 +9,19 @@ RUN pip install --no-cache-dir pytest
RUN useradd -ms /bin/bash dpk
USER dpk
WORKDIR /home/dpk

ARG DPK_WHEEL_FILE_NAME

# Copy and install data processing libraries
# These are expected to be placed in the docker context before this is run (see the make image).
COPY --chown=dpk:root data-processing-dist data-processing-dist
COPY --chown=dpk:root data-processing-dist/ data-processing-dist/
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}

COPY --chown=dpk:root src/ src/
COPY --chown=dpk:root pyproject.toml pyproject.toml
COPY --chown=dpk:root README.md README.md
COPY --chown=dpk:root requirements.txt requirements.txt

RUN pip install --no-cache-dir -e .

# copy the main() entry point to the image
COPY ./src/code_profiler_transform_python.py .
# END OF STEPS destined for a data-prep-kit base image

# copy some of the samples in
COPY ./src/code_profiler_local.py local/

# Copy the tree-sitter bindings (this is the important part)
COPY --chown=dpk:root ../../input/tree-sitter-bindings/ /home/dpk/input/tree-sitter-bindings/

# copy test
# COPY test/ test/
# COPY test-data/ test-data/
COPY --chown=dpk:root dpk_code_profiler/ dpk_code_profiler/
COPY --chown=dpk:root requirements.txt requirements.txt
RUN pip install -r requirements.txt

# Set environment
ENV PYTHONPATH /home/dpk
Expand Down
Original file line number Diff line number Diff line change
@@ -1,37 +1,22 @@
ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310

FROM ${BASE_IMAGE}

RUN pip install --upgrade --no-cache-dir pip

# install pytest
RUN pip install --no-cache-dir pytest

ARG DPK_WHEEL_FILE_NAME

# Copy and install data processing libraries
# These are expected to be placed in the docker context before this is run (see the make image).
COPY --chown=ray:users data-processing-dist data-processing-dist
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray]

COPY --chown=ray:users python-transform/ python-transform/
RUN cd python-transform && pip install --no-cache-dir -e .

#COPY requirements.txt requirements.txt
#RUN pip install --no-cache-dir -r requirements.txt

COPY --chown=ray:users src/ src/
COPY --chown=ray:users pyproject.toml pyproject.toml
RUN pip install --no-cache-dir -e .

# copy the main() entry point to the image
COPY ./src/code_profiler_transform_ray.py .

# copy some of the samples in
COPY ./src/code_profiler_local_ray.py local/

# copy test
COPY test/ test/
COPY test-data/ test-data/
## Copy the python version of the tansform
COPY --chown=ray:users dpk_code_profiler/ dpk_code_profiler/
COPY --chown=ray:users requirements.txt requirements.txt
RUN pip install -r requirements.txt

# Grant non-root users the necessary permissions to the ray directory
RUN chmod 755 /home/ray
Expand Down
77 changes: 30 additions & 47 deletions transforms/code/code_profiler/Makefile
Original file line number Diff line number Diff line change
@@ -1,49 +1,32 @@
REPOROOT=../../..
# Use make help, to see the available rules
include $(REPOROOT)/.make.defaults

setup::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse

clean::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse

build::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse
venv::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse

publish::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-image:
@echo "Skipping test-image step as per configuration."

test::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-src::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

set-versions:
@# Help: Recursively $@ in all subdirs
$(MAKE) RULE=$@ .recurse

.PHONY: workflow-venv
workflow-venv:

.PHONY: workflow-test
workflow-test:

.PHONY: workflow-upload
workflow-upload:

.PHONY: workflow-build
workflow-build:
include $(REPOROOT)/transforms/.make.cicd.targets

#
# This is intended to be included across the Makefiles provided within
# a given transform's directory tree, so must use compatible syntax.
#
################################################################################
# This defines the name of the transform and is used to match against
# expected files and is used to define the transform's image name.
TRANSFORM_NAME=$(shell basename `pwd`)

################################################################################
export RUNTIME_HOST_ARCH=x86_64

run-cli-sample:
make venv
source venv/bin/activate && \
$(PYTHON) -m dpk_$(TRANSFORM_NAME).transform_python \
--data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" \
--language "language" \
--contents "contents"

run-ray-cli-sample:
make venv
source venv/bin/activate && \
$(PYTHON) -m dpk_$(TRANSFORM_NAME).ray.transform \
--data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" \
--run_locally True \
--language "language" \
--contents "contents"
110 changes: 95 additions & 15 deletions transforms/code/code_profiler/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,13 @@ As shown in Table 2, the framework standardizes code representation by categoriz
| | **Ocaml** | Yes | NA | Yes |


* [python](python/README.md) - provides the base python-based syntactic concept extractor
implementation.
* [ray](ray/README.md) - provides the base ray-based syntactic concept extractor
implementation.



**Offline Path for Syntactic Rule Generation**

The offline path is critical for expanding and refining the syntactic rule database, enabling the UBSR framework to adapt to new languages and syntactic constructs. This process leverages LLMs to generate syntactic rules for languages that are not yet included in the rule database. To achieve this, we utilize a Few-shot Chain of Thought prompting technique, guiding the LLM through a step-by-step rule generation process. By providing carefully curated training exemplars and detailed instructions, this method ensures the LLM can accurately generalize from these examples to produce effective syntactic rules for a wide range of languages. This structured approach enhances the flexibility of the UBSR framework, allowing it to seamlessly handle evolving language constructs.

The implementation for UI-based offline customization tool is present [here](python/src/offline-customizations). To run the tool, use the following command.
The implementation for UI-based offline customization tool is present [here](dpk_code_profiler/offline-customizations). To run the tool, use the following command.

`streamlit run LLM_runner_app.py`
`streamlit run generic_LLM_runner_app.py`

The high-level system design is as follows:

Expand All @@ -62,12 +55,99 @@ For each new target language, the offline phase is utilized to create determinis

In the online phase, the system dynamically generates profiling outputs for any incoming code snippets. This is achieved by extracting concepts from the snippets using the rules in the database and storing these extractions in a tabular format. The structured tabular format allows for generating additional concept columns, which are then utilized to create comprehensive profiling reports.

The following runtimes are available:
* [python](python/README.md) - provides the base python-based transformation
implementation and python runtime.
* [ray](ray/README.md) - enables the running of the base python transformation
in a Ray runtime

Please refer to the playbook at `transforms/code/code_profiler/notebook_example/code-profiler.ipynb` to run the pythonic code profiler
## Configuration and command line Options

The set of dictionary keys holding [code_profiler_transform](dpk_code_profiler/transform.py)
configuration for values are as follows:

* content - specifies the column name in the dataframe that has the code snippet
* language - specifies the programming languages of the code snippet

## Running

Copy your input parquet file to `transforms/code/code_profiler/test-data/input`, the output will be created in a directory `transforms/code/code_profiler/output`

### Running the samples

The code profiler can be run on mach-arm64 and x86_64 host architecture.
Depending on your host architecture, please change the `RUNTIME_HOST_ARCH` in the Makefile.
```
# values possible mach-arm64, x86_64
export RUNTIME_HOST_ARCH=x86_64
```
If you are using mac, you may need to permit your Mac to load the .so from the security settings. Generally, you get the pop-up under the tab security while running the transform.

![alt text](image.png)

To run the samples, use the following `make` target

* `run-cli-sample` - runs dpk_code_profiler/transform.py using command line args

This target will activate the virtual environment and sets up any configuration needed.
Use the `-n` option of `make` to see the detail of what is done to run the sample.

For example,
```shell
make run-cli-sample
...
```
Then
```shell
ls output
```
To see results of the transform.

## Testing

Following [the testing strategy of data-processing-lib](../../../data-processing-lib/doc/transform-testing.md)

Currently we have:
- [Unit test](test/test_code_profiler_python.py)
- [Integration test](test/test_code_profiler.py)


## Document Quality Ray Transform
Please see the set of
[transform project conventions](../../README.md#transform-project-conventions)
for details on general project conventions, transform configuration,
testing and IDE set up.


### Configuration and command line Options

Document Quality configuration and command line options are the same as for the base python transform.

### Running

#### Launched Command Line Options
When running the transform with the Ray launcher (i.e., TransformLauncher),
In addition to those available to the transform as defined here,
the set of
[ray launcher](../../../data-processing-lib/doc/ray-launcher-options.md) are available.

#### Running the samples
To run the samples, use the following `make` target

* `run-ray-cli-sample` - runs dpk_code_profiler/ray/transform.py using command line args

This target will activate the virtual environment and sets up any configuration needed.
Use the `-n` option of `make` to see the detail of what is done to run the sample.

For example,
```shell
make run-ray-cli-sample
...
```
Then
```shell
ls output
```
To see results of the transform.


### Transforming data using the transform image

To use the transform image to transform your data, please refer to the
[running images quickstart](../../../doc/quick-start/run-transform-image.md),
substituting the name of this transform image and runtime as appropriate.
Loading

0 comments on commit 9773a03

Please sign in to comment.