generated from tu-studio/hpc-cluster-ml-workflow
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 05250aa
Showing
26 changed files
with
948 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Ignore all files in this directory | ||
** | ||
# Except this file | ||
!requirements.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
/config.local | ||
/tmp | ||
/cache |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
[core] | ||
remote = myremote | ||
[cache] | ||
shared = group | ||
type = symlink | ||
['remote "myremote"'] | ||
url = webdavs://tubcloud.tu-berlin.de/remote.php/dav/files/cf531c5e-2043-103b-8745-111da40a61ee/dvcR | ||
timeout = 600 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Add patterns of files dvc should ignore, which could improve | ||
# the performance. Learn more at | ||
# https://dvc.org/doc/user-guide/dvcignore |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
name: Docker Image CI | ||
|
||
on: | ||
push: | ||
paths: | ||
- 'Dockerfile' | ||
- 'requirements.txt' | ||
- '.github/workflows/docker-image.yml' | ||
pull_request: | ||
paths: | ||
- 'Dockerfile' | ||
- 'requirements.txt' | ||
- '.github/workflows/docker-image.yml' | ||
|
||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- uses: actions/checkout@v2 | ||
|
||
- name: Set up Docker Buildx | ||
uses: docker/setup-buildx-action@v1 | ||
|
||
- name: Load Environment Variables from global.env | ||
run: | | ||
grep -v '^#' global.env | grep '=' | while IFS='=' read -r key value; do | ||
if [[ -n "$key" && -n "$value" ]]; then | ||
echo "$key=$value" >> $GITHUB_ENV | ||
fi | ||
done | ||
- name: Login to DockerHub | ||
uses: docker/login-action@v1 | ||
with: | ||
username: ${{ secrets.DOCKER_USERNAME }} | ||
password: ${{ secrets.DOCKER_PASSWORD }} | ||
|
||
- name: Build and push | ||
uses: docker/build-push-action@v2 | ||
with: | ||
context: . | ||
file: ./Dockerfile | ||
push: true | ||
tags: ${{ secrets.DOCKER_USERNAME }}/${{ env.TUSTU_PROJECT_NAME }}-image:latest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# Ignore typical temporary and system files | ||
.DS_Store | ||
__pycache__/ | ||
|
||
# Ignore Python virtual environments | ||
myenv | ||
venv | ||
|
||
# Ignore data files | ||
data/*/* | ||
logs/*/* | ||
/dvclive | ||
/models | ||
/temp | ||
/exp-logs/* | ||
|
||
# Not ignore | ||
!logs/*/.gitkeep | ||
!*.dvc | ||
|
||
# Ignore personal notes | ||
NOTES.md | ||
|
||
# Ignore singularity image | ||
ml-pipeline-image_latest.sif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# Use an official Debian runtime as a parent image | ||
FROM debian:11-slim | ||
|
||
# Install necessary packages | ||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
wget \ | ||
build-essential \ | ||
libssl-dev \ | ||
zlib1g-dev \ | ||
libncurses5-dev \ | ||
libgdbm-dev \ | ||
libreadline-dev \ | ||
libffi-dev \ | ||
libsqlite3-dev \ | ||
curl \ | ||
libbz2-dev \ | ||
git \ | ||
python3-pip \ | ||
openssh-client \ | ||
rsync \ | ||
# Remove apt cache | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
# Install Python Version 3.12.4 | ||
RUN wget --no-check-certificate https://www.python.org/ftp/python/3.12.4/Python-3.12.4.tgz \ | ||
&& tar -xf Python-3.12.4.tgz \ | ||
&& cd Python-3.12.4 \ | ||
&& ./configure --enable-optimizations \ | ||
&& make -j$(nproc) \ | ||
&& make altinstall \ | ||
&& cd .. \ | ||
# Delete the unzipped directory and downloaded archive to save space | ||
&& rm -rf Python-3.12.4 Python-3.12.4.tgz \ | ||
# Create symlink for python3 | ||
&& ln -s /usr/local/bin/python3.12 /usr/local/bin/python3 | ||
|
||
# Set the working directory | ||
WORKDIR /home/app | ||
|
||
# Copy the python requirements list to /home/app and install them | ||
COPY requirements.txt . | ||
RUN python3 -m pip install -r requirements.txt \ | ||
&& rm requirements.txt | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
# ml-training-pipeline | ||
|
||
This repository provides a comprehensive template for the management of reproducible pipelines for machine learning training in the context of audio. The template is utilizing [DVC](https://dvc.org/) (data version control) and is adjusted for experiments on the Remote SLURM-Cluster [HPC cluster of the Technical University of Berlin](https://www.tu.berlin/campusmanagement/angebot/high-performance-computing-hpc). | ||
|
||
## Features | ||
|
||
|
||
## Install and Setup | ||
|
||
``` | ||
git clone https://github.com/tu-studio/dataset-pipeline-template | ||
``` | ||
|
||
|
||
Create and setup a virtual environment inside the repository. If you chose a different name than *myenv* make sure to add the directory name of your venv to the .gitignore. | ||
|
||
|
||
``` | ||
cd ml-training-pipeline | ||
python3 -m venv venv | ||
echo venv/ >> .gitignore | ||
source venv/bin/activate | ||
pip install -r requirements.txt | ||
``` | ||
|
||
|
||
Initiliase a dvc repository. | ||
|
||
``` | ||
dvc init | ||
``` | ||
|
||
Add a WebDAV server as remote storage to your dvc repository. | ||
|
||
``` | ||
dvc remote add -d myremote webdavs://tubcloud.tu-berlin.de/remote.php/dav/files/cf531c5e-2043-103b-8745-111da40a61ee/DVC | ||
``` | ||
|
||
Add your username and password for server acces to a private config file (will be ignored by git). | ||
|
||
``` | ||
dvc remote modify --local myremote user 'yourusername' | ||
dvc remote modify --local myremote password 'yourpassword' | ||
dvc remote modify myremote ask_password true | ||
``` | ||
|
||
Add the raw data folder to the dvc repository. | ||
|
||
``` | ||
dvc add data/raw | ||
``` | ||
|
||
|
||
## Usage | ||
|
||
|
||
|
||
## Contributors | ||
|
||
- [Michael Witte](https://github.com/michaelwitte) | ||
- [Fares Schulz](https://github.com/faressc) | ||
|
||
## License | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# TODO | ||
|
||
- write into Readme.md that global git config should be available because we push to the repository (singularity automatically mounts $HOME directory, so this one is used) for Docker this is not the case find another solution |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#!/bin/bash | ||
|
||
#SBATCH -J exp_job | ||
#SBATCH --ntasks=1 | ||
#SBATCH --nodes=1 | ||
#SBATCH --ntasks-per-core=1 | ||
#SBATCH --cpus-per-task=1 | ||
#SBATCH --gres=gpu:tesla:1 | ||
#SBATCH --mem=100GB | ||
#SBATCH --time=1:00:00 | ||
#SBATCH --partition=gpu | ||
#SBATCH --output=./logs/slurm/slurm-%j.out | ||
|
||
# Load necessary modules | ||
module load singularity/4.0.2 | ||
|
||
# Set environment variables defined in global.env | ||
export $(grep -v '^#' global.env | xargs) | ||
|
||
# Remove the previous singularity image if it exists | ||
if [ -f $TUSTU_PROJECT_NAME-image_latest.sif ]; then | ||
rm $TUSTU_PROJECT_NAME-image_latest.sif | ||
fi | ||
# Pull the latest docker image from Docker Hub and convert it to a singularity image. Using cached singularity image if nothing changed | ||
singularity pull docker://$TUSTU_DOCKERHUB_USERNAME/$TUSTU_PROJECT_NAME-image:latest | ||
|
||
echo "Starting singularity execution..." | ||
|
||
# Run the singularity container | ||
DEFAULT_DIR="$PWD" singularity exec --nv ml-pipeline-image_latest.sif bash -c ' | ||
echo "Checking directory existence..." | ||
if [ ! -d "../$TUSTU_TEMP_PATH" ]; then | ||
mkdir -p "../$TUSTU_TEMP_PATH" | ||
echo "The directory ../$TUSTU_TEMP_PATH has been created." | ||
else | ||
echo "The directory ../$TUSTU_TEMP_PATH exists." | ||
fi | ||
if [ -z "$INDEX" ] | ||
then | ||
echo "Creating new index 0..." | ||
INDEX=0 | ||
fi | ||
mkdir "../$TUSTU_TEMP_PATH/$INDEX" | ||
echo "Copying files..." | ||
{ | ||
git ls-files; | ||
echo ".dvc/config.local"; | ||
echo ".git"; | ||
} | while read file; do | ||
cp -r --parents "$file" "../$TUSTU_TEMP_PATH/$INDEX/" | ||
done | ||
cd ../$TUSTU_TEMP_PATH/$INDEX | ||
echo "Setting DVC cache directory..." | ||
dvc cache dir $DEFAULT_DIR/.dvc/cache | ||
# dvc config cache.shared group | ||
# dvc config cache.type symlink | ||
echo "Pulling data with DVC..." | ||
dvc pull | ||
echo "Running experiment..." | ||
dvc exp run $EXP_PARAMS && | ||
echo "Pushing experiment..." | ||
dvc exp push origin && | ||
echo "Cleaning up..." | ||
cd .. && | ||
rm -rf $INDEX | ||
' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
/raw | ||
/processed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
outs: | ||
- md5: 258f409b9cb543c7cae3902ab7eb808f.dir | ||
size: 65879160 | ||
nfiles: 2 | ||
hash: md5 | ||
path: raw |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
stages: | ||
preprocess: | ||
cmd: python3 source/preprocess.py | ||
deps: | ||
- source/preprocess.py | ||
- data/raw/ | ||
params: | ||
- preprocess.input_file | ||
- preprocess.target_file | ||
- preprocess.input_size | ||
- preprocess.test_split | ||
- preprocess.output_dir | ||
outs: | ||
- data/processed/ | ||
train: | ||
cmd: python3 source/train.py | ||
deps: | ||
- source/train.py | ||
- source/model.py | ||
- data/processed/ | ||
params: | ||
- train.name | ||
- train.batch_size | ||
- train.epochs | ||
- train.train_mode | ||
- train.device | ||
outs: | ||
- models/checkpoints/ | ||
- dvclive/ | ||
- exp-logs/tensorboard/ | ||
export: | ||
cmd: python3 source/export.py | ||
deps: | ||
- source/export.py | ||
- models/checkpoints/ | ||
params: | ||
- preprocess.input_size | ||
- train.name | ||
- train.train_mode | ||
outs: | ||
- models/exports/ | ||
save_logs: | ||
cmd: python3 source/utils/save_logs.py | ||
outs: | ||
- exp-logs/slurm | ||
|
||
metrics: | ||
- dvclive/metrics.json | ||
plots: | ||
- dvclive/plots/metrics: | ||
x: step |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import itertools | ||
import subprocess | ||
import os | ||
|
||
# Submit experiment for hyperparameter combination | ||
def submit_batch_job(index, test_split, batch_size): | ||
# Set dynamic parameters for the batch job as environment variables | ||
# But dont forget to add the os.environ to the new environment variables otherwise the PATH is not found | ||
env = { | ||
**os.environ, | ||
"EXP_PARAMS": f"-S preprocess.test_split={test_split} -S train.batch_size={batch_size}", | ||
"INDEX": str(index) | ||
} | ||
# Run sbatch command with the environment variables as bash! subprocess! command (otherwise module not found) | ||
subprocess.run(['/usr/bin/bash', '-c', 'sbatch batchjob.sh'], env=env) | ||
|
||
if __name__ == "__main__": | ||
test_split_list = [0.2, 0.3] | ||
batch_size_list = [2048, 4096] | ||
for index,(test_split, batch_size) in enumerate(itertools.product(test_split_list, batch_size_list)): | ||
submit_batch_job(index,test_split,batch_size) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
TUSTU_PROJECT_NAME=ml-pipeline | ||
TUSTU_DOCKERHUB_USERNAME=tustudio | ||
TUSTU_LOGS_PATH=logs | ||
TUSTU_TEMP_PATH=temp |
Empty file.
Empty file.
Oops, something went wrong.