Week 3 init (#20)

truskovskiyk · Nov 22, 2023 · 150d3df · 150d3df
1 parent 933ebd8
commit 150d3df
Show file tree

Hide file tree

Showing 21 changed files with 876 additions and 38 deletions.
diff --git a/.github/workflows/week-3-ci.yml b/.github/workflows/week-3-ci.yml
@@ -1,12 +1,9 @@
 name: week-3-ci
 
 on:
-  # pull_request:
-  #   branches:
-  #     - main
   push:
     branches:
-      - course-04-2023-week-3
+      - course-27-10-2023-week-3
 env:
   IMAGE_MAIN_NAME: nlp-sample
   IMAGE_MAIN_TAG: latest
@@ -55,18 +52,18 @@ jobs:
         run: |
           docker run -e WANDB_PROJECT=${{ secrets.WANDB_PROJECT }} -e WANDB_API_KEY=${{ secrets.WANDB_API_KEY }} nlp-sample:latest make test_all
 
-      - name: Push
-        uses: docker/build-push-action@v2
-        with:
-          context: week-3/nlp-sample
-          file: week-3/nlp-sample/Dockerfile
-          push: true
-          tags: ${{ secrets.DOCKER_HUB_USERNAME }}/${{ env.IMAGE_MAIN_NAME }}:${{ env.IMAGE_MAIN_TAG }}
-          cache-from: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/${{ env.IMAGE_MAIN_NAME }}:buildcache
-          cache-to: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/${{ env.IMAGE_MAIN_NAME }}:buildcache,mode=max
+      # - name: Push
+      #   uses: docker/build-push-action@v2
+      #   with:
+      #     context: week-3/nlp-sample
+      #     file: week-3/nlp-sample/Dockerfile
+      #     push: true
+      #     tags: ${{ secrets.DOCKER_HUB_USERNAME }}/${{ env.IMAGE_MAIN_NAME }}:${{ env.IMAGE_MAIN_TAG }}
+      #     cache-from: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/${{ env.IMAGE_MAIN_NAME }}:buildcache
+      #     cache-to: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/${{ env.IMAGE_MAIN_NAME }}:buildcache,mode=max
 
   cml-test:
-    # needs: test
+
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2

diff --git a/week-3/README.md b/week-3/README.md
@@ -2,6 +2,7 @@
 
 - [Python project](https://github.com/navdeep-G/samplemod.git)
 - [ML project](https://github.com/ashleve/lightning-hydra-template.git)
+- [Advanced features](https://github.com/Lightning-AI/lightning)
 
 # Configuration 
 
@@ -10,17 +11,14 @@
 
 # Example ML model with testing
 
-
 [nlp-sample](./nlp-sample)
 
-
 # Experiments
 
 https://neptune.ai/blog/best-ml-experiment-tracking-tools
 
 ## AIM 
 
-https://neptune.ai/blog/best-ml-experiment-tracking-tools
 https://github.com/aimhubio/aim
 
 
@@ -35,3 +33,79 @@ kubectl port-forward svc/my-aim-service  8080:80 --namespace default
 - https://github.com/ivylee/model-cards-and-datasheets
 - https://arxiv.org/abs/1810.03993
 
+
+# LLMs for everything
+
+
+## LoRA & Peft
+
+- https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2
+- https://github.com/huggingface/peft
+
+## Experiments 
+
+- https://github.com/georgian-io/LLM-Finetuning-Hub
+- https://medium.com/georgian-impact-blog/the-practical-guide-to-llms-llama-2-cdf21d540ce3
+
+## Run example
+
+```
+python lora_training/mistral_classification.py training-llm --pretrained-ckpt mistralai/Mistral-7B-v0.1 --epochs 1 --train-sample-fraction 0.3
+python lora_training/mistral_classification.py training-llm --pretrained-ckpt facebook/opt-350m --epochs 1 --train-sample-fraction 0.3
+
+python lora_training/mistral_classification.py inference-llm
+```
+
+
+https://github.com/brevdev/notebooks/blob/main/mistral-finetune-own-data.ipynb
+
+## Run example RLHF
+
+
+```
+docker build -t rlhf:latest .
+docker run --net=host --gpus all -it -v ${PWD}:/main rlhf:latest /bin/bash
+
+accelerate config
+python sft_llama2.py
+
+```
+
+https://github.com/huggingface/trl/tree/main/examples/research_projects/stack_llama_2/scripts
+
+
+## Eval:
+
+- https://github.com/explodinggradients/ragas
+- https://github.com/NVIDIA/NeMo-Guardrails
+- https://github.com/guardrail-ml/guardrail
+- https://github.com/promptfoo/promptfoo
+- https://github.com/confident-ai/deepeval
+
+
+
+```
+pip install nemoguardrails
+pip install openai
+export OPENAI_API_KEY=**********
+```
+
+
+
+# Distributed training 
+
+- https://www.anyscale.com/blog/what-is-distributed-training
+- https://www.anyscale.com/blog/training-175b-parameter-language-models-at-1000-gpu-scale-with-alpa-and-ray
+- https://huggingface.co/docs/transformers/perf_train_gpu_many
+- https://github.com/microsoft/DeepSpeed
+
+
+# Hyperparameter search & AutoML
+
+- https://github.com/microsoft/nni
+- https://github.com/autogluon/autogluon
+
+
+# Declarative ML
+
+https://predibase.com/blog/how-to-fine-tune-llama-2-on-your-data-with-scalable-llm-infrastructure
diff --git a/week-3/lora_training/__init__.py b/week-3/lora_training/__init__.py
diff --git a/week-3/lora_training/datasets_prep.py b/week-3/lora_training/datasets_prep.py
@@ -0,0 +1,85 @@
+import pandas as pd
+import datasets
+from datasets import load_dataset
+from sklearn.model_selection import train_test_split
+
+TRAINING_CLASSIFIER_PROMPT_v2 = """### Q:{sentence} ### Math:{label}"""
+INFERENCE_CLASSIFIER_PROMPT_v2 = """### Sentence:{sentence} ### Class:"""
+
+def clean_newsgroup_data(texts, labels):
+    label2data = {}
+    clean_data, clean_labels = [], []
+    for data, label in zip(texts, labels):
+        if isinstance(data, str) and isinstance(label, str):
+            clean_data.append(data)
+            clean_labels.append(label)
+
+            if label not in label2data:
+                label2data[label] = data
+
+    return label2data, clean_data, clean_labels
+
+def get_newsgroup_instruction_data(mode, texts, labels):
+    if mode == "train":
+        prompt = TRAINING_CLASSIFIER_PROMPT_v2
+    elif mode == "inference":
+        prompt = INFERENCE_CLASSIFIER_PROMPT_v2
+
+    instructions = []
+
+    for text, label in zip(texts, labels):
+        if mode == "train":
+            example = prompt.format(
+                sentence=text,
+                label=label,
+            )
+        elif mode == "inference":
+            example = prompt.format(
+                sentence=text,
+            )
+        instructions.append(example)
+
+    return instructions
+
+def get_newsgroup_data_for_ft(mode="train", train_sample_fraction=0.99):
+    newsgroup_dataset = load_dataset("rungalileo/20_Newsgroups_Fixed")
+    train_data = newsgroup_dataset["train"]["text"]
+    train_labels = newsgroup_dataset["train"]["label"]
+    label2data, train_data, train_labels = clean_newsgroup_data(train_data, train_labels)
+
+    test_data = newsgroup_dataset["test"]["text"]
+    test_labels = newsgroup_dataset["test"]["label"]
+    _, test_data, test_labels = clean_newsgroup_data(test_data, test_labels)
+
+    # sample n points from training data
+    train_df = pd.DataFrame(data={"text": train_data, "label": train_labels})
+    train_df, _ = train_test_split(
+        train_df,
+        train_size=train_sample_fraction,
+        stratify=train_df["label"],
+        random_state=42,
+    )
+    train_data = train_df["text"]
+    train_labels = train_df["label"]
+
+    train_instructions = get_newsgroup_instruction_data(mode, train_data, train_labels)
+    test_instructions = get_newsgroup_instruction_data(mode, test_data, test_labels)
+
+    train_dataset = datasets.Dataset.from_pandas(
+        pd.DataFrame(
+            data={
+                "instructions": train_instructions,
+                "labels": train_labels,
+            }
+        )
+    )
+    test_dataset = datasets.Dataset.from_pandas(
+        pd.DataFrame(
+            data={
+                "instructions": test_instructions,
+                "labels": test_labels,
+            }
+        )
+    )
+
+    return train_dataset, test_dataset