added docstrings

zenml-io · Dec 11, 2024 · 05f77b9 · 05f77b9
1 parent 4472b1a
commit 05f77b9
Showing 1 changed file with 55 additions and 1 deletion.
diff --git a/llm-finetuning-simple/run.py b/llm-finetuning-simple/run.py
@@ -23,11 +23,39 @@ def prepare_data(
     dataset_size: int,
     max_length: int,
 ) -> Annotated[Dataset, "tokenized_dataset"]:
+    """
+    Prepare and tokenize the dataset for fine-tuning.
+
+    This step loads a specified dataset, tokenizes it with a given base model's
+    tokenizer, and prepares it for training by formatting the input as
+    question-answer prompts.
+
+    Args:
+        base_model_id (str): Identifier of the base model to use for
+            tokenization.
+        dataset_name (str): Name of the dataset to load from Hugging Face
+            datasets.
+        dataset_size (int): Number of samples to use from the dataset.
+        max_length (int): Maximum sequence length for tokenization.
+
+    Returns:
+        Annotated[Dataset, "tokenized_dataset"]: Tokenized dataset ready for
+            training.
+    """
     tokenizer = AutoTokenizer.from_pretrained(base_model_id)
     tokenizer.pad_token = tokenizer.eos_token
     dataset = load_dataset(dataset_name, split=f"train[:{dataset_size}]")
 
     def tokenize_function(example):
+        """
+        Tokenize a single example by formatting it as a question-answer prompt.
+
+        Args:
+            example (dict): A single dataset example.
+
+        Returns:
+            dict: Tokenized input with input_ids, attention_mask, etc.
+        """
         prompt = f"Question: {example['question']}\n" \
                  f"Answer: {example['answers']['text'][0]}"
         return tokenizer(prompt, truncation=True, padding="max_length",
@@ -54,6 +82,18 @@ def finetune(
     num_train_epochs: int,
     per_device_train_batch_size: int
 ) -> None:
+    """
+    Fine-tune a pre-trained language model on the prepared dataset.
+
+    This step loads the base model, sets up training arguments, and performs
+    fine-tuning using the Hugging Face Trainer.
+
+    Args:
+        base_model_id (str): Identifier of the base model to fine-tune.
+        tokenized_dataset (Dataset): Tokenized dataset prepared for training.
+        num_train_epochs (int): Number of training epochs.
+        per_device_train_batch_size (int): Batch size per device during training.
+    """
     torch.cuda.empty_cache()
     model = AutoModelForCausalLM.from_pretrained(
         base_model_id,
@@ -97,11 +137,25 @@ def finetune(
 
 @pipeline
 def llm_finetune_pipeline(base_model_id: str):
+    """
+    ZenML pipeline for fine-tuning a language model.
+
+    This pipeline orchestrates the data preparation and fine-tuning steps
+    for a language model on a specified dataset.
+
+    Args:
+        base_model_id (str): Identifier of the base model to fine-tune.
+    """
     tokenized_dataset = prepare_data(base_model_id)
     finetune(base_model_id, tokenized_dataset)
 
 
 if __name__ == "__main__":
+    """
+    Entry point for the script that allows configuration via command-line argument.
+
+    Expects a YAML configuration file path to be provided.
+    """
     parser = argparse.ArgumentParser()
     parser.add_argument(
         '--config',
@@ -110,4 +164,4 @@ def llm_finetune_pipeline(base_model_id: str):
         help='Path to the YAML config file'
     )
     args = parser.parse_args()
-    llm_finetune_pipeline.with_options(config_path=args.config)()
+    llm_finetune_pipeline.with_options(config_path=args.config)()