From 05f77b90011ffef2fb0c6eb85a2273c16deb3f56 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Wed, 11 Dec 2024 17:24:00 +0100 Subject: [PATCH] added docstrings --- llm-finetuning-simple/run.py | 56 +++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/llm-finetuning-simple/run.py b/llm-finetuning-simple/run.py index a8f8c957..3dcb20f0 100644 --- a/llm-finetuning-simple/run.py +++ b/llm-finetuning-simple/run.py @@ -23,11 +23,39 @@ def prepare_data( dataset_size: int, max_length: int, ) -> Annotated[Dataset, "tokenized_dataset"]: + """ + Prepare and tokenize the dataset for fine-tuning. + + This step loads a specified dataset, tokenizes it with a given base model's + tokenizer, and prepares it for training by formatting the input as + question-answer prompts. + + Args: + base_model_id (str): Identifier of the base model to use for + tokenization. + dataset_name (str): Name of the dataset to load from Hugging Face + datasets. + dataset_size (int): Number of samples to use from the dataset. + max_length (int): Maximum sequence length for tokenization. + + Returns: + Annotated[Dataset, "tokenized_dataset"]: Tokenized dataset ready for + training. + """ tokenizer = AutoTokenizer.from_pretrained(base_model_id) tokenizer.pad_token = tokenizer.eos_token dataset = load_dataset(dataset_name, split=f"train[:{dataset_size}]") def tokenize_function(example): + """ + Tokenize a single example by formatting it as a question-answer prompt. + + Args: + example (dict): A single dataset example. + + Returns: + dict: Tokenized input with input_ids, attention_mask, etc. + """ prompt = f"Question: {example['question']}\n" \ f"Answer: {example['answers']['text'][0]}" return tokenizer(prompt, truncation=True, padding="max_length", @@ -54,6 +82,18 @@ def finetune( num_train_epochs: int, per_device_train_batch_size: int ) -> None: + """ + Fine-tune a pre-trained language model on the prepared dataset. + + This step loads the base model, sets up training arguments, and performs + fine-tuning using the Hugging Face Trainer. + + Args: + base_model_id (str): Identifier of the base model to fine-tune. + tokenized_dataset (Dataset): Tokenized dataset prepared for training. + num_train_epochs (int): Number of training epochs. + per_device_train_batch_size (int): Batch size per device during training. + """ torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( base_model_id, @@ -97,11 +137,25 @@ def finetune( @pipeline def llm_finetune_pipeline(base_model_id: str): + """ + ZenML pipeline for fine-tuning a language model. + + This pipeline orchestrates the data preparation and fine-tuning steps + for a language model on a specified dataset. + + Args: + base_model_id (str): Identifier of the base model to fine-tune. + """ tokenized_dataset = prepare_data(base_model_id) finetune(base_model_id, tokenized_dataset) if __name__ == "__main__": + """ + Entry point for the script that allows configuration via command-line argument. + + Expects a YAML configuration file path to be provided. + """ parser = argparse.ArgumentParser() parser.add_argument( '--config', @@ -110,4 +164,4 @@ def llm_finetune_pipeline(base_model_id: str): help='Path to the YAML config file' ) args = parser.parse_args() - llm_finetune_pipeline.with_options(config_path=args.config)() + llm_finetune_pipeline.with_options(config_path=args.config)() \ No newline at end of file