Skip to content

Commit

Permalink
added docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
bcdurak committed Dec 11, 2024
1 parent 4472b1a commit 05f77b9
Showing 1 changed file with 55 additions and 1 deletion.
56 changes: 55 additions & 1 deletion llm-finetuning-simple/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,39 @@ def prepare_data(
dataset_size: int,
max_length: int,
) -> Annotated[Dataset, "tokenized_dataset"]:
"""
Prepare and tokenize the dataset for fine-tuning.
This step loads a specified dataset, tokenizes it with a given base model's
tokenizer, and prepares it for training by formatting the input as
question-answer prompts.
Args:
base_model_id (str): Identifier of the base model to use for
tokenization.
dataset_name (str): Name of the dataset to load from Hugging Face
datasets.
dataset_size (int): Number of samples to use from the dataset.
max_length (int): Maximum sequence length for tokenization.
Returns:
Annotated[Dataset, "tokenized_dataset"]: Tokenized dataset ready for
training.
"""
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.pad_token = tokenizer.eos_token
dataset = load_dataset(dataset_name, split=f"train[:{dataset_size}]")

def tokenize_function(example):
"""
Tokenize a single example by formatting it as a question-answer prompt.
Args:
example (dict): A single dataset example.
Returns:
dict: Tokenized input with input_ids, attention_mask, etc.
"""
prompt = f"Question: {example['question']}\n" \
f"Answer: {example['answers']['text'][0]}"
return tokenizer(prompt, truncation=True, padding="max_length",
Expand All @@ -54,6 +82,18 @@ def finetune(
num_train_epochs: int,
per_device_train_batch_size: int
) -> None:
"""
Fine-tune a pre-trained language model on the prepared dataset.
This step loads the base model, sets up training arguments, and performs
fine-tuning using the Hugging Face Trainer.
Args:
base_model_id (str): Identifier of the base model to fine-tune.
tokenized_dataset (Dataset): Tokenized dataset prepared for training.
num_train_epochs (int): Number of training epochs.
per_device_train_batch_size (int): Batch size per device during training.
"""
torch.cuda.empty_cache()
model = AutoModelForCausalLM.from_pretrained(
base_model_id,
Expand Down Expand Up @@ -97,11 +137,25 @@ def finetune(

@pipeline
def llm_finetune_pipeline(base_model_id: str):
"""
ZenML pipeline for fine-tuning a language model.
This pipeline orchestrates the data preparation and fine-tuning steps
for a language model on a specified dataset.
Args:
base_model_id (str): Identifier of the base model to fine-tune.
"""
tokenized_dataset = prepare_data(base_model_id)
finetune(base_model_id, tokenized_dataset)


if __name__ == "__main__":
"""
Entry point for the script that allows configuration via command-line argument.
Expects a YAML configuration file path to be provided.
"""
parser = argparse.ArgumentParser()
parser.add_argument(
'--config',
Expand All @@ -110,4 +164,4 @@ def llm_finetune_pipeline(base_model_id: str):
help='Path to the YAML config file'
)
args = parser.parse_args()
llm_finetune_pipeline.with_options(config_path=args.config)()
llm_finetune_pipeline.with_options(config_path=args.config)()

0 comments on commit 05f77b9

Please sign in to comment.