From f393f96d747a75b3a304bba4e6303e2aee7e277a Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 12 Dec 2024 09:45:19 +0100 Subject: [PATCH] changing the classifier example --- classifier-e2e/README.md | 100 ++++++++++++++---------- classifier-e2e/run_full.ipynb | 20 ++++- classifier-e2e/run_skip_basics.ipynb | 15 +++- classifier-e2e/steps/deploy_endpoint.py | 6 +- classifier-e2e/steps/model_evaluator.py | 35 ++++----- classifier-e2e/steps/model_trainer.py | 7 +- 6 files changed, 112 insertions(+), 71 deletions(-) diff --git a/classifier-e2e/README.md b/classifier-e2e/README.md index e932141a..17aeadb9 100644 --- a/classifier-e2e/README.md +++ b/classifier-e2e/README.md @@ -11,58 +11,76 @@ pinned: false license: apache-2.0 --- -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +# ZenML MLOps Breast Cancer Classification Demo -# ๐Ÿ“œ ZenML Stack Show Case +## ๐ŸŒ Project Overview -This project aims to demonstrate the power of stacks. The code in this -project assumes that you have quite a few stacks registered already. +This is a minimalistic MLOps project demonstrating how to put machine learning +workflows into production using ZenML. The project focuses on building a breast +cancer classification model with end-to-end ML pipeline management. -## default - * `default` Orchestrator - * `default` Artifact Store +### Key Features -```commandline -zenml stack set default -python run.py --training-pipeline +- ๐Ÿ”ฌ Feature engineering pipeline +- ๐Ÿค– Model training pipeline +- ๐Ÿงช Batch inference pipeline +- ๐Ÿ“Š Artifact and model lineage tracking +- ๐Ÿ”— Integration with Weights & Biases for experiment tracking + +## ๐Ÿš€ Installation + +1. Clone the repository +2. Install requirements: + ```bash + pip install -r requirements.txt + ``` +3. Install ZenML integrations: + ```bash + zenml integration install sklearn xgboost wandb -y + zenml login + zenml init + ``` +4. You need to register a stack with a [Weights & Biases Experiment Tracker](https://docs.zenml.io/stack-components/experiment-trackers/wandb). + +## ๐Ÿง  Project Structure + +- `steps/`: Contains individual pipeline steps +- `pipelines/`: Pipeline definitions +- `run.py`: Main script to execute pipelines + +## ๐Ÿ” Workflow and Execution + +First, you need to set your stack: + +```bash +zenml stack set stack-with-wandb ``` -## local-sagemaker-step-operator-stack - * `default` Orchestrator - * `s3` Artifact Store - * `local` Image Builder - * `aws` Container Registry - * `Sagemaker` Step Operator +### 1. Data Loading and Feature Engineering -```commandline -zenml stack set local-sagemaker-step-operator-stack -zenml integration install aws -y -python run.py --training-pipeline +- Uses the Breast Cancer dataset from scikit-learn +- Splits data into training and inference sets +- Preprocesses data for model training + +```bash +python run.py --feature-pipeline ``` -## sagemaker-airflow-stack - * `Airflow` Orchestrator - * `s3` Artifact Store - * `local` Image Builder - * `aws` Container Registry - * `Sagemaker` Step Operator - -```commandline -zenml stack set sagemaker-airflow-stack -zenml integration install airflow -y -pip install apache-airflow-providers-docker apache-airflow~=2.5.0 -zenml stack up +### 2. Model Training + +- Supports multiple model types (SGD, XGBoost) +- Evaluates and compares model performance +- Tracks model metrics with Weights & Biases + +```bash python run.py --training-pipeline ``` -## sagemaker-stack - * `Sagemaker` Orchestrator - * `s3` Artifact Store - * `local` Image Builder - * `aws` Container Registry - * `Sagemaker` Step Operator +### 3. Batch Inference -```commandline -zenml stack set sagemaker-stack -python run.py --training-pipeline +- Loads production model +- Generates predictions on new data + +```bash +python run.py --inference-pipeline ``` diff --git a/classifier-e2e/run_full.ipynb b/classifier-e2e/run_full.ipynb index d29002e0..26f1fb90 100644 --- a/classifier-e2e/run_full.ipynb +++ b/classifier-e2e/run_full.ipynb @@ -941,10 +941,17 @@ " .ravel()\n", " .tolist(),\n", " }\n", - " log_model_metadata(metadata={\"wandb_url\": wandb.run.url})\n", - " log_artifact_metadata(\n", + "\n", + " try:\n", + " if get_step_context().model:\n", + " log_metadata(metadata=metadata, infer_model=True)\n", + " except StepContextError:\n", + " # If a model is not configured, it is not able to log metadata\n", + " pass\n", + "\n", + " log_metadata(\n", " metadata=metadata,\n", - " artifact_name=\"breast_cancer_classifier\",\n", + " artifact_version_id=get_step_context().inputs[\"model\"].id,\n", " )\n", "\n", " wandb.log({\"train_accuracy\": metadata[\"train_accuracy\"]})\n", @@ -1073,6 +1080,7 @@ { "cell_type": "code", "execution_count": null, + "id": "1e2130b9", "metadata": {}, "outputs": [], "source": [ @@ -1083,6 +1091,7 @@ { "cell_type": "code", "execution_count": null, + "id": "476cbf5c", "metadata": {}, "outputs": [], "source": [ @@ -1091,6 +1100,7 @@ }, { "cell_type": "markdown", + "id": "75df10e7", "metadata": {}, "source": [ "Now full run executed on local stack and experiment is tracked using Model Control Plane and Weights&Biases.\n", @@ -1103,6 +1113,7 @@ { "cell_type": "code", "execution_count": null, + "id": "bfd6345f", "metadata": {}, "outputs": [], "source": [ @@ -1113,6 +1124,7 @@ { "cell_type": "code", "execution_count": null, + "id": "24358031", "metadata": {}, "outputs": [], "source": [ @@ -1136,7 +1148,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.11.3" } }, "nbformat": 4, diff --git a/classifier-e2e/run_skip_basics.ipynb b/classifier-e2e/run_skip_basics.ipynb index 376f31c3..62da7a32 100644 --- a/classifier-e2e/run_skip_basics.ipynb +++ b/classifier-e2e/run_skip_basics.ipynb @@ -829,10 +829,17 @@ " .ravel()\n", " .tolist(),\n", " }\n", - " log_model_metadata(metadata={\"wandb_url\": wandb.run.url})\n", - " log_artifact_metadata(\n", + "\n", + " try:\n", + " if get_step_context().model:\n", + " log_metadata(metadata=metadata, infer_model=True)\n", + " except StepContextError:\n", + " # If a model is not configured, it is not able to log metadata\n", + " pass\n", + "\n", + " log_metadata(\n", " metadata=metadata,\n", - " artifact_name=\"breast_cancer_classifier\",\n", + " artifact_version_id=get_step_context().inputs[\"model\"].id,\n", " )\n", "\n", " wandb.log({\"train_accuracy\": metadata[\"train_accuracy\"]})\n", @@ -1211,7 +1218,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.11.3" } }, "nbformat": 4, diff --git a/classifier-e2e/steps/deploy_endpoint.py b/classifier-e2e/steps/deploy_endpoint.py index ad166525..9f1c56b8 100644 --- a/classifier-e2e/steps/deploy_endpoint.py +++ b/classifier-e2e/steps/deploy_endpoint.py @@ -7,6 +7,7 @@ from utils.aws import get_aws_config from utils.sagemaker_materializer import SagemakerPredictorMaterializer from zenml import ArtifactConfig, get_step_context, log_artifact_metadata, step +from zenml.enums import ArtifactType @step( @@ -16,7 +17,10 @@ def deploy_endpoint() -> ( Annotated[ Predictor, - ArtifactConfig(name="sagemaker_endpoint", is_deployment_artifact=True), + ArtifactConfig( + name="sagemaker_endpoint", + artifact_type=ArtifactType.SERVICE + ), ] ): role, session, region = get_aws_config() diff --git a/classifier-e2e/steps/model_evaluator.py b/classifier-e2e/steps/model_evaluator.py index dd335ae5..54ee2fbf 100644 --- a/classifier-e2e/steps/model_evaluator.py +++ b/classifier-e2e/steps/model_evaluator.py @@ -21,12 +21,7 @@ import wandb from sklearn.base import ClassifierMixin from sklearn.metrics import confusion_matrix -from zenml import ( - get_step_context, - log_artifact_metadata, - log_model_metadata, - step, -) +from zenml import step, log_metadata, get_step_context from zenml.client import Client from zenml.exceptions import StepContextError from zenml.logger import get_logger @@ -60,12 +55,12 @@ def model_evaluator( step to force the pipeline run to fail early and all subsequent steps to be skipped. - This step is parameterized to configure the step independently of the step code, - before running it in a pipeline. In this example, the step can be configured - to use different values for the acceptable model performance thresholds and - to control whether the pipeline run should fail if the model performance - does not meet the minimum criteria. See the documentation for more - information: + This step is parameterized to configure the step independently of the step + code, before running it in a pipeline. In this example, the step can be + configured to use different values for the acceptable model performance + thresholds and to control whether the pipeline run should fail if the model + performance does not meet the minimum criteria. See the documentation for + more information: https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines @@ -89,17 +84,19 @@ def model_evaluator( dataset_tst.drop(columns=[target]), dataset_tst[target], ) - logger.info(f"Train accuracy={trn_acc*100:.2f}%") - logger.info(f"Test accuracy={tst_acc*100:.2f}%") + logger.info(f"Train accuracy={trn_acc * 100:.2f}%") + logger.info(f"Test accuracy={tst_acc * 100:.2f}%") messages = [] if trn_acc < min_train_accuracy: messages.append( - f"Train accuracy {trn_acc*100:.2f}% is below {min_train_accuracy*100:.2f}% !" + f"Train accuracy {trn_acc * 100:.2f}% is below " + f"{min_train_accuracy * 100:.2f}% !" ) if tst_acc < min_test_accuracy: messages.append( - f"Test accuracy {tst_acc*100:.2f}% is below {min_test_accuracy*100:.2f}% !" + f"Test accuracy {tst_acc * 100:.2f}% is below " + f"{min_test_accuracy * 100:.2f}% !" ) else: for message in messages: @@ -115,14 +112,14 @@ def model_evaluator( } try: if get_step_context().model: - log_model_metadata(metadata={"wandb_url": wandb.run.url}) + log_metadata(metadata=metadata, infer_model=True) except StepContextError: # if model not configured not able to log metadata pass - log_artifact_metadata( + log_metadata( metadata=metadata, - artifact_name="breast_cancer_classifier", + artifact_version_id=get_step_context().inputs["model"].id, ) wandb.log( diff --git a/classifier-e2e/steps/model_trainer.py b/classifier-e2e/steps/model_trainer.py index aef08e27..3a04f1d8 100644 --- a/classifier-e2e/steps/model_trainer.py +++ b/classifier-e2e/steps/model_trainer.py @@ -13,7 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# from typing import Optional @@ -23,6 +22,7 @@ from typing_extensions import Annotated from utils.sagemaker_materializer import SagemakerMaterializer from zenml import ArtifactConfig, step +from zenml.enums import ArtifactType from zenml.logger import get_logger logger = get_logger(__name__) @@ -39,7 +39,10 @@ def model_trainer( target: Optional[str] = "target", ) -> Annotated[ ClassifierMixin, - ArtifactConfig(name="breast_cancer_classifier", is_model_artifact=True), + ArtifactConfig( + name="breast_cancer_classifier", + artifact_tyoe=ArtifactType.MODEL, + ), ]: """Configure and train a model on the training dataset.