Merge pull request #1 from qubole/qubole-mlflow-fresh-strart

Qubole mlflow integration
qubole · Oct 25, 2018 · d21c3e3 · d21c3e3
2 parents 41f4932 + b099975
commit d21c3e3
Show file tree

Hide file tree

Showing 9 changed files with 3,833 additions and 3,422 deletions.
diff --git a/.gitignore b/.gitignore
@@ -95,3 +95,14 @@ example/tutorial/R/*.nb.html
 
 # travis_wait command logs
 travis_wait*.log
+
+.history
+
+mlflow/java/scoring/bin/*
+mlflow/java/client/bin/*
+mlflow/java/.settings/*
+mlflow/java/client/.classpath
+mlflow/java/scoring/.classpath
+mlflow/java/scoring/.settings/
+mlflow/java/client/.settings/
+
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -7,4 +7,4 @@ codecov
 coverage
 pypi-publisher
 scikit-learn
-scipy
+scipy
diff --git a/examples/qubole/readme.md b/examples/qubole/readme.md
@@ -0,0 +1,75 @@
+# Running MLFlow in Qubole Mode
+
+
+When run in `"qubole"` mode, a `ShellCommand` is launched on QDS from the MLFlow project. 
+
+## Setting up cluster
+
+Install `mlflow` package on cluster using the node-bootstrap.
+
+```
+/usr/lib/a-4.2.0-py-3.5.3/pip install mlflow
+/usr/lib/a-4.2.0-py-2.7.13/pip install mlflow
+```
+
+## Start tracking server
+
+To run a long-lived, shared MLflow tracking server, launch an EC2 instance to run the MLflow Tracking server.
+
+Create an Anaconda with Python 3 AMI EC2 instance.You can use a t2.micro (Free-tier) instance for test environment. This AMI already has conda and many other packages needed pre-installed.
+Install mlflow: pip install mlflow.
+Open port 5000 for MLflow server; an example of how to do this via How to open a web server port on EC2 instance. Opening up port 5000 to the Internet will allow anyone to access your server, so it is recommended to only open up the port within an AWS VPC that your Qubole clusters have access to.
+Configure your AWS credentials on the instance. The optimal configuration for MLflow Remote Tracking is to use the default-artifact-root option to store your artifacts in an S3 bucket.
+SSH into your EC2 instance, e.g. ssh -i ~/.ssh/<key>.pem ubuntu@<hostname>.<region>.compute.amazonaws.com.
+Configure your S3 credentials via aws cli; for more information, refer to Configuring the AWS CLI.
+Run the Tracking Server
+Start the tracking server: 
+```sh
+mlflow server --default-artifact-root s3://<bucket-name> --host 0.0.0.0.
+```
+For more information, refer to MLflow > Running a Tracking Server.
+Test connectivity of your tracking server. Go to http://<mlflow-server-dns>:5000; it should look similar to
+
+![](https://docs.databricks.com/_static/images/mlflow/mlflow-web-ui.png)
+
+## Run the job
+
+### Set tracking server variable
+
+Set environment variable `MLFLOW_TRACKING_URI`.
+
+### Create cluster spec file
+Running the remote job requires `cluster-spec.json` to be passed as follows,
+
+```json
+{
+    "aws": {
+        "s3_experiment_bucket": "<bucket-name>",
+        "s3_experiment_base_path": "<directory>"
+    },
+    "qubole": {
+        "api_token": "<qubole-api-token>" ,
+        "api_url": "https://api.qubole.com/api/",
+        "version": "v1.2",
+        "poll_interval": 5,
+        "skip_ssl_cert_check": false,
+        "cloud_name": "AWS"
+    },
+    "cluster": {
+        "label": "mlflow-test"
+    },
+    "command": {
+        "name": "mlflow-test",
+        "tags": ["mlflow"],
+        "notify": false
+    }
+}
+```
+
+### Example
+
+A toy example can be launch using the following command,
+
+```sh
+mlflow run [email protected]:mlflow/mlflow-example.git -P alpha=0.5 -m qubole --cluster-spec example/qubole_run_remote/cluster_spec.json
+```
diff --git a/mlflow/projects/__init__.py b/mlflow/projects/__init__.py
@@ -21,6 +21,7 @@
 
 
 import mlflow.projects.databricks
+import mlflow.projects.qubole
 from mlflow.utils import process
 from mlflow.utils.logging_utils import eprint
 from mlflow.utils.mlflow_tags import MLFLOW_GIT_BRANCH_NAME
@@ -41,6 +42,9 @@ def _run(uri, entry_point="main", version=None, parameters=None, experiment_id=N
     """
     if mode == "databricks":
         mlflow.projects.databricks.before_run_validations(mlflow.get_tracking_uri(), cluster_spec)
+    elif mode == "qubole":
+        mlflow.projects.qubole.before_run_validations(mlflow.get_tracking_uri(), cluster_spec)
+
 
     exp_id = experiment_id or _get_experiment_id()
     parameters = parameters or {}
@@ -70,6 +74,12 @@ def _run(uri, entry_point="main", version=None, parameters=None, experiment_id=N
             remote_run=active_run,
             uri=uri, entry_point=entry_point, work_dir=work_dir, parameters=parameters,
             experiment_id=exp_id, cluster_spec=cluster_spec)
+    elif mode == "qubole":
+        from mlflow.projects.qubole import run_qubole
+        return run_qubole(
+            remote_run=active_run,
+            uri=uri, entry_point=entry_point, work_dir=work_dir, parameters=parameters,
+            experiment_id=exp_id, cluster_spec=cluster_spec)
     elif mode == "local" or mode is None:
         # Synchronously create a conda environment (even though this may take some time) to avoid
         # failures due to multiple concurrent attempts to create the same conda env.
@@ -85,7 +95,7 @@ def _run(uri, entry_point="main", version=None, parameters=None, experiment_id=N
         return _invoke_mlflow_run_subprocess(
             work_dir=work_dir, entry_point=entry_point, parameters=parameters, experiment_id=exp_id,
             use_conda=use_conda, storage_dir=storage_dir, run_id=active_run.info.run_uuid)
-    supported_modes = ["local", "databricks"]
+    supported_modes = ["local", "databricks", "qubole"]
     raise ExecutionException("Got unsupported execution mode %s. Supported "
                              "values: %s" % (mode, supported_modes))
 
@@ -112,8 +122,8 @@ def run(uri, entry_point="main", version=None, parameters=None, experiment_id=No
                         environment variable ``$SHELL``) to run ``.sh`` files.
     :param version: For Git-based projects, either a commit hash or a branch name.
     :param experiment_id: ID of experiment under which to launch the run.
-    :param mode: Execution mode of the run: "local" or "databricks".
-    :param cluster_spec: When ``mode`` is "databricks", path to a JSON file containing a
+    :param mode: Execution mode of the run: "local", "databricks" or "qubole".
+    :param cluster_spec: When ``mode`` is "databricks" or "qubole", path to a JSON file containing a
                          `Databricks cluster specification
                          <https://docs.databricks.com/api/latest/jobs.html#clusterspec>`_
                          to use when launching a run.