Add support for vLLM containers (#16)

This commit adds support for vLLM containers with some example documentation to support it. The schema has been updated to allow for the vLLM container. These changes help toward the migration to the v2.x architecture.
awslabs · Jun 4, 2024 · b3bf672 · b3bf672
1 parent 646e0fa
commit b3bf672
Show file tree

Hide file tree

Showing 6 changed files with 84 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -219,6 +219,24 @@ you can do so.
   the corresponding entry in `config.yaml`. For container images you can provide a path to a directory
   from which a docker container will be built (default), a path to a tarball, an ECR repository arn and
   optional tag, or a public registry path.
+  - We provide immediate support for HuggingFace TGI and TEI containers and for vLLM containers. The `example_config.yaml`
+    file provides examples for TGI and TEI, and the only difference for using vLLM is to change the
+    `inferenceContainer`, `baseImage`, and `path` options, as indicated in the snippet below. All other options can
+    remain the same as the model definition examples we have for the TGI or TEI models.
+    ```yaml
+    ecsModels:
+      - modelName: mistralai/Mistral-7B-Instruct-v0.2
+        modelId: mistral7b-vllm
+        deploy: true
+        streaming: true
+        modelType: textgen
+        instanceType: g5.xlarge
+        inferenceContainer: vllm # vLLM-specific config
+        containerConfig:
+          image:
+            baseImage: vllm/vllm-openai:v0.4.2 # vLLM-specific config
+            path: lib/serve/ecs-model/vllm # vLLM-specific config
+    ```
 - If you are deploying the LISA Chat User Interface you can optionally specify the path to the pre-built
   website assets using the top level `webAppAssetsPath` parameter in `config.yaml`. Specifying this path
   (typically `lib/user-interface/react/dist`) will avoid using a container to build and bundle the assets

diff --git a/lib/schema.ts b/lib/schema.ts
@@ -513,12 +513,21 @@ const EcsModelConfigSchema = z
     streaming: z.boolean().nullable().default(null),
     modelType: z.nativeEnum(ModelType),
     instanceType: z.enum(VALID_INSTANCE_KEYS),
-    inferenceContainer: z.union([z.literal('tgi'), z.literal('tei'), z.literal('instructor')]),
+    inferenceContainer: z
+      .union([z.literal('tgi'), z.literal('tei'), z.literal('instructor'), z.literal('vllm')])
+      .refine((data) => {
+        return !data.includes('.'); // string cannot contain a period
+      }),
     containerConfig: ContainerConfigSchema,
     autoScalingConfig: AutoScalingConfigSchema,
     loadBalancerConfig: LoadBalancerConfigSchema,
     localModelCode: z.string().default('/opt/model-code'),
-    modelHosting: z.string().default('ecs'),
+    modelHosting: z
+      .string()
+      .default('ecs')
+      .refine((data) => {
+        return !data.includes('.'); // string cannot contain a period
+      }),
   })
   .refine(
     (data) => {

diff --git a/lib/serve/ecs-model/vllm/Dockerfile b/lib/serve/ecs-model/vllm/Dockerfile
@@ -0,0 +1,14 @@
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+
+##### DOWNLOAD MOUNTPOINTS S3
+ARG MOUNTS3_DEB_URL
+RUN apt update -y && apt install -y wget rsync && \
+    wget ${MOUNTS3_DEB_URL} && \
+    apt install -y ./mount-s3.deb && \
+    rm mount-s3.deb
+
+COPY src/entrypoint.sh ./entrypoint.sh
+RUN chmod +x entrypoint.sh
+
+ENTRYPOINT ["./entrypoint.sh"]
diff --git a/lib/serve/ecs-model/vllm/src/entrypoint.sh b/lib/serve/ecs-model/vllm/src/entrypoint.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+set -e
+
+declare -a vars=("S3_BUCKET_MODELS" "LOCAL_MODEL_PATH" "MODEL_NAME" "S3_MOUNT_POINT" "THREADS")
+
+# Check the necessary environment variables
+for var in "${vars[@]}"; do
+    if [[ -z "${!var}" ]]; then
+        echo "$var must be set"
+        exit 1
+    fi
+done
+
+# Create S3 mount point to ephemeral NVMe drive
+echo "Creating S3 mountpoint for bucket ${S3_BUCKET_MODELS} at container mount point path ${S3_MOUNT_POINT}/${MODEL_NAME}"
+mkdir -p ${S3_MOUNT_POINT}
+mount-s3 ${S3_BUCKET_MODELS} ${S3_MOUNT_POINT}
+
+echo "Downloading model ${S3_BUCKET_MODELS} to container path ${LOCAL_MODEL_PATH}"
+mkdir -p ${LOCAL_MODEL_PATH}
+
+# Use rsync with S3_MOUNT_POINT
+ls ${S3_MOUNT_POINT}/${MODEL_NAME} | xargs -n1 -P${THREADS} -I% rsync -Pa --exclude "*.bin" ${S3_MOUNT_POINT}/${MODEL_NAME}/% ${LOCAL_MODEL_PATH}/
+
+ADDITIONAL_ARGS=""
+if [[ -n "${MAX_TOTAL_TOKENS}" ]]; then
+  ADDITIONAL_ARGS+=" --max-model-len ${MAX_TOTAL_TOKENS}"
+fi
+
+# Start the webserver
+echo "Starting vLLM"
+python3 -m vllm.entrypoints.openai.api_server \
+    --model ${LOCAL_MODEL_PATH} \
+    --served-model-name ${MODEL_NAME} \
+    --port 8080 ${ADDITIONAL_ARGS}
diff --git a/lib/serve/index.ts b/lib/serve/index.ts
@@ -126,6 +126,7 @@ export class LisaServeApplicationStack extends Stack {
     this.modelsPs.grantRead(restApi.taskRole);
     // Add parameter as container environment variable for both RestAPI and RagAPI
     restApi.container.addEnvironment('REGISTERED_MODELS_PS_NAME', this.modelsPs.parameterName);
+    restApi.node.addDependency(this.modelsPs);
 
     // Update
     this.restApi = restApi;

diff --git a/lib/serve/rest-api/src/main.py b/lib/serve/rest-api/src/main.py
@@ -83,9 +83,14 @@ async def lifespan(app: FastAPI):  # type: ignore
         registered_models = json.loads(response["Parameter"]["Value"])
         for model in registered_models:
             provider = model["provider"]
+            # provider format is `modelHosting.modelType.inferenceContainer`, example: "ecs.textgen.tgi"
+            [_, _, inference_container] = provider.split(".")
             model_name = model["modelName"]
             model_type = model["modelType"]
 
+            if inference_container not in ["tgi", "tei", "instructor"]:  # stopgap for supporting new containers for v2
+                continue  # not implementing new providers inside the existing cache; cache is on deprecation path
+
             # Get default model kwargs
             validator = registry.get_assets(provider)["validator"]
             model_kwargs = validator().dict()