Enable OpenTelemetry Tracing for ChatQnA on Gaudi with TGI serving

Signed-off-by: louie-tsai <[email protected]>
opea-project · Jan 3, 2025 · f020688 · f020688
1 parent 1d7ac82
commit f020688
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 4 deletions.
diff --git a/ChatQnA/README.md b/ChatQnA/README.md
@@ -344,3 +344,22 @@ OPEA microservice deployment can easily be monitored through Grafana dashboards
 
 ![chatqna dashboards](./assets/img/chatqna_dashboards.png)
 ![tgi dashboard](./assets/img/tgi_dashboard.png)
+
+## Tracing Services with OpenTelemetry Tracing and Jaeger
+
+> NOTE: limited support. Only LLM inference serving with TGI on Gaudi is enabled for this feature.
+
+OPEA microservice and TGI/TEI serving can easily be traced through Jaeger dashboards in conjunction with OpenTelemetry Tracing feature. Follow the [README](https://github.com/opea-project/GenAIComps/tree/main/comps/cores/telemetry#tracing) to trace additional functions if needed.
+
+Tracing data is exported to http://{EXTERNAL_IP}:4318/v1/traces via Jaeger.
+Users could also get the external IP via below command.
+
+```bash
+ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+'
+```
+
+For TGI serving on Gaudi, users could see different services like opea, TEI and TGI.
+![Screenshot from 2024-12-27 11-58-18](https://github.com/user-attachments/assets/6126fa70-e830-4780-bd3f-83cb6eff064e)
+
+Here is a screenshot for one tracing of TGI serving request.
+![Screenshot from 2024-12-27 11-26-25](https://github.com/user-attachments/assets/3a7c51c6-f422-41eb-8e82-c3df52cd48b8)
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -25,6 +25,7 @@ services:
       INDEX_NAME: ${INDEX_NAME}
       TEI_ENDPOINT: http://tei-embedding-service:80
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      TELEMETRY_ENDPOINT: ${TELEMETRY_ENDPOINT}
   tei-embedding-service:
     image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
     container_name: tei-embedding-gaudi-server
@@ -37,7 +38,7 @@ services:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
-    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
+    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
   retriever:
     image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest}
     container_name: retriever-redis-server
@@ -55,9 +56,11 @@ services:
       INDEX_NAME: ${INDEX_NAME}
       TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      TELEMETRY_ENDPOINT: ${TELEMETRY_ENDPOINT}
     restart: unless-stopped
   tei-reranking-service:
     image: ghcr.io/huggingface/tei-gaudi:1.5.0
+    privileged: true
     container_name: tei-reranking-gaudi-server
     ports:
       - "8808:80"
@@ -76,9 +79,10 @@ services:
       HABANA_VISIBLE_DEVICES: all
       OMPI_MCA_btl_vader_single_copy_mechanism: none
       MAX_WARMUP_SEQUENCE_LENGTH: 512
-    command: --model-id ${RERANK_MODEL_ID} --auto-truncate
+    command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
   tgi-service:
     image: ghcr.io/huggingface/tgi-gaudi:2.0.6
+    privileged: true
     container_name: tgi-gaudi-server
     ports:
       - "8005:80"
@@ -101,7 +105,22 @@ services:
     cap_add:
       - SYS_NICE
     ipc: host
-    command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096
+    command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+  jaeger:
+    image: jaegertracing/all-in-one:latest
+    container_name: jaeger
+    ports:
+      - "16686:16686"
+      - "4317:4317"
+      - "4318:4318"
+      - "9411:9411"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      COLLECTOR_ZIPKIN_HOST_PORT: 9411
+    restart: unless-stopped
   chatqna-gaudi-backend-server:
     image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
     container_name: chatqna-gaudi-backend-server
@@ -127,6 +146,7 @@ services:
       - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
       - LLM_MODEL=${LLM_MODEL_ID}
       - LOGFLAG=${LOGFLAG}
+      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
     ipc: host
     restart: always
   chatqna-gaudi-ui-server:

diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh b/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -2,7 +2,7 @@
 
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-pushd "../../../../../" > /dev/null
+pushd "../" > /dev/null
 source .set_env.sh
 popd > /dev/null
 
@@ -14,3 +14,7 @@ export INDEX_NAME="rag-redis"
 # Set it as a non-null string, such as true, if you want to enable logging facility,
 # otherwise, keep it as "" to disable it.
 export LOGFLAG=""
+# Set OpenTelemetry Tracing Endpoint
+export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
+export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces