immich-app · Zelnes · Jul 12, 2024 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
@@ -48,21 +48,21 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        suffix: ["", "-cuda", "-openvino", "-armnn"]
+        suffix: ['', '-cuda', '-rocm', '-openvino', '-armnn']
     steps:
-        - name: Login to GitHub Container Registry
-          uses: docker/login-action@v3
-          with:
-            registry: ghcr.io
-            username: ${{ github.repository_owner }}
-            password: ${{ secrets.GITHUB_TOKEN }}
-        - name: Re-tag image
-          run: |
-              REGISTRY_NAME="ghcr.io"
-              REPOSITORY=${{ github.repository_owner }}/immich-machine-learning
-              TAG_OLD=main${{ matrix.suffix }}
-              TAG_NEW=${{ github.event.number == 0 && github.ref_name ||  format('pr-{0}', github.event.number)  }}${{ matrix.suffix }}
-              docker buildx imagetools create -t $REGISTRY_NAME/$REPOSITORY:$TAG_NEW $REGISTRY_NAME/$REPOSITORY:$TAG_OLD
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Re-tag image
+        run: |
+          REGISTRY_NAME="ghcr.io"
+          REPOSITORY=${{ github.repository_owner }}/immich-machine-learning
+          TAG_OLD=main${{ matrix.suffix }}
+          TAG_NEW=${{ github.event.number == 0 && github.ref_name ||  format('pr-{0}', github.event.number)  }}${{ matrix.suffix }}
+          docker buildx imagetools create -t $REGISTRY_NAME/$REPOSITORY:$TAG_NEW $REGISTRY_NAME/$REPOSITORY:$TAG_OLD
 
   retag_server:
     name: Re-Tag Server
@@ -71,7 +71,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        suffix: [""]
+        suffix: ['']
     steps:
       - name: Login to GitHub Container Registry
         uses: docker/login-action@v3
@@ -87,12 +87,11 @@ jobs:
           TAG_NEW=${{ github.event.number == 0 && github.ref_name ||  format('pr-{0}', github.event.number)  }}${{ matrix.suffix }}
           docker buildx imagetools create -t $REGISTRY_NAME/$REPOSITORY:$TAG_NEW $REGISTRY_NAME/$REPOSITORY:$TAG_OLD
 
-
   build_and_push_ml:
     name: Build and Push ML
     needs: pre-job
     if: ${{ needs.pre-job.outputs.should_run_ml == 'true' }}
-    runs-on: ubuntu-latest
+    runs-on: mich
     env:
       image: immich-machine-learning
       context: machine-learning
@@ -109,6 +108,10 @@ jobs:
             device: cuda
             suffix: -cuda
 
+          - platforms: linux/amd64
+            device: rocm
+            suffix: -rocm
+
           - platforms: linux/amd64
             device: openvino
             suffix: -openvino
@@ -192,7 +195,6 @@ jobs:
             BUILD_SOURCE_REF=${{ github.ref_name }}
             BUILD_SOURCE_COMMIT=${{ github.sha }}
 
-
   build_and_push_server:
     name: Build and Push Server
     runs-on: ubuntu-latest

@@ -85,12 +85,12 @@ services:
     image: immich-machine-learning-dev:latest
     # extends:
     #   file: hwaccel.ml.yml
-    #   service: cpu # set to one of [armnn, cuda, openvino, openvino-wsl] for accelerated inference
+    #   service: cpu # set to one of [armnn, cuda, rocm, openvino, openvino-wsl] for accelerated inference
     build:
       context: ../machine-learning
       dockerfile: Dockerfile
       args:
-        - DEVICE=cpu # set to one of [armnn, cuda, openvino, openvino-wsl] for accelerated inference
+        - DEVICE=cpu # set to one of [armnn, cuda, rocm, openvino, openvino-wsl] for accelerated inference
     ports:
       - 3003:3003
     volumes:

@@ -29,12 +29,12 @@ services:
     image: immich-machine-learning:latest
     # extends:
     #   file: hwaccel.ml.yml
-    #   service: cpu # set to one of [armnn, cuda, openvino, openvino-wsl] for accelerated inference
+    #   service: cpu # set to one of [armnn, cuda, rocm, openvino, openvino-wsl] for accelerated inference
     build:
       context: ../machine-learning
       dockerfile: Dockerfile
       args:
-        - DEVICE=cpu # set to one of [armnn, cuda, openvino, openvino-wsl] for accelerated inference
+        - DEVICE=cpu # set to one of [armnn, cuda, rocm, openvino, openvino-wsl] for accelerated inference
     ports:
       - 3003:3003
     volumes:

@@ -32,12 +32,12 @@ services:
 
   immich-machine-learning:
     container_name: immich_machine_learning
-    # For hardware acceleration, add one of -[armnn, cuda, openvino] to the image tag.
+    # For hardware acceleration, add one of -[armnn, cuda, rocm, openvino] to the image tag.
     # Example tag: ${IMMICH_VERSION:-release}-cuda
     image: ghcr.io/immich-app/immich-machine-learning:${IMMICH_VERSION:-release}
     # extends: # uncomment this section for hardware acceleration - see https://immich.app/docs/features/ml-hardware-acceleration
     #   file: hwaccel.ml.yml
-    #   service: cpu # set to one of [armnn, cuda, openvino, openvino-wsl] for accelerated inference - use the `-wsl` version for WSL2 where applicable
+    #   service: cpu # set to one of [armnn, cuda, rocm, openvino, openvino-wsl] for accelerated inference - use the `-wsl` version for WSL2 where applicable
     volumes:
       - model-cache:/cache
     env_file:

@@ -26,6 +26,13 @@ services:
               capabilities:
                 - gpu
 
+  rocm:
+    group_add:
+      - video
+    devices:
+      - /dev/dri:/dev/dri
+      - /dev/kfd:/dev/kfd
+
   openvino:
     device_cgroup_rules:
       - 'c 189:* rmw'

diff --git a/docs/docs/features/ml-hardware-acceleration.md b/docs/docs/features/ml-hardware-acceleration.md
@@ -11,6 +11,7 @@ You do not need to redo any machine learning jobs after enabling hardware accele
 
 - ARM NN (Mali)
 - CUDA (NVIDIA GPUs with [compute capability](https://developer.nvidia.com/cuda-gpus) 5.2 or higher)
+- ROCm (AMD GPUs)
 - OpenVINO (Intel discrete GPUs such as Iris Xe and Arc)
 
 ## Limitations
@@ -41,6 +42,10 @@ You do not need to redo any machine learning jobs after enabling hardware accele
 - The installed driver must be >= 535 (it must support CUDA 12.2).
 - On Linux (except for WSL2), you also need to have [NVIDIA Container Toolkit][nvct] installed.
 
+#### ROCm
+
+- The GPU must be supported by ROCm. If it isn't officially supported, you can attempt to use the `HSA_OVERRIDE_GFX_VERSION` environmental variable: `HSA_OVERRIDE_GFX_VERSION=<a supported version, e.g. 10.3.0>`.
+
 #### OpenVINO
 
 - The server must have a discrete GPU, i.e. Iris Xe or Arc. Expect issues when attempting to use integrated graphics.
@@ -50,12 +55,12 @@ You do not need to redo any machine learning jobs after enabling hardware accele
 
 1. If you do not already have it, download the latest [`hwaccel.ml.yml`][hw-file] file and ensure it's in the same folder as the `docker-compose.yml`.
 2. In the `docker-compose.yml` under `immich-machine-learning`, uncomment the `extends` section and change `cpu` to the appropriate backend.
-3. Still in `immich-machine-learning`, add one of -[armnn, cuda, openvino] to the `image` section's tag at the end of the line.
+3. Still in `immich-machine-learning`, add one of -[armnn, cuda, rocm, openvino] to the `image` section's tag at the end of the line.
 4. Redeploy the `immich-machine-learning` container with these updated settings.
 
 ### Confirming Device Usage
 
-You can confirm the device is being recognized and used by checking its utilization. There are many tools to display this, such as `nvtop` for NVIDIA or Intel and `intel_gpu_top` for Intel.
+You can confirm the device is being recognized and used by checking its utilization. There are many tools to display this, such as `nvtop` for NVIDIA or Intel, `intel_gpu_top` for Intel, and `radeontop` for AMD.
 
 You can also check the logs of the `immich-machine-learning` container. When a Smart Search or Face Detection job begins, or when you search with text in Immich, you should either see a log for `Available ORT providers` containing the relevant provider (e.g. `CUDAExecutionProvider` in the case of CUDA), or a `Loaded ANN model` log entry without errors in the case of ARM NN.
 

diff --git a/docs/docs/guides/remote-machine-learning.md b/docs/docs/guides/remote-machine-learning.md
@@ -23,12 +23,12 @@ name: immich_remote_ml
 services:
   immich-machine-learning:
     container_name: immich_machine_learning
-    # For hardware acceleration, add one of -[armnn, cuda, openvino] to the image tag.
+    # For hardware acceleration, add one of -[armnn, cuda, rocm, openvino] to the image tag.
     # Example tag: ${IMMICH_VERSION:-release}-cuda
     image: ghcr.io/immich-app/immich-machine-learning:${IMMICH_VERSION:-release}
     # extends:
     #   file: hwaccel.ml.yml
-    #   service: # set to one of [armnn, cuda, openvino, openvino-wsl] for accelerated inference - use the `-wsl` version for WSL2 where applicable
+    #   service: # set to one of [armnn, cuda, rocm, openvino, openvino-wsl] for accelerated inference - use the `-wsl` version for WSL2 where applicable
     volumes:
       - model-cache:/cache
     restart: always

@@ -0,0 +1,150 @@
+From 350e3237eadb738a0d96295a62f2eed96653c315 Mon Sep 17 00:00:00 2001
+From: mertalev <[email protected]>
+Date: Fri, 20 Dec 2024 00:59:21 -0500
+Subject: [PATCH 1/1] fix: avoid race condition for rocm conv algo caching
+
+---
+ onnxruntime/core/providers/rocm/nn/conv.cc         |  8 ++++----
+ onnxruntime/core/providers/rocm/nn/conv.h          | 14 ++++++++++++--
+ .../core/providers/rocm/nn/conv_transpose.cc       |  8 ++++----
+ 3 files changed, 20 insertions(+), 10 deletions(-)
+
+diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc
+index d7f47d07a8..98b6b69212 100644
+--- a/onnxruntime/core/providers/rocm/nn/conv.cc
++++ b/onnxruntime/core/providers/rocm/nn/conv.cc
+@@ -127,7 +127,6 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
+
+     if (w_dims_changed) {
+       s_.last_w_dims = gsl::make_span(w_dims);
+-      s_.cached_benchmark_fwd_results.clear();
+     }
+
+     ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last, channels_last));
+@@ -278,7 +277,8 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
+       HIP_CALL_THROW(hipMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context)));
+     }
+
+-    if (!s_.cached_benchmark_fwd_results.contains(x_dims_miopen)) {
++    const std::size_t algo_key = HashConvAlgoKey(x_dims_miopen, w_dims);
++    if (!s_.cached_benchmark_fwd_results.contains(algo_key)) {
+       miopenConvAlgoPerf_t perf;
+       int algo_count = 1;
+       const ROCMExecutionProvider* rocm_ep = static_cast<const ROCMExecutionProvider*>(this->Info().GetExecutionProvider());
+@@ -301,9 +301,9 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
+           algo_search_workspace.get(),
+           max_ws_size,
+           false));  // Do not do exhaustive algo search.
+-      s_.cached_benchmark_fwd_results.insert(x_dims_miopen, {perf.fwd_algo, perf.memory});
++      s_.cached_benchmark_fwd_results.insert(algo_key, {perf.fwd_algo, perf.memory});
+     }
+-    const auto& perf = s_.cached_benchmark_fwd_results.at(x_dims_miopen);
++    const auto& perf = s_.cached_benchmark_fwd_results.at(algo_key);
+     s_.fwd_algo = perf.fwd_algo;
+     s_.workspace_bytes = perf.memory;
+   } else {
+diff --git a/onnxruntime/core/providers/rocm/nn/conv.h b/onnxruntime/core/providers/rocm/nn/conv.h
+index bc9846203e..b1ca5f8e4b 100644
+--- a/onnxruntime/core/providers/rocm/nn/conv.h
++++ b/onnxruntime/core/providers/rocm/nn/conv.h
+@@ -43,6 +43,11 @@ struct vector_hash {
+   }
+ };
+
++inline std::size_t HashConvAlgoKey(const TensorShapeVector& x_dims, const TensorShapeVector& w_dims) {
++  vector_hash vh;
++  return vh(x_dims) ^ vh(w_dims);
++}
++
+ template <typename Key, typename T,
+           typename Hash = std::hash<Key>,
+           typename KeyEqual = std::equal_to<Key>,
+@@ -52,6 +57,7 @@ class lru_unordered_map {
+   lru_unordered_map(size_t max_size) : max_size_(max_size) {}
+
+   void insert(const Key& key, const T& value) {
++    std::lock_guard<std::mutex> guard(mutex_);
+     auto it = items_.find(key);
+     if (it != items_.end()) {
+       it->second.value = value;
+@@ -69,6 +75,7 @@ class lru_unordered_map {
+   }
+
+   T& at(const Key& key) {
++    std::lock_guard<std::mutex> guard(mutex_);
+     auto it = items_.find(key);
+     if (it == items_.end()) {
+       throw std::out_of_range("There is no such key in cache");
+@@ -78,6 +85,7 @@ class lru_unordered_map {
+   }
+
+   bool contains(const Key& key) const {
++    std::lock_guard<std::mutex> guard(mutex_);
+     return items_.find(key) != items_.end();
+   }
+
+@@ -86,6 +94,7 @@ class lru_unordered_map {
+   }
+
+   void clear() {
++    std::lock_guard<std::mutex> guard(mutex_);
+     items_.clear();
+     lru_list_.clear();
+   }
+@@ -106,6 +115,7 @@ class lru_unordered_map {
+   size_t max_size_;
+   std::unordered_map<Key, value_type, Hash, KeyEqual, MapAllocator> items_;
+   list_type lru_list_;
++  mutable std::mutex mutex_;
+ };
+
+ // cached miopen descriptors
+@@ -148,8 +158,8 @@ struct MiopenConvState {
+     decltype(AlgoPerfType().memory) memory;
+   };
+
+-  lru_unordered_map<TensorShapeVector, PerfFwdResultParams, vector_hash> cached_benchmark_fwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
+-  lru_unordered_map<TensorShapeVector, PerfBwdResultParams, vector_hash> cached_benchmark_bwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
++  lru_unordered_map<std::size_t, PerfFwdResultParams> cached_benchmark_fwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
++  lru_unordered_map<std::size_t, PerfBwdResultParams> cached_benchmark_bwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
+
+   // Some properties needed to support asymmetric padded Conv nodes
+   bool post_slicing_required;
+diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
+index 7447113fdf..dea9bf2a05 100644
+--- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
++++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
+@@ -76,7 +76,6 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
+
+       if (w_dims_changed) {
+         s_.last_w_dims = gsl::make_span(w_dims);
+-        s_.cached_benchmark_bwd_results.clear();
+       }
+
+       ConvTransposeAttributes::Prepare p;
+@@ -127,7 +126,8 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
+
+       y_data = reinterpret_cast<HipT*>(p.Y->MutableData<T>());
+
+-      if (!s_.cached_benchmark_bwd_results.contains(x_dims)) {
++      const std::size_t algo_key = HashConvAlgoKey(x_dims, w_dims);
++      if (!s_.cached_benchmark_bwd_results.contains(algo_key)) {
+         IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
+
+         miopenConvAlgoPerf_t perf;
+@@ -147,10 +147,10 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
+             algo_search_workspace.get(),
+             AlgoSearchWorkspaceSize,
+             false));
+-        s_.cached_benchmark_bwd_results.insert(x_dims, {perf.bwd_data_algo, perf.memory});
++        s_.cached_benchmark_bwd_results.insert(algo_key, {perf.bwd_data_algo, perf.memory});
+       }
+
+-      const auto& perf = s_.cached_benchmark_bwd_results.at(x_dims);
++      const auto& perf = s_.cached_benchmark_bwd_results.at(algo_key);
+       s_.bwd_data_algo = perf.bwd_data_algo;
+       s_.workspace_bytes = perf.memory;
+     }
+-- 
+2.43.0
+