Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(ml): introduce support of onnxruntime-rocm for AMD GPU #11063

Draft
wants to merge 17 commits into
base: main
Choose a base branch
from
Draft
38 changes: 20 additions & 18 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,21 +48,21 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
suffix: ["", "-cuda", "-openvino", "-armnn"]
suffix: ['', '-cuda', '-rocm', '-openvino', '-armnn']
steps:
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Re-tag image
run: |
REGISTRY_NAME="ghcr.io"
REPOSITORY=${{ github.repository_owner }}/immich-machine-learning
TAG_OLD=main${{ matrix.suffix }}
TAG_NEW=${{ github.event.number == 0 && github.ref_name || format('pr-{0}', github.event.number) }}${{ matrix.suffix }}
docker buildx imagetools create -t $REGISTRY_NAME/$REPOSITORY:$TAG_NEW $REGISTRY_NAME/$REPOSITORY:$TAG_OLD
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Re-tag image
run: |
REGISTRY_NAME="ghcr.io"
REPOSITORY=${{ github.repository_owner }}/immich-machine-learning
TAG_OLD=main${{ matrix.suffix }}
TAG_NEW=${{ github.event.number == 0 && github.ref_name || format('pr-{0}', github.event.number) }}${{ matrix.suffix }}
docker buildx imagetools create -t $REGISTRY_NAME/$REPOSITORY:$TAG_NEW $REGISTRY_NAME/$REPOSITORY:$TAG_OLD

retag_server:
name: Re-Tag Server
Expand All @@ -71,7 +71,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
suffix: [""]
suffix: ['']
steps:
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
Expand All @@ -87,12 +87,11 @@ jobs:
TAG_NEW=${{ github.event.number == 0 && github.ref_name || format('pr-{0}', github.event.number) }}${{ matrix.suffix }}
docker buildx imagetools create -t $REGISTRY_NAME/$REPOSITORY:$TAG_NEW $REGISTRY_NAME/$REPOSITORY:$TAG_OLD


build_and_push_ml:
name: Build and Push ML
needs: pre-job
if: ${{ needs.pre-job.outputs.should_run_ml == 'true' }}
runs-on: ubuntu-latest
runs-on: mich
env:
image: immich-machine-learning
context: machine-learning
Expand All @@ -109,6 +108,10 @@ jobs:
device: cuda
suffix: -cuda

- platforms: linux/amd64
device: rocm
suffix: -rocm

- platforms: linux/amd64
device: openvino
suffix: -openvino
Expand Down Expand Up @@ -192,7 +195,6 @@ jobs:
BUILD_SOURCE_REF=${{ github.ref_name }}
BUILD_SOURCE_COMMIT=${{ github.sha }}


build_and_push_server:
name: Build and Push Server
runs-on: ubuntu-latest
Expand Down
4 changes: 2 additions & 2 deletions docker/docker-compose.dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,12 @@ services:
image: immich-machine-learning-dev:latest
# extends:
# file: hwaccel.ml.yml
# service: cpu # set to one of [armnn, cuda, openvino, openvino-wsl] for accelerated inference
# service: cpu # set to one of [armnn, cuda, rocm, openvino, openvino-wsl] for accelerated inference
build:
context: ../machine-learning
dockerfile: Dockerfile
args:
- DEVICE=cpu # set to one of [armnn, cuda, openvino, openvino-wsl] for accelerated inference
- DEVICE=cpu # set to one of [armnn, cuda, rocm, openvino, openvino-wsl] for accelerated inference
ports:
- 3003:3003
volumes:
Expand Down
4 changes: 2 additions & 2 deletions docker/docker-compose.prod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ services:
image: immich-machine-learning:latest
# extends:
# file: hwaccel.ml.yml
# service: cpu # set to one of [armnn, cuda, openvino, openvino-wsl] for accelerated inference
# service: cpu # set to one of [armnn, cuda, rocm, openvino, openvino-wsl] for accelerated inference
build:
context: ../machine-learning
dockerfile: Dockerfile
args:
- DEVICE=cpu # set to one of [armnn, cuda, openvino, openvino-wsl] for accelerated inference
- DEVICE=cpu # set to one of [armnn, cuda, rocm, openvino, openvino-wsl] for accelerated inference
ports:
- 3003:3003
volumes:
Expand Down
4 changes: 2 additions & 2 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@ services:

immich-machine-learning:
container_name: immich_machine_learning
# For hardware acceleration, add one of -[armnn, cuda, openvino] to the image tag.
# For hardware acceleration, add one of -[armnn, cuda, rocm, openvino] to the image tag.
# Example tag: ${IMMICH_VERSION:-release}-cuda
image: ghcr.io/immich-app/immich-machine-learning:${IMMICH_VERSION:-release}
# extends: # uncomment this section for hardware acceleration - see https://immich.app/docs/features/ml-hardware-acceleration
# file: hwaccel.ml.yml
# service: cpu # set to one of [armnn, cuda, openvino, openvino-wsl] for accelerated inference - use the `-wsl` version for WSL2 where applicable
# service: cpu # set to one of [armnn, cuda, rocm, openvino, openvino-wsl] for accelerated inference - use the `-wsl` version for WSL2 where applicable
volumes:
- model-cache:/cache
env_file:
Expand Down
7 changes: 7 additions & 0 deletions docker/hwaccel.ml.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@ services:
capabilities:
- gpu

rocm:
group_add:
- video
devices:
- /dev/dri:/dev/dri
- /dev/kfd:/dev/kfd

openvino:
device_cgroup_rules:
- 'c 189:* rmw'
Expand Down
9 changes: 7 additions & 2 deletions docs/docs/features/ml-hardware-acceleration.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ You do not need to redo any machine learning jobs after enabling hardware accele

- ARM NN (Mali)
- CUDA (NVIDIA GPUs with [compute capability](https://developer.nvidia.com/cuda-gpus) 5.2 or higher)
- ROCm (AMD GPUs)
- OpenVINO (Intel discrete GPUs such as Iris Xe and Arc)

## Limitations
Expand Down Expand Up @@ -41,6 +42,10 @@ You do not need to redo any machine learning jobs after enabling hardware accele
- The installed driver must be >= 535 (it must support CUDA 12.2).
- On Linux (except for WSL2), you also need to have [NVIDIA Container Toolkit][nvct] installed.

#### ROCm

- The GPU must be supported by ROCm. If it isn't officially supported, you can attempt to use the `HSA_OVERRIDE_GFX_VERSION` environmental variable: `HSA_OVERRIDE_GFX_VERSION=<a supported version, e.g. 10.3.0>`.

#### OpenVINO

- The server must have a discrete GPU, i.e. Iris Xe or Arc. Expect issues when attempting to use integrated graphics.
Expand All @@ -50,12 +55,12 @@ You do not need to redo any machine learning jobs after enabling hardware accele

1. If you do not already have it, download the latest [`hwaccel.ml.yml`][hw-file] file and ensure it's in the same folder as the `docker-compose.yml`.
2. In the `docker-compose.yml` under `immich-machine-learning`, uncomment the `extends` section and change `cpu` to the appropriate backend.
3. Still in `immich-machine-learning`, add one of -[armnn, cuda, openvino] to the `image` section's tag at the end of the line.
3. Still in `immich-machine-learning`, add one of -[armnn, cuda, rocm, openvino] to the `image` section's tag at the end of the line.
4. Redeploy the `immich-machine-learning` container with these updated settings.

### Confirming Device Usage

You can confirm the device is being recognized and used by checking its utilization. There are many tools to display this, such as `nvtop` for NVIDIA or Intel and `intel_gpu_top` for Intel.
You can confirm the device is being recognized and used by checking its utilization. There are many tools to display this, such as `nvtop` for NVIDIA or Intel, `intel_gpu_top` for Intel, and `radeontop` for AMD.

You can also check the logs of the `immich-machine-learning` container. When a Smart Search or Face Detection job begins, or when you search with text in Immich, you should either see a log for `Available ORT providers` containing the relevant provider (e.g. `CUDAExecutionProvider` in the case of CUDA), or a `Loaded ANN model` log entry without errors in the case of ARM NN.

Expand Down
4 changes: 2 additions & 2 deletions docs/docs/guides/remote-machine-learning.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ name: immich_remote_ml
services:
immich-machine-learning:
container_name: immich_machine_learning
# For hardware acceleration, add one of -[armnn, cuda, openvino] to the image tag.
# For hardware acceleration, add one of -[armnn, cuda, rocm, openvino] to the image tag.
# Example tag: ${IMMICH_VERSION:-release}-cuda
image: ghcr.io/immich-app/immich-machine-learning:${IMMICH_VERSION:-release}
# extends:
# file: hwaccel.ml.yml
# service: # set to one of [armnn, cuda, openvino, openvino-wsl] for accelerated inference - use the `-wsl` version for WSL2 where applicable
# service: # set to one of [armnn, cuda, rocm, openvino, openvino-wsl] for accelerated inference - use the `-wsl` version for WSL2 where applicable
volumes:
- model-cache:/cache
restart: always
Expand Down
150 changes: 150 additions & 0 deletions machine-learning/0001-fix-rocm-conv-thread-safety.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
From 350e3237eadb738a0d96295a62f2eed96653c315 Mon Sep 17 00:00:00 2001
From: mertalev <[email protected]>
Date: Fri, 20 Dec 2024 00:59:21 -0500
Subject: [PATCH 1/1] fix: avoid race condition for rocm conv algo caching

---
onnxruntime/core/providers/rocm/nn/conv.cc | 8 ++++----
onnxruntime/core/providers/rocm/nn/conv.h | 14 ++++++++++++--
.../core/providers/rocm/nn/conv_transpose.cc | 8 ++++----
3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc
index d7f47d07a8..98b6b69212 100644
--- a/onnxruntime/core/providers/rocm/nn/conv.cc
+++ b/onnxruntime/core/providers/rocm/nn/conv.cc
@@ -127,7 +127,6 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)

if (w_dims_changed) {
s_.last_w_dims = gsl::make_span(w_dims);
- s_.cached_benchmark_fwd_results.clear();
}

ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last, channels_last));
@@ -278,7 +277,8 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
HIP_CALL_THROW(hipMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context)));
}

- if (!s_.cached_benchmark_fwd_results.contains(x_dims_miopen)) {
+ const std::size_t algo_key = HashConvAlgoKey(x_dims_miopen, w_dims);
+ if (!s_.cached_benchmark_fwd_results.contains(algo_key)) {
miopenConvAlgoPerf_t perf;
int algo_count = 1;
const ROCMExecutionProvider* rocm_ep = static_cast<const ROCMExecutionProvider*>(this->Info().GetExecutionProvider());
@@ -301,9 +301,9 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
algo_search_workspace.get(),
max_ws_size,
false)); // Do not do exhaustive algo search.
- s_.cached_benchmark_fwd_results.insert(x_dims_miopen, {perf.fwd_algo, perf.memory});
+ s_.cached_benchmark_fwd_results.insert(algo_key, {perf.fwd_algo, perf.memory});
}
- const auto& perf = s_.cached_benchmark_fwd_results.at(x_dims_miopen);
+ const auto& perf = s_.cached_benchmark_fwd_results.at(algo_key);
s_.fwd_algo = perf.fwd_algo;
s_.workspace_bytes = perf.memory;
} else {
diff --git a/onnxruntime/core/providers/rocm/nn/conv.h b/onnxruntime/core/providers/rocm/nn/conv.h
index bc9846203e..b1ca5f8e4b 100644
--- a/onnxruntime/core/providers/rocm/nn/conv.h
+++ b/onnxruntime/core/providers/rocm/nn/conv.h
@@ -43,6 +43,11 @@ struct vector_hash {
}
};

+inline std::size_t HashConvAlgoKey(const TensorShapeVector& x_dims, const TensorShapeVector& w_dims) {
+ vector_hash vh;
+ return vh(x_dims) ^ vh(w_dims);
+}
+
template <typename Key, typename T,
typename Hash = std::hash<Key>,
typename KeyEqual = std::equal_to<Key>,
@@ -52,6 +57,7 @@ class lru_unordered_map {
lru_unordered_map(size_t max_size) : max_size_(max_size) {}

void insert(const Key& key, const T& value) {
+ std::lock_guard<std::mutex> guard(mutex_);
auto it = items_.find(key);
if (it != items_.end()) {
it->second.value = value;
@@ -69,6 +75,7 @@ class lru_unordered_map {
}

T& at(const Key& key) {
+ std::lock_guard<std::mutex> guard(mutex_);
auto it = items_.find(key);
if (it == items_.end()) {
throw std::out_of_range("There is no such key in cache");
@@ -78,6 +85,7 @@ class lru_unordered_map {
}

bool contains(const Key& key) const {
+ std::lock_guard<std::mutex> guard(mutex_);
return items_.find(key) != items_.end();
}

@@ -86,6 +94,7 @@ class lru_unordered_map {
}

void clear() {
+ std::lock_guard<std::mutex> guard(mutex_);
items_.clear();
lru_list_.clear();
}
@@ -106,6 +115,7 @@ class lru_unordered_map {
size_t max_size_;
std::unordered_map<Key, value_type, Hash, KeyEqual, MapAllocator> items_;
list_type lru_list_;
+ mutable std::mutex mutex_;
};

// cached miopen descriptors
@@ -148,8 +158,8 @@ struct MiopenConvState {
decltype(AlgoPerfType().memory) memory;
};

- lru_unordered_map<TensorShapeVector, PerfFwdResultParams, vector_hash> cached_benchmark_fwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
- lru_unordered_map<TensorShapeVector, PerfBwdResultParams, vector_hash> cached_benchmark_bwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
+ lru_unordered_map<std::size_t, PerfFwdResultParams> cached_benchmark_fwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
+ lru_unordered_map<std::size_t, PerfBwdResultParams> cached_benchmark_bwd_results{MAX_CACHED_ALGO_PERF_RESULTS};

// Some properties needed to support asymmetric padded Conv nodes
bool post_slicing_required;
diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
index 7447113fdf..dea9bf2a05 100644
--- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
@@ -76,7 +76,6 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy

if (w_dims_changed) {
s_.last_w_dims = gsl::make_span(w_dims);
- s_.cached_benchmark_bwd_results.clear();
}

ConvTransposeAttributes::Prepare p;
@@ -127,7 +126,8 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy

y_data = reinterpret_cast<HipT*>(p.Y->MutableData<T>());

- if (!s_.cached_benchmark_bwd_results.contains(x_dims)) {
+ const std::size_t algo_key = HashConvAlgoKey(x_dims, w_dims);
+ if (!s_.cached_benchmark_bwd_results.contains(algo_key)) {
IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());

miopenConvAlgoPerf_t perf;
@@ -147,10 +147,10 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
algo_search_workspace.get(),
AlgoSearchWorkspaceSize,
false));
- s_.cached_benchmark_bwd_results.insert(x_dims, {perf.bwd_data_algo, perf.memory});
+ s_.cached_benchmark_bwd_results.insert(algo_key, {perf.bwd_data_algo, perf.memory});
}

- const auto& perf = s_.cached_benchmark_bwd_results.at(x_dims);
+ const auto& perf = s_.cached_benchmark_bwd_results.at(algo_key);
s_.bwd_data_algo = perf.bwd_data_algo;
s_.workspace_bytes = perf.memory;
}
--
2.43.0

Loading
Loading