Skip to content

Commit

Permalink
Add Github action workflows for running continuous tests with Pytest
Browse files Browse the repository at this point in the history
Changes:
- Adds `wheel_tests.yml` that will be used to run continuous jobs that builds artifacts and runs CPU/CUDA tests. Jobs will run by workflow calls to `build_artifacts.yml`/`pytest_cpu.yml`/`pytest_gpu.yml`.
- Adds testing of CUDA tests on H100 gpus
- Make script executable
- Change the name of GPU scripts and workflows to CUDA to be more clear as to what is being tested
PiperOrigin-RevId: 702497163
  • Loading branch information
nitins17 authored and Google-ML-Automation committed Dec 20, 2024
1 parent 043c260 commit da64a88
Show file tree
Hide file tree
Showing 9 changed files with 439 additions and 3 deletions.
148 changes: 148 additions & 0 deletions .github/workflows/build_artifacts.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# CI - Build JAX Artifacts
# This workflow builds JAX wheels (jax, jaxlib, jax-cuda-plugin, and jax-cuda-pjrt) and optionally
# uploads them to a Google Cloud Storage (GCS) bucket. It can be triggered manually via
# workflow_dispatch or called by other workflows via workflow_call.
name: CI - Build JAX Artifacts

on:
workflow_dispatch:
inputs:
runner:
description: "Which runner should the workflow run on?"
type: choice
required: true
default: "linux-x86-n2-16"
options:
- "linux-x86-n2-16"
- "linux-arm64-c4a-64"
- "windows-x86-n2-16"
artifact:
description: "Which JAX artifact to build?"
type: choice
required: true
default: "jaxlib"
options:
- "jax"
- "jaxlib"
- "jax-cuda-plugin"
- "jax-cuda-pjrt"
python:
description: "Which python version should the artifact be built for?"
type: choice
required: false
default: "3.12"
options:
- "3.10"
- "3.11"
- "3.12"
- "3.13"
clone_main_xla:
description: "Should latest XLA be used?"
type: choice
required: false
default: "0"
options:
- "1"
- "0"
halt-for-connection:
description: 'Should this workflow run wait for a remote connection?'
type: choice
required: false
default: 'no'
options:
- 'yes'
- 'no'
workflow_call:
inputs:
runner:
description: "Which runner should the workflow run on?"
type: string
required: true
default: "linux-x86-n2-16"
artifact:
description: "Which JAX artifact to build?"
type: string
required: true
default: "jaxlib"
python:
description: "Which python version should the artifact be built for?"
type: string
required: false
default: "3.12"
clone_main_xla:
description: "Should latest XLA be used?"
type: string
required: false
default: "0"
upload_artifacts:
description: "Should the artifacts be uploaded to a GCS bucket?"
required: true
default: true
type: boolean
upload_url_prefix:
description: "GCS location prefix to where the artifacts should be uploaded"
required: true
default: 'gs://general-ml-ci-transient/jax-github-actions/jax/${{ github.workflow }}/${{ github.run_number }}'
type: string

permissions:
contents: read

jobs:
build-artifacts:
defaults:
run:
# Explicitly set the shell to bash to override Windows's default (cmd)
shell: bash

runs-on: ${{ inputs.runner }}

container: ${{ (contains(inputs.runner, 'linux-x86') && 'us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest') ||
(contains(inputs.runner, 'linux-arm64') && 'us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build-arm64:latest') ||
(contains(inputs.runner, 'windows-x86') && null) }}

env:
JAXCI_HERMETIC_PYTHON_VERSION: "${{ inputs.python }}"
JAXCI_CLONE_MAIN_XLA: "${{ inputs.clone_main_xla }}"

name: Build ${{ inputs.artifact }} (${{ inputs.runner }}, Python ${{ inputs.python }}, clone main XLA=${{ inputs.clone_main_xla }})

steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

- name: Enable RBE if building on Linux x86 or Windows x86
if: contains(inputs.runner, 'linux-x86') || contains(inputs.runner, 'windows-x86')
run: echo "JAXCI_BUILD_ARTIFACT_WITH_RBE=1" >> $GITHUB_ENV

# Halt for testing
- name: Wait For Connection
uses: google-ml-infra/actions/ci_connection@main
with:
halt-dispatch-input: ${{ inputs.halt-for-connection }}

- name: Build ${{ inputs.artifact }}
run: ./ci/build_artifacts.sh "${{ inputs.artifact }}"

- name: Set PLATFORM env var for use in artifact upload URL
run: |
os=$(uname -s | awk '{print tolower($0)}')
arch=$(uname -m)
# Adjust name for Windows
if [[ $os =~ "msys_nt" ]]; then
os="windows"
fi
echo "PLATFORM=${os}_${arch}" >> $GITHUB_ENV
- name: Upload artifacts to a GCS bucket (non-Windows runs)
if: >-
${{ inputs.upload_artifacts && !contains(inputs.runner, 'windows-x86') }}
run: gsutil -m cp -r $(pwd)/dist/*.whl "${{ inputs.upload_url_prefix }}"/$PLATFORM/python${JAXCI_HERMETIC_PYTHON_VERSION}/

# Set shell to cmd to avoid path errors when using gcloud commands on Windows
- name: Upload artifacts to a GCS bucket (Windows runs)
if: >-
${{ inputs.upload_artifacts && contains(inputs.runner, 'windows-x86') }}
shell: cmd
run: gsutil -m cp -r dist/*.whl "${{ inputs.upload_url_prefix }}"/$PLATFORM/python${JAXCI_HERMETIC_PYTHON_VERSION}/
95 changes: 95 additions & 0 deletions .github/workflows/pytest_cpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# CI - Pytest CPU
#
# This workflow runs the CPU tests with Pytest. It can only be triggered by other workflows via
# `workflow_call`. It is used by the `wheel_tests.yml` workflow to run the Pytest CPU tests as a
# continuous job.
#
# It consists of the following job:
# run-tests:
# - Downloads the jaxlib wheel from a GCS bucket. This wheel is usually built by the
# build-jaxlib-artifact job in the `wheel_tests.yml` workflow.
# - Executes the `run_pytest_cpu.sh` script, which performs the following actions:
# - Installs the downloaded jaxlib wheel.
# - Runs the CPU tests with Pytest.
name: CI - Pytest CPU

on:
workflow_call:
inputs:
runner:
description: "Which runner should the workflow run on?"
type: string
required: true
default: "linux-x86-n2-16"
python:
description: "Which python version should the artifact be built for?"
type: string
required: true
default: "3.12"
enable-x64:
description: "Should x64 mode be enabled?"
type: string
required: true
default: "0"
download_url_prefix:
description: "GCS location prefix from where the artifacts should be downloaded"
required: true
default: 'gs://general-ml-ci-transient/jax-github-actions/jax/${{ github.workflow }}/${{ github.run_number }}/${{ github.run_attempt }}'
type: string
halt-for-connection:
description: 'Should this workflow run wait for a remote connection?'
type: boolean
required: false
default: false

jobs:
run-tests:
defaults:
run:
# Explicitly set the shell to bash to override Windows's default (cmd)
shell: bash
runs-on: ${{ inputs.runner }}
container: ${{ (contains(inputs.runner, 'linux-x86') && 'us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest') ||
(contains(inputs.runner, 'linux-arm64') && 'us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build-arm64:latest') ||
(contains(inputs.runner, 'windows-x86') && null) }}

name: "Pytest CPU (${{ inputs.runner }}, Python ${{ inputs.python }}, x64=${{ inputs.enable-x64 }})"

env:
JAXCI_HERMETIC_PYTHON_VERSION: "${{ inputs.python }}"
JAXCI_PYTHON: "python${{ inputs.python }}"
JAXCI_ENABLE_X64: "${{ inputs.enable-x64 }}"

steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
# Halt for testing
- name: Wait For Connection
uses: google-ml-infra/actions/ci_connection@main
with:
halt-dispatch-input: ${{ inputs.halt-for-connection }}
- name: Set Platform env var for use in artifact download URL
run: |
os=$(uname -s | awk '{print tolower($0)}')
arch=$(uname -m)
# Adjust name for Windows
if [[ $os =~ "msys_nt" ]]; then
os="windows"
fi
echo "PLATFORM=${os}_${arch}" >> $GITHUB_ENV
- name: Download jaxlib wheel from GCS (non-Windows runs)
if: ${{ !contains(matrix.runner, 'windows-x86') }}
run: >-
mkdir -p $(pwd)/dist &&
gsutil -m cp -r "${{ inputs.download_url_prefix }}"/$PLATFORM/python${JAXCI_HERMETIC_PYTHON_VERSION}/jaxlib*.whl $(pwd)/dist/
- name: Download jaxlib wheel from GCS (Windows runs)
if: ${{ contains(matrix.runner, 'windows-x86') }}
shell: cmd
run: >-
mkdir dist &&
gsutil -m cp -r "${{ inputs.download_url_prefix }}"/$PLATFORM/python${JAXCI_HERMETIC_PYTHON_VERSION}/jaxlib*.whl dist/
- name: Install Python dependencies
run: $JAXCI_PYTHON -m pip install -r build/requirements.in
- name: Run Pytest CPU tests
run: ./ci/run_pytest_cpu.sh
80 changes: 80 additions & 0 deletions .github/workflows/pytest_cuda.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# CI - Pytest CUDA
#
# This workflow runs the CUDA tests with Pytest. It can only be triggered by other workflows via
# `workflow_call`. It is used by the `wheel_tests.yml` workflow to run the Pytest CUDA tests as a
# continuous job.
#
# It consists of the following job:
# run-tests:
# - Downloads the jaxlib and CUDA artifacts from a GCS bucket. These wheels are usually built by
# the artifact build jobs in the `wheel_tests.yml` workflow.
# - Executes the `run_pytest_cuda.sh` script, which performs the following actions:
# - Installs the downloaded jaxlib wheel.
# - Runs the CUDA tests with Pytest.
name: CI - Pytest CUDA

on:
workflow_call:
inputs:
runner:
description: "Which runner should the workflow run on?"
type: string
required: true
default: "linux-x86-n2-16"
python:
description: "Which python version to test?"
type: string
required: true
default: "3.12"
cuda:
description: "Which CUDA version to test?"
type: string
required: true
default: "12.3"
enable-x64:
description: "Should x64 mode be enabled?"
type: string
required: true
default: "0"
download_url_prefix:
description: "GCS location prefix from where the artifacts should be downloaded"
required: true
default: 'gs://general-ml-ci-transient/jax-github-actions/jax/${{ github.workflow }}/${{ github.run_number }}/${{ github.run_attempt }}'
type: string
halt-for-connection:
description: 'Should this workflow run wait for a remote connection?'
type: boolean
required: false
default: false

jobs:
run-tests:
runs-on: ${{ inputs.runner }}
# TODO: Update to the generic ML ecosystem test containers when they are ready.
container: ${{ (contains(inputs.cuda, '12.3') && 'us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest') ||
(contains(inputs.cuda, '12.1') && 'us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/nosla-cuda12.1-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest') }}
name: "Pytest CUDA (${{ inputs.runner }}, CUDA ${{ inputs.cuda }}, Python ${{ inputs.python }}, x64=${{ inputs.enable-x64 }})"

env:
JAXCI_HERMETIC_PYTHON_VERSION: "${{ inputs.python }}"
JAXCI_PYTHON: "python${{ inputs.python }}"
JAXCI_ENABLE_X64: "${{ inputs.enable-x64 }}"

steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
# Halt for testing
- name: Wait For Connection
uses: google-ml-infra/actions/ci_connection@main
with:
halt-dispatch-input: ${{ inputs.halt-for-connection }}
- name: Set Platform env var for use in artifact download URL
run: |
os=$(uname -s | awk '{print tolower($0)}')
arch=$(uname -m)
echo "PLATFORM=${os}_${arch}" >> $GITHUB_ENV
- name: Download artifacts from GCS
run: mkdir -p $(pwd)/dist && gsutil -m cp -r "${{ inputs.download_url_prefix }}"/$PLATFORM/python${JAXCI_HERMETIC_PYTHON_VERSION}/*.whl $(pwd)/dist/
- name: Install Python dependencies
run: $JAXCI_PYTHON -m pip install -r build/requirements.in
- name: Run Pytest CUDA tests
run: ./ci/run_pytest_cuda.sh
Loading

0 comments on commit da64a88

Please sign in to comment.