Skip to content

poolside-linux-binary-manywheel #18

poolside-linux-binary-manywheel

poolside-linux-binary-manywheel #18

# basically a partial copy of ./generated-linux-binary-manywheel-nightly.yml
# as the original version is autogenerated, we would need to manually sync this periodically,
# but this way we avoid conflicts
name: poolside-linux-binary-manywheel
on:
# only manual triggers for now
workflow_dispatch:
inputs:
publish:
description: Upload to CodeArtifact
type: choice
required: true
default: true
options:
- true
- false
env:
# Needed for conda builds
ANACONDA_USER: pytorch
BINARY_ENV_FILE: /tmp/env
BUILD_ENVIRONMENT: linux-binary-manywheel
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PYTORCH_FINAL_PACKAGE_DIR: /artifacts
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
SKIP_ALL_TESTS: 1
# All vars below are from the auto-generated ./generated-linux-binary-manywheel-nightly.yml
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu126
GPU_ARCH_VERSION: 12.6
GPU_ARCH_TYPE: cuda
# Note: we might need to fix a specific version of this image or build one ourselves
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.6-main
USE_SPLIT_BUILD: False
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64'
MAX_JOBS: 32
TORCH_CUDA_ARCH_LIST: "8.6;9.0+PTX"
# For publish:
CODEARTIFACT_DOMAIN: poolside
CODEARTIFACT_REPOSITORY: poolside-packages-python
concurrency:
# for now we only allow one build at a time
group: poolside-nightly-pytorch-build
cancel-in-progress: true
jobs:
build:
if: ${{ github.repository_owner == 'poolsideai' }}
runs-on: "ubuntu-22.04-64-pytorchci"
permissions: # required for AWS Credentials
id-token: write
contents: read
strategy:
matrix:
desired_python: ["3.10", "3.12"]
include:
- desired_python: "3.10"
desired_python_major: "3"
desired_python_minor: "10"
- desired_python: "3.12"
desired_python_major: "3"
desired_python_minor: "12"
env:
BUILD_NAME: manywheel-py${{ matrix.desired_python_major }}_${{ matrix.desired_python_minor }}-cuda12_6
DESIRED_PYTHON: ${{ matrix.desired_python }}
timeout-minutes: 210
steps:
- name: Make the env permanent during this workflow (but not the secrets)
shell: bash
run: |
{
echo "PYTORCH_ROOT=${{ env.PYTORCH_ROOT }}"
echo "BUILDER_ROOT=${{ env.BUILDER_ROOT }}"
echo "PACKAGE_TYPE=${{ env.PACKAGE_TYPE }}"
echo "DESIRED_CUDA=${{ env.DESIRED_CUDA }}"
echo "GPU_ARCH_VERSION=${{ env.GPU_ARCH_VERSION }}"
echo "GPU_ARCH_TYPE=${{ env.GPU_ARCH_TYPE }}"
echo "DOCKER_IMAGE=${{ env.DOCKER_IMAGE }}"
echo "SKIP_ALL_TESTS=${{ env.SKIP_ALL_TESTS }}"
echo "DESIRED_PYTHON=${{ env.DESIRED_PYTHON }}"
echo "PYTORCH_EXTRA_INSTALL_REQUIREMENTS=${{ env.PYTORCH_EXTRA_INSTALL_REQUIREMENTS }}"
echo "ANACONDA_USER=${{ env.ANACONDA_USER }}"
echo "BINARY_ENV_FILE=${{ env.BINARY_ENV_FILE }}"
echo "BUILD_ENVIRONMENT=${{ env.BUILD_ENVIRONMENT }}"
echo "BUILD_NAME=${{ env.BUILD_NAME }}"
echo "PR_NUMBER=${{ env.PR_NUMBER }}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
echo "SHA1=${{ env.SHA1 }}"
echo "USE_SPLIT_BUILD=${{ env.use_split_build }}"
echo "MAX_JOBS=${{ env.MAX_JOBS }}"
echo "TORCH_CUDA_ARCH_LIST=${{ env.TORCH_CUDA_ARCH_LIST }}"
} >> "${GITHUB_ENV} }}"
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
- name: Checkout PyTorch to pytorch dir
uses: malfet/checkout@silent-checkout
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
quiet-checkout: true
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder to builder dir
uses: malfet/checkout@silent-checkout
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
quiet-checkout: true
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Build PyTorch binary
run: |
set -x
mkdir -p ${RUNNER_TEMP}/artifacts
container_name=$(docker run \
-e BINARY_ENV_FILE \
-e BUILD_ENVIRONMENT \
-e DESIRED_CUDA \
-e DESIRED_DEVTOOLSET \
-e DESIRED_PYTHON \
-e GITHUB_ACTIONS \
-e GPU_ARCH_TYPE \
-e GPU_ARCH_VERSION \
-e LIBTORCH_VARIANT \
-e PACKAGE_TYPE \
-e PYTORCH_FINAL_PACKAGE_DIR \
-e PYTORCH_ROOT \
-e BUILDER_ROOT \
-e SKIP_ALL_TESTS \
-e PYTORCH_EXTRA_INSTALL_REQUIREMENTS \
-e USE_SPLIT_BUILD \
-e MAX_JOBS \
-e TORCH_CUDA_ARCH_LIST \
--tty \
--detach \
-v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-v "${GITHUB_WORKSPACE}/builder:/builder" \
-v "${RUNNER_TEMP}/artifacts:/artifacts" \
-w / \
"${DOCKER_IMAGE}"
)
docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
if [[ ${BUILD_ENVIRONMENT} == *"aarch64"* ]]; then
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/aarch64_linux/aarch64_ci_build.sh"
else
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ env.PACKAGE_TYPE }}/build.sh"
fi
docker exec -t "${container_name}" chown -R "$(id -u):$(id -g)" /artifacts
- name: Cleanup docker
if: always()
shell: bash
run: |
# stop the container for clean worker stop
# ignore expansion of "docker ps -q" since it could be empty
# shellcheck disable=SC2046
docker stop $(docker ps -q) || true
# upload to github artifacts (as we might not publish)
- uses: actions/[email protected]
with:
name: ${{ env.BUILD_NAME }}
if-no-files-found: error
path:
${{ runner.temp }}/artifacts/*
- name: Install publish dependencies
if: github.event.inputs.publish == 'true'
run: |
python -m pip install --upgrade pip
pip install twine
- name: Configure AWS credentials for publish
if: github.event.inputs.publish == 'true'
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/gh-action-publish-artifacts-role
aws-region: us-east-1
- name: Publish to CodeArtifact
if: github.event.inputs.publish == 'true'
run: |
export TWINE_USERNAME=aws
export TWINE_PASSWORD=$(aws codeartifact get-authorization-token --domain ${{ env.CODEARTIFACT_DOMAIN }} --domain-owner ${{ secrets.AWS_ACCOUNT_ID }} --query authorizationToken --output text)
export TWINE_REPOSITORY_URL=$(aws codeartifact get-repository-endpoint --domain ${{ env.CODEARTIFACT_DOMAIN }} --domain-owner ${{ secrets.AWS_ACCOUNT_ID }} --repository ${{ env.CODEARTIFACT_REPOSITORY }} --region us-east-1 --format pypi --query repositoryEndpoint --output text)
twine upload --verbose ${{ runner.temp }}/artifacts/*