forked from GoogleCloudPlatform/ml-testing-accelerators
-
Notifications
You must be signed in to change notification settings - Fork 0
108 lines (97 loc) · 4.46 KB
/
ci_pytorch.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# Example workflow to run TPU tests in Github Actions.
name: CI
# Controls when the action will run.
#
# If `push` is enabled, the workflow will run for every PR that is merged into
# the `master` branch.
#
# If `pull_requests` is enabled, the workflow will run for every commit
# pushed to a pending PR on the `master` branch.
#
# If `schedule` is enabled, the workflow will run at the specified times.
on:
# push:
# branches: [ master ]
# pull_request:
# branches: [ master ]
schedule:
# Run every day at 5:00 UTC.
- cron: '0 5 * * *'
env:
GKE_LOCATION: us-central1
IMAGE: gcr.io/${{ secrets.GKE_PROJECT }}/pytorch-xla
IMAGE_TAG: "nightly" # If building images, use $GITHUB_RUN_ID.
MAX_CHECKS: 120
CHECK_SPEED: 15
jobs:
pytorchTPU:
runs-on: ubuntu-latest
steps:
# Grab latest code from master or pending PR.
# NOTE: This affects only the code that runs here on the Github Action
# machine. It will not affect the code that runs on the Docker image in
# GKE. To do that, modify the Dockerfile to checkout the desired code.
- name: Code checkout
uses: actions/checkout@v2
with:
repository: GoogleCloudPlatform/ml-testing-accelerators
ref: ${{ github.event.pull_request.head.sha }}
- name: Install Go
uses: actions/setup-go@v2
with:
go-version: 1.14.x
- name: Setup gcloud CLI
uses: GoogleCloudPlatform/github-actions/setup-gcloud@master
with:
version: '290.0.1'
service_account_key: ${{ secrets.GKE_SA_KEY_BASE64 }}
project_id: ${{ secrets.GKE_PROJECT }}
export_default_credentials: true
#########################################################################
# Example of building and pushing a Docker image to use in GKE.
# For our repo, this is not necessary since we build daily images using
# Google Cloud Build and push them to gcr.
- name: Configure Docker
if: false # SKIP THIS STEP.
run: |-
gcloud --quiet auth configure-docker
shell: bash
- name: Build and Push Docker Image
if: false # SKIP THIS STEP.
run: |
cd ci_pytorch
docker build --tag "$IMAGE:$GITHUB_RUN_ID" -f Dockerfile --build-arg "GITHUB_REF=$GITHUB_REF" .
docker push "$IMAGE:$GITHUB_RUN_ID"
shell: bash
#########################################################################
- name: Install jsonnet
run: |-
go get github.com/google/go-jsonnet/cmd/jsonnet
shell: bash
# Get the GKE credentials so we can deploy to the cluster
# Switch to `--zone` instead of `--region` if your cluster is zonal.
- name: Get cluster credentials
run: |-
gcloud container clusters get-credentials ${{ secrets.GKE_CLUSTER }} --region "$GKE_LOCATION"
- name: Deploy the job on the kubernetes cluster
run: |-
job_name=$(jsonnet -J . ci_pytorch/tpu_test.jsonnet --ext-str image=$IMAGE --ext-str image-tag=$IMAGE_TAG | kubectl create -f -) && \
job_name=${job_name#job.batch/} && \
job_name=${job_name% created} && \
echo "Waiting on kubernetes job: $job_name" && \
i=0 && \
status_code=2 && \
# Check on the job periodically. Set the status code depending on what
# happened to the job in Kubernetes. If we try MAX_CHECKS times and
# still the job hasn't finished, give up and return the starting
# non-zero status code.
while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEED; done && \
echo "Done waiting. Job status code: $status_code" && \
pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') && \
echo "GKE pod name: $pod_name" && \
kubectl logs -f $pod_name --container=train && \
echo "Done with log retrieval attempt." && \
# Uncomment this line if building a single-use testing Docker image.
# gcloud container images delete "$IMAGE:$GITHUB_RUN_ID" --force-delete-tags && \
exit $status_code
shell: bash