Basics

helmut-hoffer-von-ankershoffen · Jul 14, 2019 · 185912c · 185912c
commit 185912c
Show file tree

Hide file tree

Showing 60 changed files with 8,951 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,12 @@
+.idea
+*~
+.DS_Store
+._.DS_Store
+.com.apple.timemachine.supported
+.docker-sync
+.gitattributes
+
+workflow/provision/image/*
+!workflow/provision/image/.gitkeep
+
+.ipynb_checkpoints
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Helmut Hoffer von Ankershoffen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,159 @@
+.DEFAULT_GOAL := help
+SHELL := /bin/bash
+
+
+help: ## This help panel.
+	@IFS=$$'\n' ; \
+	help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
+	printf "%-30s %s\n" "DevOps console for Project Jetson" ; \
+	printf "%-30s %s\n" "==================================" ; \
+	printf "%-30s %s\n" "" ; \
+	printf "%-30s %s\n" "Target" "Help" ; \
+	printf "%-30s %s\n" "------" "----" ; \
+	for help_line in $${help_lines[@]}; do \
+        IFS=$$':' ; \
+        help_split=($$help_line) ; \
+        help_command=`echo $${help_split[0]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \
+        help_info=`echo $${help_split[2]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \
+        printf '\033[36m'; \
+        printf "%-30s %s" $$help_command ; \
+        printf '\033[0m'; \
+        printf "%s\n" $$help_info; \
+    done
+
+%:      # thanks to chakrit
+	@:    # thanks to Wi.lliam Pursell
+
+
+bootstrap-environment: requirements bootstrap-environment-message ## Bootstrap development environment!
+
+requirements: requirements-bootstrap ## Install requirements on workstation
+
+requirements-bootstrap: ## Prepare basic packages on workstation
+	workflow/requirements/macOS/bootstrap
+	source ~/.bash_profile && rbenv install --skip-existing 2.2.
+	source ~/.bash_profile && ansible-galaxy install -r workflow/requirements/macOS/ansible/requirements.yml
+	ansible-playbook -i "localhost," workflow/requirements/generic/ansible/playbook.yml --tags "hosts" --ask-become-pass
+	source ~/.bash_profile && ansible-playbook -i "localhost," workflow/requirements/macOS/ansible/playbook.yml --ask-become-pass
+	source ~/.bash_profile && $(SHELL) -c 'cd workflow/requirements/macOS/docker; . ./daemon_check.sh'
+
+requirements-docker: ## Prepare Docker on workstation
+	source ~/.bash_profile && $(SHELL) -c 'cd workflow/requirements/macOS/docker; . ./daemon_check.sh'
+
+requirements-hosts: ## Prepare /etc/hosts on workstation
+	ansible-playbook -i "localhost," workflow/requirements/generic/ansible/playbook.yml --tags "hosts" --ask-become-pass
+
+requirements-packages: ## Install packages on workstation
+	ansible-playbook -i "localhost," workflow/requirements/macOS/ansible/playbook.yml --ask-become-pass
+
+requirements-ansible: ## Install ansible requirements on workstation for provisioning jetson
+	ansible-galaxy install -r workflow/provision/requirements.yml
+
+bootstrap-environment-message: ## Echo a message that the app installation is happening now
+	@echo ""
+	@echo ""
+	@echo "Welcome!"
+	@echo ""
+	@echo "1) Please follow the instructions to fully install and start Docker - Docker started up when its Icon ("the whale") is no longer moving."
+	@echo ""
+	@echo "2) Click on the Docker icon, goto Preferences / Advanced, set Memory to at least 4GiB and click Apply & Restart."
+	@echo ""
+	@echo ""
+
+
+image-download: ## Download Nvidia Jetpack into workflow/provision/image
+	cd workflow/provision/image && wget -N -O jetson-nano-sd.zip https://developer.nvidia.com/embedded/dlc/jetson-nano-dev-kit-sd-card-image && unzip -o *.zip && rm -f jetson-nano-sd.zip
+
+setup-access-secure: ## Allow passwordless ssh and sudo, disallow ssh with password
+	ssh-copy-id -i ~/.ssh/id_rsa [email protected]
+	cd workflow/provision && ansible-playbook main.yml --tags "access_secure" -b -K
+
+
+provision: ## Provision the Nvidia Jetson Nano
+	cd workflow/provision && ansible-playbook main.yml --tags "provision"
+
+provision-base: ## Provision base
+	cd workflow/provision && ansible-playbook main.yml --tags "base"
+
+provision-kernel: ## Compile custom kernel for docker - takes ca. 60 minutes
+	cd workflow/provision && ansible-playbook main.yml --tags "kernel"
+
+provision-firewall: ## Provision firewall
+	cd workflow/provision && ansible-playbook main.yml --tags "firewall"
+
+provision-lxde: ## Provision LXDE
+	cd workflow/provision && ansible-playbook main.yml --tags "lxde"
+
+provision-vnc: ## Provision VNC
+	cd workflow/provision && ansible-playbook main.yml --tags "vnc"
+
+provision-xrdp: ## Provision XRDP
+	cd workflow/provision && ansible-playbook main.yml --tags "xrdp"
+
+provision-k8s: ## Provision Kubernetes
+	cd workflow/provision && ansible-playbook main.yml --tags "k8s"
+
+provision-build: ## Provision build environment
+	cd workflow/provision && ansible-playbook main.yml --tags "build"
+
+provision-swap: ## Provision swap
+	cd workflow/provision && ansible-playbook main.yml --tags "swap"
+
+provision-performance-mode: ## Set performace mode
+	cd workflow/provision && ansible-playbook main.yml --tags "performance_mode"
+
+nano-one-ssh: ## ssh to nano-one as user admin
+	ssh [email protected]
+
+nano-one-ssh-build: ## ssh to nano-one as user build
+	ssh [email protected]
+
+nano-one-exec: ## exec command on nano-one - you must pass in arguments e.g. tegrastats
+	ssh [email protected] $(filter-out $@,$(MAKECMDGOALS))
+
+
+k8s-proxy: ## Open proxy
+	kubectl proxy
+
+k8s-dashboard-bearer-token-show: ## Show dashboard bearer token
+	workflow/k8s/dashboard-bearer-token-show
+
+k8s-dashboard-open: ## Open Dashboard
+	python -mwebbrowser http://localhost:8001/api/v1/namespaces/kube-system/services/https:kubernetes-dashboard:/proxy/#!/overview?namespace=default
+
+
+device-query-deploy: device-query-build-and-push ## Build and deploy device query
+	kubectl create namespace jetson-device-query || true
+	cd workflow/deploy/device-query && skaffold run
+
+device-query-log-show: ## Show log of pod
+	cd workflow/deploy/device-query && ./log-show
+
+device-query-delete: ## Delete device query deployment
+	kubectl delete namespace jetson-device-query || true
+	cd workflow/deploy/device-query && skaffold delete
+
+device-query-dev: ## Enter build, deploy, tail, watch cycle for device query
+	kubectl create namespace jetson-device-query || true
+	cd workflow/deploy/device-query && skaffold dev
+
+
+jupyter-deploy: jupyter-build-and-push ## Build and deploy jupyter
+	kubectl create namespace jetson-jupyter || true
+	kubectl create secret generic jupyter.polarize.ai --from-file workflow/deploy/jupyter/.basic-auth --namespace=jetson-jupyter || true
+	cd workflow/deploy/jupyter && skaffold run
+
+jupyter-open: ## Open browser pointing to jupyter notebook
+	python -mwebbrowser http://jupyter.nano-one.local/
+
+jupyter-log-show: ## Show log of pod
+	cd workflow/deploy/jupyter && ./log-show
+
+jupyter-delete: ## Delete jupyter deployment
+	kubectl delete namespace jetson-jupyter || true
+	cd workflow/deploy/jupyter && skaffold delete
+
+jupyter-dev: ## Enter build, deploy, tail, watch cycle for jupyter
+	kubectl create namespace jetson-jupyter || true
+	kubectl create secret generic jupyter.polarize.ai --from-file workflow/deploy/jupyter/.basic-auth --namespace=jetson-jupyter || true
+	cd workflow/deploy/jupyter && skaffold dev
diff --git a/README.md b/README.md
@@ -0,0 +1,93 @@
+# jetson
+
+Experimenting with Nvidia Jetson Nano, Kubernetes and ML.
+
+Hints:
+- Assumes an Nvidia Jetson Nano, TX2 or AGX Xavier as embedded device, called "nano" below for simplicity.
+- Assumes a macOS device for development
+- Assumes access to a bare-metal Kubernetes cluster the nano can join e.g. set up using https://github.com/helmuthva/ceil/tree/max.
+- Assumes basic knowledge of Ansible, Docker and Kubernetes (k8s).
+
+
+## Features
+
+- [x] basics: Automatically provision requirements on macOS device for development
+- [x] basics: Prepare hardware
+- [x] basics: Manually provision os
+- [x] basics: Automatically provision secure ssh access
+- [x] basics: Automatically provision passwordless sudo
+- [x] basics: Automatically install basic packages
+- [x] basics: Automatically setup LXDE 
+- [x] basics: Automatically setup VNC
+- [x] basics: Automatically setup RDP (optional)
+- [x] basics: Automatically setup swap
+- [x] basics: Automatically set performance mode
+- [X] k8s: Automatically build custom kernel as required by Docker + Kubernetes + Weave networking
+- [x] k8s: Automatically join Kubernetes cluster `max` as worker node labeled as `jetson` - see https://github.com/helmuthva/ceil/tree/max reg. `max`
+- [x] k8s: Automatically build and deploy CUDA deviceQuery as pod in k8s cluster to validate access to GPU and correct labeling of jetson nodes
+- [x] k8s: Build and deploy using Skaffold and kustomize
+- [ ] basics: Update to Jetpack 4.2.1 providing support for NGC et al (waiting for release)
+- [ ] security: Automatically setup firewall (waiting for iptables fix in Nvidia kernel sources) 
+- [x] ml: Use Archiconda - the arm flavor of Anacoda - for building Docker containers for arm64
+- [x] ml: Automatically build and deploy Jupyter server with support for CUDA accelerated tensorflow and keras as pod in k8s cluster running on jetson node
+- [ ] ml: Experiment with containers from NGC
+- [ ] community: Author a blog post explaining how to set up ML in Kubernetes on Jetson devices
+- [ ] ml: Scale out with Xaviers and deploy Polarize AI core (separate project)
+
+
+## Bootstrap
+
+1) Execute `make bootstrap-environment` to install requirements on your macOS device and setup hostnames such as `nano-one.local` in your `/etc/hosts`
+
+
+## Provision
+
+### Manually flash base os, create `admin` account and establish secure access
+
+1) Execute `make image-download` to download and unzip the Nvidia Jetpack image into `workflow/provision/image/`
+2) Start the `balenaEtcher` application and flash your micro sd card with the downloaded image
+3) Insert the designated micro sd card in your Nvidia Jetson nano and power up
+4) Create account with username `admin` and "Administrator" rights via the UI
+5) Execute `make setup-access-secure` and enter the password you set for the `admin` user the step above - passwordless ssh access and sudo will be set up
+
+Hints:
+* The `balenaEtcher` application was installed as part of bootstrap on your macOS device
+
+### Automatically provision services, kernel, k8s
+
+1) Execute `make provision` -  services will provisioned, kernel will be compiled, kubernetes cluster will be joined
+
+Hints:
+* If you want to provision step by step execute `make help | grep "provision-"` and execute the desired make target e.g. `make provision-kernel`
+* SSH into your nano using `make nano-one-ssh` - your ssh public key was uploaded during provisioning so no password is needed
+* VNC into your nano by starting the VNC Viewer application which was installed as part of bootstrap and connect to `nano-one.local:5901` - the password is `secret`
+* You will have to update the `kubernetes.token` in `workflow/provision/group_vars/all.yml` to a valid join token that can be created using `make k8s-token-create` in `max` cluster
+
+
+## Build and deploy
+
+1) Execute `make device-query-deploy` to build and deploy a pod into the k8s cluster that queries CUDA capabilities thus validating GPU access from k8s - execute `make device-query-log-show` to show the result after deploying
+2) Execute `make jupyter-deploy` to build and deploy a Jupyter server supporting CUDA accelerated TensorFlow + Keras as a k8s pod running on nano - execute `make jupyter-open` to open a browser tab pointing to the Jupyter server
+
+Hints:
+- Remote building on nano is implemented using Skaffold and a custom builder: E.g. use `make device-query-dev` to enter a build, deploy, tail, watch cycle.
+- Deployments are defined using kustomize - you can thus define overlays for deployments on other clusters easily.
+- Archiconda - the arm flavor of Anaconda - is used for installation inside Docker containers, see the Dockerfile of the Jupyter deployment
+- To easily inspect the cluster execute the lovely `click` which was installed as part of bootstrap.
+- Execute `make help` to show other targets that can be built and deployed
+
+
+## Additional references
+
+- https://developer.nvidia.com/embedded/learn/get-started-jetson-nano-devkit (intro)
+- https://developer.nvidia.com/embedded/jetpack (jetpack)
+- https://blog.hackster.io/getting-started-with-the-nvidia-jetson-nano-developer-kit-43aa7c298797  (jetpack,vnc)
+- https://devtalk.nvidia.com/default/topic/1051327/jetson-nano-jetpack-4-2-firewall-broken-possible-kernel-compilation-issue-missing-iptables-modules/ (jetpack,firewall,ufw,bug)
+- https://devtalk.nvidia.com/default/topic/1052748/jetson-nano/egx-nvidia-docker-runtime-on-nano/ (docker,nvidia,missing)
+- https://blog.hypriot.com/post/nvidia-jetson-nano-build-kernel-docker-optimized/ (docker,workaround)
+- https://github.com/Technica-Corporation/Tegra-Docker (docker,workaround)
+- https://medium.com/@jerry_liang/deploy-gpu-enabled-kubernetes-pod-on-nvidia-jetson-nano-ce738e3bcda9 (k8s)
+- https://gist.github.com/buptliuwei/8a340cc151507cb48a071cda04e1f882 (k8s)
+- https://github.com/dusty-nv/jetson-inference/ (ml)
+- https://docs.nvidia.com/deeplearning/frameworks/install-tf-jetson-platform/index.html (tensorflow)
+- https://devtalk.nvidia.com/default/topic/1043951/jetson-agx-xavier/docker-gpu-acceleration-on-jetson-agx-for-ubuntu-18-04-image/post/5296647/#5296647 (docker,tensorflow)
diff --git a/workflow/deploy/device-query/builder b/workflow/deploy/device-query/builder
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+echo "Building $IMAGES ..."
+
+## Sync src to nano
+rsync -rlptza --delete -P src/ [email protected]:~/device-query
+
+## Build on nano
+ssh [email protected] << EOF
+  echo "Building executable ..."
+  cd /usr/local/cuda/samples/1_Utilities/deviceQuery
+  sudo make clean
+  sudo make
+  cp deviceQuery ~/device-query/deviceQuery
+
+  echo "Building Docker image ..."
+  docker build -t device_query ~/device-query
+EOF
+
+## Tag and possibly push image
+for image in $(echo $IMAGES | tr " " "\n")
+do
+    echo "Tagging with $image ..."
+    ssh [email protected] "docker tag device_query $image"
+    if $PUSH_IMAGE
+    then
+        echo "Pushing $image ..."
+        ssh [email protected] "docker push $image"
+    fi
+done
diff --git a/workflow/deploy/device-query/kustomize/base/deployment.yaml b/workflow/deploy/device-query/kustomize/base/deployment.yaml
@@ -0,0 +1,61 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: device-query
+  namespace: jetson-device-query
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: device-query
+  template:
+    metadata:
+      name: device-query
+      labels:
+        app: device-query
+    spec:
+      hostname: device-query
+      containers:
+        - name: device-query
+          image: max-one.loc  al:5001/jetson/device-query
+          volumeMounts:
+            - mountPath: /dev/nvhost-ctrl
+              name: nvhost-ctrl
+            - mountPath: /dev/nvhost-ctrl-gpu
+              name: nvhost-ctrl-gpu
+            - mountPath: /dev/nvhost-prof-gpu
+              name: nvhost-prof-gpu
+            - mountPath: /dev/nvmap
+              name: nvmap
+            - mountPath: /dev/nvhost-gpu
+              name: nvhost-gpu
+            - mountPath: /dev/nvhost-as-gpu
+              name: nvhost-as-gpu
+            - mountPath: /usr/lib/aarch64-linux-gnu/tegra
+              name: lib
+          securityContext:
+            privileged: true
+      volumes:
+        - name: nvhost-ctrl
+          hostPath:
+            path: /dev/nvhost-ctrl
+        - name: nvhost-ctrl-gpu
+          hostPath:
+            path: /dev/nvhost-ctrl-gpu
+        - name: nvhost-prof-gpu
+          hostPath:
+            path: /dev/nvhost-prof-gpu
+        - name: nvmap
+          hostPath:
+            path: /dev/nvmap
+        - name: nvhost-gpu
+          hostPath:
+            path: /dev/nvhost-gpu
+        - name: nvhost-as-gpu
+          hostPath:
+            path: /dev/nvhost-as-gpu
+        - name: lib
+          hostPath:
+            path: /usr/lib/aarch64-linux-gnu/tegra
+      nodeSelector:
+        jetson: "true"
diff --git a/workflow/deploy/device-query/kustomize/base/kustomization.yaml b/workflow/deploy/device-query/kustomize/base/kustomization.yaml
@@ -0,0 +1,3 @@
+---
+resources:
+- deployment.yaml