Skip to content

Commit

Permalink
add pytorch API and controller (#1294)
Browse files Browse the repository at this point in the history
  • Loading branch information
zw0610 authored Jul 7, 2021
1 parent d72641c commit c172880
Show file tree
Hide file tree
Showing 28 changed files with 15,438 additions and 7 deletions.
8 changes: 8 additions & 0 deletions PROJECT
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,12 @@ resources:
kind: XGBoostJob
path: github.com/kubeflow/tf-operator/pkg/apis/xgboost/v1
version: v1
- api:
crdVersion: v1
namespaced: true
controller: true
group: kubeflow.org
kind: PyTorchJob
path: github.com/kubeflow/tf-operator/pkg/apis/pytorch/v1
version: v1
version: "3"
6,874 changes: 6,874 additions & 0 deletions config/crd/bases/kubeflow.org_pytorchjobs.yaml

Large diffs are not rendered by default.

6,899 changes: 6,899 additions & 0 deletions config/crd/bases/kubeflow.org_tfjobs.yaml

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions config/crd/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,20 @@
# It should be run by config/default
resources:
- bases/kubeflow.org_xgboostjobs.yaml
- bases/kubeflow.org_pytorchjobs.yaml
#+kubebuilder:scaffold:crdkustomizeresource

patchesStrategicMerge:
# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix.
# patches here are for enabling the conversion webhook for each CRD
#- patches/webhook_in_xgboostjobs.yaml
#- patches/webhook_in_pytorchjobs.yaml
#+kubebuilder:scaffold:crdkustomizewebhookpatch

# [CERTMANAGER] To enable webhook, uncomment all the sections with [CERTMANAGER] prefix.
# patches here are for enabling the CA injection for each CRD
#- patches/cainjection_in_xgboostjobs.yaml
#- patches/cainjection_in_pytorchjobs.yaml
#+kubebuilder:scaffold:crdkustomizecainjectionpatch

# the following config is for teaching kustomize how to do kustomization for CRDs.
Expand Down
7 changes: 7 additions & 0 deletions config/crd/patches/cainjection_in_pytorchjobs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# The following patch adds a directive for certmanager to inject CA into the CRD
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME)
name: pytorchjobs.kubeflow.org
16 changes: 16 additions & 0 deletions config/crd/patches/webhook_in_pytorchjobs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# The following patch enables a conversion webhook for the CRD
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: pytorchjobs.kubeflow.org
spec:
conversion:
strategy: Webhook
webhook:
clientConfig:
service:
namespace: system
name: webhook-service
path: /convert
conversionReviewVersions:
- v1
24 changes: 24 additions & 0 deletions config/rbac/pytorchjob_editor_role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# permissions for end users to edit pytorchjobs.
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: pytorchjob-editor-role
rules:
- apiGroups:
- kubeflow.org
resources:
- pytorchjobs
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- kubeflow.org
resources:
- pytorchjobs/status
verbs:
- get
20 changes: 20 additions & 0 deletions config/rbac/pytorchjob_viewer_role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# permissions for end users to view pytorchjobs.
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: pytorchjob-viewer-role
rules:
- apiGroups:
- kubeflow.org
resources:
- pytorchjobs
verbs:
- get
- list
- watch
- apiGroups:
- kubeflow.org
resources:
- pytorchjobs/status
verbs:
- get
7 changes: 7 additions & 0 deletions config/samples/kubeflow.org_v1_pytorchjob.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
apiVersion: kubeflow.org/v1
kind: PyTorchJob
metadata:
name: pytorchjob-sample
spec:
# Add fields here
foo: bar
12 changes: 5 additions & 7 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ import (
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"

pytorchv1 "github.com/kubeflow/tf-operator/pkg/apis/pytorch/v1"
xgboostv1 "github.com/kubeflow/tf-operator/pkg/apis/xgboost/v1"
xgboostcontroller "github.com/kubeflow/tf-operator/pkg/controller.v1/xgboost"
pytorchcontroller "github.com/kubeflow/tf-operator/pkg/controller.v1/pytorch"
//+kubebuilder:scaffold:imports
)

Expand All @@ -45,6 +46,7 @@ func init() {
utilruntime.Must(clientgoscheme.AddToScheme(scheme))

utilruntime.Must(xgboostv1.AddToScheme(scheme))
utilruntime.Must(pytorchv1.AddToScheme(scheme))
//+kubebuilder:scaffold:scheme
}

Expand Down Expand Up @@ -78,12 +80,8 @@ func main() {
os.Exit(1)
}

if err = (&xgboostcontroller.XGBoostJobReconciler{
Client: mgr.GetClient(),
Log: ctrl.Log.WithName("controllers").WithName("XGBoostJob"),
Scheme: mgr.GetScheme(),
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "XGBoostJob")
if err = pytorchcontroller.NewReconciler(mgr).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "PyTorchJob")
os.Exit(1)
}
//+kubebuilder:scaffold:builder
Expand Down
34 changes: 34 additions & 0 deletions pkg/apis/pytorch/v1/constants.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Copyright 2018 The Kubeflow Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package v1

import (
common "github.com/kubeflow/common/pkg/apis/common/v1"
)

const (
// EnvKubeflowNamespace is ENV for kubeflow namespace specified by user.
EnvKubeflowNamespace = "KUBEFLOW_NAMESPACE"

// DefaultPortName is name of the port used to communicate between Master and
// workers.
DefaultPortName = "pytorchjob-port"
// DefaultContainerName is the name of the PyTorchJob container.
DefaultContainerName = "pytorch"
// DefaultPort is default value of the port.
DefaultPort = 23456
// DefaultRestartPolicy is default RestartPolicy for PyTorchReplicaSpec.
DefaultRestartPolicy = common.RestartPolicyOnFailure
)
106 changes: 106 additions & 0 deletions pkg/apis/pytorch/v1/defaults.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
// Copyright 2018 The Kubeflow Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package v1

import (
"strings"

common "github.com/kubeflow/common/pkg/apis/common/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
)

// Int32 is a helper routine that allocates a new int32 value
// to store v and returns a pointer to it.
func Int32(v int32) *int32 {
return &v
}

func addDefaultingFuncs(scheme *runtime.Scheme) error {
return RegisterDefaults(scheme)
}

// setDefaultPort sets the default ports for pytorch container.
func setDefaultPort(spec *v1.PodSpec) {
index := 0
for i, container := range spec.Containers {
if container.Name == DefaultContainerName {
index = i
break
}
}

hasPyTorchJobPort := false
for _, port := range spec.Containers[index].Ports {
if port.Name == DefaultPortName {
hasPyTorchJobPort = true
break
}
}
if !hasPyTorchJobPort {
spec.Containers[index].Ports = append(spec.Containers[index].Ports, v1.ContainerPort{
Name: DefaultPortName,
ContainerPort: DefaultPort,
})
}
}

func setDefaultReplicas(spec *common.ReplicaSpec) {
if spec.Replicas == nil {
spec.Replicas = Int32(1)
}
if spec.RestartPolicy == "" {
spec.RestartPolicy = DefaultRestartPolicy
}
}

// setTypeNamesToCamelCase sets the name of all replica types from any case to correct case.
func setTypeNamesToCamelCase(job *PyTorchJob) {
setTypeNameToCamelCase(job, PyTorchReplicaTypeMaster)
setTypeNameToCamelCase(job, PyTorchReplicaTypeWorker)
}

// setTypeNameToCamelCase sets the name of the replica type from any case to correct case.
func setTypeNameToCamelCase(job *PyTorchJob, typ PyTorchReplicaType) {
for t := range job.Spec.PyTorchReplicaSpecs {
if strings.EqualFold(string(t), string(typ)) && t != typ {
spec := job.Spec.PyTorchReplicaSpecs[t]
delete(job.Spec.PyTorchReplicaSpecs, t)
job.Spec.PyTorchReplicaSpecs[typ] = spec
return
}
}
}

// SetDefaults_PyTorchJob sets any unspecified values to defaults.
func SetDefaults_PyTorchJob(job *PyTorchJob) {
// Set default cleanpod policy to None.
if job.Spec.CleanPodPolicy == nil {
policy := common.CleanPodPolicyNone
job.Spec.CleanPodPolicy = &policy
}

// Update the key of PyTorchReplicaSpecs to camel case.
setTypeNamesToCamelCase(job)

for rType, spec := range job.Spec.PyTorchReplicaSpecs {
// Set default replicas to 1.
setDefaultReplicas(spec)
if rType == PyTorchReplicaTypeMaster {
// Set default port to pytorch container of Master.
setDefaultPort(&spec.Template.Spec)
}
}
}
21 changes: 21 additions & 0 deletions pkg/apis/pytorch/v1/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// Copyright 2018 The Kubeflow Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +k8s:deepcopy-gen=package,register
// +k8s:defaulter-gen=TypeMeta
// +k8s:openapi-gen=true

// Package v1 is the v1 version of the API.
// +groupName=kubeflow.org
package v1
34 changes: 34 additions & 0 deletions pkg/apis/pytorch/v1/groupversion_info.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Copyright YEAR The Kubeflow Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package v1 contains API Schema definitions for the kubeflow.org v1 API group
//+kubebuilder:object:generate=true
//+groupName=kubeflow.org
package v1

import (
"k8s.io/apimachinery/pkg/runtime/schema"
"sigs.k8s.io/controller-runtime/pkg/scheme"
)

var (
// GroupVersion is group version used to register these objects
GroupVersion = schema.GroupVersion{Group: "kubeflow.org", Version: "v1"}

// SchemeBuilder is used to add go types to the GroupVersionKind scheme
SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion}

// AddToScheme adds the types in this group-version to the given scheme.
AddToScheme = SchemeBuilder.AddToScheme
)
Loading

0 comments on commit c172880

Please sign in to comment.