update kfp to 1.6

This commit is contained in:
pengluan 2021-09-07 18:04:49 +08:00
parent feb158037a
commit 62a675763c
25 changed files with 1829 additions and 102 deletions

View File

@ -1,66 +0,0 @@
# 所需要的所有镜像
images = ['gcr.io/kubeflow-images-public/xgboost-operator:vmaster-g56c2c075','gcr.io/ml-pipeline/metadata-writer:1.0.4','gcr.io/tfx-oss-public/ml_metadata_store_server:v0.21.1','gcr.io/ml-pipeline/envoy:metadata-grpc','mysql:8.0.3','docker.io/kubeflowkatib/katib-db-manager:v1beta1-a96ff59','docker.io/kubeflowkatib/katib-controller:v1beta1-a96ff59','argoproj/argoui:v2.3.0','gcr.io/istio-release/proxy_init:release-1.3-latest-daily','gcr.io/istio-release/kubectl:release-1.3-latest-daily','gcr.io/google_containers/spartakus-amd64:v1.1.0','gcr.io/istio-release/proxyv2:release-1.3-latest-daily','mpioperator/mpi-operator:latest','gcr.io/kubeflow-images-public/admission-webhook:vmaster-ge5452b6f','gcr.io/kubeflow-images-public/tf_operator:vmaster-gda226016','istio/proxyv2:1.3.1','gcr.io/istio-release/galley:release-1.3-latest-daily','quay.io/jetstack/cert-manager-cainjector:v0.11.0','gcr.io/istio-release/citadel:release-1.3-latest-daily','gcr.io/kubeflow-images-public/jupyter-web-app:vmaster-g845af298','python:3.7','gcr.io/istio-release/mixer:release-1.3-latest-daily','gcr.io/istio-release/pilot:release-1.3-latest-daily','gcr.io/spark-operator/spark-operator:v1beta2-1.1.0-2.4.5','gcr.io/kubebuilder/kube-rbac-proxy:v0.4.0','gcr.io/tfx-oss-public/ml_metadata_store_server:0.22.1','gcr.io/tfx-oss-public/ml_metadata_store_server:0.25.1','gcr.io/istio-release/sidecar_injector:release-1.3-latest-daily','quay.io/jetstack/cert-manager-webhook:v0.11.0','gcr.io/kubeflow-images-public/kubernetes-sigs/application:1.0-beta','gcr.io/kubeflow-images-public/centraldashboard:vmaster-g8097cfeb','gcr.io/kubeflow-images-public/xgboost-operator:v0.1.0','quay.io/jetstack/cert-manager-controller:v0.11.0','mysql:8','seldonio/seldon-core-operator:1.4.0','gcr.io/kfserving/kfserving-controller:v0.4.1','gcr.io/istio-release/node-agent-k8s:release-1.3-latest-daily','gcr.io/kubeflow-images-public/notebook-controller:vmaster-g6eb007d0','gcr.io/kubeflow-images-public/pytorch-operator:vmaster-g518f9c76','gcr.io/tfx-oss-public/ml_metadata_store_server:v0.21.1','metacontroller/metacontroller:v0.3.0','prom/prometheus:v2.8.0','gcr.io/kubeflow-images-public/kfam:vmaster-g9f3bfd00','kubeflow/mxnet-operator:v1.0.0-20200625','gcr.io/kubeflow-images-public/profile-controller:vmaster-ga49f658f','gcr.io/kubeflow-images-public/ingress-setup:latest']
new_pipline=['argoproj/workflow-controller:v2.3.0','gcr.io/ml-pipeline/metadata-envoy:1.0.4', 'gcr.io/ml-pipeline/api-server:1.0.4', 'gcr.io/ml-pipeline/argoexec:v2.7.5-license-compliance', 'gcr.io/ml-pipeline/minio:RELEASE.2019-08-14T20-37-41Z-license-compliance', 'gcr.io/ml-pipeline/persistenceagent:1.0.4', 'gcr.io/tfx-oss-public/ml_metadata_store_server:0.22.1', 'gcr.io/ml-pipeline/frontend:1.0.4', 'gcr.io/ml-pipeline/metadata-writer:1.0.4', 'gcr.io/ml-pipeline/scheduledworkflow:1.0.4', 'gcr.io/ml-pipeline/viewer-crd-controller:1.0.4', 'gcr.io/ml-pipeline/visualization-server:1.0.4', 'gcr.io/ml-pipeline/application-crd-controller:1.0-beta-non-cluster-role', 'gcr.io/ml-pipeline/workflow-controller:v2.7.5-license-compliance', 'gcr.io/ml-pipeline/mysql:5.6']
new_katib=['docker.io/kubeflowkatib/katib-ui:v1beta1-a96ff59','gcr.io/kubeflow-images-public/katib/v1alpha3/katib-db-manager', 'mysql:5.7', 'gcr.io/kubeflow-images-public/katib/v1alpha3/file-metrics-collector', 'gcr.io/kubeflow-images-public/katib/v1alpha3/katib-ui', 'gcr.io/kubeflow-images-public/katib/v1alpha3/katib-controller', 'docker.io/kubeflowkatib/mxnet-mnist', 'gcr.io/kubeflow-images-public/katib/v1alpha3/suggestion-hyperopt','gcr.io/kubeflow-images-public/katib/v1alpha3/tfevent-metrics-collector','gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0','gcr.io/kubeflow-images-public/katib/v1alpha3/suggestion-chocolate']
new_gpu=['nvidia/k8s-device-plugin:v0.7.1','nvidia/dcgm-exporter:2.0.13-2.1.2-ubuntu20.04','nvidia/pod-gpu-metrics-exporter:v1.0.0-alpha']
new_serving=['gcr.io/kfserving/alibi-explainer:0.2.2','gcr.io/kfserving/logger:0.2.2','tensorflow/serving:1.14.0','tensorflow/serving:1.14.0-gpu','tensorflow/serving:1.11.0','tensorflow/serving:1.11.0-gpu','tensorflow/serving:1.12.0','tensorflow/serving:1.12.0-gpu','tensorflow/serving:1.13.0','tensorflow/serving:1.13.0-gpu','tensorflow/serving:1.14.0','tensorflow/serving:1.14.0-gpu','tensorflow/serving:2.0.0','tensorflow/serving:2.0.0-gpu','tensorflow/serving:2.1.0','tensorflow/serving:2.1.0-gpu','tensorflow/serving:2.2.0','tensorflow/serving:2.2.0-gpu','tensorflow/serving:2.3.0','tensorflow/serving:2.3.0-gpu','tensorflow/serving:2.4.0','tensorflow/serving:2.4.0-gpu','mcr.microsoft.com/onnxruntime/server:v0.5.1','gcr.io/kfserving/sklearnserver:0.2.2','gcr.io/kfserving/xgbserver:0.2.2','gcr.io/kfserving/pytorchserver:0.2.2','nvcr.io/nvidia/tensorrtserver:19.05-py3','gcr.io/kfserving/storage-initializer:0.2.2','gcr.io/knative-releases/knative.dev/serving/cmd/queue:792f6945c7bc73a49a470a5b955c39c8bd174705743abf5fb71aa0f4c04128eb']
knative_sha256=['gcr.io/knative-releases/knative.dev/serving/cmd/activator@sha256:ffa3d72ee6c2eeb2357999248191a643405288061b7080381f22875cb703e929',
'gcr.io/knative-releases/knative.dev/serving/cmd/autoscaler@sha256:f89fd23889c3e0ca3d8e42c9b189dc2f93aa5b3a91c64e8aab75e952a210eeb3',
'gcr.io/knative-releases/knative.dev/serving/cmd/controller@sha256:b86ac8ecc6b2688a0e0b9cb68298220a752125d0a048b8edf2cf42403224393c',
'gcr.io/knative-releases/knative.dev/net-istio/cmd/webhook@sha256:e6b142c0f82e0e0b8cb670c11eb4eef6ded827f98761bbf4bea7bdb777b80092',
'gcr.io/knative-releases/knative.dev/net-istio/cmd/controller@sha256:75c7918ca887622e7242ec1965f87036db1dc462464810b72735a8e64111f6f7',
'gcr.io/knative-releases/knative.dev/serving/cmd/webhook@sha256:7e6df0fda229a13219bbc90ff72a10434a0c64cd7fe13dc534b914247d1087f4',
'gcr.io/knative-releases/knative.dev/serving/cmd/queue@sha256:d066ae5b642885827506610ae25728d442ce11447b82df6e9cc4c174bb97ecb3',
'gcr.io/knative-releases/knative.dev/eventing/cmd/controller@sha256:c99f08229c464407e5ba11f942d29b969e0f7dd2e242973d50d480cc45eebf28',
'gcr.io/knative-releases/knative.dev/eventing/cmd/channel_broker@sha256:5065eaeb3904e8b0893255b11fdcdde54a6bac1d0d4ecc8c9ce4c4c32073d924',
'gcr.io/knative-releases/knative.dev/eventing/cmd/webhook@sha256:a3046d0426b4617fe9186fb3d983e350de82d2e3f33dcc13441e591e24410901',
'gcr.io/knative-releases/knative.dev/eventing/cmd/in_memory/channel_controller@sha256:9a084ba0ed6a12862adb3ca00de069f0ec1715fe8d4db6c9921fcca335c675bb',
'gcr.io/knative-releases/knative.dev/eventing/cmd/in_memory/channel_dispatcher@sha256:8df896444091f1b34185f0fa3da5d41f32e84c43c48df07605c728e0fe49a9a8'
]
knative=['tencentmusic/knative:serving-activator',
'tencentmusic/knative:serving-autoscaler',
'tencentmusic/knative:serving-controller',
'tencentmusic/knative:serving-webhook',
'tencentmusic/knative:net-istio-webhook',
'tencentmusic/knative:net-istio-controller']
volcano = ['volcanosh/vc-controller-manager:latest','volcanosh/vc-scheduler:latest','volcanosh/vc-webhook-manager:latest']
kube_batch=['kubesigs/kube-batch:v0.5']
images=new_katib+images+new_pipline+new_gpu+new_serving+knative_sha256+knative+volcano
# images = kube_batch
images = list(set(images))
# 通过私有仓库将公有镜像下发到内网每台机器上例如内网localhub.example.com的仓库
HOST = 'localhub.example.com/kubeflow/'
for image in images:
# print(image)
image = image.replace('<none>','')
image_name = HOST+image.replace('/','-').replace('@sha256','')
# 可联网机器上拉取公有镜像并推送到私有仓库
print('docker pull %s'%image)
print('docker tag %s %s'%(image,image_name))
print('docker push %s'%(image_name))
# 内网机器上拉取私有仓库镜像
# image=image.replace('@sha256','')
# print("docker pull %s" % image_name)
# print("docker tag %s %s"%(image_name,image))

View File

@ -22,6 +22,8 @@ spec:
imagePullPolicy: Always
env:
- name: NAMESPACE_TO_WATCH
value: 'pipeline'
valueFrom:
fieldRef:
fieldPath: metadata.namespace
serviceAccountName: kubeflow-pipelines-cache-deployer-sa
restartPolicy: Always

View File

@ -56,7 +56,9 @@ spec:
name: mysql-secret
key: password
- name: NAMESPACE_TO_WATCH
value: pipeline
valueFrom:
fieldRef:
fieldPath: metadata.namespace
args: ["--db_driver=$(DBCONFIG_DRIVER)",
"--db_host=$(DBCONFIG_HOST_NAME)",
"--db_port=$(DBCONFIG_PORT)",

View File

@ -4,4 +4,4 @@ metadata:
name: mysql-secret
stringData:
username: root
password: admin
password: ""

View File

@ -5,7 +5,7 @@ metadata:
data:
appName: pipeline
appVersion: 1.6.0
dbHost: mysql-service.infra
dbHost: mysql
dbPort: "3306"
mlmdDb: metadb
cacheDb: cachedb
@ -21,7 +21,7 @@ data:
## the IANA Time Zone database, such as "America/New_York" and "Asia/Shanghai".
## Feature stage:
## [Alpha](https://github.com/kubeflow/pipelines/blob/07328e5094ac2981d3059314cc848fbb71437a76/docs/release/feature-stages.md#alpha)
cronScheduleTimezone: "CST"
cronScheduleTimezone: "UTC"
## cacheImage is the image that the mutating webhook will use to patch
## cached steps with. Will be used to echo a message announcing that
## the cached step result will be used. If not set it will default to

View File

@ -9,5 +9,5 @@ spec:
- name: server
env:
- name: NAMESPACE_TO_WATCH
value: 'pipeline'
value: ''
valueFrom: null

View File

@ -9,5 +9,5 @@ spec:
- name: main
env:
- name: NAMESPACE_TO_WATCH
value: 'pipeline'
value: ''
valueFrom: null

View File

@ -9,5 +9,5 @@ spec:
- name: ml-pipeline-persistenceagent
env:
- name: NAMESPACE
value: 'pipeline'
value: ''
valueFrom: null

View File

@ -105,6 +105,10 @@ class Controller(BaseHTTPRequestHandler):
"cpu": "50m",
"memory": "200Mi"
},
"limits": {
"cpu": "500m",
"memory": "1Gi"
},
}
}],
"serviceAccountName":

View File

@ -19,5 +19,7 @@ spec:
image: gcr.io/ml-pipeline/metadata-writer:dummy
env:
- name: NAMESPACE_TO_WATCH
value: 'pipeline'
valueFrom:
fieldRef:
fieldPath: metadata.namespace
serviceAccountName: kubeflow-pipelines-metadata-writer

View File

@ -23,7 +23,9 @@ spec:
name: pipeline-install-config
key: autoUpdatePipelineDefaultVersion
- name: POD_NAMESPACE
value: pipeline
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: OBJECTSTORECONFIG_SECURE
value: "false"
- name: OBJECTSTORECONFIG_BUCKETNAME

View File

@ -18,7 +18,9 @@ spec:
containers:
- env:
- name: NAMESPACE
value: pipeline
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: TTL_SECONDS_AFTER_WORKFLOW_FINISH
value: "86400"
- name: NUM_WORKERS

View File

@ -21,7 +21,9 @@ spec:
name: ml-pipeline-scheduledworkflow
env:
- name: NAMESPACE
value: pipeline
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: CRON_SCHEDULE_TIMEZONE
valueFrom:
configMapKeyRef:

View File

@ -10,5 +10,7 @@ spec:
protocol: TCP
port: 80
targetPort: 3000
nodePort: 30004
type: NodePort
selector:
app: ml-pipeline-ui

View File

@ -11,9 +11,9 @@ bases:
- ../base/pipeline/cluster-scoped
#- ../base/cache-deployer/cluster-scoped
#vars:
## NOTE: var name must be unique globally to allow composition of multiple kustomize
## packages. Therefore, we added prefix `kfp-cluster-scoped-` to distinguish it from
## others.
# NOTE: var name must be unique globally to allow composition of multiple kustomize
# packages. Therefore, we added prefix `kfp-cluster-scoped-` to distinguish it from
# others.
#- name: kfp-cluster-scoped-namespace
# objref:
# # cache deployer sa's metadata.namespace will be first transformed by namespace field in kustomization.yaml
@ -25,4 +25,3 @@ bases:
# fieldpath: metadata.namespace
configurations:
- params.yaml

View File

@ -0,0 +1,16 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: not-important
spec:
template:
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubeflow
operator: In
values:
- "true"

View File

@ -17,3 +17,132 @@ commonLabels:
# !!! If you want to customize the namespace,
# please also update base/cache-deployer/cluster-scoped/cache-deployer-clusterrolebinding.yaml
namespace: kubeflow
patches:
# 为所有的Deployment打上节点亲和度补丁
- path: affinity_patch.yaml
target:
kind: Deployment
# 修改流水线执行的命名空间
- patch: |
- op: add
path: /spec/template/spec/containers/0/args/-
value: "--managed-namespace"
- op: add
path: /spec/template/spec/containers/0/args/-
value: "pipeline"
target:
kind: Deployment
name: workflow-controller
# 修改mysql密码
- patch: |
- op: replace
path: /stringData/password
value: "admin"
target:
kind: Secret
name: mysql-secret
# 修改mysql服务host
- patch: |
- op: replace
path: /data/dbHost
value: "mysql-service.infra"
target:
kind: ConfigMap
name: pipeline-install-config
# minio不要用subPath,rancher部署的集群不支持
- patch: |
- op: remove
path: /spec/template/spec/containers/0/volumeMounts/0/subPath
target:
kind: Deployment
name: minio
# minio-pvc 加一个selector
- path: minio_pvc_selector_patch.yaml
target:
kind: PersistentVolumeClaim
name: minio-pvc
# 修改 metadata-writer 的 NAMESPACE_TO_WATCH 环境变量
- patch: |
- op: replace
path: /spec/template/spec/containers/0/env/0/value
value: "pipeline"
- op: remove
path: /spec/template/spec/containers/0/env/0/valueFrom
target:
kind: Deployment
name: metadata-writer
# 修改 ml-pipeline 的 POD_NAMESPACE 环境变量
- patch: |
- op: replace
path: /spec/template/spec/containers/0/env/1/value
value: "pipeline"
- op: remove
path: /spec/template/spec/containers/0/env/1/valueFrom
target:
kind: Deployment
name: ml-pipeline
# 修改 ml-pipeline-persistenceagent 的 NAMESPACE 环境变量
- patch: |
- op: replace
path: /spec/template/spec/containers/0/env/0/value
value: "pipeline"
- op: remove
path: /spec/template/spec/containers/0/env/0/valueFrom
target:
kind: Deployment
name: ml-pipeline-persistenceagent
# 所有镜像拉取策略改为IfNotPresent
- patch: |
- op: replace
path: /spec/template/spec/containers/0/imagePullPolicy
value: "IfNotPresent"
target:
kind: Deployment
# 部分serviceaccount权限增大
- patch: |
- op: replace
path: /kind
value: "ClusterRoleBinding"
- op: replace
path: /roleRef/kind
value: "ClusterRole"
- op: add
path: /subjects/0/namespace
value: "kubeflow"
target:
kind: RoleBinding
name: ml-pipeline-persistenceagent-binding|argo-binding|kubeflow-pipelines-metadata-writer-binding|ml-pipeline|ml-pipeline-scheduledworkflow-binding|ml-pipeline-ui|ml-pipeline-viewer-crd-service-account-binding
- patch: |
- op: replace
path: /kind
value: "ClusterRoleBinding"
- op: replace
path: /roleRef/kind
value: "ClusterRole"
- op: add
path: /subjects/0/namespace
value: "pipeline"
target:
kind: RoleBinding
name: pipeline-runner-binding
- patch: |
- op: replace
path: /kind
value: "ClusterRole"
- op: replace
path: /metadata/namespace
value: ""
target:
kind: Role
name: ml-pipeline-persistenceagent-role|argo-role|kubeflow-pipelines-metadata-writer-role|ml-pipeline|ml-pipeline-scheduledworkflow-role|ml-pipeline-ui|ml-pipeline-viewer-crd-service-account-role|pipeline-runner
# 删除mysql、cache、cache-deployer的部署
#- patch: |
# - op: remove
# path: /
# target:
# name: cache.*
#patchesStrategicMerge:
#- remove_patch.yaml

View File

@ -0,0 +1,8 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: minio-pvc
spec:
selector:
matchLabels:
kubeflow-pvname: kubeflow-minio-pv

File diff suppressed because it is too large Load Diff

View File

@ -34,6 +34,9 @@ spec:
fieldRef:
fieldPath: metadata.namespace
resources:
limits:
cpu: 100m
memory: 30Mi
requests:
cpu: 100m
memory: 20Mi

View File

@ -34,3 +34,6 @@ data:
requests:
cpu: 0.01
memory: 32Mi
limits:
cpu: 0.5
memory: 512Mi

View File

@ -48,6 +48,9 @@ spec:
successThreshold: 1
timeoutSeconds: 1
resources:
limits:
cpu: 100m
memory: 256Mi
requests:
cpu: 100m
memory: 256Mi

View File

@ -29,6 +29,9 @@ spec:
ports:
- containerPort: 2345
resources:
limits:
cpu: "4"
memory: 4Gi
requests:
cpu: 500m
memory: 1Gi

View File

@ -15,7 +15,7 @@ spec:
labels:
app: mysql
spec:
serviceAccountName: mysql
# serviceAccountName: mysql
containers:
# https://dev.mysql.com/doc/refman/5.7/en/server-options.html#option_mysqld_ignore-db-dir
# Ext4, Btrfs etc. volumes root directories have a lost+found directory that should not be treated as a database.
@ -31,14 +31,14 @@ spec:
ports:
- containerPort: 3306
name: mysql
volumeMounts:
- mountPath: /var/lib/mysql
name: mysql-persistent-storage
# volumeMounts:
# - mountPath: /var/lib/mysql
# name: mysql-persistent-storage
resources:
requests:
cpu: 100m
memory: 800Mi
volumes:
- name: mysql-persistent-storage
persistentVolumeClaim:
claimName: mysql-pv-claim
# volumes:
# - name: mysql-persistent-storage
# persistentVolumeClaim:
# claimName: mysql-pv-claim

View File

@ -19,21 +19,31 @@ kubeflow每一部分相应的可以独立升级
```
1.6.0 版本替换为自己的mysql数据库未测试
```
注释pipeline/1.6.0/kustomize/cluster-scoped-resources/kustomization.yaml中的 cache部署 ,namespace的部署
注释pipeline/1.6.0/kustomize/base/installs/generic/kustomization.yaml中的cache部署
注释pipeline/1.6.0/kustomize/env/platform-agnostic/kustomization.yaml 中的 mysql部署
修改pipeline/1.6.0/kustomize/base/installs/generic/mysql-secret.yaml 中的mysql账号密码
修改pipeline/1.6.0/kustomize/base/installs/generic/pipeline-install-config.yaml 中mysql的地址信息
1.6.0 版本更改
为所有的Deployment打上节点亲和度补丁
修改流水线执行的命名空间
修改mysql服务host和密码为自己部署的
minio不要用subPath,rancher部署的集群不支持
minio-pvc 加一个selector
修改 metadata-writer 的 NAMESPACE_TO_WATCH 环境变量
修改 ml-pipeline 的 POD_NAMESPACE 环境变量
修改 ml-pipeline-persistenceagent 的 NAMESPACE 环境变量
所有镜像拉取策略改为IfNotPresent
修改某些跨命名空间的serviceaccount权限
不部署cache模块和mysql模块
部署1.6.0版本
cd pipeline/1.6.0/kustomize
kubectl apply -k cluster-scoped-resources/
kubectl wait crd/applications.app.k8s.io --for condition=established --timeout=60s
kubectl apply -k env/platform-agnostic/
```
cd pipeline/1.0.6/kustomize
kustomize build cluster-scoped-resources/ | kubectl apply -f -
kubectl wait crd/applications.app.k8s.io --for condition=established --timeout=60s
kustomize build env/platform-agnostic/ | kubectl apply -f -
kubectl wait applications/pipeline -n kubeflow --for condition=Ready --timeout=1800s
# 注意1.6.0版本初始化时如果检测到mysql里有mlpipeline库不会在里面建表。所以部署前保证mlpipeline库已经建好表
#或者没有mlpipeline库
# 注意需要kustomize版本大于v3.0.0安装可下载releaseshttps://github.com/kubernetes-sigs/kustomize/releases/tag/kustomize%2Fv4.3.0
#如果kubectl版本大于等于v1.22.1也可以直接用kubectl apply -k 安装。
1.0.4版本替换为自己的mysql
```