From d232c2415e9ad8b9cf34a86a7215a19076857ba7 Mon Sep 17 00:00:00 2001 From: chendile Date: Sun, 3 Sep 2023 18:19:47 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0cube=20studio=E5=88=9D?= =?UTF-8?q?=E5=A7=8B=E5=8C=96=E9=83=A8=E7=BD=B2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- install/kubernetes/all_image.py | 91 ++++++++--------------- install/kubernetes/create_ns_secret.sh | 3 +- install/kubernetes/ns.yaml | 38 ---------- install/kubernetes/pull_image_kubeflow.sh | 60 ++++++++------- install/kubernetes/pv-pvc-automl.yaml | 45 ++++++++++- install/kubernetes/pv-pvc-infra.yaml | 18 +++-- install/kubernetes/pv-pvc-jupyter.yaml | 18 +++-- install/kubernetes/pv-pvc-kubeflow.yaml | 8 +- install/kubernetes/pv-pvc-pipeline.yaml | 29 +++++--- install/kubernetes/pv-pvc-service.yaml | 18 +++-- install/kubernetes/sa-rbac.yaml | 11 ++- install/kubernetes/start-mini.sh | 7 +- install/kubernetes/start.sh | 58 ++++----------- install/kubernetes/virtual.yaml | 61 ++++++++++----- 14 files changed, 221 insertions(+), 244 deletions(-) delete mode 100644 install/kubernetes/ns.yaml diff --git a/install/kubernetes/all_image.py b/install/kubernetes/all_image.py index 7d3836cf..b85d08a1 100644 --- a/install/kubernetes/all_image.py +++ b/install/kubernetes/all_image.py @@ -1,58 +1,52 @@ # 所需要的所有镜像 kubeflow = [ - 'mysql:5.7', # 数据库 - 'bitnami/redis:4.0.14', # 缓存 - 'alpine:3.10', - "busybox", - "ccr.ccs.tencentyun.com/cube-studio/kubeflow:training-operator", # 分布式训练 - 'ccr.ccs.tencentyun.com/cube-studio/spark-operator:v1beta2-1.3.7-3.1.1', # spark serverless + 'mysql:8.0.32', # 数据库 + 'bitnami/redis:6.2.12', # 缓存 + "busybox:1.36.0", + "kubeflow/training-operator:v1-8a066f9", # 分布式训练 ] kubernetes_dashboard = [ - # 'kubernetesui/dashboard:v2.6.1', # k8s dashboard - # 'kubernetesui/metrics-scraper:v1.0.8', # k8s dashboard 上的指标监控 - 'kubernetesui/dashboard:v2.2.0', # k8s dashboard - 'kubernetesui/metrics-scraper:v1.0.6', # k8s dashboard 上的指标监控 + 'kubernetesui/dashboard:v2.6.1', # k8s dashboard + 'kubernetesui/metrics-scraper:v1.0.8', # k8s dashboard 上的指标监控 ] new_gpu = [ - 'nvidia/k8s-device-plugin:v0.7.1', # gpu k8s插件 - 'nvidia/dcgm-exporter:2.3.1-2.6.1-ubuntu20.04', # gpu监控 - 'tkestack/gpu-manager:1.0.3' + 'nvidia/k8s-device-plugin:v0.11.0-ubuntu20.04', # gpu k8s插件 + 'nvidia/dcgm-exporter:3.1.7-3.1.4-ubuntu20.04', # gpu监控 ] new_prometheus = [ - 'quay.io/prometheus/alertmanager:v0.15.0', # 报警 + "prom/prometheus:v2.27.1", # peomethues数据库 + 'prom/node-exporter:v1.5.0', # 机器指标 + 'quay.io/prometheus-operator/prometheus-config-reloader:v0.46.0', # prometheus配置翻译 - "quay.io/prometheus/prometheus:v2.27.1", # peomethues数据库 - 'quay.io/coreos/kube-state-metrics:v1.3.1', # 状态 指标 - 'quay.io/prometheus/node-exporter:v0.15.2', # 机器指标 - 'quay.io/coreos/kube-rbac-proxy:v0.3.1', # 指标 - 'quay.io/coreos/addon-resizer:1.0', # 指标 "quay.io/prometheus-operator/prometheus-operator:v0.46.0", # prometheus 部署工具 - "k8s.gcr.io/prometheus-adapter/prometheus-adapter:v0.9.1", # peometheus指标翻译为自定义指标 + 'bitnami/kube-rbac-proxy:0.14.1', # 指标 + 'carlosedp/addon-resizer:v1.8.4', # 指标 + 'grafana/grafana:9.1.5' # 监控看板 + "ccr.ccs.tencentyun.com/cube-studio/prometheus-adapter:v0.9.1", # peometheus指标翻译为自定义指标 ] istio = [ - "istio/proxyv2:1.14.1", # ingressgateway - "istio/pilot:1.14.1" # 数据面 + "istio/proxyv2:1.15.0", # ingressgateway + "istio/pilot:1.15.0" # 数据面 ] volcano = [ - 'volcanosh/vc-controller-manager:v1.4.0', # 控制器 - 'volcanosh/vc-scheduler:v1.4.0', # 调度器 - 'volcanosh/vc-webhook-manager:v1.4.0' # 拦截器 + 'volcanosh/vc-controller-manager:v1.7.0', # 控制器 + 'volcanosh/vc-scheduler:v1.7.0', # 调度器 + 'volcanosh/vc-webhook-manager:v1.7.0' # 拦截器 ] nni = [ 'frameworkcontroller/frameworkcontroller' # 超参搜索 ] pipeline = [ - 'minio/minio', - 'quay.io/argoproj/argoexec:v3.4.3', - 'quay.io/argoproj/workflow-controller:latest', - 'quay.io/argoproj/workflow-controller:v3.4.3', - 'quay.io/argoproj/argocli:latest' + 'minio/minio:RELEASE.2023-04-20T17-56-55Z', + 'argoproj/argoexec:v3.4.3', + 'argoproj/workflow-controller:v3.4.3', + 'argoproj/argocli:v3.4.3' ] cube_studio = [ # notebook基础镜像 @@ -96,19 +90,14 @@ cube_studio = [ 'ccr.ccs.tencentyun.com/cube-studio/onnxruntime:latest-cuda', # 任务模板的镜像 - "ubuntu:18.04", + "ubuntu:20.04", + 'python:3.9', "ccr.ccs.tencentyun.com/cube-studio/datax:latest", "ccr.ccs.tencentyun.com/cube-studio/volcano:20211001", "ccr.ccs.tencentyun.com/cube-studio/ray:gpu-20210601", "ccr.ccs.tencentyun.com/cube-studio/sklearn_estimator:v1", - "ccr.ccs.tencentyun.com/cube-studio/xgb_train_and_predict:v1", - "ccr.ccs.tencentyun.com/cube-studio/tf2.3_keras_train:latest", - "ccr.ccs.tencentyun.com/cube-studio/tf2.3_plain_train:latest", - "ccr.ccs.tencentyun.com/cube-studio/tf_distributed_train:latest", - "ccr.ccs.tencentyun.com/cube-studio/tf2.3_model_evaluation:latest", - "ccr.ccs.tencentyun.com/cube-studio/tf_distributed_eval:latest", - "ccr.ccs.tencentyun.com/cube-studio/tf_model_offline_predict:latest", - "ccr.ccs.tencentyun.com/cube-studio/pytorch_distributed_train_k8s:20201010", + "ccr.ccs.tencentyun.com/cube-studio/xgb:20230801", + "ccr.ccs.tencentyun.com/cube-studio/pytorch:20201010", "ccr.ccs.tencentyun.com/cube-studio/horovod:20210401", "ccr.ccs.tencentyun.com/cube-studio/video-audio:20210601", "ccr.ccs.tencentyun.com/cube-studio/video-audio:20210601", @@ -116,27 +105,11 @@ cube_studio = [ "ccr.ccs.tencentyun.com/cube-studio/kaldi_distributed_on_volcano:v2", "ccr.ccs.tencentyun.com/cube-studio/volcano:offline-predict-20220101", "ccr.ccs.tencentyun.com/cube-studio/object_detection_on_darknet:v1", - "ccr.ccs.tencentyun.com/cube-studio/deploy-service:20211001" + "ccr.ccs.tencentyun.com/cube-studio/deploy-service:20250501", # 用户可能使用的基础镜像 - 'ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda11.0.3-cudnn8', - 'ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda11.0.3-cudnn8-python3.7', - 'ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda11.0.3-cudnn8-python3.8', - 'ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda10.2-cudnn7', - 'ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda10.2-cudnn7-python3.7', - 'ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda10.2-cudnn7-python3.8', - 'ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda10.1-cudnn7', - 'ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda10.1-cudnn7-python3.6', - 'ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda10.1-cudnn7-python3.7', - 'ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda10.1-cudnn7-python3.8', - 'ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda10.0-cudnn7', - 'ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda10.0-cudnn7-python3.6', - 'ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda10.0-cudnn7-python3.7', - 'ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda10.0-cudnn7-python3.8', - 'ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda9.1-cudnn7', - 'ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda9.1-cudnn7-python3.6', - 'ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda9.1-cudnn7-python3.7', - 'ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda9.1-cudnn7-python3.8', + 'ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda11.8.0-cudnn8-python3.9', + ] # images = kubeflow + kubernetes_dashboard + new_gpu + new_prometheus + istio+ volcano + nni+ pipeline+cube_studio @@ -160,8 +133,10 @@ for image in images: # # 拉取公有镜像 image = image.replace('@sha256', '') + # print("docker pull %s && docker tag %s %s &" % (image_name,image_name,image)) print("docker pull %s &" % (image,)) + print('') print('wait') diff --git a/install/kubernetes/create_ns_secret.sh b/install/kubernetes/create_ns_secret.sh index fbef98f7..167faa32 100755 --- a/install/kubernetes/create_ns_secret.sh +++ b/install/kubernetes/create_ns_secret.sh @@ -1,10 +1,9 @@ -for namespace in 'infra' 'kubeflow' 'istio-system' 'pipeline' 'automl' 'jupyter' 'service' 'monitoring' 'logging' 'kube-system' +for namespace in 'infra' 'kubeflow' 'istio-system' 'pipeline' 'automl' 'jupyter' 'service' 'monitoring' 'logging' 'kube-system' 'aihub' do kubectl create ns $namespace kubectl delete secret docker-registry hubsecret -n $namespace kubectl create secret docker-registry hubsecret --docker-server=https://index.docker.io/v1/ --docker-username=xxx --docker-password=xxxx -n $namespace -# kubectl create secret docker-registry oa-hubsecret --docker-server=docker.oa.com:8080 --docker-username=xxx --docker-password=xxxx -n $namespace kubectl label ns $namespace istio-injection=disabled --overwrite # kubectl label namespace $namespace istio-inhection=enabled --overwrite done diff --git a/install/kubernetes/ns.yaml b/install/kubernetes/ns.yaml deleted file mode 100644 index 30a6d2d9..00000000 --- a/install/kubernetes/ns.yaml +++ /dev/null @@ -1,38 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: infra ---- -apiVersion: v1 -kind: Namespace -metadata: - name: pipeline ---- -apiVersion: v1 -kind: Namespace -metadata: - name: automl - ---- -apiVersion: v1 -kind: Namespace -metadata: - name: service - - ---- -apiVersion: v1 -kind: Namespace -metadata: - name: jupyter - ---- -apiVersion: v1 -kind: Namespace -metadata: - name: kubeflow ---- -apiVersion: v1 -kind: Namespace -metadata: - name: istio-system \ No newline at end of file diff --git a/install/kubernetes/pull_image_kubeflow.sh b/install/kubernetes/pull_image_kubeflow.sh index ebdb6ad3..13873a66 100644 --- a/install/kubernetes/pull_image_kubeflow.sh +++ b/install/kubernetes/pull_image_kubeflow.sh @@ -1,33 +1,31 @@ -docker pull ccr.ccs.tencentyun.com/cube-studio/bitnami-redis:4.0.14 && docker tag ccr.ccs.tencentyun.com/cube-studio/bitnami-redis:4.0.14 bitnami/redis:4.0.14 & -docker pull ccr.ccs.tencentyun.com/cube-studio/kubeflow:training-operator && docker tag ccr.ccs.tencentyun.com/cube-studio/kubeflow:training-operator ccr.ccs.tencentyun.com/cube-studio/kubeflow:training-operator & -docker pull ccr.ccs.tencentyun.com/cube-studio/istio-pilot:1.14.1 && docker tag ccr.ccs.tencentyun.com/cube-studio/istio-pilot:1.14.1 istio/pilot:1.14.1 & -docker pull ccr.ccs.tencentyun.com/cube-studio/volcanosh-vc-webhook-manager:v1.4.0 && docker tag ccr.ccs.tencentyun.com/cube-studio/volcanosh-vc-webhook-manager:v1.4.0 volcanosh/vc-webhook-manager:v1.4.0 & -docker pull ccr.ccs.tencentyun.com/cube-studio/nvidia-k8s-device-plugin:v0.7.1 && docker tag ccr.ccs.tencentyun.com/cube-studio/nvidia-k8s-device-plugin:v0.7.1 nvidia/k8s-device-plugin:v0.7.1 & -docker pull ccr.ccs.tencentyun.com/cube-studio/nvidia-dcgm-exporter:2.3.1-2.6.1-ubuntu20.04 && docker tag ccr.ccs.tencentyun.com/cube-studio/nvidia-dcgm-exporter:2.3.1-2.6.1-ubuntu20.04 nvidia/dcgm-exporter:2.3.1-2.6.1-ubuntu20.04 & -docker pull ccr.ccs.tencentyun.com/cube-studio/quay.io-coreos-kube-state-metrics:v1.3.1 && docker tag ccr.ccs.tencentyun.com/cube-studio/quay.io-coreos-kube-state-metrics:v1.3.1 quay.io/coreos/kube-state-metrics:v1.3.1 & -docker pull ccr.ccs.tencentyun.com/cube-studio/quay.io-prometheus-node-exporter:v0.15.2 && docker tag ccr.ccs.tencentyun.com/cube-studio/quay.io-prometheus-node-exporter:v0.15.2 quay.io/prometheus/node-exporter:v0.15.2 & -docker pull ccr.ccs.tencentyun.com/cube-studio/quay.io-argoproj-argoexec:v3.4.3 && docker tag ccr.ccs.tencentyun.com/cube-studio/quay.io-argoproj-argoexec:v3.4.3 quay.io/argoproj/argoexec:v3.4.3 & -docker pull ccr.ccs.tencentyun.com/cube-studio/quay.io-prometheus-alertmanager:v0.15.0 && docker tag ccr.ccs.tencentyun.com/cube-studio/quay.io-prometheus-alertmanager:v0.15.0 quay.io/prometheus/alertmanager:v0.15.0 & -docker pull ccr.ccs.tencentyun.com/cube-studio/quay.io-argoproj-workflow-controller:v3.4.3 && docker tag ccr.ccs.tencentyun.com/cube-studio/quay.io-argoproj-workflow-controller:v3.4.3 quay.io/argoproj/workflow-controller:v3.4.3 & -docker pull ccr.ccs.tencentyun.com/cube-studio/quay.io-coreos-addon-resizer:1.0 && docker tag ccr.ccs.tencentyun.com/cube-studio/quay.io-coreos-addon-resizer:1.0 quay.io/coreos/addon-resizer:1.0 & -docker pull ccr.ccs.tencentyun.com/cube-studio/spark-operator:v1beta2-1.3.7-3.1.1 && docker tag ccr.ccs.tencentyun.com/cube-studio/spark-operator:v1beta2-1.3.7-3.1.1 ccr.ccs.tencentyun.com/cube-studio/spark-operator:v1beta2-1.3.7-3.1.1 & -docker pull ccr.ccs.tencentyun.com/cube-studio/kubernetesui-dashboard:v2.2.0 && docker tag ccr.ccs.tencentyun.com/cube-studio/kubernetesui-dashboard:v2.2.0 kubernetesui/dashboard:v2.2.0 & -docker pull ccr.ccs.tencentyun.com/cube-studio/quay.io-coreos-kube-rbac-proxy:v0.3.1 && docker tag ccr.ccs.tencentyun.com/cube-studio/quay.io-coreos-kube-rbac-proxy:v0.3.1 quay.io/coreos/kube-rbac-proxy:v0.3.1 & -docker pull ccr.ccs.tencentyun.com/cube-studio/quay.io-prometheus-operator-prometheus-config-reloader:v0.46.0 && docker tag ccr.ccs.tencentyun.com/cube-studio/quay.io-prometheus-operator-prometheus-config-reloader:v0.46.0 quay.io/prometheus-operator/prometheus-config-reloader:v0.46.0 & -docker pull ccr.ccs.tencentyun.com/cube-studio/grafana-grafana:9.1.5 && docker tag ccr.ccs.tencentyun.com/cube-studio/grafana-grafana:9.1.5 grafana/grafana:9.1.5 & -docker pull ccr.ccs.tencentyun.com/cube-studio/alpine:3.10 && docker tag ccr.ccs.tencentyun.com/cube-studio/alpine:3.10 alpine:3.10 & -docker pull ccr.ccs.tencentyun.com/cube-studio/istio-proxyv2:1.14.1 && docker tag ccr.ccs.tencentyun.com/cube-studio/istio-proxyv2:1.14.1 istio/proxyv2:1.14.1 & -docker pull ccr.ccs.tencentyun.com/cube-studio/k8s.gcr.io-prometheus-adapter-prometheus-adapter:v0.9.1 && docker tag ccr.ccs.tencentyun.com/cube-studio/k8s.gcr.io-prometheus-adapter-prometheus-adapter:v0.9.1 k8s.gcr.io/prometheus-adapter/prometheus-adapter:v0.9.1 & -docker pull ccr.ccs.tencentyun.com/cube-studio/quay.io-prometheus-operator-prometheus-operator:v0.46.0 && docker tag ccr.ccs.tencentyun.com/cube-studio/quay.io-prometheus-operator-prometheus-operator:v0.46.0 quay.io/prometheus-operator/prometheus-operator:v0.46.0 & -docker pull ccr.ccs.tencentyun.com/cube-studio/quay.io-argoproj-workflow-controller:latest && docker tag ccr.ccs.tencentyun.com/cube-studio/quay.io-argoproj-workflow-controller:latest quay.io/argoproj/workflow-controller:latest & -docker pull ccr.ccs.tencentyun.com/cube-studio/quay.io-argoproj-argocli:latest && docker tag ccr.ccs.tencentyun.com/cube-studio/quay.io-argoproj-argocli:latest quay.io/argoproj/argocli:latest & -docker pull ccr.ccs.tencentyun.com/cube-studio/volcanosh-vc-scheduler:v1.4.0 && docker tag ccr.ccs.tencentyun.com/cube-studio/volcanosh-vc-scheduler:v1.4.0 volcanosh/vc-scheduler:v1.4.0 & -docker pull ccr.ccs.tencentyun.com/cube-studio/volcanosh-vc-controller-manager:v1.4.0 && docker tag ccr.ccs.tencentyun.com/cube-studio/volcanosh-vc-controller-manager:v1.4.0 volcanosh/vc-controller-manager:v1.4.0 & -docker pull ccr.ccs.tencentyun.com/cube-studio/kubernetesui-metrics-scraper:v1.0.6 && docker tag ccr.ccs.tencentyun.com/cube-studio/kubernetesui-metrics-scraper:v1.0.6 kubernetesui/metrics-scraper:v1.0.6 & -docker pull ccr.ccs.tencentyun.com/cube-studio/minio-minio && docker tag ccr.ccs.tencentyun.com/cube-studio/minio-minio minio/minio & -docker pull ccr.ccs.tencentyun.com/cube-studio/mysql:5.7 && docker tag ccr.ccs.tencentyun.com/cube-studio/mysql:5.7 mysql:5.7 & -docker pull ccr.ccs.tencentyun.com/cube-studio/busybox && docker tag ccr.ccs.tencentyun.com/cube-studio/busybox busybox & -docker pull ccr.ccs.tencentyun.com/cube-studio/frameworkcontroller-frameworkcontroller && docker tag ccr.ccs.tencentyun.com/cube-studio/frameworkcontroller-frameworkcontroller frameworkcontroller/frameworkcontroller & -docker pull ccr.ccs.tencentyun.com/cube-studio/quay.io-prometheus-prometheus:v2.27.1 && docker tag ccr.ccs.tencentyun.com/cube-studio/quay.io-prometheus-prometheus:v2.27.1 quay.io/prometheus/prometheus:v2.27.1 & +docker pull istio/proxyv2:1.15.0 & +docker pull ccr.ccs.tencentyun.com/tkeimages/gpu-manager:latest & +docker pull volcanosh/vc-webhook-manager:v1.7.0 & +docker pull busybox:1.36.0 & +docker pull postgres:11.5 & +docker pull prom/node-exporter:v1.5.0 & +docker pull volcanosh/vc-scheduler:v1.7.0 & +docker pull argoproj/argoexec:v3.4.3 & +docker pull ccr.ccs.tencentyun.com/cube-studio/spark-operator:1.3.7-3.1.1 & +docker pull prom/prometheus:v2.27.1 & +docker pull quay.io/prometheus-operator/prometheus-config-reloader:v0.46.0 & +docker pull frameworkcontroller/frameworkcontroller & +docker pull kubeflow/training-operator:v1-8a066f9 & +docker pull nvidia/k8s-device-plugin:v0.11.0-ubuntu20.04 & +docker pull argoproj/argocli:v3.4.3 & +docker pull quay.io/prometheus-operator/prometheus-operator:v0.46.0 & +docker pull volcanosh/vc-controller-manager:v1.7.0 & +docker pull nvidia/dcgm-exporter:3.1.7-3.1.4-ubuntu20.04 & +docker pull mysql:8.0.32 & +docker pull kubernetesui/metrics-scraper:v1.0.8 & +docker pull minio/minio:RELEASE.2023-04-20T17-56-55Z & +docker pull carlosedp/addon-resizer:v1.8.4 & +docker pull kubernetesui/dashboard:v2.6.1 & +docker pull istio/pilot:1.15.0 & +docker pull bitnami/redis:6.2.12 & +docker pull bitnami/kube-rbac-proxy:0.14.1 & +docker pull ccr.ccs.tencentyun.com/cube-studio/label-studio:1.7.3 & +docker pull argoproj/workflow-controller:v3.4.3 & +docker pull grafana/grafana:9.1.5ccr.ccs.tencentyun.com/cube-studio/prometheus-adapter:v0.9.1 & wait diff --git a/install/kubernetes/pv-pvc-automl.yaml b/install/kubernetes/pv-pvc-automl.yaml index d81bb25a..6e8fbe03 100755 --- a/install/kubernetes/pv-pvc-automl.yaml +++ b/install/kubernetes/pv-pvc-automl.yaml @@ -28,6 +28,45 @@ spec: resources: requests: storage: 500Gi - selector: - matchLabels: - automl-pvname: automl-kubeflow-user-workspace + storageClassName: "" + volumeName: automl-kubeflow-user-workspace +# selector: +# matchLabels: +# automl-pvname: automl-kubeflow-user-workspace + + + +# 模型归档 +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: automl-kubeflow-archives + labels: + automl-pvname: automl-kubeflow-archives +spec: + capacity: + storage: 500Gi + accessModes: + - ReadWriteMany + hostPath: + path: /data/k8s/kubeflow/pipeline/archives + persistentVolumeReclaimPolicy: Retain +--- +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: kubeflow-archives + namespace: automl +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 500Gi + storageClassName: "" + volumeName: automl-kubeflow-archives +# selector: +# matchLabels: +# automl-pvname: automl-kubeflow-archives + diff --git a/install/kubernetes/pv-pvc-infra.yaml b/install/kubernetes/pv-pvc-infra.yaml index cf799fde..e894e1e9 100755 --- a/install/kubernetes/pv-pvc-infra.yaml +++ b/install/kubernetes/pv-pvc-infra.yaml @@ -26,10 +26,11 @@ spec: resources: requests: storage: 100Gi - selector: - matchLabels: - infra-pvname: infra-kubeflow-global-pv - +# selector: +# matchLabels: +# infra-pvname: infra-kubeflow-global-pv + storageClassName: "" + volumeName: infra-kubeflow-global-pv --- apiVersion: v1 @@ -58,7 +59,8 @@ spec: resources: requests: storage: 1Gi - selector: - matchLabels: - infra-pvname: infra-kubeflow - +# selector: +# matchLabels: +# infra-pvname: infra-kubeflow + storageClassName: "" + volumeName: infra-kubeflow diff --git a/install/kubernetes/pv-pvc-jupyter.yaml b/install/kubernetes/pv-pvc-jupyter.yaml index 70424acd..65960da5 100755 --- a/install/kubernetes/pv-pvc-jupyter.yaml +++ b/install/kubernetes/pv-pvc-jupyter.yaml @@ -27,10 +27,11 @@ spec: resources: requests: storage: 500Gi - selector: - matchLabels: - jupyter-pvname: jupyter-kubeflow-user-workspace - +# selector: +# matchLabels: +# jupyter-pvname: jupyter-kubeflow-user-workspace + storageClassName: "" + volumeName: jupyter-kubeflow-user-workspace # 模型归档 --- @@ -60,7 +61,8 @@ spec: resources: requests: storage: 500Gi - selector: - matchLabels: - jupyter-pvname: jupyter-kubeflow-archives - +# selector: +# matchLabels: +# jupyter-pvname: jupyter-kubeflow-archives + storageClassName: "" + volumeName: jupyter-kubeflow-archives diff --git a/install/kubernetes/pv-pvc-kubeflow.yaml b/install/kubernetes/pv-pvc-kubeflow.yaml index 66b0beee..b369699c 100755 --- a/install/kubernetes/pv-pvc-kubeflow.yaml +++ b/install/kubernetes/pv-pvc-kubeflow.yaml @@ -29,6 +29,8 @@ spec: resources: requests: storage: 500Gi - selector: - matchLabels: - kubeflow-pvname: kubeflow-kubeflow-user-workspace +# selector: +# matchLabels: +# kubeflow-pvname: kubeflow-kubeflow-user-workspace + storageClassName: "" + volumeName: kubeflow-kubeflow-user-workspace \ No newline at end of file diff --git a/install/kubernetes/pv-pvc-pipeline.yaml b/install/kubernetes/pv-pvc-pipeline.yaml index 349680b0..7307153d 100755 --- a/install/kubernetes/pv-pvc-pipeline.yaml +++ b/install/kubernetes/pv-pvc-pipeline.yaml @@ -26,10 +26,11 @@ spec: resources: requests: storage: 100Gi - selector: - matchLabels: - pipeline-pvname: pipeline-kubeflow-global-pv - +# selector: +# matchLabels: +# pipeline-pvname: pipeline-kubeflow-global-pv + storageClassName: "" + volumeName: pipeline-kubeflow-global-pv # 模型训练 --- @@ -40,7 +41,7 @@ metadata: labels: pipeline-pvname: pipeline-kubeflow-user-workspace spec: -# storageClassName: pipeline + capacity: storage: 500Gi accessModes: @@ -48,6 +49,7 @@ spec: hostPath: path: /data/k8s/kubeflow/pipeline/workspace persistentVolumeReclaimPolicy: Retain + --- kind: PersistentVolumeClaim apiVersion: v1 @@ -60,9 +62,11 @@ spec: resources: requests: storage: 500Gi - selector: - matchLabels: - pipeline-pvname: pipeline-kubeflow-user-workspace +# selector: +# matchLabels: +# pipeline-pvname: pipeline-kubeflow-user-workspace + storageClassName: "" + volumeName: pipeline-kubeflow-user-workspace # 模型部署 --- apiVersion: v1 @@ -108,9 +112,10 @@ spec: resources: requests: storage: 500Gi - selector: - matchLabels: - pipeline-pvname: pipeline-kubeflow-archives - +# selector: +# matchLabels: +# pipeline-pvname: pipeline-kubeflow-archives + storageClassName: "" + volumeName: pipeline-kubeflow-archives diff --git a/install/kubernetes/pv-pvc-service.yaml b/install/kubernetes/pv-pvc-service.yaml index 6d67a18b..d8c4ede3 100755 --- a/install/kubernetes/pv-pvc-service.yaml +++ b/install/kubernetes/pv-pvc-service.yaml @@ -27,10 +27,11 @@ spec: resources: requests: storage: 500Gi - selector: - matchLabels: - service-pvname: service-kubeflow-user-workspace - +# selector: +# matchLabels: +# service-pvname: service-kubeflow-user-workspace + storageClassName: "" + volumeName: service-kubeflow-user-workspace # 模型归档 --- @@ -60,8 +61,9 @@ spec: resources: requests: storage: 500Gi - selector: - matchLabels: - service-pvname: service-kubeflow-archives - +# selector: +# matchLabels: +# service-pvname: service-kubeflow-archives + storageClassName: "" + volumeName: service-kubeflow-archives diff --git a/install/kubernetes/sa-rbac.yaml b/install/kubernetes/sa-rbac.yaml index 452f03dd..cf983a23 100755 --- a/install/kubernetes/sa-rbac.yaml +++ b/install/kubernetes/sa-rbac.yaml @@ -5,10 +5,7 @@ metadata: name: kubeflow-clusterrole rules: - apiGroups: ["*"] - resources: ["pods","pods/log","services","endpoints","configmaps","nodes","deployments","mpijobs","tfjobs","pytorchjobs","frameworks"] - verbs: ["create", "delete", "deletecollection", "patch", "update", "get", "list", "watch"] -- apiGroups: ["*"] - resources: ["*"] + resources: ["pods","pods/exec","pods/log","services","endpoints","events","configmaps","nodes","deployments","mpijobs","tfjobs","pytorchjobs","frameworks","jobs","sparkapplications","mxjobs","paddlejobs","xgboostjobs"] verbs: ["create", "delete", "deletecollection", "patch", "update", "get", "list", "watch"] --- apiVersion: v1 @@ -18,10 +15,11 @@ metadata: namespace: pipeline --- -kind: ClusterRoleBinding +kind: RoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: name: kubeflow-pipeline + namespace: pipeline subjects: - kind: ServiceAccount name: kubeflow-pipeline @@ -39,10 +37,11 @@ metadata: name: nni namespace: automl --- -kind: ClusterRoleBinding +kind: RoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: name: automl-nni-clusterrolebinding + namespace: automl subjects: - kind: ServiceAccount name: nni diff --git a/install/kubernetes/start-mini.sh b/install/kubernetes/start-mini.sh index a4378c48..3c3930e7 100644 --- a/install/kubernetes/start-mini.sh +++ b/install/kubernetes/start-mini.sh @@ -9,7 +9,7 @@ kubectl label node $node train=true cpu=true notebook=true service=true org=publ sh create_ns_secret.sh kubectl apply -f sa-rbac.yaml # 部署dashboard -kubectl apply -f dashboard/v2.2.0-cluster.yaml +kubectl apply -f dashboard/v2.6.1-cluster.yaml # 部署mysql kubectl create -f mysql/pv-pvc-hostpath.yaml kubectl create -f mysql/service.yaml @@ -25,10 +25,7 @@ kubectl create -f redis/master.yaml # 部署istio kubectl apply -f istio/install-crd.yaml kubectl wait crd/envoyfilters.networking.istio.io --for condition=established --timeout=60s -kubectl apply -f istio/install.yaml -# k8s 1.21+ -# kubectl delete -f istio/install.yaml -# kubectl apply -f istio/install-1.15.0.yaml +kubectl apply -f istio/install-1.15.0.yaml kubectl wait crd/virtualservices.networking.istio.io --for condition=established --timeout=60s kubectl wait crd/gateways.networking.istio.io --for condition=established --timeout=60s diff --git a/install/kubernetes/start.sh b/install/kubernetes/start.sh index 1a995f5e..f2b54ff0 100644 --- a/install/kubernetes/start.sh +++ b/install/kubernetes/start.sh @@ -3,7 +3,7 @@ bash init_node.sh iptables -P FORWARD ACCEPT iptables -P INPUT ACCEPT iptables -P OUTPUT ACCEPT -mkdir -p ~/.kube && cp config ~/.kube/config && cp ~/.kube/config /etc/kubernetes/admin.conf +mkdir -p ~/.kube /etc/kubernetes/ && rm -rf ~/.kube/config /etc/kubernetes/admin.conf && cp config ~/.kube/config && cp ~/.kube/config /etc/kubernetes/admin.conf mkdir -p kubeconfig && echo "" > kubeconfig/dev-kubeconfig curl -LO https://dl.k8s.io/release/v1.24.0/bin/linux/amd64/kubectl && chmod +x kubectl && cp kubectl /usr/bin/ && mv kubectl /usr/local/bin/ node=`kubectl get node -o wide |grep $1 |awk '{print $1}'| head -n 1` @@ -13,20 +13,15 @@ kubectl label node $node train=true cpu=true notebook=true service=true org=publ sh create_ns_secret.sh kubectl apply -f sa-rbac.yaml # 部署dashboard -kubectl apply -f dashboard/v2.2.0-cluster.yaml -# 高版本k8s部署2.6.1版本 -#kubectl apply -f dashboard/v2.6.1-cluster.yaml +kubectl apply -f dashboard/v2.6.1-cluster.yaml # 部署mysql kubectl create -f mysql/pv-pvc-hostpath.yaml kubectl create -f mysql/service.yaml kubectl create -f mysql/configmap-mysql.yaml kubectl create -f mysql/deploy.yaml # 部署redis -kubectl create -f redis/pv-hostpath.yaml -kubectl create -f redis/configmap.yaml -kubectl create -f redis/service.yaml -# 如果自己需要使用pv来保存redis队列数据,可以修改master.yaml -kubectl create -f redis/master.yaml +kubectl delete -f redis/redis.yaml +kubectl create -f redis/redis.yaml # 部署prometheus cd prometheus @@ -36,18 +31,11 @@ kubectl apply -f ./operator/operator-crd.yml kubectl apply -f ./operator/operator-rbac.yml kubectl wait crd/podmonitors.monitoring.coreos.com --for condition=established --timeout=60s kubectl apply -f ./operator/operator-dp.yml -kubectl apply -f ./alertmanater/alertmanager-main-sa.yml -kubectl apply -f ./alertmanater/alertmanager-main-secret.yml -kubectl apply -f ./alertmanater/alertmanager-main-svc.yml -kubectl apply -f ./alertmanater/alertmanager-main.yml kubectl apply -f ./node-exporter/node-exporter-sa.yml kubectl apply -f ./node-exporter/node-exporter-rbac.yml kubectl apply -f ./node-exporter/node-exporter-svc.yml kubectl apply -f ./node-exporter/node-exporter-ds.yml -kubectl apply -f ./kube-state-metrics/kube-state-metrics-sa.yml -kubectl apply -f ./kube-state-metrics/kube-state-metrics-rbac.yml -kubectl apply -f ./kube-state-metrics/kube-state-metrics-svc.yml -kubectl apply -f ./kube-state-metrics/kube-state-metrics-dp.yml + kubectl apply -f ./grafana/pv-pvc-hostpath.yml kubectl apply -f ./grafana/grafana-sa.yml kubectl apply -f ./grafana/grafana-source.yml @@ -71,6 +59,8 @@ kubectl delete -f ./prometheus/prometheus-main.yml sleep 5 kubectl apply -f ./prometheus/pv-pvc-hostpath.yaml kubectl apply -f ./prometheus/prometheus-main.yml +sleep 5 +# 部署sm kubectl apply -f ./servicemonitor/alertmanager-sm.yml kubectl apply -f ./servicemonitor/coredns-sm.yml kubectl apply -f ./servicemonitor/kube-apiserver-sm.yml @@ -82,6 +72,8 @@ kubectl apply -f ./servicemonitor/node-exporter-sm.yml kubectl apply -f ./servicemonitor/prometheus-operator-sm.yml kubectl apply -f ./servicemonitor/prometheus-sm.yml kubectl apply -f ./servicemonitor/pushgateway-sm.yml + +# 部署prometheus_adapter kubectl apply -f ./prometheus_adapter/metric_rule.yaml kubectl apply -f ./prometheus_adapter/prometheus_adapter.yaml cd ../ @@ -92,35 +84,16 @@ kubectl apply -f gpu/nvidia-device-plugin.yml kubectl apply -f gpu/dcgm-exporter.yaml kubectl apply -f gpu/dcgm-exporter-sm.yaml -# 部署frameworkcontroller nni超参搜索使用 -kubectl create serviceaccount frameworkcontroller --namespace kubeflow -kubectl create clusterrolebinding frameworkcontroller-kubeflow --clusterrole=cluster-admin --user=system:serviceaccount:kubeflow:frameworkcontroller -kubectl create -f frameworkcontroller/frameworkcontroller-with-default-config.yaml -sleep 5 -kubectl wait crd/frameworks.frameworkcontroller.microsoft.com --for condition=established --timeout=60s - -kubectl create serviceaccount frameworkbarrier --namespace pipeline -kubectl create serviceaccount frameworkbarrier --namespace automl -kubectl create serviceaccount frameworkbarrier --namespace kubeflow -kubectl create clusterrole frameworkbarrier --verb=get,list,watch --resource=frameworks -kubectl create clusterrolebinding frameworkbarrier-pipeline --clusterrole=frameworkbarrier --user=system:serviceaccount:pipeline:frameworkbarrier -kubectl create clusterrolebinding frameworkbarrier-automl --clusterrole=frameworkbarrier --user=system:serviceaccount:automl:frameworkbarrier -kubectl create clusterrolebinding frameworkbarrier-kubeflow --clusterrole=frameworkbarrier --user=system:serviceaccount:kubeflow:frameworkbarrier - # 部署volcano kubectl delete -f volcano/volcano-development.yaml -kubectl delete secret volcano-admission-secret -n kubeflow kubectl apply -f volcano/volcano-development.yaml kubectl wait crd/jobs.batch.volcano.sh --for condition=established --timeout=60s # 部署istio kubectl apply -f istio/install-crd.yaml kubectl wait crd/envoyfilters.networking.istio.io --for condition=established --timeout=60s -# 在k8s 1.21-部署 -kubectl apply -f istio/install.yaml -# 在k8s 1.21+部署 -# kubectl delete -f istio/install.yaml -# kubectl apply -f istio/install-1.15.0.yaml + +kubectl apply -f istio/install-1.15.0.yaml kubectl wait crd/virtualservices.networking.istio.io --for condition=established --timeout=60s kubectl wait crd/gateways.networking.istio.io --for condition=established --timeout=60s @@ -133,16 +106,11 @@ kubectl apply -f argo/minio-pv-pvc-hostpath.yaml kubectl apply -f argo/pipeline-runner-rolebinding.yaml kubectl apply -f argo/install-3.4.3-all.yaml -# 部署trainjob:tfjob/pytorchjob/mpijob/mxnetjob/xgboostjobs +# 部署trainjob:tfjob/pytorchjob/mpijob/mxnetjob/xgboostjobs/paddlepaddle kubectl apply -f kubeflow/sa-rbac.yaml + kubectl apply -k kubeflow/train-operator/manifests/overlays/standalone -# 部署sparkjob -kubectl apply -f spark/install.yaml - -# 部署paddlejob -kubectl apply -f paddle/crd.yaml -kubectl apply -f paddle/operator.yaml # 部署管理平台 kubectl delete configmap kubernetes-config -n infra diff --git a/install/kubernetes/virtual.yaml b/install/kubernetes/virtual.yaml index b4f9fd2b..271a9ed6 100644 --- a/install/kubernetes/virtual.yaml +++ b/install/kubernetes/virtual.yaml @@ -1,20 +1,3 @@ -## 通过vs代理访问其他的服务,包括pipline,argp,minio,grafana等 -#apiVersion: networking.istio.io/v1alpha3 -#kind: VirtualService -#metadata: -# name: infra-kubeflow-dashboard -# namespace: infra -#spec: -# gateways: -# - kubeflow/kubeflow-gateway -# hosts: -# - "*" # 管理平台的域名 kubeflow.local.com -# http: -# - route: -# - destination: -# host: kubeflow-dashboard.infra.svc.cluster.local -# port: -# number: 80 --- apiVersion: networking.istio.io/v1alpha3 @@ -42,6 +25,50 @@ spec: --- apiVersion: networking.istio.io/v1alpha3 kind: VirtualService +metadata: + name: kube-system-k8s-dashboard-user1 + namespace: kube-system +spec: + gateways: + - kubeflow/kubeflow-gateway + hosts: + - "*" # 配置自己管理的域名kubeflow.local.com + http: + - match: + - uri: + prefix: /k8s/dashboard/user1/ + rewrite: + uri: / + route: + - destination: + host: kubernetes-dashboard-user1.kube-system.svc.cluster.local + port: + number: 9090 +--- +apiVersion: networking.istio.io/v1alpha3 +kind: VirtualService +metadata: + name: kubeflow-labelstudio + namespace: kubeflow +spec: + gateways: + - kubeflow/kubeflow-gateway + hosts: + - "*" # 配置自己管理的域名 kubeflow.local.com + http: + - match: + - uri: + prefix: /labelstudio/ + rewrite: + uri: /labelstudio/ + route: + - destination: + host: labelstudio.kubeflow.svc.cluster.local + port: + number: 8080 +--- +apiVersion: networking.istio.io/v1alpha3 +kind: VirtualService metadata: name: monitoring-grafana namespace: monitoring