mirror of
https://github.com/tencentmusic/cube-studio.git
synced 2025-02-05 14:19:59 +08:00
delete katib
This commit is contained in:
parent
9576ed1801
commit
ee91eccf3f
@ -10,7 +10,7 @@ cube-studio is a one-stop cloud-native machine learning platform open sourced by
|
||||
- 1、data management: feature store, online and offline features; dataset management, structure data and media data, data label platform
|
||||
- 2、develop: notebook(vscode/jupyter); docker image management; image build online
|
||||
- 3、train: pipeline drag and drop online; open template market; distributed computing/training tasks, example tf/pytorch/mxnet/spark/ray/horovod/kaldi/volcano; batch priority scheduling; resource monitoring/alarm/balancing; cron scheduling
|
||||
- 4、automl: nni, katib, ray
|
||||
- 4、automl: nni, ray
|
||||
- 5、inference: model manager; serverless traffic control; tf/pytorch/onnx/tensorrt model deploy, tfserving/torchserver/onnxruntime/triton inference; VGPU; load balancing、high availability、elastic scaling
|
||||
- 6、infra: multi-user; multi-project; multi-cluster; edge cluster mode; blockchain sharing;
|
||||
|
||||
|
@ -9,7 +9,7 @@ cube是 腾讯音乐 开源的一站式云原生机器学习平台,目前主
|
||||
- 1、数据管理:特征平台,支持在/离线特征;数据源管理,支持结构数据和媒体标注数据管理;
|
||||
- 2、在线开发:在线的vscode/jupyter代码开发;在线镜像调试,支持免dockerfile,增量构建;
|
||||
- 3、训练编排:任务流编排,在线拖拉拽;开放的模板市场,支持tf/pytorch/mxnet/spark/ray/horovod/kaldi/volcano等分布式计算/训练任务;task的单节点debug,分布式任务的批量优先级调度,聚合日志;任务运行资源监控,报警;定时调度,支持补录,忽略,重试,依赖,并发限制,定时任务算力的智能修正;
|
||||
- 4、超参搜索:nni,katib,ray的超参搜索;
|
||||
- 4、超参搜索:nni,ray的超参搜索;
|
||||
- 5、推理服务:tf/pytorch/onnx模型的推理服务,serverless流量管控,triton gpu推理加速,依据gpu利用率/qps等指标的hpa能力,虚拟化gpu,虚拟显存等服务化能力;
|
||||
- 6、资源统筹:多集群多项目组资源统筹,联邦调度,边缘计算;
|
||||
|
||||
|
@ -6,7 +6,7 @@
|
||||
完整的平台包含
|
||||
- 1、机器的标准化
|
||||
- 2、分布式存储(单机可忽略)、k8s集群、监控体系(prometheus/efk/zipkin)
|
||||
- 3、基础能力(tf/pytorch/mxnet/valcano/ray等分布式,nni/katib超参搜索)
|
||||
- 3、基础能力(tf/pytorch/mxnet/valcano/ray等分布式,nni/ray超参搜索)
|
||||
- 4、平台web部分(oa/权限/项目组、在线构建镜像、在线开发、pipeline拖拉拽、超参搜索、推理服务管理等)
|
||||
|
||||
# 组件说明
|
||||
|
@ -744,8 +744,8 @@ ADMIN_USER='admin'
|
||||
PIPELINE_NAMESPACE = 'pipeline'
|
||||
# 服务pipeline运行的空间,必填service
|
||||
SERVICE_PIPELINE_NAMESPACE='service'
|
||||
# 超参搜索命名空间,必填katib
|
||||
AUTOML_NAMESPACE = 'katib'
|
||||
# 超参搜索命名空间,必填automl
|
||||
AUTOML_NAMESPACE = 'automl'
|
||||
# notebook必填空间,必填jupyter
|
||||
NOTEBOOK_NAMESPACE = 'jupyter'
|
||||
# 内部服务命名空间,必填service
|
||||
@ -753,12 +753,12 @@ SERVICE_NAMESPACE = 'service'
|
||||
# 服务链路追踪地址
|
||||
SERVICE_PIPELINE_ZIPKIN='http://xx.xx.xx.xx:9401'
|
||||
SERVICE_PIPELINE_JAEGER='tracing.service'
|
||||
# katib任务默认镜像
|
||||
KATIB_JOB_DEFAULT_IMAGE='ccr.ccs.tencentyun.com/cube-studio/katib'
|
||||
# katib的tfjob任务默认镜像
|
||||
KATIB_TFJOB_DEFAULT_IMAGE = 'gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0'
|
||||
# katib的pytorchjob任务默认镜像
|
||||
KATIB_PYTORCHJOB_DEFAULT_IMAGE = 'gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0'
|
||||
# automl任务默认镜像
|
||||
AUTOML_JOB_DEFAULT_IMAGE='ccr.ccs.tencentyun.com/cube-studio/automl'
|
||||
# automl的tfjob任务默认镜像
|
||||
AUTOML_TFJOB_DEFAULT_IMAGE = 'gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0'
|
||||
# automl的pytorchjob任务默认镜像
|
||||
AUTOML_PYTORCHJOB_DEFAULT_IMAGE = 'gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0'
|
||||
# 拉取私有仓库镜像默认携带的k8s hubsecret名称
|
||||
HUBSECRET = ['hubsecret']
|
||||
# 私有仓库的组织名,用户在线构建的镜像自动推送这个组织下面
|
||||
@ -805,7 +805,6 @@ K8S_DASHBOARD_CLUSTER = '/k8s/dashboard/cluster/' #
|
||||
K8S_DASHBOARD_PIPELINE = '/k8s/dashboard/pipeline/'
|
||||
|
||||
PIPELINE_URL = '/pipeline/#/'
|
||||
KATIB_URL = '/katib/#'
|
||||
|
||||
# 这两部分功能需要泛化域名。没有泛化域名此部分功能受限。ISTIO_INGRESS_DOMAIN为泛域名后缀
|
||||
ISTIO_INGRESS_DOMAIN = os.getenv('ISTIO_INGRESS_DOMAIN','local.com') # 泛化域名,尾缀,可以和HOST不一致,没有泛化域名对应的功能没法使用
|
||||
|
@ -1,5 +1,5 @@
|
||||
|
||||
for namespace in 'infra' 'kubeflow' 'istio-system' 'pipeline' 'katib' 'jupyter' 'service' 'monitoring' 'logging' 'kube-system'
|
||||
for namespace in 'infra' 'kubeflow' 'istio-system' 'pipeline' 'automl' 'jupyter' 'service' 'monitoring' 'logging' 'kube-system'
|
||||
do
|
||||
kubectl create ns $namespace
|
||||
kubectl delete secret docker-registry hubsecret -n $namespace
|
||||
@ -9,7 +9,6 @@ do
|
||||
# kubectl label namespace $namespace istio-inhection=enabled --overwrite
|
||||
done
|
||||
|
||||
kubectl label ns katib katib-metricscollector-injection=enabled --overwrite
|
||||
kubectl label ns service istio-injection-
|
||||
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
# 通过vs代理访问其他的服务,包括katib、pipline,argp,minio,grafana等
|
||||
# 通过vs代理访问其他的服务,包括pipline,argp,minio,grafana等
|
||||
apiVersion: networking.istio.io/v1alpha3
|
||||
kind: VirtualService
|
||||
metadata:
|
||||
|
@ -567,7 +567,12 @@ class CeleryConfig(object):
|
||||
'task': 'task.adjust_node_resource', # 定时在多项目组间进行资源均衡
|
||||
# 'schedule': 10.0,
|
||||
'schedule': crontab(minute='*/10'),
|
||||
}
|
||||
},
|
||||
'task_update_aihub': {
|
||||
'task': 'task.update_aihub', # 更新aihub
|
||||
# 'schedule': 10.0,
|
||||
'schedule': crontab(minute='30', hour='4'),
|
||||
},
|
||||
}
|
||||
|
||||
# 帮助文档地址,显示在web导航栏
|
||||
@ -739,8 +744,8 @@ ADMIN_USER='admin'
|
||||
PIPELINE_NAMESPACE = 'pipeline'
|
||||
# 服务pipeline运行的空间,必填service
|
||||
SERVICE_PIPELINE_NAMESPACE='service'
|
||||
# 超参搜索命名空间,必填katib
|
||||
AUTOML_NAMESPACE = 'katib'
|
||||
# 超参搜索命名空间,必填automl
|
||||
AUTOML_NAMESPACE = 'automl'
|
||||
# notebook必填空间,必填jupyter
|
||||
NOTEBOOK_NAMESPACE = 'jupyter'
|
||||
# 内部服务命名空间,必填service
|
||||
@ -748,12 +753,12 @@ SERVICE_NAMESPACE = 'service'
|
||||
# 服务链路追踪地址
|
||||
SERVICE_PIPELINE_ZIPKIN='http://xx.xx.xx.xx:9401'
|
||||
SERVICE_PIPELINE_JAEGER='tracing.service'
|
||||
# katib任务默认镜像
|
||||
KATIB_JOB_DEFAULT_IMAGE='ccr.ccs.tencentyun.com/cube-studio/katib'
|
||||
# katib的tfjob任务默认镜像
|
||||
KATIB_TFJOB_DEFAULT_IMAGE = 'gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0'
|
||||
# katib的pytorchjob任务默认镜像
|
||||
KATIB_PYTORCHJOB_DEFAULT_IMAGE = 'gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0'
|
||||
# automl任务默认镜像
|
||||
AUTOML_JOB_DEFAULT_IMAGE='ccr.ccs.tencentyun.com/cube-studio/automl'
|
||||
# automl的tfjob任务默认镜像
|
||||
AUTOML_TFJOB_DEFAULT_IMAGE = 'gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0'
|
||||
# automl的pytorchjob任务默认镜像
|
||||
AUTOML_PYTORCHJOB_DEFAULT_IMAGE = 'gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0'
|
||||
# 拉取私有仓库镜像默认携带的k8s hubsecret名称
|
||||
HUBSECRET = ['hubsecret']
|
||||
# 私有仓库的组织名,用户在线构建的镜像自动推送这个组织下面
|
||||
@ -800,7 +805,6 @@ K8S_DASHBOARD_CLUSTER = '/k8s/dashboard/cluster/' #
|
||||
K8S_DASHBOARD_PIPELINE = '/k8s/dashboard/pipeline/'
|
||||
|
||||
PIPELINE_URL = '/pipeline/#/'
|
||||
KATIB_URL = '/katib/#'
|
||||
|
||||
# 这两部分功能需要泛化域名。没有泛化域名此部分功能受限。ISTIO_INGRESS_DOMAIN为泛域名后缀
|
||||
ISTIO_INGRESS_DOMAIN = os.getenv('ISTIO_INGRESS_DOMAIN','local.com') # 泛化域名,尾缀,可以和HOST不一致,没有泛化域名对应的功能没法使用
|
||||
|
@ -117,8 +117,8 @@ subjects:
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: kubernetes-dashboard-katib
|
||||
namespace: katib # 生效空间
|
||||
name: kubernetes-dashboard-automl
|
||||
namespace: automl # 生效空间
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
|
@ -2,11 +2,11 @@
|
||||
```bash
|
||||
kubectl create serviceaccount frameworkcontroller --namespace service
|
||||
kubectl create serviceaccount frameworkcontroller --namespace pipeline
|
||||
kubectl create serviceaccount frameworkcontroller --namespace katib
|
||||
kubectl create serviceaccount frameworkcontroller --namespace automl
|
||||
kubectl create serviceaccount frameworkcontroller --namespace kubeflow
|
||||
kubectl create clusterrolebinding frameworkcontroller-service --clusterrole=cluster-admin --user=system:serviceaccount:service:frameworkcontroller
|
||||
kubectl create clusterrolebinding frameworkcontroller-pipeline --clusterrole=cluster-admin --user=system:serviceaccount:pipeline:frameworkcontroller
|
||||
kubectl create clusterrolebinding frameworkcontroller-katib --clusterrole=cluster-admin --user=system:serviceaccount:katib:frameworkcontroller
|
||||
kubectl create clusterrolebinding frameworkcontroller-automl --clusterrole=cluster-admin --user=system:serviceaccount:automl:frameworkcontroller
|
||||
kubectl create clusterrolebinding frameworkcontroller-kubeflow --clusterrole=cluster-admin --user=system:serviceaccount:kubeflow:frameworkcontroller
|
||||
|
||||
```
|
||||
@ -18,12 +18,12 @@ kubectl create -f frameworkcontroller-with-default-config.yaml
|
||||
|
||||
kubectl create serviceaccount frameworkbarrier --namespace service
|
||||
kubectl create serviceaccount frameworkbarrier --namespace pipeline
|
||||
kubectl create serviceaccount frameworkbarrier --namespace katib
|
||||
kubectl create serviceaccount frameworkbarrier --namespace automl
|
||||
kubectl create serviceaccount frameworkbarrier --namespace kubeflow
|
||||
kubectl create clusterrole frameworkbarrier --verb=get,list,watch --resource=frameworks
|
||||
kubectl create clusterrolebinding frameworkbarrier-service --clusterrole=frameworkbarrier --user=system:serviceaccount:service:frameworkbarrier
|
||||
kubectl create clusterrolebinding frameworkbarrier-pipeline --clusterrole=frameworkbarrier --user=system:serviceaccount:pipeline:frameworkbarrier
|
||||
kubectl create clusterrolebinding frameworkbarrier-katib --clusterrole=frameworkbarrier --user=system:serviceaccount:katib:frameworkbarrier
|
||||
kubectl create clusterrolebinding frameworkbarrier-automl --clusterrole=frameworkbarrier --user=system:serviceaccount:automl:frameworkbarrier
|
||||
kubectl create clusterrolebinding frameworkbarrier-kubeflow --clusterrole=frameworkbarrier --user=system:serviceaccount:kubeflow:frameworkbarrier
|
||||
|
||||
```
|
||||
|
@ -59,10 +59,6 @@ spec:
|
||||
rules:
|
||||
- http:
|
||||
paths:
|
||||
- path: /katib/
|
||||
backend:
|
||||
serviceName: katib-ui
|
||||
servicePort: 80
|
||||
- path: /minio/
|
||||
backend:
|
||||
serviceName: minio-service
|
||||
|
@ -5,9 +5,9 @@
|
||||
apiVersion: v1
|
||||
kind: PersistentVolume
|
||||
metadata:
|
||||
name: katib-kubeflow-user-workspace
|
||||
name: automl-kubeflow-user-workspace
|
||||
labels:
|
||||
katib-pvname: katib-kubeflow-user-workspace
|
||||
automl-pvname: automl-kubeflow-user-workspace
|
||||
spec:
|
||||
capacity:
|
||||
storage: 500Gi
|
||||
@ -16,7 +16,7 @@ spec:
|
||||
persistentVolumeReclaimPolicy: Retain
|
||||
csi:
|
||||
driver: csi.juicefs.com
|
||||
volumeHandle: katib-kubeflow-user-workspace
|
||||
volumeHandle: automl-kubeflow-user-workspace
|
||||
fsType: juicefs
|
||||
nodePublishSecretRef:
|
||||
name: juicefs-sc-secret
|
||||
@ -33,7 +33,7 @@ kind: PersistentVolumeClaim
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: kubeflow-user-workspace
|
||||
namespace: katib
|
||||
namespace: automl
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
@ -42,4 +42,4 @@ spec:
|
||||
storage: 500Gi
|
||||
selector:
|
||||
matchLabels:
|
||||
katib-pvname: katib-kubeflow-user-workspace
|
||||
automl-pvname: automl-kubeflow-user-workspace
|
||||
|
@ -11,7 +11,7 @@ metadata:
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: katib
|
||||
name: automl
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
|
@ -84,7 +84,7 @@
|
||||
"targets": [
|
||||
{
|
||||
"exemplar": true,
|
||||
"expr": "container_spec_memory_limit_bytes{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod!=\"\",namespace=~\"pipeline|jupyter|service|katib\"}/1000/1000/1000",
|
||||
"expr": "container_spec_memory_limit_bytes{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod!=\"\",namespace=~\"pipeline|jupyter|service|automl\"}/1000/1000/1000",
|
||||
"format": "table",
|
||||
"hide": false,
|
||||
"interval": "",
|
||||
@ -94,7 +94,7 @@
|
||||
},
|
||||
{
|
||||
"exemplar": true,
|
||||
"expr": "container_memory_working_set_bytes{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod!=\"\",namespace=~\"pipeline|jupyter|service|katib\"}/1000/1000/1000",
|
||||
"expr": "container_memory_working_set_bytes{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod!=\"\",namespace=~\"pipeline|jupyter|service|automl\"}/1000/1000/1000",
|
||||
"format": "table",
|
||||
"hide": false,
|
||||
"instant": true,
|
||||
@ -105,7 +105,7 @@
|
||||
},
|
||||
{
|
||||
"exemplar": true,
|
||||
"expr": "sum by (node,namespace,pod) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod!=\"\",namespace=~\"pipeline|jupyter|service|katib\"}[2m]))",
|
||||
"expr": "sum by (node,namespace,pod) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod!=\"\",namespace=~\"pipeline|jupyter|service|automl\"}[2m]))",
|
||||
"format": "table",
|
||||
"hide": false,
|
||||
"instant": true,
|
||||
@ -115,7 +115,7 @@
|
||||
},
|
||||
{
|
||||
"exemplar": true,
|
||||
"expr": "container_spec_cpu_quota{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod!=\"\",namespace=~\"pipeline|jupyter|service|katib\"}/100000",
|
||||
"expr": "container_spec_cpu_quota{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod!=\"\",namespace=~\"pipeline|jupyter|service|automl\"}/100000",
|
||||
"format": "table",
|
||||
"hide": false,
|
||||
"instant": true,
|
||||
|
@ -5,9 +5,9 @@
|
||||
apiVersion: v1
|
||||
kind: PersistentVolume
|
||||
metadata:
|
||||
name: katib-kubeflow-user-workspace
|
||||
name: automl-kubeflow-user-workspace
|
||||
labels:
|
||||
katib-pvname: katib-kubeflow-user-workspace
|
||||
automl-pvname: automl-kubeflow-user-workspace
|
||||
spec:
|
||||
capacity:
|
||||
storage: 500Gi
|
||||
@ -21,7 +21,7 @@ kind: PersistentVolumeClaim
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: kubeflow-user-workspace
|
||||
namespace: katib
|
||||
namespace: automl
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
@ -30,4 +30,4 @@ spec:
|
||||
storage: 500Gi
|
||||
selector:
|
||||
matchLabels:
|
||||
katib-pvname: katib-kubeflow-user-workspace
|
||||
automl-pvname: automl-kubeflow-user-workspace
|
@ -134,8 +134,8 @@ kubectl create configmap kubernetes-config --from-file=kubeconfig -n infra
|
||||
kubectl delete configmap kubernetes-config -n pipeline
|
||||
kubectl create configmap kubernetes-config --from-file=kubeconfig -n pipeline
|
||||
|
||||
kubectl delete configmap kubernetes-config -n katib
|
||||
kubectl create configmap kubernetes-config --from-file=kubeconfig -n katib
|
||||
kubectl delete configmap kubernetes-config -n automl
|
||||
kubectl create configmap kubernetes-config --from-file=kubeconfig -n automl
|
||||
```
|
||||
|
||||
|
||||
@ -165,7 +165,7 @@ kubectl apply -k cube/overlays
|
||||
```bash
|
||||
kubectl create -f pv-pvc-infra.yaml
|
||||
kubectl create -f pv-pvc-jupyter.yaml
|
||||
kubectl create -f pv-pvc-katib.yaml
|
||||
kubectl create -f pv-pvc-automl.yaml
|
||||
# kubectl create -f pv-pvc-kubeflow.yaml
|
||||
kubectl create -f pv-pvc-pipeline.yaml
|
||||
kubectl create -f pv-pvc-service.yaml
|
||||
|
@ -37,16 +37,16 @@ apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: nni
|
||||
namespace: katib
|
||||
namespace: automl
|
||||
---
|
||||
kind: ClusterRoleBinding
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: katib-nni-clusterrolebinding
|
||||
name: automl-nni-clusterrolebinding
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: nni
|
||||
namespace: katib
|
||||
namespace: automl
|
||||
roleRef:
|
||||
kind: ClusterRole
|
||||
name: kubeflow-clusterrole
|
||||
|
@ -96,11 +96,11 @@ sleep 5
|
||||
kubectl wait crd/frameworks.frameworkcontroller.microsoft.com --for condition=established --timeout=60s
|
||||
|
||||
kubectl create serviceaccount frameworkbarrier --namespace pipeline
|
||||
kubectl create serviceaccount frameworkbarrier --namespace katib
|
||||
kubectl create serviceaccount frameworkbarrier --namespace automl
|
||||
kubectl create serviceaccount frameworkbarrier --namespace kubeflow
|
||||
kubectl create clusterrole frameworkbarrier --verb=get,list,watch --resource=frameworks
|
||||
kubectl create clusterrolebinding frameworkbarrier-pipeline --clusterrole=frameworkbarrier --user=system:serviceaccount:pipeline:frameworkbarrier
|
||||
kubectl create clusterrolebinding frameworkbarrier-katib --clusterrole=frameworkbarrier --user=system:serviceaccount:katib:frameworkbarrier
|
||||
kubectl create clusterrolebinding frameworkbarrier-automl --clusterrole=frameworkbarrier --user=system:serviceaccount:automl:frameworkbarrier
|
||||
kubectl create clusterrolebinding frameworkbarrier-kubeflow --clusterrole=frameworkbarrier --user=system:serviceaccount:kubeflow:frameworkbarrier
|
||||
|
||||
# 部署volcano
|
||||
@ -147,12 +147,12 @@ kubectl create configmap kubernetes-config --from-file=kubeconfig -n infra
|
||||
kubectl delete configmap kubernetes-config -n pipeline
|
||||
kubectl create configmap kubernetes-config --from-file=kubeconfig -n pipeline
|
||||
|
||||
kubectl delete configmap kubernetes-config -n katib
|
||||
kubectl create configmap kubernetes-config --from-file=kubeconfig -n katib
|
||||
kubectl delete configmap kubernetes-config -n automl
|
||||
kubectl create configmap kubernetes-config --from-file=kubeconfig -n automl
|
||||
|
||||
kubectl create -f pv-pvc-infra.yaml
|
||||
kubectl create -f pv-pvc-jupyter.yaml
|
||||
kubectl create -f pv-pvc-katib.yaml
|
||||
kubectl create -f pv-pvc-automl.yaml
|
||||
kubectl create -f pv-pvc-pipeline.yaml
|
||||
kubectl create -f pv-pvc-service.yaml
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
## 通过vs代理访问其他的服务,包括katib、pipline,argp,minio,grafana等
|
||||
## 通过vs代理访问其他的服务,包括pipline,argp,minio,grafana等
|
||||
#apiVersion: networking.istio.io/v1alpha3
|
||||
#kind: VirtualService
|
||||
#metadata:
|
||||
@ -16,7 +16,7 @@
|
||||
# port:
|
||||
# number: 80
|
||||
#---
|
||||
# 通过vs代理访问其他的服务,包括katib、pipline,argp,minio,grafana等
|
||||
# 通过vs代理访问其他的服务,包括pipline,argp,minio,grafana等
|
||||
apiVersion: networking.istio.io/v1alpha3
|
||||
kind: VirtualService
|
||||
metadata:
|
||||
@ -47,24 +47,6 @@ spec:
|
||||
allowHeaders:
|
||||
- "*"
|
||||
|
||||
- match:
|
||||
- uri:
|
||||
prefix: /katib/
|
||||
rewrite:
|
||||
uri: /katib/
|
||||
route:
|
||||
- destination:
|
||||
host: katib-ui.kubeflow.svc.cluster.local
|
||||
port:
|
||||
number: 80
|
||||
corsPolicy:
|
||||
allowOrigin:
|
||||
- "*"
|
||||
allowMethods:
|
||||
- POST
|
||||
- GET
|
||||
allowHeaders:
|
||||
- "*"
|
||||
- match:
|
||||
- uri:
|
||||
prefix: /pipeline/
|
||||
|
@ -41,7 +41,7 @@ class NNI(Model,AuditMixinNullable,MyappModelBase):
|
||||
"Project", foreign_keys=[project_id]
|
||||
)
|
||||
name = Column(String(200), unique = True, nullable=False)
|
||||
namespace = Column(String(200), nullable=True,default='katib')
|
||||
namespace = Column(String(200), nullable=True,default='automl')
|
||||
describe = Column(Text)
|
||||
parallel_trial_count = Column(Integer,default=3)
|
||||
maxExecDuration = Column(Integer,default=3600)
|
||||
|
@ -1048,7 +1048,7 @@ def watch_gpu(task):
|
||||
cluster = clusters[cluster_name]
|
||||
k8s_client = K8s(cluster.get('KUBECONFIG',''))
|
||||
|
||||
all_gpu_pods=k8s_client.get_uesd_gpu(namespaces=['pipeline','katib','jupyter','service'])
|
||||
all_gpu_pods=k8s_client.get_uesd_gpu(namespaces=['pipeline','automl','jupyter','service'])
|
||||
|
||||
print(all_gpu_pods)
|
||||
message = ''
|
||||
@ -1089,7 +1089,7 @@ def adjust_node_resource(task):
|
||||
all_node_json[ip]['used_gpu'] = []
|
||||
|
||||
# print(all_node_json)
|
||||
for namespace in ['jupyter', 'pipeline', 'katib', 'service']:
|
||||
for namespace in ['jupyter', 'pipeline', 'automl', 'service']:
|
||||
all_pods = k8s_client.get_pods(namespace=namespace)
|
||||
for pod in all_pods:
|
||||
if pod['host_ip'] not in all_node_json:
|
||||
|
@ -20,7 +20,6 @@ from myapp.models.model_job import (
|
||||
from myapp.utils.celery import session_scope
|
||||
from myapp.project import push_admin,push_message
|
||||
from myapp.models.model_job import Pipeline,Workflow
|
||||
from myapp.models.model_katib import Hyperparameter_Tuning,Experiments
|
||||
import pymysql
|
||||
conf=app.config
|
||||
|
||||
|
@ -20,7 +20,6 @@ from myapp.models.model_job import (
|
||||
from myapp.utils.celery import session_scope
|
||||
from myapp.project import push_admin,push_message
|
||||
from myapp.models.model_job import Pipeline,Workflow
|
||||
from myapp.models.model_katib import Hyperparameter_Tuning,Experiments
|
||||
import pymysql
|
||||
conf=app.config
|
||||
|
||||
|
@ -763,7 +763,7 @@ class Myapp(BaseMyappView):
|
||||
all_node_json[ip]['user'] = []
|
||||
|
||||
# print(all_node_json)
|
||||
for namespace in ['jupyter', 'pipeline', 'katib', 'service']:
|
||||
for namespace in ['jupyter', 'pipeline', 'automl', 'service']:
|
||||
all_pods = k8s_client.get_pods(namespace=namespace)
|
||||
for pod in all_pods:
|
||||
if pod['status'] == 'Running' and pod['host_ip'] in all_node_json:
|
||||
@ -906,7 +906,7 @@ class Myapp(BaseMyappView):
|
||||
# 获取pod的资源占用
|
||||
all_tasks_json[cluster_name]={}
|
||||
# print(all_node_json)
|
||||
for namespace in ['pipeline', 'katib', 'service']:
|
||||
for namespace in ['pipeline', 'automl', 'service']:
|
||||
all_tasks_json[cluster_name][namespace]={}
|
||||
all_pods = k8s_client.get_pods(namespace=namespace)
|
||||
for pod in all_pods:
|
||||
|
@ -351,7 +351,7 @@ class NNI_ModelView_Base():
|
||||
|
||||
from myapp.utils.py.py_k8s import K8s
|
||||
k8s_client = K8s(nni.project.cluster.get('KUBECONFIG',''))
|
||||
namespace = conf.get('AUTOML_NAMESPACE','katib')
|
||||
namespace = conf.get('AUTOML_NAMESPACE','automl')
|
||||
run_id='nni-'+nni.name
|
||||
|
||||
try:
|
||||
@ -546,7 +546,7 @@ tuner:
|
||||
trial:
|
||||
codeDir: {nni.working_dir}
|
||||
frameworkcontrollerConfig:
|
||||
namespace: {conf.get('AUTOML_NAMESPACE','katib')}
|
||||
namespace: {conf.get('AUTOML_NAMESPACE','automl')}
|
||||
storage: pvc
|
||||
configPath: /mnt/{nni.created_by.username}/nni/{nni.name}/trial_template.yaml
|
||||
pvc:
|
||||
|
Loading…
Reference in New Issue
Block a user