diff --git a/install/kubernetes/start.sh b/install/kubernetes/start.sh index ace32acc..d3c12221 100644 --- a/install/kubernetes/start.sh +++ b/install/kubernetes/start.sh @@ -91,7 +91,7 @@ kubectl apply -f gpu/nvidia-device-plugin.yml kubectl apply -f gpu/dcgm-exporter.yaml kubectl apply -f gpu/dcgm-exporter-sm.yaml -# 部署frameworkcontroller +# 部署frameworkcontroller nni超参搜索使用 kubectl create serviceaccount frameworkcontroller --namespace kubeflow kubectl create clusterrolebinding frameworkcontroller-kubeflow --clusterrole=cluster-admin --user=system:serviceaccount:kubeflow:frameworkcontroller kubectl create -f frameworkcontroller/frameworkcontroller-with-default-config.yaml @@ -129,6 +129,11 @@ kubectl wait crd/applications.app.k8s.io --for condition=established --timeout=6 kubectl apply -k env/platform-agnostic cd ../../../../ +# 部署trainjob:tfjob/pytorchjob/mpijob/mxnetjob/xgboostjobs +kubectl apply -k kubeflow/train-operator/manifests/overlays/standalone +# 部署sparkjob +kubectl apply -f spark/install.yaml + # 部署管理平台 kubectl delete configmap kubernetes-config -n infra diff --git a/job-template/job/pytorch_distributed_train_k8s/Dockerfile b/job-template/job/pytorch_distributed_train_k8s/Dockerfile index 58096d9c..d2821bd5 100644 --- a/job-template/job/pytorch_distributed_train_k8s/Dockerfile +++ b/job-template/job/pytorch_distributed_train_k8s/Dockerfile @@ -33,6 +33,6 @@ COPY job/pkgs /app/job/pkgs WORKDIR /app ENV PYTHONPATH=/app:$PYTHONPATH -ENTRYPOINT ["python3", "pytorchjob_launcher.py"] +ENTRYPOINT ["python3", "launcher.py"]