mirror of
https://github.com/tencentmusic/cube-studio.git
synced 2025-02-17 14:40:28 +08:00
更新volcano模板
This commit is contained in:
parent
624cb8e53a
commit
08fe32c753
@ -1,7 +1,5 @@
|
||||
# volcanojob 模板
|
||||
镜像:ccr.ccs.tencentyun.com/cube-studio/volcano:20211001
|
||||
|
||||
挂载:kubernetes-config(configmap):/root/.kube
|
||||
镜像:ccr.ccs.tencentyun.com/cube-studio/volcano:20230601
|
||||
|
||||
环境变量:
|
||||
```bash
|
||||
@ -66,7 +64,7 @@ TASK_RESOURCE_GPU=0
|
||||
"require": 1,
|
||||
"choice": [],
|
||||
"range": "",
|
||||
"default": "ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda10.1-cudnn7-python3.6",
|
||||
"default": "ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda11.8.0-cudnn8-python3.9",
|
||||
"placeholder": "",
|
||||
"describe": "worker镜像,直接运行你代码的环境镜像<a href='https://github.com/tencentmusic/cube-studio/tree/master/images'>基础镜像</a>",
|
||||
"editable": 1,
|
||||
|
@ -58,8 +58,13 @@ print(k8s_volume_mounts)
|
||||
|
||||
GPU_TYPE= os.getenv('KFJ_GPU_TYPE', 'NVIDIA')
|
||||
GPU_RESOURCE= os.getenv('KFJ_TASK_RESOURCE_GPU', '0')
|
||||
print(GPU_TYPE,GPU_RESOURCE)
|
||||
# print(GPU_TYPE,GPU_RESOURCE)
|
||||
gpu_num = GPU_RESOURCE.split(',')[0]
|
||||
if '(' in gpu_num:
|
||||
gpu_type = gpu_num[gpu_num.index('(')+1:gpu_num.index(')')]
|
||||
gpu_num = gpu_num[:gpu_num.index('(')]
|
||||
|
||||
KFJ_TASK_NODE_SELECTOR['gpu-type']=gpu_type
|
||||
|
||||
|
||||
def default_job_name():
|
||||
@ -67,8 +72,6 @@ def default_job_name():
|
||||
return name[0:54]
|
||||
|
||||
|
||||
|
||||
|
||||
import subprocess
|
||||
# @pysnooper.snoop()
|
||||
def run_shell(shell):
|
||||
@ -86,7 +89,7 @@ def run_shell(shell):
|
||||
if status == 0: # 判断子进程是否结束
|
||||
print('shell finish %s'%status,flush=True)
|
||||
break
|
||||
if status==-9 or status==-15 or status==143: # 外界触发kill
|
||||
if status==-9 or status==-15 or status==143 or status==127: # 外界触发kill
|
||||
# if status!=0: # 外界触发kill
|
||||
print('shell finish %s'%status,flush=True)
|
||||
break
|
||||
@ -167,11 +170,11 @@ def make_volcanojob(name,num_workers,image,working_dir,command):
|
||||
"spec": {
|
||||
"restartPolicy": "Never",
|
||||
"volumes": k8s_volumes,
|
||||
# "imagePullSecrets": [
|
||||
# {
|
||||
# "name": "hubsecret"
|
||||
# }
|
||||
# ],
|
||||
"imagePullSecrets": [
|
||||
{
|
||||
"name": "hubsecret"
|
||||
}
|
||||
],
|
||||
"affinity": {
|
||||
"nodeAffinity": {
|
||||
"requiredDuringSchedulingIgnoredDuringExecution": {
|
||||
@ -251,8 +254,9 @@ def make_volcanojob(name,num_workers,image,working_dir,command):
|
||||
|
||||
|
||||
if GPU_TYPE=='NVIDIA' and GPU_RESOURCE:
|
||||
task_spec['template']['spec']['containers'][0]['resources']['requests']['nvidia.com/gpu'] = GPU_RESOURCE.split(',')[0]
|
||||
task_spec['template']['spec']['containers'][0]['resources']['limits']['nvidia.com/gpu'] = GPU_RESOURCE.split(',')[0]
|
||||
|
||||
task_spec['template']['spec']['containers'][0]['resources']['requests']['nvidia.com/gpu'] = gpu_num
|
||||
task_spec['template']['spec']['containers'][0]['resources']['limits']['nvidia.com/gpu'] = gpu_num
|
||||
|
||||
worker_pod_spec = copy.deepcopy(task_spec)
|
||||
worker_pod_spec['replicas']=int(num_workers)-1 # 因为master是其中一个worker
|
||||
@ -278,8 +282,9 @@ def make_volcanojob(name,num_workers,image,working_dir,command):
|
||||
"schedulerName":"volcano",
|
||||
"cleanPodPolicy": "None",
|
||||
"plugins":{
|
||||
"env":[],
|
||||
"svc":[]
|
||||
"env": [],
|
||||
"svc": [],
|
||||
"ssh": []
|
||||
},
|
||||
"queue":"default",
|
||||
"tasks": [
|
||||
|
Loading…
Reference in New Issue
Block a user