cube-studio/myapp/views/view_task.py
2024-06-30 11:34:35 +08:00

885 lines
42 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import random
from myapp.views.baseSQLA import MyappSQLAInterface as SQLAInterface
from flask_babel import gettext as __
from flask_babel import lazy_gettext as _
import uuid
import pysnooper
from myapp.models.model_job import Job_Template, Task, Pipeline
from flask_appbuilder.forms import GeneralModelConverter
from myapp.utils import core
from myapp import app, appbuilder, db
from wtforms.ext.sqlalchemy.fields import QuerySelectField
from jinja2 import Environment, BaseLoader, DebugUndefined
import os
from wtforms.validators import DataRequired, Length, Regexp
from myapp.exceptions import MyappException
from wtforms import BooleanField, IntegerField, StringField, SelectField, FloatField, DateField, DateTimeField, SelectMultipleField
from flask_appbuilder.fieldwidgets import BS3TextFieldWidget, BS3PasswordFieldWidget, DatePickerWidget, DateTimePickerWidget, Select2ManyWidget, Select2Widget
from myapp.forms import MyBS3TextAreaFieldWidget, MyLineSeparatedListField, MyJSONField, MyBS3TextFieldWidget
from flask_wtf.file import FileField
from .baseApi import (
MyappModelRestApi
)
import logging
from flask import (
flash,
g,
redirect
)
from .base import (
get_user_roles,
MyappModelView,
)
from myapp.views.base import CompactCRUDMixin
from flask_appbuilder import expose
import datetime, time, json
conf = app.config
class Task_ModelView_Base():
label_title = _('任务')
datamodel = SQLAInterface(Task)
check_redirect_list_url = '/pipeline_modelview/edit/'
help_url = conf.get('HELP_URL', {}).get(datamodel.obj.__tablename__, '') if datamodel else ''
list_columns = ['name', 'label', 'pipeline', 'job_template', 'volume_mount', 'resource_memory', 'resource_cpu',
'resource_gpu','resource_rdma', 'timeout', 'retry', 'created_on', 'changed_on', 'monitoring', 'expand']
# list_columns = ['name','label','job_template_url','volume_mount','debug','run','clear','log']
cols_width = {
"name": {"type": "ellip2", "width": 250},
"label": {"type": "ellip2", "width": 200},
"pipeline": {"type": "ellip2", "width": 200},
"job_template": {"type": "ellip2", "width": 200},
"volume_mount": {"type": "ellip2", "width": 600},
"command": {"type": "ellip2", "width": 200},
"args": {"type": "ellip2", "width": 400},
"resource_memory": {"type": "ellip2", "width": 100},
"resource_cpu": {"type": "ellip2", "width": 100},
"resource_gpu": {"type": "ellip2", "width": 100},
"resource_rdma": {"type": "ellip2", "width": 100},
"timeout": {"type": "ellip2", "width": 100},
"retry": {"type": "ellip2", "width": 100},
"created_on": {"type": "ellip2", "width": 300},
"changed_on": {"type": "ellip2", "width": 300},
"monitoring": {"type": "ellip2", "width": 300},
"node_selector": {"type": "ellip2", "width": 200},
"expand": {"type": "ellip2", "width": 300},
}
show_columns = ['name', 'label', 'pipeline', 'job_template', 'volume_mount', 'command', 'overwrite_entrypoint',
'working_dir', 'args_html', 'resource_memory', 'resource_cpu', 'resource_gpu','resource_rdma', 'timeout', 'retry',
'skip', 'created_by', 'changed_by', 'created_on', 'changed_on', 'monitoring_html']
add_columns = ['job_template', 'name', 'label', 'pipeline', 'volume_mount', 'command', 'working_dir', 'skip']
edit_columns = ['name', 'label', 'volume_mount', 'command', 'working_dir', 'skip']
base_order = ('id', 'desc')
order_columns = ['id']
search_columns = ['pipeline', 'name', 'label']
conv = GeneralModelConverter(datamodel)
add_form_extra_fields = {
"args": StringField(
_('启动参数'),
widget=MyBS3TextAreaFieldWidget(rows=10),
),
"pipeline": QuerySelectField(
_('任务流'),
query_factory=lambda: db.session.query(Pipeline),
allow_blank=True,
widget=Select2Widget(extra_classes="readonly"),
),
"job_template": QuerySelectField(
_('任务模板'),
query_factory=lambda: db.session.query(Job_Template),
allow_blank=True,
widget=Select2Widget(),
),
"name": StringField(
label= _('名称'),
description= _('英文名(小写字母、数字、- 组成)最长50个字符'),
widget=BS3TextFieldWidget(),
validators=[Regexp("^[a-z][a-z0-9\-]*[a-z0-9]$"), Length(1, 54), DataRequired()]
),
"label": StringField(
label= _('标签'),
description= _('中文名'),
widget=BS3TextFieldWidget(),
validators=[DataRequired()]
),
"volume_mount": StringField(
label= _('挂载'),
description= _('外部挂载,格式:$pvc_name1(pvc):/$container_path1,$hostpath1(hostpath):/$container_path2,4G(memory):/dev/shm,注意pvc会自动挂载对应目录下的个人username子目录'),
widget=BS3TextFieldWidget(),
default='kubeflow-user-workspace(pvc):/mnt,kubeflow-archives(pvc):/archives'
),
"working_dir": StringField(
label= _('工作目录'),
description= _('工作目录容器启动的初始所在目录不填默认使用Dockerfile内定义的工作目录'),
widget=BS3TextFieldWidget()
),
"command": StringField(
label= _('启动命令'),
description= _('启动命令'),
widget=BS3TextFieldWidget()
),
"overwrite_entrypoint": BooleanField(
label= _('覆盖入口点'),
description= _('启动命令是否覆盖Dockerfile中ENTRYPOINT不覆盖则叠加。')
),
"node_selector": StringField(
label= _('机器选择'),
description= _('运行当前task所在的机器'),
widget=BS3TextFieldWidget(),
default=Task.node_selector.default.arg,
validators=[]
),
'resource_memory': StringField(
label= _('memory'),
default=Task.resource_memory.default.arg,
description= _('内存的资源使用限制示例1G10G 最大100G如需更多联系管理员'),
widget=BS3TextFieldWidget(),
validators=[DataRequired()]
),
'resource_cpu': StringField(
label= _('cpu'),
default=Task.resource_cpu.default.arg,
description= _('cpu的资源使用限制(单位核),示例 0.410最大50核如需更多联系管理员'),
widget=BS3TextFieldWidget(),
validators=[DataRequired()]
),
'timeout': IntegerField(
label= _('超时'),
default=Task.timeout.default.arg,
description= _('task运行时长限制为0表示不限制(单位s)'),
widget=BS3TextFieldWidget()
),
'retry': IntegerField(
label= _('重试'),
default=Task.retry.default.arg,
description= _('task重试次数'),
widget=BS3TextFieldWidget()
),
'outputs': StringField(
label= _('输出'),
default=Task.outputs.default.arg,
description= _('task输出文件支持容器目录文件和minio存储路径'),
widget=MyBS3TextAreaFieldWidget(rows=3)
),
}
add_form_extra_fields['resource_gpu'] = StringField('gpu', default='0', description= _('gpu的资源使用限制(单位卡),示例:12训练任务每个容器独占整卡。申请具体的卡型号可以类似 1(V100)'),widget=BS3TextFieldWidget())
add_form_extra_fields['resource_rdma'] = StringField('rdma', default='0', description= _('RDMA的资源使用限制示例 0110填写方式咨询管理员'), widget=BS3TextFieldWidget())
edit_form_extra_fields = add_form_extra_fields
# 处理form请求
# @pysnooper.snoop(watch_explode=('form'))
def process_form(self, form, is_created):
# from flask_appbuilder.forms import DynamicForm
if 'job_describe' in form._fields:
del form._fields['job_describe'] # 不处理这个字段
# 检测是否具有编辑权限只有creator和admin可以编辑
def check_edit_permission(self, item):
user_roles = [role.name.lower() for role in list(get_user_roles())]
if "admin" in user_roles:
return True
if g.user and g.user.username and item.pipeline and hasattr(item.pipeline, 'created_by'):
if g.user.username == item.pipeline.created_by.username:
return True
flash('just creator can edit/delete ', 'warning')
return False
# 验证args参数
# @pysnooper.snoop(watch_explode=('item'))
def task_args_check(self, item):
core.validate_str(item.name, 'name')
core.validate_json(item.args)
task_args = json.loads(item.args)
job_args = json.loads(item.job_template.args)
item.args = json.dumps(core.validate_task_args(task_args, job_args), indent=4, ensure_ascii=False)
if item.volume_mount and ":" not in item.volume_mount:
raise MyappException('volume_mount is not valid, must contain : or null')
# @pysnooper.snoop(watch_explode=('item'))
def merge_args(self, item, action):
logging.info(item)
# 将字段合并为字典
# @pysnooper.snoop()
def nest_once(inp_dict):
out = {}
if isinstance(inp_dict, dict):
for key, val in inp_dict.items():
if '.' in key:
keys = key.split('.')
sub_dict = out
for sub_key_index in range(len(keys)):
sub_key = keys[sub_key_index]
# 下面还有字典的情况
if sub_key_index != len(keys) - 1:
if sub_key not in sub_dict:
sub_dict[sub_key] = {}
else:
sub_dict[sub_key] = val
sub_dict = sub_dict[sub_key]
else:
out[key] = val
return out
args_json_column = {}
# 根据参数生成args字典。一层嵌套的形式
for arg in item.__dict__:
if arg[:5] == 'args.':
task_attr_value = getattr(item, arg)
# 如果是add
# 用户没做任何修改,比如文件未做修改或者输入为空,那么后端采用不修改的方案
if task_attr_value == None and action == 'update': # 必须不是None
# logging.info(item.args)
src_attr = arg[5:].split('.') # 多级子属性
sub_src_attr = json.loads(item.args)
for sub_key in src_attr:
sub_src_attr = sub_src_attr[sub_key] if sub_key in sub_src_attr else ''
args_json_column[arg] = sub_src_attr
elif task_attr_value == None and action == 'add': # 必须不是None
args_json_column[arg] = ''
else:
args_json_column[arg] = task_attr_value
# 如果是合并成的args
if args_json_column:
# 将一层嵌套的参数形式改为多层嵌套的json形似
des_merge_args = nest_once(args_json_column)
item.args = json.dumps(des_merge_args.get('args', {}))
# 如果是原始完成的args
elif not item.args:
item.args = '{}'
# 在web界面上添加一个图标
# @pysnooper.snoop()
def post_add(self, task):
pipeline = task.pipeline
expand = json.loads(pipeline.expand) if pipeline.expand else []
for ui_node in expand:
if ui_node.get('id', 0) == task.id:
return
expand.append(
{
"id": str(task.id),
"type": 'dataSet',
"position": {
"x": random.randint(50, 1000),
"y": random.randint(50, 600),
},
"data": {
"name": task.name,
"label": task.label
}
}
)
pipeline.expand = json.dumps(expand, ensure_ascii=False, indent=4)
db.session.commit()
pass
# @pysnooper.snoop(watch_explode=('item'))
def pre_add(self, item):
item.name = item.name.replace('_', '-')[0:54].lower()
if item.job_template is None:
raise MyappException(__("Job Template 为必选"))
item.volume_mount = item.pipeline.project.volume_mount # 默认使用项目的配置
if item.job_template.volume_mount and item.job_template.volume_mount not in item.volume_mount:
if item.volume_mount:
item.volume_mount += "," + item.job_template.volume_mount
else:
item.volume_mount = item.job_template.volume_mount
item.resource_memory = core.check_resource_memory(item.resource_memory)
item.resource_cpu = core.check_resource_cpu(item.resource_cpu)
self.merge_args(item, 'add')
self.task_args_check(item)
item.create_datetime = datetime.datetime.now()
item.change_datetime = datetime.datetime.now()
if int(core.get_gpu(item.resource_gpu)[0]):
item.node_selector = item.node_selector.replace('cpu=true', 'gpu=true')
else:
item.node_selector = item.node_selector.replace('gpu=true', 'cpu=true')
# @pysnooper.snoop(watch_explode=('item'))
def pre_update(self, item):
item.name = item.name.replace('_', '-')[0:54].lower()
if item.resource_gpu:
item.resource_gpu = str(item.resource_gpu).upper()
if item.job_template is None:
raise MyappException(__("Job Template 为必选"))
# # 切换了项目组,要把项目组的挂载加进去
all_project_volumes = []
if item.volume_mount:
all_project_volumes = [x.strip() for x in item.volume_mount.split(',') if x.strip()]
if item.job_template.volume_mount:
all_project_volumes += [x.strip() for x in item.job_template.volume_mount.split(',') if x.strip()]
for volume_mount in all_project_volumes:
if ":" in volume_mount:
volume, mount = volume_mount.split(":")[0], volume_mount.split(":")[1]
if mount not in item.volume_mount:
item.volume_mount = item.volume_mount.strip(',') + "," + volume_mount
# 修改失败,直接换为原来的
if item.volume_mount and ':' not in item.volume_mount:
item.volume_mount = self.src_item_json.get('volume_mount', '')
# 规范文本内容
if item.volume_mount:
item.volume_mount = ','.join([x.strip() for x in item.volume_mount.split(',') if x.strip()])
if item.outputs:
core.validate_json(item.outputs)
item.outputs = json.dumps(json.loads(item.outputs), indent=4, ensure_ascii=False)
if item.expand:
core.validate_json(item.expand)
item.expand = json.dumps(json.loads(item.expand), indent=4, ensure_ascii=False)
item.resource_memory = core.check_resource_memory(item.resource_memory, self.src_item_json.get('resource_memory', None))
item.resource_cpu = core.check_resource_cpu(item.resource_cpu, self.src_item_json.get('resource_cpu', None))
# item.resource_memory=core.check_resource_memory(item.resource_memory,self.src_resource_memory)
# item.resource_cpu = core.check_resource_cpu(item.resource_cpu,self.src_resource_cpu)
self.merge_args(item, 'update')
self.task_args_check(item)
item.change_datetime = datetime.datetime.now()
if int(core.get_gpu(item.resource_gpu)[0]):
item.node_selector = item.node_selector.replace('cpu=true', 'gpu=true')
else:
item.node_selector = item.node_selector.replace('gpu=true', 'cpu=true')
# 修改了名称要在pipeline的属性里面一起改了
src_task_name = self.src_item_json.get('name', item.name)
if item.name != src_task_name:
pipeline = item.pipeline
pipeline.dag_json = pipeline.dag_json.replace(f'"{src_task_name}"',f'"{item.name}"')
pipeline.expand = pipeline.expand.replace(f'"{src_task_name}"',f'"{item.name}"')
# 添加和删除task以后要更新pipeline的信息,会报错is not bound to a Session
# # @pysnooper.snoop()
# def post_add(self, item):
# item.pipeline.pipeline_file = dag_to_pipeline(item.pipeline, db.session)
# pipeline_argo_id, version_id = upload_pipeline(item.pipeline)
# if pipeline_argo_id:
# item.pipeline.pipeline_argo_id = pipeline_argo_id
# if version_id:
# item.pipeline.version_id = version_id
# db.session.commit()
# # @pysnooper.snoop(watch_explode=('item'))
# def post_update(self, item):
# # if type(item)==UnmarshalResult:
# # pass
# item.pipeline.pipeline_file = dag_to_pipeline(item.pipeline, db.session)
# pipeline_argo_id, version_id = upload_pipeline(item.pipeline)
# if pipeline_argo_id:
# item.pipeline.pipeline_argo_id = pipeline_argo_id
# if version_id:
# item.pipeline.version_id = version_id
# # db.session.update(item)
# db.session.commit()
# 因为删除就找不到pipeline了
def pre_delete(self, item):
self.check_redirect_list_url = '/pipeline_modelview/edit/' + str(item.pipeline.id)
self.pipeline = item.pipeline
# 删除task启动的所有实例
self.delete_task_run(item)
widget_config = {
"int": MyBS3TextFieldWidget,
"float": MyBS3TextFieldWidget,
"bool": None,
"str": MyBS3TextFieldWidget,
"text": MyBS3TextAreaFieldWidget,
"json": MyBS3TextAreaFieldWidget,
"date": DatePickerWidget,
"datetime": DateTimePickerWidget,
"password": BS3PasswordFieldWidget,
"enum": Select2Widget,
"multiple": Select2ManyWidget,
"file": None,
"dict": None,
"list": None
}
field_config = {
"int": IntegerField,
"float": FloatField,
"bool": BooleanField,
"str": StringField,
"text": StringField,
"json": MyJSONField, # MyJSONField 如果使用文本字段,传到后端的是又编过一次码的字符串
"date": DateField,
"datetime": DateTimeField,
"password": StringField,
"enum": SelectField,
"multiple": SelectMultipleField,
"file": FileField,
"dict": None,
"list": MyLineSeparatedListField
}
def run_pod(self, task, k8s_client, run_id, namespace, pod_name, image, working_dir, command, args):
# 模板中环境变量
task_env = task.job_template.env + "\n" if task.job_template.env else ''
HostNetwork = json.loads(task.job_template.expand).get("HostNetwork", False) if task.job_template.expand else False
# hostPort = 40000 + (task.id * 1000) % 10000
byte_string = run_id.encode('utf-8')
import hashlib
# 计算字节串的哈希值
hash_object = hashlib.sha256(byte_string)
hash_value = int(hash_object.hexdigest(), 16)
# 将哈希值映射到指定范围
hostPort = 40000 + 10*(hash_value % 1000)
_, _, resource_name = core.get_gpu(task.resource_gpu)
# 系统环境变量
task_env += 'KFJ_TASK_ID=' + str(task.id) + "\n"
task_env += 'KFJ_TASK_NAME=' + str(task.name) + "\n"
task_env += 'KFJ_TASK_NODE_SELECTOR=' + str(task.get_node_selector()) + "\n"
task_env += 'KFJ_TASK_VOLUME_MOUNT=' + str(task.volume_mount) + "\n"
task_env += 'KFJ_TASK_IMAGES=' + str(task.job_template.images) + "\n"
task_env += 'KFJ_TASK_RESOURCE_CPU=' + str(task.resource_cpu) + "\n"
task_env += 'KFJ_TASK_RESOURCE_MEMORY=' + str(task.resource_memory) + "\n"
task_env += 'KFJ_TASK_RESOURCE_GPU=' + str(task.resource_gpu.replace('+', '')) + "\n"
task_env += 'KFJ_TASK_PROJECT_NAME=' + str(task.pipeline.project.name) + "\n"
task_env += 'KFJ_PIPELINE_ID=' + str(task.pipeline_id) + "\n"
task_env += 'KFJ_RUN_ID=' + run_id + "\n"
task_env += 'KFJ_CREATOR=' + str(task.pipeline.created_by.username) + "\n"
task_env += 'KFJ_RUNNER=' + str(g.user.username) + "\n"
task_env += 'KFJ_PIPELINE_NAME=' + str(task.pipeline.name) + "\n"
task_env += 'KFJ_NAMESPACE=pipeline' + "\n"
task_env += f'GPU_RESOURCE_NAME={resource_name}' + "\n"
template_kwargs={}
def template_str(src_str):
rtemplate = Environment(loader=BaseLoader, undefined=DebugUndefined).from_string(src_str)
des_str = rtemplate.render(creator=task.pipeline.created_by.username,
datetime=datetime,
runner=g.user.username if g and g.user and g.user.username else task.pipeline.created_by.username,
uuid=uuid,
pipeline_id=task.pipeline.id,
pipeline_name=task.pipeline.name,
cluster_name=task.pipeline.project.cluster['NAME'],
**template_kwargs
)
return des_str
# 全局环境变量
pipeline_global_env = template_str(task.pipeline.global_env.strip()) if task.pipeline.global_env else '' # 优先渲染不然里面如果有date就可能存在不一致的问题
pipeline_global_env = [env.strip() for env in pipeline_global_env.split('\n') if '=' in env.strip()]
for env in pipeline_global_env:
key, value = env[:env.index('=')], env[env.index('=') + 1:]
if key not in task_env:
task_env += key + '=' + value + "\n"
# 全局环境变量可以在任务的参数中引用
for global_env in pipeline_global_env:
key, value = global_env.split('=')[0], global_env.split('=')[1]
if key not in template_kwargs:
template_kwargs[key] = value
platform_global_envs = json.loads(template_str(json.dumps(conf.get('GLOBAL_ENV', {}), indent=4, ensure_ascii=False)))
for global_env_key in platform_global_envs:
if global_env_key not in task_env:
task_env += global_env_key + '=' + platform_global_envs[global_env_key] + "\n"
new_args = []
if args:
for arg in args:
new_args.append(template_str(arg))
if command:
command = json.loads(template_str(json.dumps(command)))
if working_dir:
working_dir = template_str(working_dir)
volume_mount = task.volume_mount
resource_cpu = task.job_template.get_env('TASK_RESOURCE_CPU') if task.job_template.get_env('TASK_RESOURCE_CPU') else task.resource_cpu
resource_gpu = task.job_template.get_env('TASK_RESOURCE_GPU') if task.job_template.get_env('TASK_RESOURCE_GPU') else task.resource_gpu
resource_memory = task.job_template.get_env('TASK_RESOURCE_MEMORY') if task.job_template.get_env('TASK_RESOURCE_MEMORY') else task.resource_memory
hostAliases=conf.get('HOSTALIASES')
if task.job_template.hostAliases:
hostAliases += "\n" + task.job_template.hostAliases
image_pull_secrets = conf.get('HUBSECRET', [])
from myapp.models.model_job import Repository
user_repositorys = db.session.query(Repository).filter(Repository.created_by_fk == g.user.id).all()
image_pull_secrets = list(set([task.job_template.images.repository.hubsecret]+image_pull_secrets + [rep.hubsecret for rep in user_repositorys]))
if image_pull_secrets:
task_env += 'HUBSECRET='+ ','.join(image_pull_secrets) + "\n"
print(resource_gpu)
k8s_client.create_debug_pod(namespace,
name=pod_name,
labels={"pipeline": task.pipeline.name, 'task': task.name, 'user': g.user.username, 'run-id': run_id, 'pod-type': "task"},
annotations={'project':task.pipeline.project.name},
command=command,
args=new_args,
volume_mount=volume_mount,
working_dir=working_dir,
node_selector=task.get_node_selector(),
resource_memory=resource_memory,
resource_cpu=resource_cpu,
resource_gpu=resource_gpu,
resource_rdma = '0',
image_pull_policy=conf.get('IMAGE_PULL_POLICY', 'Always'),
image_pull_secrets=image_pull_secrets,
image=image,
hostAliases=hostAliases,
env=task_env,
privileged=task.job_template.privileged,
accounts=task.job_template.accounts, username=task.pipeline.created_by.username,
hostPort=[hostPort+1,hostPort+2] if HostNetwork else []
)
# @event_logger.log_this
@expose("/debug/<task_id>", methods=["GET", "POST"])
def debug(self, task_id):
task = db.session.query(Task).filter_by(id=task_id).first()
# 逻辑节点不能进行调试
if task.job_template.name == conf.get('LOGICAL_JOB'):
message = __('当前任务类型不允许进行调试')
flash(message, 'warning')
return self.response(400, **{"status": 1, "result": {}, "message": message})
# 除了自定义节点其他节点不能单任务调试
if task.job_template.name != conf.get('CUSTOMIZE_JOB'):
# 模板创建者可以调试模板
if not g.user.is_admin() and task.job_template.created_by.username != g.user.username:
message = __('仅管理员或当前任务模板创建者可启动debug模式')
flash(message, 'warning')
return self.response(400, **{"status": 1, "result": {}, "message": message})
# return redirect('/pipeline_modelview/web/%s' % str(task.pipeline.id))
from myapp.utils.py.py_k8s import K8s
k8s_client = K8s(task.pipeline.project.cluster.get('KUBECONFIG', ''))
namespace = conf.get('PIPELINE_NAMESPACE')
pod_name = "debug-" + task.pipeline.name.replace('_', '-') + "-" + task.name.replace('_', '-')
pod_name = pod_name.lower()[:60].strip('-')
pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name)
# print(pod)
if pod:
pod = pod[0]
# 有历史非运行态,直接删除
# if pod and (pod['status']!='Running' and pod['status']!='Pending'):
if pod and pod['status'] == 'Succeeded':
k8s_client.delete_pods(namespace=namespace, pod_name=pod_name)
time.sleep(2)
pod = None
# 没有历史或者没有运行态,直接创建
image = task.job_template.images.name
if json.loads(task.args).get('--work_images',''):
image = json.loads(task.args)['--work_images']
if json.loads(task.args).get('--work_image',''):
image = json.loads(task.args)['--work_image']
if json.loads(task.args).get('--images',''):
image = json.loads(task.args)['--images']
if json.loads(task.args).get('--image',''):
image = json.loads(task.args)['--image']
if json.loads(task.args).get('images',''):
image = json.loads(task.args)['images']
working_dir = None
if json.loads(task.args).get('workdir', ''):
working_dir = json.loads(task.args)['workdir']
if json.loads(task.args).get('--workdir', ''):
working_dir = json.loads(task.args)['--workdir']
if json.loads(task.args).get('--working_dir', ''):
working_dir = json.loads(task.args)['--working_dir']
if not pod or pod['status'] != 'Running':
run_id = "debug-" + str(uuid.uuid4().hex)
command=['sh','-c','sleep 7200 && hour=`date +%H` && while [ $hour -ge 06 ];do sleep 3600;hour=`date +%H`;done']
try:
self.run_pod(
task=task,
k8s_client=k8s_client,
run_id=run_id,
namespace=namespace,
pod_name=pod_name,
image=image,
working_dir=working_dir,
command=command,
args=None
)
except Exception as e:
return self.response(400, **{"status": 1, "result": {}, "message": str(e)})
try_num = 30
message = __('启动时间过长,一分钟后刷新此页面')
while (try_num > 0):
pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name)
# print(pod)
if pod:
pod = pod[0]
# 有历史非运行态,直接删除
if pod:
if pod['status'] == 'Running':
break
else:
events = k8s_client.get_pod_event(namespace=namespace, pod_name=pod_name)
# try:
# message = '启动时间过长,一分钟后刷新此页面'+", status:"+pod['status']+", message:"+json.dumps(pod['status_more']['conditions'],indent=4,ensure_ascii=False)
# except Exception as e:
# print(e)
try:
# 有新消息要打印
for event in events:
message = f'"time: "{event["time"]} \ntype: {event["type"]} \nreason: {event["reason"]} \nmessage: {event["message"]}'
message = message.replace('\n','<br>')
# print(message, flush=True)
message += "<br><br>" + message
except Exception as e:
print(e)
try_num = try_num - 1
time.sleep(2)
if try_num == 0:
flash(message, 'warning')
return self.response(400, **{"status": 1, "result": {}, "message": message})
# return redirect('/pipeline_modelview/web/%s'%str(task.pipeline.id))
return redirect("/k8s/web/debug/%s/%s/%s/%s" % (task.pipeline.project.cluster['NAME'], namespace, pod_name, pod_name))
@expose("/run/<task_id>", methods=["GET", "POST"])
# @pysnooper.snoop(watch_explode=('ops_args',))
def run_task(self, task_id):
task = db.session.query(Task).filter_by(id=task_id).first()
# 逻辑节点和python节点不能进行单任务运行
if task.job_template.name == conf.get('LOGICAL_JOB'):
message = __('当前任务类型不允许进行运行')
flash(message, 'warning')
return self.response(400, **{"status": 1, "result": {}, "message": message})
# 包含上游输出的不能进行单任务运行
import re
all_templates_vars = re.findall("(\{\{.*?\}\})",task.args)
for var in all_templates_vars:
if '.output' in var:
message = __('包含接收上游输出,不允许单任务运行')
flash(message, 'warning')
return self.response(400, **{"status": 1, "result": {}, "message": message})
from myapp.utils.py.py_k8s import K8s
k8s_client = K8s(task.pipeline.project.cluster.get('KUBECONFIG', ''))
namespace = conf.get('PIPELINE_NAMESPACE')
pod_name = "run-" + task.pipeline.name.replace('_', '-') + "-" + task.name.replace('_', '-')
pod_name = pod_name.lower()[:60].strip('-')
pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name)
# print(pod)
if pod:
pod = pod[0]
# 有历史,直接删除
if pod:
run_id = pod['labels'].get("run-id", '')
if run_id:
k8s_client.delete_workflow(all_crd_info=conf.get('CRD_INFO', {}), namespace=namespace, run_id=run_id)
k8s_client.delete_pods(namespace=namespace, pod_name=pod_name)
delete_time = datetime.datetime.now()
while pod:
time.sleep(2)
pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name)
check_date = datetime.datetime.now()
if (check_date - delete_time).total_seconds() > 60:
message = __("超时,请稍后重试")
flash(message, category='warning')
return self.response(400, **{"status": 1, "result": {}, "message": message})
# return redirect('/pipeline_modelview/web/%s' % str(task.pipeline.id))
# 没有历史或者没有运行态,直接创建
if not pod:
command = None
if task.job_template.entrypoint and task.job_template.entrypoint.strip():
command = task.job_template.entrypoint.strip()
if task.command and task.command.strip():
command = task.command.strip()
if command:
command = command.split(" ")
command = [com for com in command if com]
ops_args = []
task_args = json.loads(task.args) if task.args else {}
for task_attr_name in task_args:
# 布尔型只添加参数名
if type(task_args[task_attr_name]) == bool:
if task_args[task_attr_name]:
ops_args.append('%s' % str(task_attr_name))
elif not task_args[task_attr_name]: # 如果参数值为空,则都不添加
pass
# json类型直接导入序列化以后的
elif type(task_args[task_attr_name]) == dict or type(task_args[task_attr_name])==list:
ops_args.append('%s' % str(task_attr_name))
args_values = json.dumps(task_args[task_attr_name], ensure_ascii=False)
ops_args.append('%s' % args_values)
# list类型逗号分隔就好了
# # list类型分多次导入
# elif type(task_args[task_attr_name]) == list:
# for args_values in task_args[task_attr_name].split('\n'):
# ops_args.append('%s' % str(task_attr_name))
# # args_values = template_str(args_values) if re.match('\{\{.*\}\}',args_values) else args_values
# ops_args.append('%s' % args_values)
elif task_attr_name not in ['images','workdir']: # 如果参数名直接是这些,就不作为参数,而是替换模板的这两个配置
ops_args.append('%s' % str(task_attr_name))
ops_args.append('%s' % str(task_args[task_attr_name])) # 这里应该对不同类型的参数名称做不同的参数处理比如bool型只有参数没有值
# print(ops_args)
run_id = "run-" + str(task.pipeline.id) + "-" + str(task.id)
if task.job_template.name == conf.get('CUSTOMIZE_JOB'):
command = ['bash','-c',json.loads(task.args)['command']]
if task.job_template.name == conf.get('PYTHON_JOB'):
command = ['python', '-c', json.loads(task.args)['code']]
can_customize_args = [conf.get('CUSTOMIZE_JOB'),conf.get('PYTHON_JOB')]
args=None if task.job_template.name in can_customize_args else ops_args
try:
self.run_pod(
task=task,
k8s_client=k8s_client,
run_id=run_id,
namespace=namespace,
pod_name=pod_name,
image=json.loads(task.args).get('images',task.job_template.images.name),
working_dir=json.loads(task.args).get('workdir',task.job_template.workdir),
command=command,
args=args
)
except Exception as e:
return self.response(400, **{"status": 1, "result": {}, "message": str(e)})
try_num = 5
while (try_num > 0):
pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name)
# print(pod)
if pod:
break
try_num = try_num - 1
time.sleep(2)
if try_num == 0:
message = __('启动时间过长,一分钟后重试')
flash(message, 'warning')
return self.response(400, **{"status": 1, "result": {}, "message": message})
# return redirect('/pipeline_modelview/web/%s' % str(task.pipeline.id))
return redirect("/k8s/web/log/%s/%s/%s" % (task.pipeline.project.cluster['NAME'], namespace, pod_name))
def delete_task_run(self, task):
from myapp.utils.py.py_k8s import K8s
k8s_client = K8s(task.pipeline.project.cluster.get('KUBECONFIG', ''))
namespace = conf.get('PIPELINE_NAMESPACE')
# 删除运行时容器
pod_name = "run-" + task.pipeline.name.replace('_', '-') + "-" + task.name.replace('_', '-')
pod_name = pod_name.lower()[:60].strip('-')
pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name)
# print(pod)
if pod:
pod = pod[0]
# 有历史,直接删除
if pod:
k8s_client.delete_pods(namespace=namespace, pod_name=pod['name'])
run_id = pod['labels'].get('run-id', '')
if run_id:
k8s_client.delete_workflow(all_crd_info=conf.get("CRD_INFO", {}), namespace=namespace, run_id=run_id)
k8s_client.delete_pods(namespace=namespace, labels={"run-id": run_id})
k8s_client.delete_service(namespace=namespace, labels={"run-id": run_id})
time.sleep(2)
# 删除debug容器
pod_name = "debug-" + task.pipeline.name.replace('_', '-') + "-" + task.name.replace('_', '-')
pod_name = pod_name.lower()[:60].strip('-')
pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name)
# print(pod)
if pod:
pod = pod[0]
# 有历史,直接删除
if pod:
k8s_client.delete_pods(namespace=namespace, pod_name=pod['name'])
run_id = pod['labels'].get('run-id', '')
if run_id:
k8s_client.delete_workflow(all_crd_info=conf.get("CRD_INFO", {}), namespace=namespace, run_id=run_id)
k8s_client.delete_pods(namespace=namespace, labels={"run-id": run_id})
time.sleep(2)
@expose("/clear/<task_id>", methods=["GET", "POST"])
def clear_task(self, task_id):
task = db.session.query(Task).filter_by(id=task_id).first()
self.delete_task_run(task)
# flash(__("删除完成"), category='success')
# self.update_redirect()
return redirect('/pipeline_modelview/api/web/%s' % str(task.pipeline.id))
@expose("/log/<task_id>", methods=["GET", "POST"])
def log_task(self, task_id):
task = db.session.query(Task).filter_by(id=task_id).first()
from myapp.utils.py.py_k8s import K8s
k8s = K8s(task.pipeline.project.cluster.get('KUBECONFIG', ''))
namespace = conf.get('PIPELINE_NAMESPACE')
running_pod_name = "run-" + task.pipeline.name.replace('_', '-') + "-" + task.name.replace('_', '-')
pod_name = running_pod_name.lower()[:60].strip('-')
pod = k8s.get_pods(namespace=namespace, pod_name=pod_name)
if pod:
pod = pod[0]
return redirect("/k8s/web/log/%s/%s/%s" % (task.pipeline.project.cluster['NAME'], namespace, pod_name))
flash(__("未检测到当前task正在运行的容器"), category='success')
return redirect('/pipeline_modelview/api/web/%s' % str(task.pipeline.id))
#
# class Task_ModelView(Task_ModelView_Base, CompactCRUDMixin, MyappModelView):
# datamodel = SQLAInterface(Task)
#
#
# appbuilder.add_view_no_menu(Task_ModelView)
class Task_ModelView(Task_ModelView_Base, MyappModelRestApi):
datamodel = SQLAInterface(Task)
route_base = '/task_modelview'
appbuilder.add_api(Task_ModelView)
# # 添加api
class Task_ModelView_Api(Task_ModelView_Base, MyappModelRestApi):
datamodel = SQLAInterface(Task)
route_base = '/task_modelview/api'
# list_columns = ['name','label','job_template_url','volume_mount','debug']
list_columns = ['name', 'label', 'pipeline', 'job_template', 'volume_mount', 'node_selector', 'command',
'overwrite_entrypoint', 'working_dir', 'args', 'resource_memory', 'resource_cpu', 'resource_gpu', 'resource_rdma',
'timeout', 'retry', 'created_by', 'changed_by', 'created_on', 'changed_on', 'monitoring', 'expand']
add_columns = ['name', 'label', 'job_template', 'pipeline', 'working_dir', 'command', 'args', 'volume_mount',
'node_selector', 'resource_memory', 'resource_cpu', 'resource_gpu', 'resource_rdma', 'timeout', 'retry', 'skip',
'expand']
edit_columns = ['name', 'label', 'working_dir', 'command', 'args', 'volume_mount', 'resource_memory',
'resource_cpu', 'resource_gpu', 'resource_rdma', 'timeout', 'retry', 'skip', 'expand']
show_columns = ['name', 'label', 'pipeline', 'job_template', 'volume_mount', 'node_selector', 'command',
'overwrite_entrypoint', 'working_dir', 'args', 'resource_memory', 'resource_cpu', 'resource_gpu', 'resource_rdma',
'timeout', 'retry', 'skip', 'created_by', 'changed_by', 'created_on', 'changed_on', 'monitoring',
'expand']
appbuilder.add_api(Task_ModelView_Api)