cube-studio/myapp/tasks/async_task.py

115 lines
6.1 KiB
Python

"""Utility functions used across Myapp"""
import pysnooper
from datetime import datetime
import time
from myapp.utils.py.py_k8s import K8s
from myapp.utils.celery import session_scope
from myapp.project import push_message,push_admin
from myapp.tasks.celery_app import celery_app
# Myapp framework imports
from myapp import app
from myapp.models.model_serving import InferenceService
from myapp.views.view_inferenceserving import InferenceService_ModelView_base
from myapp.models.model_docker import Docker
conf = app.config
@celery_app.task(name="task.check_docker_commit", bind=True) # , soft_time_limit=15
def check_docker_commit(task,docker_id): # 在页面中测试时会自定接收者和id
with session_scope(nullpool=True) as dbsession:
try:
docker = dbsession.query(Docker).filter_by(id=int(docker_id)).first()
pod_name = "docker-commit-%s-%s" % (docker.created_by.username, str(docker.id))
namespace = conf.get('NOTEBOOK_NAMESPACE')
k8s_client = K8s(conf.get('CLUSTERS').get(conf.get('ENVIRONMENT')).get('KUBECONFIG',''))
begin_time=datetime.datetime.now()
now_time=datetime.datetime.now()
while((now_time-begin_time).seconds<1800): # 也就是最多commit push 30分钟
time.sleep(12000)
commit_pods = k8s_client.get_pods(namespace=namespace,pod_name=pod_name)
if commit_pods:
commit_pod=commit_pods[0]
if commit_pod['status']=='Succeeded':
docker.last_image=docker.target_image
dbsession.commit()
break
# 其他异常状态直接报警
if commit_pod['status']!='Running':
push_message(conf.get('ADMIN_USER').split(','),'commit pod %s not running'%commit_pod['name'])
break
else:
break
except Exception as e:
print(e)
@celery_app.task(name="task.upgrade_service", bind=True) # , soft_time_limit=15
@pysnooper.snoop()
def upgrade_service(task,service_id,name,namespace):
# 将旧的在线版本进行下掉,前提是新的服务必须已经就绪
time.sleep(10)
with session_scope(nullpool=True) as dbsession:
try:
service = dbsession.query(InferenceService).filter_by(id=int(service_id)).first()
message = '%s 准备进行服务迭代 %s %s'%(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),service.model_name,service.model_version)
push_admin(message)
push_message([service.created_by.username],message)
k8s_client = K8s(service.project.cluster.get('KUBECONFIG',''))
begin_time = time.time()
while (True):
try:
deployment = k8s_client.AppsV1Api.read_namespaced_deployment(name=name, namespace=namespace)
if deployment:
ready_replicas = deployment.status.ready_replicas
replicas = deployment.status.replicas
message = '%s 服务 %s %s ready副本数:%s 目标副本数:%s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), service.model_name,service.model_version, ready_replicas, replicas)
push_admin(message)
push_message([service.created_by.username], message)
# 如果新的dp副本数已全部就绪
if ready_replicas == replicas:
break
else:
message = '%s 没有发现 %s %s 的 deployment' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), service.model_name,service.model_version)
push_admin(message)
push_message([service.created_by.username], message)
return
except Exception as e:
print(e)
if time.time() - begin_time > 600:
message = '%s 新版本运行状态检查超时,请手动检查和清理旧版本%s %s' % (
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), service.model_name, service.model_version)
push_admin(message)
push_message([service.created_by.username], message)
return
time.sleep(60)
old_services = dbsession.query(InferenceService)\
.filter(InferenceService.model_status=='online')\
.filter(InferenceService.model_name==service.model_name)\
.filter(InferenceService.name!=service.name)\
.filter(InferenceService.host==service.host).all()
if old_services:
for old_service in old_services:
if old_service.name != service.name:
inference_model_view = InferenceService_ModelView_base()
inference_model_view.delete_old_service(old_service.name, old_service.project.cluster)
old_service.model_status = 'offline'
old_service.deploy_history = service.deploy_history + "\n" + "clear: %s %s" % ('admin', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
dbsession.commit()
message = '%s 新版本服务升级完成,下线旧服务 %s %s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),service.model_name, old_service.model_version)
push_admin(message)
push_message([service.created_by.username],message)
else:
message = '%s %s 没有历史在线版本,%s版本升级完成' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), service.model_name, service.model_version)
push_admin(message)
push_message([service.created_by.username], message)
except Exception as e:
print(e)
push_admin('部署升级报错 %s %s: %s'%(service.model_name, service.model_version,str(e)))