mirror of
https://github.com/tencentmusic/cube-studio.git
synced 2024-12-27 06:29:10 +08:00
115 lines
6.1 KiB
Python
115 lines
6.1 KiB
Python
|
|
"""Utility functions used across Myapp"""
|
|
import pysnooper
|
|
from datetime import datetime
|
|
import time
|
|
from myapp.utils.py.py_k8s import K8s
|
|
from myapp.utils.celery import session_scope
|
|
from myapp.project import push_message,push_admin
|
|
from myapp.tasks.celery_app import celery_app
|
|
# Myapp framework imports
|
|
from myapp import app
|
|
from myapp.models.model_serving import InferenceService
|
|
from myapp.views.view_inferenceserving import InferenceService_ModelView_base
|
|
from myapp.models.model_docker import Docker
|
|
conf = app.config
|
|
|
|
|
|
@celery_app.task(name="task.check_docker_commit", bind=True) # , soft_time_limit=15
|
|
def check_docker_commit(task,docker_id): # 在页面中测试时会自定接收者和id
|
|
with session_scope(nullpool=True) as dbsession:
|
|
try:
|
|
docker = dbsession.query(Docker).filter_by(id=int(docker_id)).first()
|
|
pod_name = "docker-commit-%s-%s" % (docker.created_by.username, str(docker.id))
|
|
namespace = conf.get('NOTEBOOK_NAMESPACE')
|
|
k8s_client = K8s(conf.get('CLUSTERS').get(conf.get('ENVIRONMENT')).get('KUBECONFIG',''))
|
|
begin_time=datetime.datetime.now()
|
|
now_time=datetime.datetime.now()
|
|
while((now_time-begin_time).seconds<1800): # 也就是最多commit push 30分钟
|
|
time.sleep(12000)
|
|
commit_pods = k8s_client.get_pods(namespace=namespace,pod_name=pod_name)
|
|
if commit_pods:
|
|
commit_pod=commit_pods[0]
|
|
if commit_pod['status']=='Succeeded':
|
|
docker.last_image=docker.target_image
|
|
dbsession.commit()
|
|
break
|
|
# 其他异常状态直接报警
|
|
if commit_pod['status']!='Running':
|
|
push_message(conf.get('ADMIN_USER').split(','),'commit pod %s not running'%commit_pod['name'])
|
|
break
|
|
else:
|
|
break
|
|
|
|
except Exception as e:
|
|
print(e)
|
|
|
|
|
|
@celery_app.task(name="task.upgrade_service", bind=True) # , soft_time_limit=15
|
|
@pysnooper.snoop()
|
|
def upgrade_service(task,service_id,name,namespace):
|
|
# 将旧的在线版本进行下掉,前提是新的服务必须已经就绪
|
|
time.sleep(10)
|
|
with session_scope(nullpool=True) as dbsession:
|
|
try:
|
|
service = dbsession.query(InferenceService).filter_by(id=int(service_id)).first()
|
|
message = '%s 准备进行服务迭代 %s %s'%(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),service.model_name,service.model_version)
|
|
push_admin(message)
|
|
push_message([service.created_by.username],message)
|
|
k8s_client = K8s(service.project.cluster.get('KUBECONFIG',''))
|
|
begin_time = time.time()
|
|
while (True):
|
|
try:
|
|
deployment = k8s_client.AppsV1Api.read_namespaced_deployment(name=name, namespace=namespace)
|
|
if deployment:
|
|
ready_replicas = deployment.status.ready_replicas
|
|
replicas = deployment.status.replicas
|
|
message = '%s 服务 %s %s ready副本数:%s 目标副本数:%s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), service.model_name,service.model_version, ready_replicas, replicas)
|
|
push_admin(message)
|
|
push_message([service.created_by.username], message)
|
|
# 如果新的dp副本数已全部就绪
|
|
if ready_replicas == replicas:
|
|
break
|
|
else:
|
|
message = '%s 没有发现 %s %s 的 deployment' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), service.model_name,service.model_version)
|
|
push_admin(message)
|
|
push_message([service.created_by.username], message)
|
|
return
|
|
|
|
except Exception as e:
|
|
print(e)
|
|
if time.time() - begin_time > 600:
|
|
message = '%s 新版本运行状态检查超时,请手动检查和清理旧版本%s %s' % (
|
|
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), service.model_name, service.model_version)
|
|
push_admin(message)
|
|
push_message([service.created_by.username], message)
|
|
return
|
|
time.sleep(60)
|
|
|
|
old_services = dbsession.query(InferenceService)\
|
|
.filter(InferenceService.model_status=='online')\
|
|
.filter(InferenceService.model_name==service.model_name)\
|
|
.filter(InferenceService.name!=service.name)\
|
|
.filter(InferenceService.host==service.host).all()
|
|
|
|
if old_services:
|
|
for old_service in old_services:
|
|
if old_service.name != service.name:
|
|
inference_model_view = InferenceService_ModelView_base()
|
|
inference_model_view.delete_old_service(old_service.name, old_service.project.cluster)
|
|
old_service.model_status = 'offline'
|
|
old_service.deploy_history = service.deploy_history + "\n" + "clear: %s %s" % ('admin', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
|
|
dbsession.commit()
|
|
message = '%s 新版本服务升级完成,下线旧服务 %s %s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),service.model_name, old_service.model_version)
|
|
push_admin(message)
|
|
push_message([service.created_by.username],message)
|
|
else:
|
|
message = '%s %s 没有历史在线版本,%s版本升级完成' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), service.model_name, service.model_version)
|
|
push_admin(message)
|
|
push_message([service.created_by.username], message)
|
|
except Exception as e:
|
|
print(e)
|
|
push_admin('部署升级报错 %s %s: %s'%(service.model_name, service.model_version,str(e)))
|
|
|
|
|