cube-studio/myapp/tools/watch_tfjob.py

306 lines
12 KiB
Python
Raw Normal View History

2021-08-17 17:00:34 +08:00
import time,datetime, os
2022-07-24 17:46:23 +08:00
from kubernetes import client
2021-08-17 17:00:34 +08:00
from kubernetes import watch
import json
2022-07-10 21:46:26 +08:00
from myapp.utils.py.py_k8s import check_status_time,K8s
2021-08-17 17:00:34 +08:00
import pysnooper
from myapp import app
2021-08-17 17:00:34 +08:00
from myapp.models.model_job import (
Tfjob,
Task
)
from myapp.utils.celery import session_scope
from myapp.project import push_admin,push_message
from myapp.models.model_job import Pipeline
2021-08-17 17:00:34 +08:00
conf=app.config
from myapp.utils.py.py_prometheus import Prometheus
prometheus = Prometheus(conf.get('PROMETHEUS',''))
2021-09-07 18:10:37 +08:00
cluster=os.getenv('ENVIRONMENT','').lower()
2021-08-17 17:00:34 +08:00
if not cluster:
print('no cluster %s'%cluster)
exit(1)
else:
clusters = conf.get('CLUSTERS',{})
if clusters and cluster in clusters:
2022-06-05 20:38:37 +08:00
kubeconfig = clusters[cluster].get('KUBECONFIG','')
2022-07-24 17:46:23 +08:00
K8s(kubeconfig)
2021-08-17 17:00:34 +08:00
else:
print('no kubeconfig in cluster %s' % cluster)
exit(1)
2022-08-11 10:47:08 +08:00
# 推送消息
2022-02-26 22:36:57 +08:00
# @pysnooper.snoop()
2021-08-17 17:00:34 +08:00
def deliver_message(tfjob):
if not tfjob:
return
receivers = tfjob.username.split(',')
receivers = [receiver.strip() for receiver in receivers]
if not receivers:
print('no receivers')
return
info_json = json.loads(tfjob.info_json)
# print(info_json,experiments.status)
if tfjob.status in info_json['alert_status'] and tfjob.status not in info_json['has_push']:
receivers=list(set(receivers))
# data = {
# "Sender": sender,
# "Rcptto":receivers,
# }
workflow_name = info_json.get('workflow_name','')
hp_name = info_json.get('hp_name', '')
if workflow_name:
message = "tfjob: %s \nworkflow: %s \nnamespace: %s\nstatus: %s \ntime: %s" % (tfjob.name,workflow_name,tfjob.namespace,tfjob.status,datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
elif hp_name:
message = "tfjob: %s \nhp: %s(%s) \nnamespace: %s\nstatus: %s \ntime: %s" % (tfjob.name,info_json.get('hp_name',''),info_json.get('describe',''),tfjob.namespace,tfjob.status,datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
else:
message = "tfjob: %s \nnamespace: %s\nstatus: %s \ntime: %s" % (tfjob.name,tfjob.namespace,tfjob.status,datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
if message:
push_message(receivers,message)
2022-02-26 22:36:57 +08:00
# @pysnooper.snoop()
2021-08-17 17:00:34 +08:00
def check_has_push(crd,dbsession):
# 可能是workflow启动的或者是hp启动的
workflow_name = crd['labels'].get('workflow-name','')
hp_name = crd['labels'].get('hp-name', '')
username = crd['username']
alert_status = ''
# 如果是从workflow中创建的
if workflow_name:
pipeline = dbsession.query(Pipeline).filter_by(name=workflow_name).first()
if pipeline and pipeline.alert_status:
alert_status = pipeline.alert_status
print("tf %s from workflow_name %s,user %s,status %s" % (crd['name'],workflow_name,crd['username'],crd['status']))
# print("%s status %s"%(crd['name'], crd['status']))
alert_status='Pending' # 这里写死就是相当于必须且仅Pending告警
info_json={
"workflow_name":workflow_name,
"hp_name":hp_name,
"alert_status": alert_status,
"has_push":''
}
# print(crd['name'],crd['namespace'])
tfjob = dbsession.query(Tfjob).filter(Tfjob.name==crd['name']).filter(Tfjob.namespace==crd['namespace']).first()
if tfjob:
print('exist tfjob')
if tfjob.info_json:
exist_info_json = json.loads(tfjob.info_json)
info_json['has_push']=exist_info_json.get('has_push','')
tfjob.create_time = crd['create_time']
tfjob.status = crd['status']
tfjob.annotations = json.dumps(crd['annotations'],indent=4,ensure_ascii=False)
tfjob.labels = json.dumps(crd['labels'],indent=4,ensure_ascii=False)
tfjob.spec = json.dumps(crd['spec'],indent=4,ensure_ascii=False),
tfjob.status_more = json.dumps(crd['status_more'],indent=4,ensure_ascii=False)
tfjob.username = crd['username']
tfjob.info_json = json.dumps(info_json,indent=4,ensure_ascii=False)
dbsession.commit()
if crd['status'] in info_json['alert_status'] and crd['status'] not in info_json['has_push']:
return False,tfjob
else:
return True,tfjob
else:
print('new tfjob')
# crd['status_more']={}
# crd['spec']={}
tfjob = Tfjob(name=crd['name'],namespace=crd['namespace'],create_time=crd['create_time'],
status=crd['status'],
annotations=json.dumps(crd['annotations'],indent=4,ensure_ascii=False),
labels=json.dumps(crd['labels'],indent=4,ensure_ascii=False),
spec=json.dumps(crd['spec'],indent=4,ensure_ascii=False),
status_more=json.dumps(crd['status_more'],indent=4,ensure_ascii=False),
username=username,
info_json=json.dumps(info_json,indent=4,ensure_ascii=False))
dbsession.add(tfjob)
dbsession.commit()
return False,tfjob
#
# # 推送修改通知
# @pysnooper.snoop()
# def push_resource_rec(task,dbsession):
# task_monitorings = json.loads(task.monitoring).get('tfjob',[])
# if len(task_monitorings)>9:
# max_cpu = 0
# max_memory=0
# init_message = 'pipeline(%s)中分布式训练%s推荐资源如下自行修改:\n' % (task.pipeline.describe,task.label)
# message = init_message
# # tfjob_src_mem=re.match(task.args.match("memory": "32G",))
# for task_monitoring in task_monitorings:
# if float(task_monitoring.get('cpu',0))>max_cpu:
# max_cpu = float(task_monitoring.get('cpu',0))
# if float(task_monitoring.get('memory', 0)) > max_memory:
# max_memory = float(task_monitoring.get('memory', 0))
# if max_cpu:
# rec_cpu = math.ceil(max_cpu*1.4)
# if rec_cpu>150:
# rec_cpu=150
# if rec_cpu!=int(task.resource_cpu):
# message += "task(%s)原申请cpu:%s近10次最大使用cpu:%s建议申请值:%s\n" % (task.label,task.resource_cpu, max_cpu, rec_cpu)
# task.resource_cpu = str(rec_cpu)
# if max_memory:
# rec_memory = math.ceil(max_memory*1.4)
# if rec_memory>350:
# rec_memory=350
# if rec_memory!=int(task.resource_memory.replace('G','').replace('M','')):
# message += "task(%s)原申请mem:%s近10次最大使用mem:%s(G),建议申请值:%s\n" % (task.label,task.resource_memory, max_memory, str(rec_memory)+"G")
# task.resource_memory = str(rec_memory)+"G"
#
# dbsession.commit()
# if message!=init_message:
# push_message([task.pipeline.created_by.username],message)
2022-02-26 22:36:57 +08:00
# @pysnooper.snoop()
2021-08-17 17:00:34 +08:00
def save_monitoring(tfjob,dbsession):
try:
if tfjob.status=='Succeeded':
task_id = json.loads(tfjob.labels).get('task-id','')
if task_id:
task = dbsession.query(Task).filter_by(id=int(task_id)).first()
2021-09-07 18:10:37 +08:00
metrics = prometheus.get_resource_metric(tfjob.name, namespace='pipeline')
2021-08-17 17:00:34 +08:00
monitoring = json.loads(task.monitoring) if task and task.monitoring else {}
tfjob_monitoring = monitoring.get('tfjob', [])
if metrics:
tfjob_monitoring.append({
"cpu": metrics.get('cpu', ''),
"memory": metrics.get('memory', ''),
"name": tfjob.name,
"update_time": datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
})
# 清理监控记录
tfjob_monitoring_new = []
for metric in tfjob_monitoring:
# 采集结果不对的,和采集结果太久远的都清理掉
if float(metric.get('cpu', 0)) > 0.1 and float(metric.get('memory', 0)) > 0.1 and metric['update_time'] > (datetime.datetime.now() - datetime.timedelta(days=30)).strftime('%Y-%m-%d %H:%M:%S'):
tfjob_monitoring_new.append(metric)
if len(tfjob_monitoring_new) > 10:
del tfjob_monitoring_new[0]
monitoring_new = {}
monitoring_new['task'] = monitoring.get('task', [])
monitoring_new['tfjob'] = tfjob_monitoring_new
print(monitoring_new)
if task:
task.monitoring = json.dumps(monitoring_new,ensure_ascii=False,indent=4)
dbsession.commit()
# print(pods)
# push_resource_rec(task, dbsession)
except Exception as e:
print(e)
2022-02-26 22:36:57 +08:00
# @pysnooper.snoop()
2021-08-17 17:00:34 +08:00
def save_history(tfjob,dbsession):
info_json = json.loads(tfjob.info_json)
if info_json['has_push']:
if not tfjob.status in info_json['has_push']:
info_json['has_push'] += ',' + tfjob.status
else:
info_json['has_push'] = tfjob.status
tfjob.info_json = json.dumps(info_json, indent=4, ensure_ascii=False)
dbsession.commit()
2022-02-26 22:36:57 +08:00
# @pysnooper.snoop()
2021-08-17 17:00:34 +08:00
def check_crd_exist(group,version,namespace,plural,name):
2022-07-24 17:46:23 +08:00
exist_crd = client.CustomObjectsApi().get_namespaced_custom_object(group,version,namespace,plural,name)
2021-08-17 17:00:34 +08:00
return exist_crd
@pysnooper.snoop()
def deal_event(event,crd_info,namespace):
with session_scope(nullpool=True) as dbsession:
try:
crd_object = event['object']
exist_crd = check_crd_exist(group=crd_info['group'], version=crd_info["version"], namespace=namespace,
plural=crd_info["plural"], name=crd_object['metadata']['name'])
if not exist_crd:
print('not exist')
return
status = ''
if 'status' in crd_object:
if 'conditions' in crd_object['status']:
if len(crd_object['status']['conditions']) > 0:
if 'type' in crd_object['status']['conditions'][-1]:
status = crd_object['status']['conditions'][-1]['type']
creat_time = crd_object['metadata']['creationTimestamp'].replace('T', ' ').replace('Z', '')
creat_time = (datetime.datetime.strptime(creat_time, '%Y-%m-%d %H:%M:%S') + datetime.timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S')
back_object = {
'username': '',
"name": crd_object['metadata']['name'],
"namespace": crd_object['metadata']['namespace'] if 'namespace' in crd_object['metadata'] else '',
"annotations": crd_object['metadata'].get('annotations', {}),
"labels": crd_object['metadata'].get('labels', {}),
"spec": crd_object['spec'],
"create_time": creat_time,
"status": status,
"status_more": check_status_time(crd_object['status']) if 'status' in crd_object else {}
}
if 'run-rtx' in back_object['labels']:
back_object['username'] = back_object['labels']['run-rtx']
elif 'upload-rtx' in back_object:
back_object['username'] = back_object['labels']['upload-rtx']
has_push, crd_model = check_has_push(back_object,dbsession)
if not has_push:
try:
deliver_message(crd_model)
except Exception as e1:
print('push fail:', e1)
push_admin(str(e1))
save_history(crd_model,dbsession)
save_monitoring(crd_model,dbsession)
except Exception as e:
print(e)
@pysnooper.snoop()
def listen_crd():
crd_info = conf.get('CRD_INFO')['tfjob']
namespace = conf.get('PIPELINE_NAMESPACE')
w = watch.Watch()
print('begin listen')
while(True):
try:
2022-07-24 17:46:23 +08:00
for event in w.stream(client.CustomObjectsApi().list_namespaced_custom_object, group=crd_info['group'],
2021-08-17 17:00:34 +08:00
version=crd_info['version'],
namespace=namespace, plural=crd_info['plural'], pretty='true'):
if event['type']=='ADDED' or event['type']=='MODIFIED': # ADDED MODIFIED DELETED
deal_event(event,crd_info,namespace)
elif event['type']=='ERROR':
w = watch.Watch()
time.sleep(60)
except Exception as ee:
print(ee)
# 不能使用异步io因为stream会阻塞
if __name__=='__main__':
listen_crd()