mirror of
https://github.com/tencentmusic/cube-studio.git
synced 2024-11-27 05:33:10 +08:00
Merge branch 'master' of https://github.com/tencentmusic/cube-studio
This commit is contained in:
commit
ef4fbcfc6d
25
job-template/job/ray_sklearn/README.md
Normal file
25
job-template/job/ray_sklearn/README.md
Normal file
@ -0,0 +1,25 @@
|
||||
# ray-sklearn 模板
|
||||
描述:基于ray的分布式能力,实现sklearn机器学习模型的分布式训练。
|
||||
|
||||
镜像:ccr.ccs.tencentyun.com/cube-studio/sklearn_estimator:v1
|
||||
|
||||
环境变量:
|
||||
```bash
|
||||
NO_RESOURCE_CHECK=true
|
||||
TASK_RESOURCE_CPU=2
|
||||
TASK_RESOURCE_MEMORY=4G
|
||||
TASK_RESOURCE_GPU=0
|
||||
```
|
||||
|
||||
启动参数:
|
||||
```bash
|
||||
train_csv_file_path: 训练集csv,|分割符,首行是列名
|
||||
predict_csv_file_path: 预测数据集csv,格式和训练集一致,默认为空,需要predict时填
|
||||
label_name: label的列名,必填
|
||||
model_name: 训练用到的模型名称,如LogisticRegression,必填。常用的都支持,要加联系管理员
|
||||
model_args_dict: 模型参数,json格式,默认为空
|
||||
model_file_path: 模型文件保存文件名,必填
|
||||
predict_result_path: 预测结果保存文件名,默认为空,需要predict时填
|
||||
worker_num: ray worker数量
|
||||
```
|
||||
|
@ -1,117 +1,25 @@
|
||||
# volcanojob 模板
|
||||
镜像:ccr.ccs.tencentyun.com/cube-studio/xgb_train_and_predict:v1
|
||||
# xgboost 模板
|
||||
描述:单机xgb训练,支持训练预测。
|
||||
|
||||
环境变量:
|
||||
镜像:ccr.ccs.tencentyun.com/cube-studio/xgb_train_and_predict:v1
|
||||
|
||||
环境变量:
|
||||
```bash
|
||||
NO_RESOURCE_CHECK=true
|
||||
TASK_RESOURCE_CPU=2
|
||||
TASK_RESOURCE_MEMORY=4G
|
||||
TASK_RESOURCE_GPU=0
|
||||
```
|
||||
账号:kubeflow-pipeline
|
||||
启动参数:
|
||||
|
||||
启动参数:
|
||||
```bash
|
||||
{
|
||||
"shell": {
|
||||
"--working_dir": {
|
||||
"type": "str",
|
||||
"item_type": "str",
|
||||
"label": "启动目录",
|
||||
"require": 1,
|
||||
"choice": [],
|
||||
"range": "",
|
||||
"default": "/mnt/xx",
|
||||
"placeholder": "",
|
||||
"describe": "启动目录",
|
||||
"editable": 1,
|
||||
"condition": "",
|
||||
"sub_args": {}
|
||||
},
|
||||
"--command": {
|
||||
"type": "str",
|
||||
"item_type": "str",
|
||||
"label": "启动命令",
|
||||
"require": 1,
|
||||
"choice": [],
|
||||
"range": "",
|
||||
"default": "echo aa",
|
||||
"placeholder": "",
|
||||
"describe": "启动命令",
|
||||
"editable": 1,
|
||||
"condition": "",
|
||||
"sub_args": {}
|
||||
},
|
||||
"--num_worker": {
|
||||
"type": "str",
|
||||
"item_type": "str",
|
||||
"label": "占用机器个数",
|
||||
"require": 1,
|
||||
"choice": [],
|
||||
"range": "",
|
||||
"default": "3",
|
||||
"placeholder": "",
|
||||
"describe": "占用机器个数",
|
||||
"editable": 1,
|
||||
"condition": "",
|
||||
"sub_args": {}
|
||||
},
|
||||
"--image": {
|
||||
"type": "str",
|
||||
"item_type": "str",
|
||||
"label": "",
|
||||
"require": 1,
|
||||
"choice": [],
|
||||
"range": "",
|
||||
"default": "ccr.ccs.tencentyun.com/cube-studio/ubuntu-gpu:cuda10.1-cudnn7-python3.6",
|
||||
"placeholder": "",
|
||||
"describe": "worker镜像,直接运行你代码的环境镜像<a href='https://docs.qq.com/doc/DU0ptZEpiSmtMY1JT'>基础镜像</a>",
|
||||
"editable": 1,
|
||||
"condition": "",
|
||||
"sub_args": {}
|
||||
}
|
||||
}
|
||||
}
|
||||
sep: 分隔符
|
||||
classifier_or_regressor: 分类还是回归
|
||||
params: xgb参数, json格式,透传给xgboost类构建函数
|
||||
train_csv_file_path: 训练集csv路径,首行是header,首列是label。为空则不做训练,尝试从model_load_path加载模型。
|
||||
model_load_path: 模型加载路径。为空则不加载。
|
||||
predict_csv_file_path: 预测数据集csv路径,格式和训练集一致,顺序保持一致,没有label列。为空则不做predict。
|
||||
predict_result_path: 预测结果保存路径,为空则不做predict。
|
||||
model_save_path: 模型文件保存路径。为空则不保存模型。
|
||||
eval_result_path: 模型评估报告保存路径。默认为空,想看模型评估报告可填。
|
||||
```
|
||||
|
||||
# 用户代码示例
|
||||
|
||||
保留单机的代码,添加识别集群信息的代码(多少个worker,当前worker是第几个),添加分工(只处理归属于当前worker的任务),
|
||||
|
||||
完成。
|
||||
|
||||
worker示例:
|
||||
```
|
||||
import time, datetime, json, requests, io, os
|
||||
from multiprocessing import Pool
|
||||
from functools import partial
|
||||
import os, random
|
||||
|
||||
WORLD_SIZE = int(os.getenv('VC_WORKER_NUM', '1')) # 总worker的数目
|
||||
RANK = int(os.getenv("VC_TASK_INDEX", '0')) # 当前是第几个worker 从0开始
|
||||
|
||||
print(WORLD_SIZE, RANK)
|
||||
|
||||
|
||||
# 子进程要执行的代码
|
||||
def task(key):
|
||||
print('worker:',RANK,', task:',key,flush=True)
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
input = range(30000) # 所有要处理的数据
|
||||
local_task = [] # 当前worker需要处理的任务
|
||||
for index in input:
|
||||
if index%WORLD_SIZE==RANK:
|
||||
local_task.append(index) # 要处理的数据均匀分配到每个worker
|
||||
|
||||
# 每个worker内部还可以用多进程,线程池之类的并发操作。
|
||||
pool = Pool(10) # 开辟包含指定数目线程的线程池
|
||||
pool.map(partial(task), local_task) # 当前worker,只处理分配给当前worker的任务
|
||||
pool.close()
|
||||
pool.join()
|
||||
```
|
||||
|
||||
# 示例
|
||||
demo.py
|
@ -10,6 +10,7 @@ import { Provider } from 'react-redux';
|
||||
initializeIcons();
|
||||
|
||||
// 全局样式
|
||||
// 样式可替换
|
||||
mergeStyles({
|
||||
':global(body,html,#app)': {
|
||||
margin: 0,
|
||||
|
Loading…
Reference in New Issue
Block a user