cube-studio/job-template/job/horovod
FerdinandWard 0c1968fe45 fix readme
2022-08-18 15:16:10 +08:00
..
build.sh 添加horovod分布式 2022-06-14 13:20:50 +08:00
demo.py 模板demo示例 2022-07-26 20:47:20 +08:00
Dockerfile 模板demo示例 2022-07-26 20:47:20 +08:00
README.md fix readme 2022-08-18 15:16:10 +08:00
start.py upgrade kubernetes version and add /dev/shm volume_mounts default 2022-08-16 11:07:00 +08:00
tensorflow-mnist.yaml 添加horovod分布式 2022-06-14 13:20:50 +08:00

horovod 模板

镜像ccr.ccs.tencentyun.com/cube-studio/horovod:20210401 k8s账号: kubeflow-pipeline 启动参数:

{
    "参数": {
        "--work_images": {
            "type": "str",
            "item_type": "str",
            "label": "worker的运行镜像直接运行你代码的环境镜像 <a target='_blank' href='https://github.com/tencentmusic/cube-studio/tree/master/images'>基础镜像</a>",
            "require": 1,
            "choice": [],
            "range": "",
            "default": "ccr.ccs.tencentyun.com/cube-studio/horovod:20210401",
            "placeholder": "",
            "describe": "worker的运行镜像",
            "editable": 1,
            "condition": "",
            "sub_args": {}
        },
        "--working_dir": {
            "type": "str",
            "item_type": "str",
            "label": "命令的启动目录",
            "require": 1,
            "choice": [],
            "range": "",
            "default": "/mnt/xxx/horovod/",
            "placeholder": "",
            "describe": "命令的启动目录",
            "editable": 1,
            "condition": "",
            "sub_args": {}
        },
        "--command": {
            "type": "str",
            "item_type": "str",
            "label": "训练启动命令",
            "require": 1,
            "choice": [],
            "range": "",
            "default": "python /mnt/admin/demo.py",
            "placeholder": "",
            "describe": "训练启动命令",
            "editable": 1,
            "condition": "",
            "sub_args": {}
        },
        "--num_worker": {
            "type": "str",
            "item_type": "str",
            "label": "分布式worker的数目",
            "require": 1,
            "choice": [],
            "range": "",
            "default": 2,
            "placeholder": "",
            "describe": "分布式worker的数目",
            "editable": 1,
            "condition": "",
            "sub_args": {}
        }
    }
}