From fd47961487138b1f952e4ec5aa98ad79afeb03f3 Mon Sep 17 00:00:00 2001 From: cdllp2 Date: Tue, 11 Oct 2022 18:24:45 +0800 Subject: [PATCH] add tensorboard --- .../jupyter-notebook/deeplearning/Dockerfile | 5 ++ images/jupyter-notebook/readme.md | 55 +++++++++++++ .../example/ms_wide_deep_dataparallel.yaml | 61 +++++++++++++++ .../example/ms_wide_deep_ps_distribute.yaml | 77 +++++++++++++++++++ .../example/ms_wide_deep_ps_standalone.yaml | 74 ++++++++++++++++++ 5 files changed, 272 insertions(+) create mode 100644 images/jupyter-notebook/readme.md create mode 100644 install/kubernetes/mindspore/example/ms_wide_deep_dataparallel.yaml create mode 100644 install/kubernetes/mindspore/example/ms_wide_deep_ps_distribute.yaml create mode 100644 install/kubernetes/mindspore/example/ms_wide_deep_ps_standalone.yaml diff --git a/images/jupyter-notebook/deeplearning/Dockerfile b/images/jupyter-notebook/deeplearning/Dockerfile index 5bb45757..5259e6a2 100644 --- a/images/jupyter-notebook/deeplearning/Dockerfile +++ b/images/jupyter-notebook/deeplearning/Dockerfile @@ -15,6 +15,11 @@ RUN pip install numpy pandas StatsModels Matplotlib Seaborn sklearn wheel SciPy RUN pip install tensorflow keras torch torchvision keras nltk spacy gensim \ && rm -rf /tmp/* /var/tmp/* /root/.cache +# 安装插件 +RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager +RUN pip install --upgrade jupyterlab-git && jupyter lab build && \ + pip uninstall -y tensorboard && pip install tensorboard==2.10 jupyterlab-tensorboard-pro && jupyter labextension install jupyterlab_tensorboard_pro ; jupyter serverextension enable jupyterlab_tensorboard_pro --sys-prefix + # 拷贝examples COPY examples/* /examples/ diff --git a/images/jupyter-notebook/readme.md b/images/jupyter-notebook/readme.md new file mode 100644 index 00000000..49f72f28 --- /dev/null +++ b/images/jupyter-notebook/readme.md @@ -0,0 +1,55 @@ +# notebook重启问题 + +关于续期:因为对gpu的占用方式为独占方式,所以对于gpu notebook会定时清理,需要按时续期。 + +关于清理:可以通过删除config.py中的delete_notebook定时任务,关闭掉定时清理notebook + +关于环境:重启后会自动执行/mnt/$USERNAME/init.sh脚本,所以可以将环境写入此脚本,重启后自动安装环境,否则就需要打包到镜像或者离线anaconda文件 + +# 构建notebook镜像 + +需要构建新镜像并在生产上替换,才能让用户使用新的notebook镜像。 + +## 方法1:Dockerfile构建 + +jupyter镜像的构建在:https://github.com/tencentmusic/cube-studio/tree/master/images/jupyter-notebook + +vscode镜像的构建在:https://github.com/tencentmusic/cube-studio/tree/master/images/theia + +现在默认使用的镜像为 +``` +# notebook使用的镜像 +NOTEBOOK_IMAGES=[ + ['ccr.ccs.tencentyun.com/cube-studio/notebook:vscode-ubuntu-cpu-base', 'vscode(cpu)'], + ['ccr.ccs.tencentyun.com/cube-studio/notebook:vscode-ubuntu-gpu-base', 'vscode(gpu)'], + ['ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-cpu-base', 'jupyter(cpu)'], + ['ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-gpu-base','jupyter(gpu)'], + ['ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-bigdata', 'jupyter(bigdata)'], + ['ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-machinelearning', 'jupyter(machinelearning)'], + ['ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-deeplearning', 'jupyter(deeplearning)'], +] +``` +## 方法2,直接commit容器 + +也可以直接run一个容器,然后安装插件后将容器commmit成镜像。 +``` +# 启动jupyter +docker run --name jupyter -p 3000:3000 -d ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-cpu-base jupyter lab --notebook-dir=/ --ip=0.0.0.0 --no-browser --allow-root --port=3000 --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.allow_origin='*' + +# 启动vscode +docker run --name vscode -p 3000:3000 -d ccr.ccs.tencentyun.com/cube-studio/notebook:vscode-ubuntu-cpu-base node /home/theia/src-gen/backend/main.js /home/project --hostname=0.0.0.0 --port=3000 + +``` +然后访问 http://xx.xx.xx.xx:3000/ , web界面操作,安装notebook插件,安装pip/apt环境等。环境完整后,再使用如下命令commit成镜像。 +``` +docker commit notebook ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-cpu-base-1 +``` + +# 修改配置文件 + +config.py中 NOTEBOOK_IMAGES 变量为notebook可选镜像。更新此变量即可。 + + +# 其他类型的notebook + +所有可提供在线编辑功能的web服务都可以定义为notebook。开发代码已提供对外,需要满足几个条件,可配置url前缀,用来区分不同的notebook。 diff --git a/install/kubernetes/mindspore/example/ms_wide_deep_dataparallel.yaml b/install/kubernetes/mindspore/example/ms_wide_deep_dataparallel.yaml new file mode 100644 index 00000000..f1717f9b --- /dev/null +++ b/install/kubernetes/mindspore/example/ms_wide_deep_dataparallel.yaml @@ -0,0 +1,61 @@ +# wide&deep dataparallel mode with scheduler and workers +# network : https://gitee.com/mindspore/models/tree/master/official/recommend/wide_and_deep +apiVersion: mindspore.gitee.com/v1 +kind: MSJob +metadata: + name: ms-widedeep-dataparallel + namespace: pipeline +spec: + runPolicy: + cleanPodPolicy: None + successPolicy: AllWorkers + msReplicaSpecs: + Scheduler: + replicas: 1 + restartPolicy: Never + template: + spec: + volumes: + - name: script-data + hostPath: + path: /home/fzh/wide_and_deep/ + containers: + - name: mindspore + image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541 + imagePullPolicy: IfNotPresent + command: + - /bin/bash + - -c + - python -s /home/fzh/wide_and_deep/train_and_eval_distribute.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --batch_size=16000 + volumeMounts: + - mountPath: /home/fzh/wide_and_deep/ + name: script-data + env: + - name: GLOG_v + value: "1" + Worker: + replicas: 4 + restartPolicy: Never + template: + spec: + volumes: + - name: script-data + hostPath: + path: /home/fzh/wide_and_deep/ + containers: + - name: mindspore + image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541 + imagePullPolicy: IfNotPresent + command: + - /bin/bash + - -c + - python -s /home/fzh/wide_and_deep/train_and_eval_distribute.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --batch_size=16000 + volumeMounts: + - mountPath: /home/fzh/wide_and_deep/ + name: script-data + env: + - name: GLOG_v + value: "1" +# resources: +# limits: +# nvidia.com/gpu: 1 \ No newline at end of file diff --git a/install/kubernetes/mindspore/example/ms_wide_deep_ps_distribute.yaml b/install/kubernetes/mindspore/example/ms_wide_deep_ps_distribute.yaml new file mode 100644 index 00000000..511e5fb1 --- /dev/null +++ b/install/kubernetes/mindspore/example/ms_wide_deep_ps_distribute.yaml @@ -0,0 +1,77 @@ +# wide&deep ps mode distribute training. +# network : https://gitee.com/mindspore/models/tree/master/official/recommend/wide_and_deep +apiVersion: mindspore.gitee.com/v1 +kind: MSJob +metadata: + name: ms-widedeep-ps-distribute +spec: + runPolicy: + cleanPodPolicy: None + successPolicy: AllWorkers + msReplicaSpecs: + Scheduler: + replicas: 1 + restartPolicy: Never + template: + spec: + volumes: + - name: script-data + hostPath: + path: /home/fzh/wide_and_deep/ + containers: + - name: mindspore + image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541 + imagePullPolicy: IfNotPresent + command: + - /bin/bash + - -c + - python -s /home/fzh/wide_and_deep/train_and_eval_parameter_server_distribute.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --parameter_server=1 --vocab_cache_size=300000 + volumeMounts: + - mountPath: /home/fzh/wide_and_deep/ + name: script-data + env: + - name: GLOG_v + value: "1" + PS: + replicas: 2 + restartPolicy: Never + template: + spec: + volumes: + - name: script-data + hostPath: + path: /home/fzh/wide_and_deep/ + containers: + - name: mindspore + image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541 + imagePullPolicy: IfNotPresent + command: + - /bin/bash + - -c + - python -s /home/fzh/wide_and_deep/train_and_eval_parameter_server_distribute.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --parameter_server=1 --vocab_cache_size=300000 + volumeMounts: + - mountPath: /home/fzh/wide_and_deep/ + name: script-data + Worker: + replicas: 4 + restartPolicy: Never + template: + spec: + volumes: + - name: script-data + hostPath: + path: /home/fzh/wide_and_deep/ + containers: + - name: mindspore + image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541 + imagePullPolicy: IfNotPresent + command: + - /bin/bash + - -c + - python -s /home/fzh/wide_and_deep/train_and_eval_parameter_server_distribute.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --parameter_server=1 --vocab_cache_size=300000 --sparse=False + volumeMounts: + - mountPath: /home/fzh/wide_and_deep/ + name: script-data + resources: + limits: + nvidia.com/gpu: 1 diff --git a/install/kubernetes/mindspore/example/ms_wide_deep_ps_standalone.yaml b/install/kubernetes/mindspore/example/ms_wide_deep_ps_standalone.yaml new file mode 100644 index 00000000..b5d46530 --- /dev/null +++ b/install/kubernetes/mindspore/example/ms_wide_deep_ps_standalone.yaml @@ -0,0 +1,74 @@ +# wide&deep for ps mode which only has one worker training. +# network : https://gitee.com/mindspore/models/tree/master/official/recommend/wide_and_deep +apiVersion: mindspore.gitee.com/v1 +kind: MSJob +metadata: + name: ms-widedeep-ps-standalone +spec: + runPolicy: + cleanPodPolicy: None + successPolicy: AllWorkers + msReplicaSpecs: + Scheduler: + replicas: 1 + restartPolicy: Never + template: + spec: + volumes: + - name: script-data + hostPath: + path: /home/fzh/wide_and_deep/ + containers: + - name: mindspore + image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541 + imagePullPolicy: IfNotPresent + command: + - /bin/bash + - -c + - python -s /home/fzh/wide_and_deep/train_and_eval_parameter_server_standalone.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --parameter_server=1 --vocab_cache_size=200000 + volumeMounts: + - mountPath: /home/fzh/wide_and_deep/ + name: script-data + env: + - name: GLOG_v + value: "1" + PS: + replicas: 2 + restartPolicy: Never + template: + spec: + volumes: + - name: script-data + hostPath: + path: /home/fzh/wide_and_deep/ + containers: + - name: mindspore + image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541 + imagePullPolicy: IfNotPresent + command: + - /bin/bash + - -c + - python -s /home/fzh/wide_and_deep/train_and_eval_parameter_server_standalone.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --parameter_server=1 --vocab_cache_size=200000 + volumeMounts: + - mountPath: /home/fzh/wide_and_deep/ + name: script-data + Worker: + replicas: 1 + restartPolicy: Never + template: + spec: + volumes: + - name: script-data + hostPath: + path: /home/fzh/wide_and_deep/ + containers: + - name: mindspore + image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541 + imagePullPolicy: IfNotPresent + command: + - /bin/bash + - -c + - python -s /home/fzh/wide_and_deep/train_and_eval_parameter_server_standalone.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --parameter_server=1 --vocab_cache_size=200000 + volumeMounts: + - mountPath: /home/fzh/wide_and_deep/ + name: script-data