add tensorboard

This commit is contained in:
cdllp2 2022-10-11 18:24:45 +08:00
parent d2e9c51049
commit fd47961487
5 changed files with 272 additions and 0 deletions

View File

@ -15,6 +15,11 @@ RUN pip install numpy pandas StatsModels Matplotlib Seaborn sklearn wheel SciPy
RUN pip install tensorflow keras torch torchvision keras nltk spacy gensim \
&& rm -rf /tmp/* /var/tmp/* /root/.cache
# 安装插件
RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager
RUN pip install --upgrade jupyterlab-git && jupyter lab build && \
pip uninstall -y tensorboard && pip install tensorboard==2.10 jupyterlab-tensorboard-pro && jupyter labextension install jupyterlab_tensorboard_pro ; jupyter serverextension enable jupyterlab_tensorboard_pro --sys-prefix
# 拷贝examples
COPY examples/* /examples/

View File

@ -0,0 +1,55 @@
# notebook重启问题
关于续期因为对gpu的占用方式为独占方式所以对于gpu notebook会定时清理需要按时续期。
关于清理可以通过删除config.py中的delete_notebook定时任务关闭掉定时清理notebook
关于环境:重启后会自动执行/mnt/$USERNAME/init.sh脚本所以可以将环境写入此脚本重启后自动安装环境否则就需要打包到镜像或者离线anaconda文件
# 构建notebook镜像
需要构建新镜像并在生产上替换才能让用户使用新的notebook镜像。
## 方法1Dockerfile构建
jupyter镜像的构建在https://github.com/tencentmusic/cube-studio/tree/master/images/jupyter-notebook
vscode镜像的构建在https://github.com/tencentmusic/cube-studio/tree/master/images/theia
现在默认使用的镜像为
```
# notebook使用的镜像
NOTEBOOK_IMAGES=[
['ccr.ccs.tencentyun.com/cube-studio/notebook:vscode-ubuntu-cpu-base', 'vscodecpu'],
['ccr.ccs.tencentyun.com/cube-studio/notebook:vscode-ubuntu-gpu-base', 'vscodegpu'],
['ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-cpu-base', 'jupytercpu'],
['ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-gpu-base','jupytergpu'],
['ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-bigdata', 'jupyterbigdata'],
['ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-machinelearning', 'jupytermachinelearning'],
['ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-deeplearning', 'jupyterdeeplearning'],
]
```
## 方法2直接commit容器
也可以直接run一个容器然后安装插件后将容器commmit成镜像。
```
# 启动jupyter
docker run --name jupyter -p 3000:3000 -d ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-cpu-base jupyter lab --notebook-dir=/ --ip=0.0.0.0 --no-browser --allow-root --port=3000 --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.allow_origin='*'
# 启动vscode
docker run --name vscode -p 3000:3000 -d ccr.ccs.tencentyun.com/cube-studio/notebook:vscode-ubuntu-cpu-base node /home/theia/src-gen/backend/main.js /home/project --hostname=0.0.0.0 --port=3000
```
然后访问 http://xx.xx.xx.xx:3000/ web界面操作安装notebook插件安装pip/apt环境等。环境完整后再使用如下命令commit成镜像。
```
docker commit notebook ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-cpu-base-1
```
# 修改配置文件
config.py中 NOTEBOOK_IMAGES 变量为notebook可选镜像。更新此变量即可。
# 其他类型的notebook
所有可提供在线编辑功能的web服务都可以定义为notebook。开发代码已提供对外需要满足几个条件可配置url前缀用来区分不同的notebook。

View File

@ -0,0 +1,61 @@
# wide&deep dataparallel mode with scheduler and workers
# network : https://gitee.com/mindspore/models/tree/master/official/recommend/wide_and_deep
apiVersion: mindspore.gitee.com/v1
kind: MSJob
metadata:
name: ms-widedeep-dataparallel
namespace: pipeline
spec:
runPolicy:
cleanPodPolicy: None
successPolicy: AllWorkers
msReplicaSpecs:
Scheduler:
replicas: 1
restartPolicy: Never
template:
spec:
volumes:
- name: script-data
hostPath:
path: /home/fzh/wide_and_deep/
containers:
- name: mindspore
image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541
imagePullPolicy: IfNotPresent
command:
- /bin/bash
- -c
- python -s /home/fzh/wide_and_deep/train_and_eval_distribute.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --batch_size=16000
volumeMounts:
- mountPath: /home/fzh/wide_and_deep/
name: script-data
env:
- name: GLOG_v
value: "1"
Worker:
replicas: 4
restartPolicy: Never
template:
spec:
volumes:
- name: script-data
hostPath:
path: /home/fzh/wide_and_deep/
containers:
- name: mindspore
image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541
imagePullPolicy: IfNotPresent
command:
- /bin/bash
- -c
- python -s /home/fzh/wide_and_deep/train_and_eval_distribute.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --batch_size=16000
volumeMounts:
- mountPath: /home/fzh/wide_and_deep/
name: script-data
env:
- name: GLOG_v
value: "1"
# resources:
# limits:
# nvidia.com/gpu: 1

View File

@ -0,0 +1,77 @@
# wide&deep ps mode distribute training.
# network : https://gitee.com/mindspore/models/tree/master/official/recommend/wide_and_deep
apiVersion: mindspore.gitee.com/v1
kind: MSJob
metadata:
name: ms-widedeep-ps-distribute
spec:
runPolicy:
cleanPodPolicy: None
successPolicy: AllWorkers
msReplicaSpecs:
Scheduler:
replicas: 1
restartPolicy: Never
template:
spec:
volumes:
- name: script-data
hostPath:
path: /home/fzh/wide_and_deep/
containers:
- name: mindspore
image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541
imagePullPolicy: IfNotPresent
command:
- /bin/bash
- -c
- python -s /home/fzh/wide_and_deep/train_and_eval_parameter_server_distribute.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --parameter_server=1 --vocab_cache_size=300000
volumeMounts:
- mountPath: /home/fzh/wide_and_deep/
name: script-data
env:
- name: GLOG_v
value: "1"
PS:
replicas: 2
restartPolicy: Never
template:
spec:
volumes:
- name: script-data
hostPath:
path: /home/fzh/wide_and_deep/
containers:
- name: mindspore
image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541
imagePullPolicy: IfNotPresent
command:
- /bin/bash
- -c
- python -s /home/fzh/wide_and_deep/train_and_eval_parameter_server_distribute.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --parameter_server=1 --vocab_cache_size=300000
volumeMounts:
- mountPath: /home/fzh/wide_and_deep/
name: script-data
Worker:
replicas: 4
restartPolicy: Never
template:
spec:
volumes:
- name: script-data
hostPath:
path: /home/fzh/wide_and_deep/
containers:
- name: mindspore
image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541
imagePullPolicy: IfNotPresent
command:
- /bin/bash
- -c
- python -s /home/fzh/wide_and_deep/train_and_eval_parameter_server_distribute.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --parameter_server=1 --vocab_cache_size=300000 --sparse=False
volumeMounts:
- mountPath: /home/fzh/wide_and_deep/
name: script-data
resources:
limits:
nvidia.com/gpu: 1

View File

@ -0,0 +1,74 @@
# wide&deep for ps mode which only has one worker training.
# network : https://gitee.com/mindspore/models/tree/master/official/recommend/wide_and_deep
apiVersion: mindspore.gitee.com/v1
kind: MSJob
metadata:
name: ms-widedeep-ps-standalone
spec:
runPolicy:
cleanPodPolicy: None
successPolicy: AllWorkers
msReplicaSpecs:
Scheduler:
replicas: 1
restartPolicy: Never
template:
spec:
volumes:
- name: script-data
hostPath:
path: /home/fzh/wide_and_deep/
containers:
- name: mindspore
image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541
imagePullPolicy: IfNotPresent
command:
- /bin/bash
- -c
- python -s /home/fzh/wide_and_deep/train_and_eval_parameter_server_standalone.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --parameter_server=1 --vocab_cache_size=200000
volumeMounts:
- mountPath: /home/fzh/wide_and_deep/
name: script-data
env:
- name: GLOG_v
value: "1"
PS:
replicas: 2
restartPolicy: Never
template:
spec:
volumes:
- name: script-data
hostPath:
path: /home/fzh/wide_and_deep/
containers:
- name: mindspore
image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541
imagePullPolicy: IfNotPresent
command:
- /bin/bash
- -c
- python -s /home/fzh/wide_and_deep/train_and_eval_parameter_server_standalone.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --parameter_server=1 --vocab_cache_size=200000
volumeMounts:
- mountPath: /home/fzh/wide_and_deep/
name: script-data
Worker:
replicas: 1
restartPolicy: Never
template:
spec:
volumes:
- name: script-data
hostPath:
path: /home/fzh/wide_and_deep/
containers:
- name: mindspore
image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541
imagePullPolicy: IfNotPresent
command:
- /bin/bash
- -c
- python -s /home/fzh/wide_and_deep/train_and_eval_parameter_server_standalone.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --parameter_server=1 --vocab_cache_size=200000
volumeMounts:
- mountPath: /home/fzh/wide_and_deep/
name: script-data