From fd47961487138b1f952e4ec5aa98ad79afeb03f3 Mon Sep 17 00:00:00 2001
From: cdllp2 <luanpengcdlllp2@163.com>
Date: Tue, 11 Oct 2022 18:24:45 +0800
Subject: [PATCH] add tensorboard

---
 .../jupyter-notebook/deeplearning/Dockerfile  |  5 ++
 images/jupyter-notebook/readme.md             | 55 +++++++++++++
 .../example/ms_wide_deep_dataparallel.yaml    | 61 +++++++++++++++
 .../example/ms_wide_deep_ps_distribute.yaml   | 77 +++++++++++++++++++
 .../example/ms_wide_deep_ps_standalone.yaml   | 74 ++++++++++++++++++
 5 files changed, 272 insertions(+)
 create mode 100644 images/jupyter-notebook/readme.md
 create mode 100644 install/kubernetes/mindspore/example/ms_wide_deep_dataparallel.yaml
 create mode 100644 install/kubernetes/mindspore/example/ms_wide_deep_ps_distribute.yaml
 create mode 100644 install/kubernetes/mindspore/example/ms_wide_deep_ps_standalone.yaml

diff --git a/images/jupyter-notebook/deeplearning/Dockerfile b/images/jupyter-notebook/deeplearning/Dockerfile
index 5bb45757..5259e6a2 100644
--- a/images/jupyter-notebook/deeplearning/Dockerfile
+++ b/images/jupyter-notebook/deeplearning/Dockerfile
@@ -15,6 +15,11 @@ RUN pip install numpy pandas StatsModels Matplotlib Seaborn  sklearn wheel SciPy
 RUN pip install tensorflow keras torch torchvision keras nltk spacy gensim \
     && rm -rf /tmp/* /var/tmp/* /root/.cache
 
+# 安装插件
+RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager
+RUN pip install --upgrade jupyterlab-git && jupyter lab build && \
+     pip uninstall -y tensorboard && pip install tensorboard==2.10 jupyterlab-tensorboard-pro && jupyter labextension install jupyterlab_tensorboard_pro ; jupyter serverextension enable jupyterlab_tensorboard_pro --sys-prefix
+
 # 拷贝examples
 COPY examples/* /examples/
 
diff --git a/images/jupyter-notebook/readme.md b/images/jupyter-notebook/readme.md
new file mode 100644
index 00000000..49f72f28
--- /dev/null
+++ b/images/jupyter-notebook/readme.md
@@ -0,0 +1,55 @@
+# notebook重启问题
+
+关于续期：因为对gpu的占用方式为独占方式，所以对于gpu notebook会定时清理，需要按时续期。
+
+关于清理：可以通过删除config.py中的delete_notebook定时任务，关闭掉定时清理notebook
+
+关于环境：重启后会自动执行/mnt/$USERNAME/init.sh脚本，所以可以将环境写入此脚本，重启后自动安装环境，否则就需要打包到镜像或者离线anaconda文件
+
+# 构建notebook镜像
+
+需要构建新镜像并在生产上替换，才能让用户使用新的notebook镜像。
+
+## 方法1：Dockerfile构建
+
+jupyter镜像的构建在：https://github.com/tencentmusic/cube-studio/tree/master/images/jupyter-notebook
+
+vscode镜像的构建在：https://github.com/tencentmusic/cube-studio/tree/master/images/theia
+
+现在默认使用的镜像为
+```
+# notebook使用的镜像
+NOTEBOOK_IMAGES=[
+    ['ccr.ccs.tencentyun.com/cube-studio/notebook:vscode-ubuntu-cpu-base', 'vscode（cpu）'],
+    ['ccr.ccs.tencentyun.com/cube-studio/notebook:vscode-ubuntu-gpu-base', 'vscode（gpu）'],
+    ['ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-cpu-base', 'jupyter（cpu）'],
+    ['ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-gpu-base','jupyter（gpu）'],
+    ['ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-bigdata', 'jupyter（bigdata）'],
+    ['ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-machinelearning', 'jupyter（machinelearning）'],
+    ['ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-deeplearning', 'jupyter（deeplearning）'],
+]
+```
+## 方法2，直接commit容器
+
+也可以直接run一个容器，然后安装插件后将容器commmit成镜像。
+```
+# 启动jupyter
+docker run --name jupyter -p 3000:3000 -d ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-cpu-base jupyter lab --notebook-dir=/ --ip=0.0.0.0 --no-browser --allow-root --port=3000 --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.allow_origin='*'
+
+# 启动vscode
+docker run --name vscode -p 3000:3000 -d ccr.ccs.tencentyun.com/cube-studio/notebook:vscode-ubuntu-cpu-base node /home/theia/src-gen/backend/main.js /home/project --hostname=0.0.0.0 --port=3000
+
+```
+然后访问 http://xx.xx.xx.xx:3000/ ， web界面操作，安装notebook插件，安装pip/apt环境等。环境完整后，再使用如下命令commit成镜像。
+```
+docker commit notebook ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-cpu-base-1
+```
+
+# 修改配置文件
+
+config.py中 NOTEBOOK_IMAGES 变量为notebook可选镜像。更新此变量即可。
+
+
+# 其他类型的notebook
+
+所有可提供在线编辑功能的web服务都可以定义为notebook。开发代码已提供对外，需要满足几个条件，可配置url前缀，用来区分不同的notebook。
diff --git a/install/kubernetes/mindspore/example/ms_wide_deep_dataparallel.yaml b/install/kubernetes/mindspore/example/ms_wide_deep_dataparallel.yaml
new file mode 100644
index 00000000..f1717f9b
--- /dev/null
+++ b/install/kubernetes/mindspore/example/ms_wide_deep_dataparallel.yaml
@@ -0,0 +1,61 @@
+# wide&deep dataparallel mode with scheduler and workers
+# network : https://gitee.com/mindspore/models/tree/master/official/recommend/wide_and_deep
+apiVersion: mindspore.gitee.com/v1
+kind: MSJob
+metadata:
+  name: ms-widedeep-dataparallel
+  namespace: pipeline
+spec:
+  runPolicy:
+    cleanPodPolicy: None
+  successPolicy: AllWorkers
+  msReplicaSpecs:
+    Scheduler:
+      replicas: 1
+      restartPolicy: Never
+      template:
+        spec:
+          volumes:
+            - name: script-data
+              hostPath:
+                path: /home/fzh/wide_and_deep/
+          containers:
+            - name: mindspore
+              image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541
+              imagePullPolicy: IfNotPresent
+              command:
+                - /bin/bash
+                - -c
+                - python -s /home/fzh/wide_and_deep/train_and_eval_distribute.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord  --batch_size=16000
+              volumeMounts:
+                - mountPath: /home/fzh/wide_and_deep/
+                  name: script-data
+              env:
+                - name: GLOG_v
+                  value: "1"
+    Worker:
+      replicas: 4
+      restartPolicy: Never
+      template:
+        spec:
+          volumes:
+            - name: script-data
+              hostPath:
+                path: /home/fzh/wide_and_deep/
+          containers:
+            - name: mindspore
+              image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541
+              imagePullPolicy: IfNotPresent
+              command:
+                - /bin/bash
+                - -c
+                - python -s /home/fzh/wide_and_deep/train_and_eval_distribute.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --batch_size=16000
+              volumeMounts:
+                - mountPath: /home/fzh/wide_and_deep/
+                  name: script-data
+              env:
+                - name: GLOG_v
+                  value: "1"
+#              resources:
+#                limits:
+#                  nvidia.com/gpu: 1
\ No newline at end of file
diff --git a/install/kubernetes/mindspore/example/ms_wide_deep_ps_distribute.yaml b/install/kubernetes/mindspore/example/ms_wide_deep_ps_distribute.yaml
new file mode 100644
index 00000000..511e5fb1
--- /dev/null
+++ b/install/kubernetes/mindspore/example/ms_wide_deep_ps_distribute.yaml
@@ -0,0 +1,77 @@
+# wide&deep ps mode distribute training.
+# network : https://gitee.com/mindspore/models/tree/master/official/recommend/wide_and_deep
+apiVersion: mindspore.gitee.com/v1
+kind: MSJob
+metadata:
+  name: ms-widedeep-ps-distribute
+spec:
+  runPolicy:
+    cleanPodPolicy: None
+  successPolicy: AllWorkers
+  msReplicaSpecs:
+    Scheduler:
+      replicas: 1
+      restartPolicy: Never
+      template:
+        spec:
+          volumes:
+            - name: script-data
+              hostPath:
+                path: /home/fzh/wide_and_deep/
+          containers:
+            - name: mindspore
+              image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541
+              imagePullPolicy: IfNotPresent
+              command:
+                - /bin/bash
+                - -c
+                - python -s /home/fzh/wide_and_deep/train_and_eval_parameter_server_distribute.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --parameter_server=1 --vocab_cache_size=300000
+              volumeMounts:
+                - mountPath: /home/fzh/wide_and_deep/
+                  name: script-data
+              env:
+                - name: GLOG_v
+                  value: "1"
+    PS:
+      replicas: 2
+      restartPolicy: Never
+      template:
+        spec:
+          volumes:
+            - name: script-data
+              hostPath:
+                path: /home/fzh/wide_and_deep/
+          containers:
+            - name: mindspore
+              image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541
+              imagePullPolicy: IfNotPresent
+              command:
+                - /bin/bash
+                - -c
+                - python -s /home/fzh/wide_and_deep/train_and_eval_parameter_server_distribute.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --parameter_server=1 --vocab_cache_size=300000
+              volumeMounts:
+                - mountPath: /home/fzh/wide_and_deep/
+                  name: script-data
+    Worker:
+      replicas: 4
+      restartPolicy: Never
+      template:
+        spec:
+          volumes:
+            - name: script-data
+              hostPath:
+                path: /home/fzh/wide_and_deep/
+          containers:
+            - name: mindspore
+              image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541
+              imagePullPolicy: IfNotPresent
+              command:
+                - /bin/bash
+                - -c
+                - python -s /home/fzh/wide_and_deep/train_and_eval_parameter_server_distribute.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --parameter_server=1  --vocab_cache_size=300000  --sparse=False
+              volumeMounts:
+                - mountPath: /home/fzh/wide_and_deep/
+                  name: script-data
+              resources:
+                limits:
+                  nvidia.com/gpu: 1
diff --git a/install/kubernetes/mindspore/example/ms_wide_deep_ps_standalone.yaml b/install/kubernetes/mindspore/example/ms_wide_deep_ps_standalone.yaml
new file mode 100644
index 00000000..b5d46530
--- /dev/null
+++ b/install/kubernetes/mindspore/example/ms_wide_deep_ps_standalone.yaml
@@ -0,0 +1,74 @@
+# wide&deep for ps mode which only has one worker training.
+# network : https://gitee.com/mindspore/models/tree/master/official/recommend/wide_and_deep
+apiVersion: mindspore.gitee.com/v1
+kind: MSJob
+metadata:
+  name: ms-widedeep-ps-standalone
+spec:
+  runPolicy:
+    cleanPodPolicy: None
+  successPolicy: AllWorkers
+  msReplicaSpecs:
+    Scheduler:
+      replicas: 1
+      restartPolicy: Never
+      template:
+        spec:
+          volumes:
+            - name: script-data
+              hostPath:
+                path: /home/fzh/wide_and_deep/
+          containers:
+            - name: mindspore
+              image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541
+              imagePullPolicy: IfNotPresent
+              command:
+                - /bin/bash
+                - -c
+                - python -s /home/fzh/wide_and_deep/train_and_eval_parameter_server_standalone.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --parameter_server=1 --vocab_cache_size=200000
+              volumeMounts:
+                - mountPath: /home/fzh/wide_and_deep/
+                  name: script-data
+              env:
+                - name: GLOG_v
+                  value: "1"
+    PS:
+      replicas: 2
+      restartPolicy: Never
+      template:
+        spec:
+          volumes:
+            - name: script-data
+              hostPath:
+                path: /home/fzh/wide_and_deep/
+          containers:
+            - name: mindspore
+              image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541
+              imagePullPolicy: IfNotPresent
+              command:
+                - /bin/bash
+                - -c
+                - python -s /home/fzh/wide_and_deep/train_and_eval_parameter_server_standalone.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --parameter_server=1 --vocab_cache_size=200000
+              volumeMounts:
+                - mountPath: /home/fzh/wide_and_deep/
+                  name: script-data
+    Worker:
+      replicas: 1
+      restartPolicy: Never
+      template:
+        spec:
+          volumes:
+            - name: script-data
+              hostPath:
+                path: /home/fzh/wide_and_deep/
+          containers:
+            - name: mindspore
+              image: swr.cn-south-1.myhuaweicloud.com/mindspore-ci/mindspore-gpu:1.7.0-20220327121541
+              imagePullPolicy: IfNotPresent
+              command:
+                - /bin/bash
+                - -c
+                - python -s /home/fzh/wide_and_deep/train_and_eval_parameter_server_standalone.py --device_target="GPU" --epochs=1 --data_path=/home/fzh/wide_and_deep/criteo_mindrecord --parameter_server=1  --vocab_cache_size=200000
+              volumeMounts:
+                - mountPath: /home/fzh/wide_and_deep/
+                  name: script-data