mirror of
https://github.com/tencentmusic/cube-studio.git
synced 2025-02-23 14:51:43 +08:00
完善gpu部署监控采集和gpu机器部署流程
This commit is contained in:
parent
5fa24a9baa
commit
c1578dd5c7
@ -1,21 +0,0 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: "dcgm-exporter"
|
||||
labels:
|
||||
app.kubernetes.io/name: "dcgm-exporter"
|
||||
app.kubernetes.io/version: "2.4.0"
|
||||
k8s-app: dcgm-exporter
|
||||
namespace: monitoring
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: "dcgm-exporter"
|
||||
app.kubernetes.io/version: "2.4.0"
|
||||
# jobLabel: k8s-app
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- monitoring
|
||||
endpoints:
|
||||
- port: "metrics"
|
||||
path: "/metrics"
|
@ -20,6 +20,7 @@ spec:
|
||||
app.kubernetes.io/version: "2.4.0"
|
||||
name: "dcgm-exporter"
|
||||
spec:
|
||||
hostNetwork: true
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
@ -43,11 +44,11 @@ spec:
|
||||
- name: "DCGM_EXPORTER_KUBERNETES"
|
||||
value: "true"
|
||||
name: "dcgm-exporter"
|
||||
# hostNetwork: true
|
||||
ports:
|
||||
- name: "metrics"
|
||||
containerPort: 9400
|
||||
securityContext:
|
||||
privileged: true
|
||||
runAsNonRoot: false
|
||||
runAsUser: 0
|
||||
volumeMounts:
|
||||
@ -69,6 +70,10 @@ metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: "dcgm-exporter"
|
||||
app.kubernetes.io/version: "2.4.0"
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: '9400'
|
||||
prometheus.io/path: /metrics
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: "dcgm-exporter"
|
||||
|
@ -49,6 +49,9 @@ spec:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
# env:
|
||||
# - name: NVIDIA_VISIBLE_DEVICES
|
||||
# value: "0,1,2"
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
|
@ -1,5 +1,14 @@
|
||||
|
||||
# 1、gpu机器环境的准备
|
||||
|
||||
安装gpu驱动
|
||||
https://www.nvidia.cn/Download/index.aspx?lang=cn
|
||||
安装cuda
|
||||
https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=20.04&target_type=deb_network
|
||||
|
||||
安装nvidia-docker2
|
||||
|
||||
|
||||
首先需要找运维同学安装机器gpu卡对应的驱动,然后需要让你的docker能识别并应用gpu驱动。
|
||||
|
||||
- 如果你的docker是19.03及以后的版本,并且只在docker中使用而不在k8s中使用,可以只安装nvidia-container-runtime 或者 只安装nvidia-container-toolkit,然后重启docker,就可以在docker run时通过添加参数--gpus 来应用gpu卡了。
|
||||
@ -40,9 +49,22 @@ notebook=true 用于开发
|
||||
```
|
||||
|
||||
# 3、部署k8s gpu插件(vgpu)
|
||||
|
||||
daemonset kube-system/nvidia-device-plugin.会在机器上部署pod,用于scheduler识别改机器可用gpu算力。
|
||||
|
||||
daemonset kube-system/vgpu-nvidia-device-plugin,会在gpu上虚拟化多张卡,在plugin中不同的虚拟化方式,有可能会占用的方式不同。此处使用的虚拟化方式不影响调用方式
|
||||
daemonset kube-system/gpu-manager,会在gpu上虚拟化多张卡,在plugin中不同的虚拟化方式,有可能会占用的方式不同。此处使用的虚拟化方式不影响调用方式
|
||||
|
||||
|
||||
使用vgpu添加的挂载,
|
||||
```bash
|
||||
/var/lib/kubelet/device-plugins:/var/lib/kubelet/device-plugins
|
||||
/etc/gpu-manager/vm:/etc/gpu-manager/vm
|
||||
/etc/gpu-manager/vdriver:/etc/gpu-manager/vdriver
|
||||
/var/run/docker.sock:/var/run/docker.sock
|
||||
/sys/fs/cgroup:/sys/fs/cgroup
|
||||
/usr:/usr
|
||||
```
|
||||
|
||||
|
||||
# 4、部署k8s监控组件
|
||||
daemonset monitoring/dcgm-exporter.会在机器上部署pod,用于监控gpu上的使用率
|
||||
|
Loading…
Reference in New Issue
Block a user