to tke vgpu

2025-02-11 14:34:22 +08:00 · 2022-12-06 10:47:50 +08:00 · 2022-12-06 10:47:50 +08:00 · 51b31448d6
commit 51b31448d6
parent 9643bd7bd8
2 changed files with 152 additions and 14 deletions
--- a/install/kubernetes/gpu/4paradigm-vgpu-nvidia-device-plugin.yml
+++ b/install/kubernetes/gpu/4paradigm-vgpu-nvidia-device-plugin.yml
@ -1,16 +1,3 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.

 apiVersion: apps/v1
 kind: DaemonSet
@ -59,7 +46,7 @@ spec:
        # - image: m7-ieg-pico-test01:5000/k8s-device-plugin-test:v0.9.0-ubuntu20.04
        imagePullPolicy: Always
        name: nvidia-device-plugin-ctr
-        args: ["--fail-on-init-error=false", "--device-split-count=3", "--device-memory-scaling=3", "--device-cores-scaling=3"]
+        args: ["--fail-on-init-error=false", "--device-split-count=4", "--device-memory-scaling=4", "--device-cores-scaling=4"]
        env:
        - name: PCIBUSFILE
          value: "/usr/local/vgpu/pciinfo.vgpu"
--- a/install/kubernetes/gpu/tke-gpu-manager.yaml
+++ b/install/kubernetes/gpu/tke-gpu-manager.yaml
@ -0,0 +1,151 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: gpu-manager
+  namespace: kube-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: gpu-manager-role
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: cluster-admin
+subjects:
+- kind: ServiceAccount
+  name: gpu-manager
+  namespace: kube-system
+
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: gpu-manager-daemonset
+  namespace: kube-system
+spec:
+  updateStrategy:
+    type: RollingUpdate
+  selector:
+    matchLabels:
+      name: gpu-manager-ds
+  template:
+    metadata:
+      # This annotation is deprecated. Kept here for backward compatibility
+      # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
+      annotations:
+        scheduler.alpha.kubernetes.io/critical-pod: ""
+      labels:
+        name: gpu-manager-ds
+    spec:
+      serviceAccount: gpu-manager
+      tolerations:
+        # This toleration is deprecated. Kept here for backward compatibility
+        # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
+        - key: CriticalAddonsOnly
+          operator: Exists
+        - key: tencent.com/vcuda-core
+          operator: Exists
+          effect: NoSchedule
+      # Mark this pod as a critical add-on; when enabled, the critical add-on
+      # scheduler reserves resources for critical add-on pods so that they can
+      # be rescheduled after a failure.
+      # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
+      priorityClassName: "system-node-critical"
+      # only run node has gpu device
+      nodeSelector:
+        vgpu: 'true'
+      hostPID: true
+      containers:
+        - image: tkestack/gpu-manager:1.0.3
+          imagePullPolicy: Always
+          name: gpu-manager
+          securityContext:
+            privileged: true
+          ports:
+            - containerPort: 5678
+          volumeMounts:
+            - name: device-plugin
+              mountPath: /var/lib/kubelet/device-plugins
+            - name: vdriver
+              mountPath: /etc/gpu-manager/vdriver
+            - name: vmdata
+              mountPath: /etc/gpu-manager/vm
+            - name: log
+              mountPath: /var/log/gpu-manager
+            - name: checkpoint
+              mountPath: /etc/gpu-manager/checkpoint
+            - name: run-dir
+              mountPath: /var/run
+            - name: cgroup
+              mountPath: /sys/fs/cgroup
+              readOnly: true
+            - name: usr-directory
+              mountPath: /usr/local/host
+              readOnly: true
+          env:
+            - name: LOG_LEVEL
+              value: "4"
+            - name: EXTRA_FLAGS
+              value: "--logtostderr=false"
+            - name: NODE_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
+      volumes:
+        - name: device-plugin
+          hostPath:
+            type: Directory
+            path: /var/lib/kubelet/device-plugins
+        - name: vmdata
+          hostPath:
+            type: DirectoryOrCreate
+            path: /etc/gpu-manager/vm
+        - name: vdriver
+          hostPath:
+            type: DirectoryOrCreate
+            path: /etc/gpu-manager/vdriver
+        - name: log
+          hostPath:
+            type: DirectoryOrCreate
+            path: /etc/gpu-manager/log
+        - name: checkpoint
+          hostPath:
+            type: DirectoryOrCreate
+            path: /etc/gpu-manager/checkpoint
+        # We have to mount the whole /var/run directory into container, because of bind mount docker.sock
+        # inode change after host docker is restarted
+        - name: run-dir
+          hostPath:
+            type: Directory
+            path: /var/run
+        - name: cgroup
+          hostPath:
+            type: Directory
+            path: /sys/fs/cgroup
+        # We have to mount /usr directory instead of specified library path, because of non-existing
+        # problem for different distro
+        - name: usr-directory
+          hostPath:
+            type: Directory
+            path: /usr
+---
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: gpu-manager-metric
+  namespace: kube-system
+  annotations:
+    prometheus.io/scrape: "true"
+  labels:
+    kubernetes.io/cluster-service: "true"
+spec:
+  clusterIP: None
+  ports:
+    - name: metrics
+      port: 5678
+      protocol: TCP
+      targetPort: 5678
+  selector:
+    name: gpu-manager-ds