mirror of
https://github.com/tencentmusic/cube-studio.git
synced 2024-11-27 05:33:10 +08:00
添加spark serverless支持
This commit is contained in:
parent
b37be5a4da
commit
efee7b76a5
@ -660,6 +660,13 @@ CRD_INFO={
|
||||
'kind': 'Job',
|
||||
"plural": "jobs",
|
||||
"timeout": 60 * 60 * 24 * 2
|
||||
},
|
||||
"sparkjob": {
|
||||
"group": "sparkoperator.k8s.io",
|
||||
"version": "v1beta2",
|
||||
'kind': 'SparkApplication',
|
||||
"plural": "sparkapplications",
|
||||
"timeout": 60 * 60 * 24 * 2
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4,8 +4,8 @@ set -ex
|
||||
|
||||
rm -rf /home/myapp/myapp/static/assets
|
||||
ln -s /home/myapp/myapp/assets /home/myapp/myapp/static/
|
||||
rm -rf /home/myapp/myapp/static/appbuilder/mnt
|
||||
ln -s /data/k8s/kubeflow/global/static /home/myapp/myapp/static/appbuilder/mnt
|
||||
rm -rf /home/myapp/myapp/static/mnt
|
||||
ln -s /data/k8s/kubeflow/pipeline/workspace /home/myapp/myapp/static/mnt
|
||||
|
||||
export FLASK_APP=myapp:app
|
||||
python myapp/create_db.py
|
||||
|
@ -4,12 +4,12 @@ metadata:
|
||||
name: spark-pi
|
||||
namespace: pipeline
|
||||
spec:
|
||||
type: Scala
|
||||
type: Scala # Scala Python
|
||||
mode: cluster
|
||||
image: "ccr.ccs.tencentyun.com/cube-studio/spark-operator:spark-v3.1.1"
|
||||
image: "ccr.ccs.tencentyun.com/cube-studio/spark-operator:spark-v3.1.1" # "gcr.io/spark-operator/spark-py:v3.1.1" #
|
||||
imagePullPolicy: Always
|
||||
mainClass: org.apache.spark.examples.SparkPi
|
||||
mainApplicationFile: "local:///opt/spark/examples/jars/spark-examples_2.12-3.1.1.jar"
|
||||
mainApplicationFile: "local:///opt/spark/examples/jars/spark-examples_2.12-3.1.1.jar" # local:///opt/spark/examples/src/main/python/pi.py # "local:///opt/spark/examples/jars/spark-examples_2.12-3.1.1.jar"
|
||||
sparkVersion: "3.1.1"
|
||||
restartPolicy:
|
||||
type: Never
|
||||
|
@ -4517,7 +4517,7 @@ spec:
|
||||
{}
|
||||
containers:
|
||||
- name: spark-operator
|
||||
image: ccr.ccs.tencentyun.com/cube-studio/spark-operator:v1beta2-1.3.7-3.1.1
|
||||
image: gcr.io/spark-operator/spark-operator:v1beta2-1.3.0-3.1.1
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
{}
|
||||
|
@ -1 +1,2 @@
|
||||
helm template --set sparkJobNamespace=pipeline spark-operator spark-operator/spark-operator > install.yaml
|
||||
|
||||
|
@ -1,9 +1,5 @@
|
||||
FROM ubuntu:18.04
|
||||
|
||||
## 换内网源
|
||||
#COPY job/pkgs/config/ubuntu-sources.list /etc/apt/sources.list
|
||||
#COPY job/pkgs/config/pip.conf /root/.pip/pip.conf
|
||||
|
||||
# 安装运维工具
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
RUN apt-get update && apt install -y --no-install-recommends vim apt-transport-https gnupg2 ca-certificates-java rsync jq wget git dnsutils iputils-ping net-tools curl locales zip tzdata
|
||||
|
@ -1,8 +1,5 @@
|
||||
FROM ubuntu:18.04
|
||||
|
||||
#https://docker-76009.sz.gfp.tencent-cloud.com/tencent/ubuntu-sources.list
|
||||
#COPY job/pkgs/config/ubuntu-sources.list /etc/apt/sources.list
|
||||
|
||||
# 安装运维工具
|
||||
RUN apt-get update && apt install -y --force-yes --no-install-recommends vim apt-transport-https gnupg2 ca-certificates-java rsync jq wget git dnsutils iputils-ping net-tools curl locales zip
|
||||
# 安装python
|
||||
|
@ -1,5 +1,4 @@
|
||||
FROM ubuntu:18.04
|
||||
COPY job/pkgs/config/ubuntu-sources.list /etc/apt/sources.list
|
||||
RUN apt-get update && apt-get -y install gcc g++ libjpeg-dev zlib1g-dev cmake
|
||||
|
||||
# 安装运维工具
|
||||
@ -27,8 +26,6 @@ RUN echo "alias ll='ls -alF'" >> /root/.bashrc && \
|
||||
|
||||
RUN pip install kubernetes==12.0.1 pysnooper psutil
|
||||
COPY job/pytorch_distributed_train_k8s/* /app/
|
||||
#COPY job/pkgs/config/ubuntu-sources.list /etc/apt/sources.list
|
||||
COPY job/pkgs/config/pip.conf /root/.pip/pip.conf
|
||||
COPY job/pkgs /app/job/pkgs
|
||||
WORKDIR /app
|
||||
ENV PYTHONPATH=/app:$PYTHONPATH
|
||||
|
@ -2,8 +2,6 @@
|
||||
FROM ccr.ccs.tencentyun.com/cube-studio/ray:nightly
|
||||
USER root
|
||||
|
||||
COPY job/pkgs/config/pip.conf /root/.pip/pip.conf
|
||||
COPY job/pkgs/config/ubuntu-sources.list /etc/apt/sources.list
|
||||
|
||||
# 安装调试相关工具
|
||||
RUN apt update && apt install -y --force-yes --no-install-recommends vim apt-transport-https gnupg2 ca-certificates-java rsync jq wget git dnsutils iputils-ping net-tools curl mysql-client locales zip software-properties-common
|
||||
|
@ -2,9 +2,6 @@ FROM ccr.ccs.tencentyun.com/cube-studio/ray:gpu
|
||||
|
||||
USER root
|
||||
|
||||
COPY job/pkgs/config/pip.conf /root/.pip/pip.conf
|
||||
COPY job/pkgs/config/ubuntu-sources.list /etc/apt/sources.list
|
||||
|
||||
|
||||
# 安装调试相关工具
|
||||
RUN apt update && apt install -y --force-yes --no-install-recommends vim apt-transport-https gnupg2 ca-certificates-java rsync jq wget git dnsutils iputils-ping net-tools curl mysql-client locales zip software-properties-common
|
||||
|
@ -2,8 +2,6 @@
|
||||
FROM ccr.ccs.tencentyun.com/cube-studio/ray:gpu
|
||||
USER root
|
||||
|
||||
#COPY job/pkgs/config/pip.conf /root/.pip/pip.conf
|
||||
#COPY job/pkgs/config/ubuntu-sources.list /etc/apt/sources.list
|
||||
|
||||
# 安装调试相关工具
|
||||
RUN apt update && apt install -y --force-yes --no-install-recommends vim apt-transport-https gnupg2 ca-certificates-java rsync jq wget git dnsutils iputils-ping net-tools curl mysql-client locales zip software-properties-common
|
||||
|
35
job-template/job/spark/Dockerfile
Normal file
35
job-template/job/spark/Dockerfile
Normal file
@ -0,0 +1,35 @@
|
||||
FROM ubuntu:20.04
|
||||
|
||||
RUN apt-get update ; apt-get -y --fix-missing install gcc g++ libjpeg-dev zlib1g-dev cmake
|
||||
|
||||
# 安装运维工具
|
||||
RUN apt install -y --no-install-recommends --fix-missing software-properties-common vim apt-transport-https gnupg2 ca-certificates-java rsync jq wget git dnsutils iputils-ping net-tools curl locales zip unzip
|
||||
|
||||
RUN add-apt-repository -y ppa:deadsnakes/ppa && apt update && apt install -y libsasl2-dev libpq-dev python3-pip python3-distutils
|
||||
RUN set -x; rm -rf /usr/bin/python; apt install -y --fix-missing python3.8 python3.8-dev && ln -s /usr/bin/python3.8 /usr/bin/python
|
||||
|
||||
RUN bash -c "wget https://bootstrap.pypa.io/get-pip.py && python get-pip.py --ignore-installed" \
|
||||
&& rm -rf /usr/bin/pip && ln -s /usr/bin/pip3 /usr/bin/pip
|
||||
|
||||
# 安装中文
|
||||
RUN apt install -y --force-yes --no-install-recommends locales ttf-wqy-microhei ttf-wqy-zenhei xfonts-wqy && locale-gen zh_CN && locale-gen zh_CN.utf8
|
||||
ENV LANG zh_CN.UTF-8
|
||||
ENV LC_ALL zh_CN.UTF-8
|
||||
ENV LANGUAGE zh_CN.UTF-8
|
||||
|
||||
# 便捷操作
|
||||
RUN echo "alias ll='ls -alF'" >> /root/.bashrc && \
|
||||
echo "alias la='ls -A'" >> /root/.bashrc && \
|
||||
echo "alias vi='vim'" >> /root/.bashrc && \
|
||||
/bin/bash -c "source /root/.bashrc"
|
||||
|
||||
RUN pip install kubernetes pysnooper psutil
|
||||
|
||||
COPY job/spark/* /app/
|
||||
COPY job/pkgs /app/job/pkgs
|
||||
WORKDIR /app
|
||||
ENV PYTHONPATH=/app:$PYTHONPATH
|
||||
|
||||
ENTRYPOINT ["python3", "launcher.py"]
|
||||
|
||||
|
126
job-template/job/spark/README.md
Normal file
126
job-template/job/spark/README.md
Normal file
@ -0,0 +1,126 @@
|
||||
镜像:ccr.ccs.tencentyun.com/cube-studio/spark:20221010
|
||||
账号:kubeflow-pipeline
|
||||
参数
|
||||
```bash
|
||||
{
|
||||
"shell": {
|
||||
"--image": {
|
||||
"type": "str",
|
||||
"item_type": "str",
|
||||
"label": "执行镜像",
|
||||
"require": 1,
|
||||
"choice": [],
|
||||
"range": "",
|
||||
"default": "ccr.ccs.tencentyun.com/cube-studio/spark-operator:spark-v3.1.1",
|
||||
"placeholder": "",
|
||||
"describe": "执行镜像",
|
||||
"editable": 1,
|
||||
"condition": "",
|
||||
"sub_args": {}
|
||||
},
|
||||
"--num_worker": {
|
||||
"type": "str",
|
||||
"item_type": "str",
|
||||
"label": "executor 数目",
|
||||
"require": 1,
|
||||
"choice": [],
|
||||
"range": "",
|
||||
"default": "3",
|
||||
"placeholder": "",
|
||||
"describe": "executor 数目",
|
||||
"editable": 1,
|
||||
"condition": "",
|
||||
"sub_args": {}
|
||||
},
|
||||
"--code_type": {
|
||||
"type": "str",
|
||||
"item_type": "str",
|
||||
"label": "语言类型",
|
||||
"require": 1,
|
||||
"choice": [
|
||||
"Java",
|
||||
"Python",
|
||||
"Scala",
|
||||
"R"
|
||||
],
|
||||
"range": "",
|
||||
"default": "Python",
|
||||
"placeholder": "",
|
||||
"describe": "语言类型",
|
||||
"editable": 1,
|
||||
"condition": "",
|
||||
"sub_args": {}
|
||||
},
|
||||
"--code_class": {
|
||||
"type": "str",
|
||||
"item_type": "str",
|
||||
"label": "Java/Scala类名",
|
||||
"require": 1,
|
||||
"choice": [],
|
||||
"range": "",
|
||||
"default": "",
|
||||
"placeholder": "",
|
||||
"describe": "Java/Scala类名",
|
||||
"editable": 1,
|
||||
"condition": "",
|
||||
"sub_args": {}
|
||||
},
|
||||
"--code_file": {
|
||||
"type": "str",
|
||||
"item_type": "str",
|
||||
"label": "代码文件地址",
|
||||
"require": 1,
|
||||
"choice": [],
|
||||
"range": "",
|
||||
"default": "local:///opt/spark/examples/src/main/python/pi.py",
|
||||
"placeholder": "",
|
||||
"describe": "代码文件地址,支持local://,http://,hdfs://,s3a://,gcs://",
|
||||
"editable": 1,
|
||||
"condition": "",
|
||||
"sub_args": {}
|
||||
},
|
||||
"--code_arguments": {
|
||||
"type": "str",
|
||||
"item_type": "str",
|
||||
"label": "代码参数",
|
||||
"require": 1,
|
||||
"choice": [],
|
||||
"range": "",
|
||||
"default": "",
|
||||
"placeholder": "",
|
||||
"describe": "代码参数",
|
||||
"editable": 1,
|
||||
"condition": "",
|
||||
"sub_args": {}
|
||||
},
|
||||
"--sparkConf": {
|
||||
"type": "text",
|
||||
"item_type": "str",
|
||||
"label": "spark配置",
|
||||
"require": 1,
|
||||
"choice": [],
|
||||
"range": "",
|
||||
"default": "",
|
||||
"placeholder": "",
|
||||
"describe": "spark配置,每行一个配置,xx=yy",
|
||||
"editable": 1,
|
||||
"condition": "",
|
||||
"sub_args": {}
|
||||
},
|
||||
"--hadoopConf": {
|
||||
"type": "text",
|
||||
"item_type": "str",
|
||||
"label": "hadoop配置,每行一个配置,xx=yy",
|
||||
"require": 1,
|
||||
"choice": [],
|
||||
"range": "",
|
||||
"default": "",
|
||||
"placeholder": "",
|
||||
"describe": "hadoop配置",
|
||||
"editable": 1,
|
||||
"condition": "",
|
||||
"sub_args": {}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
8
job-template/job/spark/build.sh
Normal file
8
job-template/job/spark/build.sh
Normal file
@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
docker build -t ccr.ccs.tencentyun.com/cube-studio/spark:20221010 -f job/spark/Dockerfile .
|
||||
docker push ccr.ccs.tencentyun.com/cube-studio/spark:20221010
|
||||
|
||||
|
104
job-template/job/spark/demo.yaml
Normal file
104
job-template/job/spark/demo.yaml
Normal file
@ -0,0 +1,104 @@
|
||||
# 如果觉得好用,请收藏或则分享本站
|
||||
apiVersion: sparkoperator.k8s.io/v1beta2
|
||||
kind: SparkApplication
|
||||
metadata:
|
||||
namespace: pipeline
|
||||
name: sparkjob-pipeline1-2d4c
|
||||
labels:
|
||||
run-id: run-2-7
|
||||
run-rtx: admin
|
||||
pipeline-rtx: admin
|
||||
pipeline-id: '2'
|
||||
pipeline-name: pipeline1
|
||||
task-id: '7'
|
||||
task-name: spark-1658152032310
|
||||
spec:
|
||||
type: Python
|
||||
mode: cluster
|
||||
proxyUser: admin
|
||||
image: 'ccr.ccs.tencentyun.com/cube-studio/spark-operator:spark-v3.1.1'
|
||||
imagePullPolicy: Always
|
||||
mainClass: ''
|
||||
mainApplicationFile: 'local:///opt/spark/examples/src/main/python/pi.py'
|
||||
arguments: []
|
||||
sparkConf: {}
|
||||
hadoopConf: {}
|
||||
nodeSelector:
|
||||
org: public
|
||||
cpu: 'true'
|
||||
train: 'true'
|
||||
sparkVersion: 3.1.1
|
||||
pythonVersion: '3'
|
||||
batchScheduler: kube-batch
|
||||
restartPolicy:
|
||||
type: Never
|
||||
timeToLiveSeconds: 172800
|
||||
volumes:
|
||||
- name: kubeflow-user-workspace
|
||||
persistentVolumeClaim:
|
||||
claimName: kubeflow-user-workspace
|
||||
- name: kubeflow-archives
|
||||
persistentVolumeClaim:
|
||||
claimName: kubeflow-archives
|
||||
- name: tz-config
|
||||
hostPath:
|
||||
path: /usr/share/zoneinfo/Asia/Shanghai
|
||||
- name: spark-local-dir-1
|
||||
hostPath:
|
||||
path: /data/k8s/kubeflow/pipeline/workspace/admin
|
||||
|
||||
driver:
|
||||
cores: 2
|
||||
coreLimit: '2'
|
||||
memory: 2G
|
||||
# memoryLimit: 2G
|
||||
labels:
|
||||
run-id: run-2-7
|
||||
run-rtx: admin
|
||||
pipeline-rtx: admin
|
||||
pipeline-id: '2'
|
||||
pipeline-name: pipeline1
|
||||
task-id: '7'
|
||||
task-name: spark-1658152032310
|
||||
serviceAccount: spark
|
||||
volumeMounts:
|
||||
- name: kubeflow-user-workspace
|
||||
mountPath: /mnt/admin
|
||||
subPath: admin
|
||||
- name: kubeflow-archives
|
||||
mountPath: /archives/admin
|
||||
subPath: admin
|
||||
- name: tz-config
|
||||
mountPath: /etc/localtime
|
||||
executor:
|
||||
instances: 3
|
||||
cores: 2
|
||||
coreLimit: '2'
|
||||
memory: 2G
|
||||
# memoryLimit: 2G
|
||||
labels:
|
||||
run-id: run-2-7
|
||||
run-rtx: admin
|
||||
pipeline-rtx: admin
|
||||
pipeline-id: '2'
|
||||
pipeline-name: pipeline1
|
||||
task-id: '7'
|
||||
task-name: spark-1658152032310
|
||||
volumeMounts:
|
||||
- name: kubeflow-user-workspace
|
||||
mountPath: /mnt/admin
|
||||
subPath: admin
|
||||
- name: kubeflow-archives
|
||||
mountPath: /archives/admin
|
||||
subPath: admin
|
||||
- name: tz-config
|
||||
mountPath: /etc/localtime
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 5
|
||||
podAffinityTerm:
|
||||
topologyKey: kubernetes.io/hostname
|
||||
labelSelector:
|
||||
matchLabels:
|
||||
task-name: spark-1658152032310
|
30
job-template/job/spark/demo/pi.py
Normal file
30
job-template/job/spark/demo/pi.py
Normal file
@ -0,0 +1,30 @@
|
||||
|
||||
|
||||
import sys
|
||||
from random import random
|
||||
from operator import add
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
Usage: pi [partitions]
|
||||
"""
|
||||
spark = SparkSession\
|
||||
.builder\
|
||||
.appName("PythonPi")\
|
||||
.getOrCreate()
|
||||
|
||||
partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
|
||||
n = 100000 * partitions
|
||||
|
||||
def f(_):
|
||||
x = random() * 2 - 1
|
||||
y = random() * 2 - 1
|
||||
return 1 if x ** 2 + y ** 2 <= 1 else 0
|
||||
|
||||
count = spark.sparkContext.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
|
||||
print("Pi is roughly %f" % (4.0 * count / n))
|
||||
|
||||
spark.stop()
|
212
job-template/job/spark/launcher.py
Normal file
212
job-template/job/spark/launcher.py
Normal file
@ -0,0 +1,212 @@
|
||||
|
||||
import os,sys
|
||||
base_dir = os.path.split(os.path.realpath(__file__))[0]
|
||||
sys.path.append(base_dir)
|
||||
|
||||
import argparse
|
||||
import datetime
|
||||
import json
|
||||
import time
|
||||
import uuid
|
||||
import os
|
||||
import pysnooper
|
||||
import os,sys
|
||||
import re
|
||||
import threading
|
||||
import psutil
|
||||
import copy
|
||||
|
||||
from kubernetes import client
|
||||
|
||||
# print(os.environ)
|
||||
from job.pkgs.k8s.py_k8s import K8s
|
||||
k8s_client = K8s()
|
||||
|
||||
KFJ_NAMESPACE = os.getenv('KFJ_NAMESPACE', '')
|
||||
KFJ_TASK_ID = os.getenv('KFJ_TASK_ID', '')
|
||||
KFJ_TASK_NAME = os.getenv('KFJ_TASK_NAME', '')
|
||||
task_node_selectors = re.split(',|;|\n|\t', os.getenv('KFJ_TASK_NODE_SELECTOR', 'cpu=true,train=true'))
|
||||
KFJ_TASK_NODE_SELECTOR = {}
|
||||
for task_node_selector in task_node_selectors:
|
||||
KFJ_TASK_NODE_SELECTOR[task_node_selector.split('=')[0]] = task_node_selector.split('=')[1]
|
||||
|
||||
KFJ_PIPELINE_ID = os.getenv('KFJ_PIPELINE_ID', '')
|
||||
KFJ_RUN_ID = os.getenv('KFJ_RUN_ID', '')
|
||||
KFJ_CREATOR = os.getenv('KFJ_CREATOR', '')
|
||||
KFJ_RUNNER = os.getenv('KFJ_RUNNER','')
|
||||
KFJ_PIPELINE_NAME = os.getenv('KFJ_PIPELINE_NAME', '')
|
||||
KFJ_TASK_IMAGES = os.getenv('KFJ_TASK_IMAGES', '')
|
||||
KFJ_TASK_VOLUME_MOUNT = os.getenv('KFJ_TASK_VOLUME_MOUNT', '')
|
||||
KFJ_TASK_TIMEOUT = int(os.getenv('KFJ_TASK_TIMEOUT', 60 * 60 * 24 * 2))
|
||||
KFJ_TASK_RESOURCE_CPU = os.getenv('KFJ_TASK_RESOURCE_CPU', '')
|
||||
KFJ_TASK_RESOURCE_MEMORY = os.getenv('KFJ_TASK_RESOURCE_MEMORY', '')
|
||||
|
||||
INIT_FILE=''
|
||||
crd_info={
|
||||
"group": "sparkoperator.k8s.io",
|
||||
"version": "v1beta2",
|
||||
'kind': 'SparkApplication',
|
||||
"plural": "sparkapplications",
|
||||
"timeout": 60 * 60 * 24 * 2
|
||||
}
|
||||
|
||||
k8s_volumes, k8s_volume_mounts = k8s_client.get_volume_mounts(KFJ_TASK_VOLUME_MOUNT,KFJ_CREATOR)
|
||||
|
||||
print(k8s_volumes)
|
||||
print(k8s_volume_mounts)
|
||||
|
||||
GPU_TYPE= os.getenv('KFJ_GPU_TYPE', 'NVIDIA')
|
||||
GPU_RESOURCE= os.getenv('KFJ_TASK_RESOURCE_GPU', '0')
|
||||
print(GPU_TYPE,GPU_RESOURCE)
|
||||
|
||||
|
||||
|
||||
def default_job_name():
|
||||
name = "sparkjob-" + KFJ_PIPELINE_NAME.replace('_','-')+"-"+uuid.uuid4().hex[:4]
|
||||
return name[0:54]
|
||||
|
||||
|
||||
# @pysnooper.snoop()
|
||||
def make_sparkjob(name,**kwargs):
|
||||
|
||||
|
||||
label={
|
||||
"run-id":KFJ_RUN_ID,
|
||||
"run-rtx":KFJ_RUNNER,
|
||||
"pipeline-rtx": KFJ_CREATOR,
|
||||
"pipeline-id": KFJ_PIPELINE_ID,
|
||||
"pipeline-name": KFJ_PIPELINE_NAME,
|
||||
"task-id": KFJ_TASK_ID,
|
||||
"task-name": KFJ_TASK_NAME,
|
||||
}
|
||||
|
||||
spark_deploy = {
|
||||
"apiVersion": "sparkoperator.k8s.io/v1beta2",
|
||||
"kind": "SparkApplication",
|
||||
"metadata": {
|
||||
"namespace": KFJ_NAMESPACE,
|
||||
"name": name,
|
||||
"labels":label
|
||||
},
|
||||
"spec": {
|
||||
"type": kwargs['code_type'], # Java Python R Scala
|
||||
"mode": "cluster", # client cluster in-cluster-client
|
||||
"proxyUser":KFJ_CREATOR,
|
||||
"image": kwargs['image'],
|
||||
"imagePullPolicy": "Always",
|
||||
"mainClass": kwargs['code_class'], # Java/Scala
|
||||
"mainApplicationFile": kwargs['code_file'], # JAR, Python, or R file
|
||||
"arguments":kwargs['code_arguments'],
|
||||
"sparkConf":kwargs['sparkConf'],
|
||||
"hadoopConf":kwargs['hadoopConf'],
|
||||
"nodeSelector":KFJ_TASK_NODE_SELECTOR,
|
||||
"sparkVersion": "3.1.1",
|
||||
"pythonVersion":"3",
|
||||
"batchScheduler": "kube-batch",
|
||||
"restartPolicy": {
|
||||
"type": "Never"
|
||||
},
|
||||
"timeToLiveSeconds":KFJ_TASK_TIMEOUT,
|
||||
"volumes": k8s_volumes,
|
||||
"driver": {
|
||||
"cores": int(KFJ_TASK_RESOURCE_CPU),
|
||||
"coreLimit": str(KFJ_TASK_RESOURCE_CPU),
|
||||
"memory": KFJ_TASK_RESOURCE_MEMORY,
|
||||
# "memoryLimit": KFJ_TASK_RESOURCE_MEMORY,
|
||||
"labels": label,
|
||||
"serviceAccount": "spark",
|
||||
"volumeMounts": k8s_volume_mounts
|
||||
},
|
||||
"executor": {
|
||||
"instances": int(kwargs['num_worker']),
|
||||
"cores": int(KFJ_TASK_RESOURCE_CPU),
|
||||
"coreLimit": str(KFJ_TASK_RESOURCE_CPU),
|
||||
"memory": KFJ_TASK_RESOURCE_MEMORY,
|
||||
# "memoryLimit": KFJ_TASK_RESOURCE_MEMORY,
|
||||
"labels": label,
|
||||
"volumeMounts": k8s_volume_mounts,
|
||||
"affinity":{
|
||||
"podAntiAffinity": {
|
||||
"preferredDuringSchedulingIgnoredDuringExecution": [
|
||||
{
|
||||
"weight": 5,
|
||||
"podAffinityTerm": {
|
||||
"topologyKey": "kubernetes.io/hostname",
|
||||
"labelSelector": {
|
||||
"matchLabels": {
|
||||
"task-name": KFJ_TASK_NAME,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
print(spark_deploy)
|
||||
return spark_deploy
|
||||
|
||||
|
||||
# @pysnooper.snoop()
|
||||
def launch_sparkjob(name, **kwargs):
|
||||
|
||||
if KFJ_RUN_ID:
|
||||
print('delete old spark, run-id %s'%KFJ_RUN_ID, flush=True)
|
||||
k8s_client.delete_crd(group=crd_info['group'],version=crd_info['version'],plural=crd_info['plural'],namespace=KFJ_NAMESPACE,labels={"run-id":KFJ_RUN_ID})
|
||||
time.sleep(10)
|
||||
# 删除旧的spark
|
||||
k8s_client.delete_crd(group=crd_info['group'], version=crd_info['version'], plural=crd_info['plural'],namespace=KFJ_NAMESPACE, name=name)
|
||||
time.sleep(10)
|
||||
# 创建新的spark
|
||||
sparkjob_json = make_sparkjob(name=name,**kwargs)
|
||||
print('create new spark %s' % name, flush=True)
|
||||
k8s_client.create_crd(group=crd_info['group'],version=crd_info['version'],plural=crd_info['plural'],namespace=KFJ_NAMESPACE,body=sparkjob_json)
|
||||
time.sleep(10)
|
||||
while True:
|
||||
time.sleep(10)
|
||||
sparkjob = k8s_client.get_one_crd(group=crd_info['group'], version=crd_info['version'],plural=crd_info['plural'], namespace=KFJ_NAMESPACE, name=name)
|
||||
if sparkjob and (sparkjob['status'] == "Succeeded" or sparkjob['status'] == "Failed"):
|
||||
break
|
||||
|
||||
sparkjob = k8s_client.get_one_crd(group=crd_info['group'],version=crd_info['version'],plural=crd_info['plural'],namespace=KFJ_NAMESPACE,name=name)
|
||||
print("sparkjob %s finished, status %s"%(name, sparkjob['status']))
|
||||
|
||||
if sparkjob['status']!='Succeeded':
|
||||
exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
arg_parser = argparse.ArgumentParser("sparkjob launcher")
|
||||
arg_parser.add_argument('--image', type=str, help="运行job的镜像", default='ccr.ccs.tencentyun.com/cube-studio/spark-operator:spark-v3.1.1')
|
||||
arg_parser.add_argument('--num_worker', type=int, help="运行job所在的机器", default=3)
|
||||
arg_parser.add_argument('--code_type', type=str, help="代码类型", default='') # Java Python R Scala
|
||||
arg_parser.add_argument('--code_class', type=str, help="代码类型", default='') #
|
||||
arg_parser.add_argument('--code_file', type=str, help="代码地址", default='') # local://,http://,hdfs://,s3a://,gcs://
|
||||
arg_parser.add_argument('--code_arguments', type=str, help="代码参数",default='') #
|
||||
arg_parser.add_argument('--sparkConf', type=str, help="spark配置", default='') #
|
||||
arg_parser.add_argument('--hadoopConf', type=str, help="hadoop配置", default='') #
|
||||
# arg_parser.add_argument('--driver_memory', type=str, help="driver端的内存", default='2g')
|
||||
# arg_parser.add_argument('--executor_memory', type=str, help="executor端的内存", default='2g')
|
||||
# arg_parser.add_argument('--driver_cpu', type=str, help="driver端的cpu", default='2')
|
||||
# arg_parser.add_argument('--executor_cpu', type=str, help="executor端的cpu", default='2')
|
||||
|
||||
args = arg_parser.parse_args().__dict__
|
||||
|
||||
print("{} args: {}".format(__file__, args))
|
||||
|
||||
sparkConf = [[x.split('=')[0], x.split('=')[1]] for x in args['sparkConf'].split('\n') if '=' in x]
|
||||
args['sparkConf'] = dict(zip([x[0] for x in sparkConf],[x[1] for x in sparkConf]))
|
||||
args['sparkConf']['spark.driver.bindAddress']='0.0.0.0'
|
||||
|
||||
hadoopConf = [[x.split('=')[0], x.split('=')[1]] for x in args['hadoopConf'].split('\n') if '=' in x]
|
||||
args['hadoopConf'] = dict(zip([x[0] for x in hadoopConf], [x[1] for x in hadoopConf]))
|
||||
args['code_arguments'] = [x.strip() for x in args['code_arguments'].split(' ') if x.strip()]
|
||||
|
||||
|
||||
launch_sparkjob(name=default_job_name(),**args)
|
||||
|
||||
|
@ -19,8 +19,6 @@ ENV RAY_CLIENT_SERVER_MAX_THREADS=1000
|
||||
USER root
|
||||
COPY job/video-audio /app
|
||||
COPY job/pkgs /app/job/pkgs
|
||||
# COPY job/pkgs/config/pip.conf /root/.pip/pip.conf
|
||||
# COPY job/pkgs/config/ubuntu-sources.list /etc/apt/sources.list
|
||||
|
||||
WORKDIR /app
|
||||
ENTRYPOINT ["python", "start_download.py"]
|
||||
|
@ -25,8 +25,6 @@ RUN echo "alias ll='ls -alF'" >> /root/.bashrc && \
|
||||
|
||||
RUN pip install kubernetes==12.0.1 pysnooper psutil
|
||||
COPY job/volcano/* /app/
|
||||
#COPY job/pkgs/config/ubuntu-sources.list /etc/apt/sources.list
|
||||
#COPY job/pkgs/config/pip.conf /root/.pip/pip.conf
|
||||
COPY job/pkgs /app/job/pkgs
|
||||
WORKDIR /app
|
||||
ENV PYTHONPATH=/app:$PYTHONPATH
|
||||
|
@ -264,6 +264,142 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"sparkjob":{
|
||||
"project_name":"数据处理",
|
||||
"image_name":"ccr.ccs.tencentyun.com/cube-studio/spark:20221010",
|
||||
"image_describe":"spark serverless分布式任务",
|
||||
"job_template_name":"sparkjob",
|
||||
"job_template_describe":"spark serverless分布式任务",
|
||||
"job_template_command":"",
|
||||
"job_template_volume":"",
|
||||
"job_template_account":"kubeflow-pipeline",
|
||||
"job_template_expand":{
|
||||
"index":3,
|
||||
"help_url":"https://github.com/tencentmusic/cube-studio/tree/master/job-template/job/spark"
|
||||
},
|
||||
"job_template_env":"NO_RESOURCE_CHECK=true\nTASK_RESOURCE_CPU=2\nTASK_RESOURCE_MEMORY=4G\nTASK_RESOURCE_GPU=0",
|
||||
"job_template_args":{
|
||||
"参数":{
|
||||
"--image": {
|
||||
"type": "str",
|
||||
"item_type": "str",
|
||||
"label": "执行镜像",
|
||||
"require": 1,
|
||||
"choice": [],
|
||||
"range": "",
|
||||
"default": "ccr.ccs.tencentyun.com/cube-studio/spark-operator:spark-v3.1.1",
|
||||
"placeholder": "",
|
||||
"describe": "执行镜像",
|
||||
"editable": 1,
|
||||
"condition": "",
|
||||
"sub_args": {}
|
||||
},
|
||||
"--num_worker": {
|
||||
"type": "str",
|
||||
"item_type": "str",
|
||||
"label": "executor 数目",
|
||||
"require": 1,
|
||||
"choice": [],
|
||||
"range": "",
|
||||
"default": "3",
|
||||
"placeholder": "",
|
||||
"describe": "executor 数目",
|
||||
"editable": 1,
|
||||
"condition": "",
|
||||
"sub_args": {}
|
||||
},
|
||||
"--code_type": {
|
||||
"type": "str",
|
||||
"item_type": "str",
|
||||
"label": "语言类型",
|
||||
"require": 1,
|
||||
"choice": [
|
||||
"Java",
|
||||
"Python",
|
||||
"Scala",
|
||||
"R"
|
||||
],
|
||||
"range": "",
|
||||
"default": "Python",
|
||||
"placeholder": "",
|
||||
"describe": "语言类型",
|
||||
"editable": 1,
|
||||
"condition": "",
|
||||
"sub_args": {}
|
||||
},
|
||||
"--code_class": {
|
||||
"type": "str",
|
||||
"item_type": "str",
|
||||
"label": "Java/Scala类名",
|
||||
"require": 1,
|
||||
"choice": [],
|
||||
"range": "",
|
||||
"default": "",
|
||||
"placeholder": "",
|
||||
"describe": "Java/Scala类名,其他语言下不填",
|
||||
"editable": 1,
|
||||
"condition": "",
|
||||
"sub_args": {}
|
||||
},
|
||||
"--code_file": {
|
||||
"type": "str",
|
||||
"item_type": "str",
|
||||
"label": "代码文件地址",
|
||||
"require": 1,
|
||||
"choice": [],
|
||||
"range": "",
|
||||
"default": "local:///opt/spark/examples/src/main/python/pi.py",
|
||||
"placeholder": "",
|
||||
"describe": "代码文件地址,支持local://,http://,hdfs://,s3a://,gcs://",
|
||||
"editable": 1,
|
||||
"condition": "",
|
||||
"sub_args": {}
|
||||
},
|
||||
"--code_arguments": {
|
||||
"type": "str",
|
||||
"item_type": "str",
|
||||
"label": "代码参数",
|
||||
"require": 1,
|
||||
"choice": [],
|
||||
"range": "",
|
||||
"default": "",
|
||||
"placeholder": "",
|
||||
"describe": "代码参数",
|
||||
"editable": 1,
|
||||
"condition": "",
|
||||
"sub_args": {}
|
||||
},
|
||||
"--sparkConf": {
|
||||
"type": "text",
|
||||
"item_type": "str",
|
||||
"label": "spark配置",
|
||||
"require": 1,
|
||||
"choice": [],
|
||||
"range": "",
|
||||
"default": "",
|
||||
"placeholder": "",
|
||||
"describe": "spark配置,每行一个配置,xx=yy",
|
||||
"editable": 1,
|
||||
"condition": "",
|
||||
"sub_args": {}
|
||||
},
|
||||
"--hadoopConf": {
|
||||
"type": "text",
|
||||
"item_type": "str",
|
||||
"label": "hadoop配置,每行一个配置,xx=yy",
|
||||
"require": 1,
|
||||
"choice": [],
|
||||
"range": "",
|
||||
"default": "",
|
||||
"placeholder": "",
|
||||
"describe": "hadoop配置",
|
||||
"editable": 1,
|
||||
"condition": "",
|
||||
"sub_args": {}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"ray-sklearn":{
|
||||
"project_name":"机器学习",
|
||||
"image_name":"ccr.ccs.tencentyun.com/cube-studio/sklearn_estimator:v1",
|
||||
|
@ -478,6 +478,15 @@ class K8s():
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
# 删除sparkjob
|
||||
try:
|
||||
crd_info = all_crd_info['sparkjob']
|
||||
crd_names = self.delete_crd(
|
||||
group=crd_info['group'], version=crd_info['version'], plural=crd_info['plural'],
|
||||
namespace=namespace, labels={'run-id': run_id}
|
||||
)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user