mirror of
https://github.com/tencentmusic/cube-studio.git
synced 2024-11-21 01:16:33 +08:00
Jupyter notebook image supports apache spark and comes with examples
This commit is contained in:
parent
448531cbc2
commit
b5320985c7
@ -4,9 +4,11 @@ base_image=ubuntu:18.04
|
||||
docker build -t $hubhost/notebook:jupyter-ubuntu-cpu-base --build-arg FROM_IMAGES=$base_image -f Dockerfile-ubuntu-base .
|
||||
docker push $hubhost/notebook:jupyter-ubuntu-cpu-base
|
||||
|
||||
# 构建hadoop镜像
|
||||
docker build -t $hubhost/notebook:jupyter-ubuntu-cpu-hadoop -f hadoop/Dockerfile-ubuntu-hadoop .
|
||||
docker push $hubhost/notebook:jupyter-ubuntu-cpu-base
|
||||
|
||||
|
||||
base_image=nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
|
||||
docker build -t $hubhost/notebook:jupyter-ubuntu-gpu-base --build-arg FROM_IMAGES=$base_image -f Dockerfile-ubuntu-base .
|
||||
docker push $hubhost/notebook:jupyter-ubuntu-gpu-base
|
||||
|
||||
|
||||
|
||||
docker push $hubhost/notebook:jupyter-ubuntu-gpu-base
|
28
images/jupyter-notebook/hadoop/Dockerfile-ubuntu-hadoop
Normal file
28
images/jupyter-notebook/hadoop/Dockerfile-ubuntu-hadoop
Normal file
@ -0,0 +1,28 @@
|
||||
FROM ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-cpu-base
|
||||
|
||||
MAINTAINER hamawhite
|
||||
|
||||
COPY hadoop/run-jupyter.sh /root/run-jupyter.sh
|
||||
# 拷贝examples
|
||||
COPY hadoop/examples/spark/* /examples/
|
||||
|
||||
RUN apt install -y lsof
|
||||
|
||||
# 修改python3的软链接
|
||||
RUN cd /usr/bin \
|
||||
&& rm -rf python3 \
|
||||
&& ln -s python3.8* python3
|
||||
|
||||
# 下载apache spark安装包
|
||||
RUN mkdir -p /opt/third/hadoop/etc/hadoop \
|
||||
&& cd /opt/third \
|
||||
&& wget http://dlcdn.apache.org/spark/spark-3.1.3/spark-3.1.3-bin-hadoop3.2.tgz \
|
||||
&& tar -xvzf spark-3.1.3-bin-hadoop3.2.tgz \
|
||||
&& ln -s spark-3.1.3-bin-hadoop3.2 spark \
|
||||
&& rm -rf spark-3.1.3-bin-hadoop3.2.tgz
|
||||
|
||||
# 创建spark-defaults.conf
|
||||
RUN cd /opt/third/spark/conf \
|
||||
&& mv spark-defaults.conf.template spark-defaults.conf
|
||||
|
||||
ENTRYPOINT ["bash","/root/run-jupyter.sh"]
|
55
images/jupyter-notebook/hadoop/examples/spark/pyspark_local.ipynb
Executable file
55
images/jupyter-notebook/hadoop/examples/spark/pyspark_local.ipynb
Executable file
@ -0,0 +1,55 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from random import random\n",
|
||||
"from operator import add\n",
|
||||
"from pyspark.sql import SparkSession\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" spark = SparkSession\\\n",
|
||||
" .builder\\\n",
|
||||
" .appName(\"PythonPi-Local\")\\\n",
|
||||
" .master(\"local\")\\\n",
|
||||
" .getOrCreate()\n",
|
||||
"\n",
|
||||
" n = 100000 * 2\n",
|
||||
"\n",
|
||||
" def f(_):\n",
|
||||
" x = random() * 2 - 1\n",
|
||||
" y = random() * 2 - 1\n",
|
||||
" return 1 if x ** 2 + y ** 2 <= 1 else 0\n",
|
||||
"\n",
|
||||
" count = spark.sparkContext.parallelize(range(1, n + 1), 2).map(f).reduce(add)\n",
|
||||
" print(\"Pi is roughly %f\" % (4.0 * count / n))\n",
|
||||
"\n",
|
||||
" spark.stop()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3.0
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
46
images/jupyter-notebook/hadoop/examples/spark/pyspark_local_hive.ipynb
Executable file
46
images/jupyter-notebook/hadoop/examples/spark/pyspark_local_hive.ipynb
Executable file
@ -0,0 +1,46 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pyspark.sql import SparkSession\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" spark = SparkSession.builder \\\n",
|
||||
" .appName('spark-hive-demo') \\\n",
|
||||
" .config(\"hive.metastore.uris\", \"thrift://xxx.xxx.xxx.xxx:9083\") \\\n",
|
||||
" .enableHiveSupport() \\\n",
|
||||
" .getOrCreate()\n",
|
||||
"\n",
|
||||
" spark.sql(\"create table if not exists demo(id bigint,name String)\")\n",
|
||||
"\n",
|
||||
" spark.sql(\"insert overwrite demo values (1,'hamawhite'),(2,'song.bs')\")\n",
|
||||
" spark.sql(\"select * from demo\").show()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3.0
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
56
images/jupyter-notebook/hadoop/examples/spark/pyspark_yarn.ipynb
Executable file
56
images/jupyter-notebook/hadoop/examples/spark/pyspark_yarn.ipynb
Executable file
@ -0,0 +1,56 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from random import random\n",
|
||||
"from operator import add\n",
|
||||
"from pyspark.sql import SparkSession\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" spark = SparkSession\\\n",
|
||||
" .builder\\\n",
|
||||
" .appName(\"PythonPi-Yarn-Client-Dockerfile\")\\\n",
|
||||
" .master(\"yarn\")\\\n",
|
||||
" .config(\"spark.submit.deployMode\", \"client\")\\\n",
|
||||
" .getOrCreate()\n",
|
||||
"\n",
|
||||
" n = 100000 * 2\n",
|
||||
"\n",
|
||||
" def f(_):\n",
|
||||
" x = random() * 2 - 1\n",
|
||||
" y = random() * 2 - 1\n",
|
||||
" return 1 if x ** 2 + y ** 2 <= 1 else 0\n",
|
||||
"\n",
|
||||
" count = spark.sparkContext.parallelize(range(1, n + 1), 2).map(f).reduce(add)\n",
|
||||
" print(\"Pi is roughly %f\" % (4.0 * count / n))\n",
|
||||
"\n",
|
||||
" spark.stop()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3.0
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
29
images/jupyter-notebook/hadoop/run-jupyter.sh
Normal file
29
images/jupyter-notebook/hadoop/run-jupyter.sh
Normal file
@ -0,0 +1,29 @@
|
||||
#!/bin/bash
|
||||
|
||||
HOST_IP=$1
|
||||
|
||||
# Hadoop生态集群的环境变量统一设置在/opt/third/hadoop-env文件中。
|
||||
|
||||
# 设置Hadoop环境变量
|
||||
echo "export HADOOP_CONF_DIR=/opt/third/hadoop/etc/hadoop" >> /opt/third/hadoop-env
|
||||
|
||||
SPARK_HOME="/opt/third/spark"
|
||||
|
||||
# 设置Spark环境变量
|
||||
echo "export SPARK_HOME=${SPARK_HOME}" >> /opt/third/hadoop-env
|
||||
echo 'export PATH=$PATH:$SPARK_HOME/bin' >> /opt/third/hadoop-env
|
||||
echo 'export PYTHONPATH=${SPARK_HOME}/python:$(ZIPS=("$SPARK_HOME"/python/lib/*.zip); IFS=:; echo "${ZIPS[*]}"):$PYTHONPATH' >> /opt/third/hadoop-env
|
||||
|
||||
|
||||
# 配置spark-defaults.conf
|
||||
echo "spark.ui.enabled=false" >> ${SPARK_HOME}/conf/spark-defaults.conf
|
||||
echo "spark.driver.port=32788" >> ${SPARK_HOME}/conf/spark-defaults.conf
|
||||
echo "spark.blockManager.port=32789" >> ${SPARK_HOME}/conf/spark-defaults.conf
|
||||
echo "spark.driver.bindAddress=0.0.0.0" >> ${SPARK_HOME}/conf/spark-defaults.conf
|
||||
echo "spark.driver.host=${HOST_IP}" >>${SPARK_HOME}/conf/spark-defaults.conf
|
||||
|
||||
|
||||
source /opt/third/hadoop-env
|
||||
|
||||
# 绑定到/data目录下
|
||||
jupyter lab --notebook-dir=/ --ip=0.0.0.0 --no-browser --allow-root --port=3000 --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.allow_origin='*' 2>&1
|
Loading…
Reference in New Issue
Block a user