Jupyter notebook image supports apache spark and comes with examples

This commit is contained in:
白松 2022-08-28 01:28:04 +08:00
parent 448531cbc2
commit b5320985c7
6 changed files with 220 additions and 4 deletions

View File

@ -4,9 +4,11 @@ base_image=ubuntu:18.04
docker build -t $hubhost/notebook:jupyter-ubuntu-cpu-base --build-arg FROM_IMAGES=$base_image -f Dockerfile-ubuntu-base .
docker push $hubhost/notebook:jupyter-ubuntu-cpu-base
# 构建hadoop镜像
docker build -t $hubhost/notebook:jupyter-ubuntu-cpu-hadoop -f hadoop/Dockerfile-ubuntu-hadoop .
docker push $hubhost/notebook:jupyter-ubuntu-cpu-base
base_image=nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
docker build -t $hubhost/notebook:jupyter-ubuntu-gpu-base --build-arg FROM_IMAGES=$base_image -f Dockerfile-ubuntu-base .
docker push $hubhost/notebook:jupyter-ubuntu-gpu-base
docker push $hubhost/notebook:jupyter-ubuntu-gpu-base

View File

@ -0,0 +1,28 @@
FROM ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-cpu-base
MAINTAINER hamawhite
COPY hadoop/run-jupyter.sh /root/run-jupyter.sh
# 拷贝examples
COPY hadoop/examples/spark/* /examples/
RUN apt install -y lsof
# 修改python3的软链接
RUN cd /usr/bin \
&& rm -rf python3 \
&& ln -s python3.8* python3
# 下载apache spark安装包
RUN mkdir -p /opt/third/hadoop/etc/hadoop \
&& cd /opt/third \
&& wget http://dlcdn.apache.org/spark/spark-3.1.3/spark-3.1.3-bin-hadoop3.2.tgz \
&& tar -xvzf spark-3.1.3-bin-hadoop3.2.tgz \
&& ln -s spark-3.1.3-bin-hadoop3.2 spark \
&& rm -rf spark-3.1.3-bin-hadoop3.2.tgz
# 创建spark-defaults.conf
RUN cd /opt/third/spark/conf \
&& mv spark-defaults.conf.template spark-defaults.conf
ENTRYPOINT ["bash","/root/run-jupyter.sh"]

View File

@ -0,0 +1,55 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from random import random\n",
"from operator import add\n",
"from pyspark.sql import SparkSession\n",
"\n",
"if __name__ == \"__main__\":\n",
" spark = SparkSession\\\n",
" .builder\\\n",
" .appName(\"PythonPi-Local\")\\\n",
" .master(\"local\")\\\n",
" .getOrCreate()\n",
"\n",
" n = 100000 * 2\n",
"\n",
" def f(_):\n",
" x = random() * 2 - 1\n",
" y = random() * 2 - 1\n",
" return 1 if x ** 2 + y ** 2 <= 1 else 0\n",
"\n",
" count = spark.sparkContext.parallelize(range(1, n + 1), 2).map(f).reduce(add)\n",
" print(\"Pi is roughly %f\" % (4.0 * count / n))\n",
"\n",
" spark.stop()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3.0
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@ -0,0 +1,46 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql import SparkSession\n",
"\n",
"if __name__ == \"__main__\":\n",
" spark = SparkSession.builder \\\n",
" .appName('spark-hive-demo') \\\n",
" .config(\"hive.metastore.uris\", \"thrift://xxx.xxx.xxx.xxx:9083\") \\\n",
" .enableHiveSupport() \\\n",
" .getOrCreate()\n",
"\n",
" spark.sql(\"create table if not exists demo(id bigint,name String)\")\n",
"\n",
" spark.sql(\"insert overwrite demo values (1,'hamawhite'),(2,'song.bs')\")\n",
" spark.sql(\"select * from demo\").show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3.0
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@ -0,0 +1,56 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from random import random\n",
"from operator import add\n",
"from pyspark.sql import SparkSession\n",
"\n",
"if __name__ == \"__main__\":\n",
" spark = SparkSession\\\n",
" .builder\\\n",
" .appName(\"PythonPi-Yarn-Client-Dockerfile\")\\\n",
" .master(\"yarn\")\\\n",
" .config(\"spark.submit.deployMode\", \"client\")\\\n",
" .getOrCreate()\n",
"\n",
" n = 100000 * 2\n",
"\n",
" def f(_):\n",
" x = random() * 2 - 1\n",
" y = random() * 2 - 1\n",
" return 1 if x ** 2 + y ** 2 <= 1 else 0\n",
"\n",
" count = spark.sparkContext.parallelize(range(1, n + 1), 2).map(f).reduce(add)\n",
" print(\"Pi is roughly %f\" % (4.0 * count / n))\n",
"\n",
" spark.stop()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3.0
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@ -0,0 +1,29 @@
#!/bin/bash
HOST_IP=$1
# Hadoop生态集群的环境变量统一设置在/opt/third/hadoop-env文件中。
# 设置Hadoop环境变量
echo "export HADOOP_CONF_DIR=/opt/third/hadoop/etc/hadoop" >> /opt/third/hadoop-env
SPARK_HOME="/opt/third/spark"
# 设置Spark环境变量
echo "export SPARK_HOME=${SPARK_HOME}" >> /opt/third/hadoop-env
echo 'export PATH=$PATH:$SPARK_HOME/bin' >> /opt/third/hadoop-env
echo 'export PYTHONPATH=${SPARK_HOME}/python:$(ZIPS=("$SPARK_HOME"/python/lib/*.zip); IFS=:; echo "${ZIPS[*]}"):$PYTHONPATH' >> /opt/third/hadoop-env
# 配置spark-defaults.conf
echo "spark.ui.enabled=false" >> ${SPARK_HOME}/conf/spark-defaults.conf
echo "spark.driver.port=32788" >> ${SPARK_HOME}/conf/spark-defaults.conf
echo "spark.blockManager.port=32789" >> ${SPARK_HOME}/conf/spark-defaults.conf
echo "spark.driver.bindAddress=0.0.0.0" >> ${SPARK_HOME}/conf/spark-defaults.conf
echo "spark.driver.host=${HOST_IP}" >>${SPARK_HOME}/conf/spark-defaults.conf
source /opt/third/hadoop-env
# 绑定到/data目录下
jupyter lab --notebook-dir=/ --ip=0.0.0.0 --no-browser --allow-root --port=3000 --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.allow_origin='*' 2>&1