From b5320985c7be1002507e6040dd709d6ca2ef6a42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=99=BD=E6=9D=BE?= Date: Sun, 28 Aug 2022 01:28:04 +0800 Subject: [PATCH] Jupyter notebook image supports apache spark and comes with examples --- images/jupyter-notebook/build.sh | 10 ++-- .../hadoop/Dockerfile-ubuntu-hadoop | 28 ++++++++++ .../hadoop/examples/spark/pyspark_local.ipynb | 55 ++++++++++++++++++ .../examples/spark/pyspark_local_hive.ipynb | 46 +++++++++++++++ .../hadoop/examples/spark/pyspark_yarn.ipynb | 56 +++++++++++++++++++ images/jupyter-notebook/hadoop/run-jupyter.sh | 29 ++++++++++ 6 files changed, 220 insertions(+), 4 deletions(-) create mode 100644 images/jupyter-notebook/hadoop/Dockerfile-ubuntu-hadoop create mode 100755 images/jupyter-notebook/hadoop/examples/spark/pyspark_local.ipynb create mode 100755 images/jupyter-notebook/hadoop/examples/spark/pyspark_local_hive.ipynb create mode 100755 images/jupyter-notebook/hadoop/examples/spark/pyspark_yarn.ipynb create mode 100644 images/jupyter-notebook/hadoop/run-jupyter.sh diff --git a/images/jupyter-notebook/build.sh b/images/jupyter-notebook/build.sh index dc2a5ac0..c5100063 100644 --- a/images/jupyter-notebook/build.sh +++ b/images/jupyter-notebook/build.sh @@ -4,9 +4,11 @@ base_image=ubuntu:18.04 docker build -t $hubhost/notebook:jupyter-ubuntu-cpu-base --build-arg FROM_IMAGES=$base_image -f Dockerfile-ubuntu-base . docker push $hubhost/notebook:jupyter-ubuntu-cpu-base +# 构建hadoop镜像 +docker build -t $hubhost/notebook:jupyter-ubuntu-cpu-hadoop -f hadoop/Dockerfile-ubuntu-hadoop . +docker push $hubhost/notebook:jupyter-ubuntu-cpu-base + + base_image=nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04 docker build -t $hubhost/notebook:jupyter-ubuntu-gpu-base --build-arg FROM_IMAGES=$base_image -f Dockerfile-ubuntu-base . -docker push $hubhost/notebook:jupyter-ubuntu-gpu-base - - - +docker push $hubhost/notebook:jupyter-ubuntu-gpu-base \ No newline at end of file diff --git a/images/jupyter-notebook/hadoop/Dockerfile-ubuntu-hadoop b/images/jupyter-notebook/hadoop/Dockerfile-ubuntu-hadoop new file mode 100644 index 00000000..70db7a91 --- /dev/null +++ b/images/jupyter-notebook/hadoop/Dockerfile-ubuntu-hadoop @@ -0,0 +1,28 @@ +FROM ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-cpu-base + +MAINTAINER hamawhite + +COPY hadoop/run-jupyter.sh /root/run-jupyter.sh +# 拷贝examples +COPY hadoop/examples/spark/* /examples/ + +RUN apt install -y lsof + +# 修改python3的软链接 +RUN cd /usr/bin \ + && rm -rf python3 \ + && ln -s python3.8* python3 + +# 下载apache spark安装包 +RUN mkdir -p /opt/third/hadoop/etc/hadoop \ + && cd /opt/third \ + && wget http://dlcdn.apache.org/spark/spark-3.1.3/spark-3.1.3-bin-hadoop3.2.tgz \ + && tar -xvzf spark-3.1.3-bin-hadoop3.2.tgz \ + && ln -s spark-3.1.3-bin-hadoop3.2 spark \ + && rm -rf spark-3.1.3-bin-hadoop3.2.tgz + +# 创建spark-defaults.conf +RUN cd /opt/third/spark/conf \ + && mv spark-defaults.conf.template spark-defaults.conf + +ENTRYPOINT ["bash","/root/run-jupyter.sh"] \ No newline at end of file diff --git a/images/jupyter-notebook/hadoop/examples/spark/pyspark_local.ipynb b/images/jupyter-notebook/hadoop/examples/spark/pyspark_local.ipynb new file mode 100755 index 00000000..8985b001 --- /dev/null +++ b/images/jupyter-notebook/hadoop/examples/spark/pyspark_local.ipynb @@ -0,0 +1,55 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from random import random\n", + "from operator import add\n", + "from pyspark.sql import SparkSession\n", + "\n", + "if __name__ == \"__main__\":\n", + " spark = SparkSession\\\n", + " .builder\\\n", + " .appName(\"PythonPi-Local\")\\\n", + " .master(\"local\")\\\n", + " .getOrCreate()\n", + "\n", + " n = 100000 * 2\n", + "\n", + " def f(_):\n", + " x = random() * 2 - 1\n", + " y = random() * 2 - 1\n", + " return 1 if x ** 2 + y ** 2 <= 1 else 0\n", + "\n", + " count = spark.sparkContext.parallelize(range(1, n + 1), 2).map(f).reduce(add)\n", + " print(\"Pi is roughly %f\" % (4.0 * count / n))\n", + "\n", + " spark.stop()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3.0 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/images/jupyter-notebook/hadoop/examples/spark/pyspark_local_hive.ipynb b/images/jupyter-notebook/hadoop/examples/spark/pyspark_local_hive.ipynb new file mode 100755 index 00000000..f085535b --- /dev/null +++ b/images/jupyter-notebook/hadoop/examples/spark/pyspark_local_hive.ipynb @@ -0,0 +1,46 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "\n", + "if __name__ == \"__main__\":\n", + " spark = SparkSession.builder \\\n", + " .appName('spark-hive-demo') \\\n", + " .config(\"hive.metastore.uris\", \"thrift://xxx.xxx.xxx.xxx:9083\") \\\n", + " .enableHiveSupport() \\\n", + " .getOrCreate()\n", + "\n", + " spark.sql(\"create table if not exists demo(id bigint,name String)\")\n", + "\n", + " spark.sql(\"insert overwrite demo values (1,'hamawhite'),(2,'song.bs')\")\n", + " spark.sql(\"select * from demo\").show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3.0 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/images/jupyter-notebook/hadoop/examples/spark/pyspark_yarn.ipynb b/images/jupyter-notebook/hadoop/examples/spark/pyspark_yarn.ipynb new file mode 100755 index 00000000..ee8fd9cd --- /dev/null +++ b/images/jupyter-notebook/hadoop/examples/spark/pyspark_yarn.ipynb @@ -0,0 +1,56 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from random import random\n", + "from operator import add\n", + "from pyspark.sql import SparkSession\n", + "\n", + "if __name__ == \"__main__\":\n", + " spark = SparkSession\\\n", + " .builder\\\n", + " .appName(\"PythonPi-Yarn-Client-Dockerfile\")\\\n", + " .master(\"yarn\")\\\n", + " .config(\"spark.submit.deployMode\", \"client\")\\\n", + " .getOrCreate()\n", + "\n", + " n = 100000 * 2\n", + "\n", + " def f(_):\n", + " x = random() * 2 - 1\n", + " y = random() * 2 - 1\n", + " return 1 if x ** 2 + y ** 2 <= 1 else 0\n", + "\n", + " count = spark.sparkContext.parallelize(range(1, n + 1), 2).map(f).reduce(add)\n", + " print(\"Pi is roughly %f\" % (4.0 * count / n))\n", + "\n", + " spark.stop()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3.0 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/images/jupyter-notebook/hadoop/run-jupyter.sh b/images/jupyter-notebook/hadoop/run-jupyter.sh new file mode 100644 index 00000000..00386202 --- /dev/null +++ b/images/jupyter-notebook/hadoop/run-jupyter.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +HOST_IP=$1 + +# Hadoop生态集群的环境变量统一设置在/opt/third/hadoop-env文件中。 + +# 设置Hadoop环境变量 +echo "export HADOOP_CONF_DIR=/opt/third/hadoop/etc/hadoop" >> /opt/third/hadoop-env + +SPARK_HOME="/opt/third/spark" + +# 设置Spark环境变量 +echo "export SPARK_HOME=${SPARK_HOME}" >> /opt/third/hadoop-env +echo 'export PATH=$PATH:$SPARK_HOME/bin' >> /opt/third/hadoop-env +echo 'export PYTHONPATH=${SPARK_HOME}/python:$(ZIPS=("$SPARK_HOME"/python/lib/*.zip); IFS=:; echo "${ZIPS[*]}"):$PYTHONPATH' >> /opt/third/hadoop-env + + +# 配置spark-defaults.conf +echo "spark.ui.enabled=false" >> ${SPARK_HOME}/conf/spark-defaults.conf +echo "spark.driver.port=32788" >> ${SPARK_HOME}/conf/spark-defaults.conf +echo "spark.blockManager.port=32789" >> ${SPARK_HOME}/conf/spark-defaults.conf +echo "spark.driver.bindAddress=0.0.0.0" >> ${SPARK_HOME}/conf/spark-defaults.conf +echo "spark.driver.host=${HOST_IP}" >>${SPARK_HOME}/conf/spark-defaults.conf + + +source /opt/third/hadoop-env + +# 绑定到/data目录下 +jupyter lab --notebook-dir=/ --ip=0.0.0.0 --no-browser --allow-root --port=3000 --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.allow_origin='*' 2>&1 \ No newline at end of file