Install
First, you need Java 8 JDK.
http://www.oracle.com/technetwork/java/javase/downloads/index.html
$ java -version
java version "1.8.0_131"
Homebrew Version
$ brew update
$ brew install maven apache-spark
Pre-built Version
$ mkdir -p /usr/local/share/apache-spark && \
cd /usr/local/share/apache-spark && \
wget https://www.apache.org/dyn/closer.lua/spark/spark-2.2.0/spark-2.2.0-bin-hadoop2.7.tgz && \
tar -xvzf spark-2.2.0-bin-hadoop2.7.tgz
ref:
http://spark.apache.org/downloads.html
Build Version
This is the recommended way.
$ brew install [email protected]
$ export PATH="/usr/local/opt/[email protected]/bin:$PATH"
$ scala -version
Scala code runner version 2.11.8 -- Copyright 2002-2016, LAMP/EPFL
$ mkdir -p /usr/local/share/apache-spark && \
cd /usr/local/share/apache-spark && \
wget https://d3kbcqa49mib13.cloudfront.net/spark-2.2.0.tgz && \
tar -xvzf spark-2.2.0.tgz && \
cd spark-2.2.0
$ ./build/mvn -Pnetlib-lgpl -Pyarn -Phadoop-2.7 -Dhadoop.version=2.7.3 -DskipTests -T 4C package
# or
$ ./build/mvn -Pnetlib-lgpl -Pyarn -Phadoop-2.7 -Dhadoop.version=2.7.3 -DskipTests clean package
$ spark-shell --packages "com.github.fommil.netlib:all:1.1.2"
scala> import com.github.fommil.netlib.BLAS
import com.github.fommil.netlib.BLAS
scala> BLAS.getInstance().getClass().getName()
res1: String = com.github.fommil.netlib.NativeSystemBLAS
ref:
http://spark.apache.org/downloads.html
http://spark.apache.org/docs/latest/building-spark.html
http://spark.apache.org/docs/latest/ml-guide.html#dependencies
Configurations
in .zshrc
if which java > /dev/null; then
export JAVA_HOME="$(/usr/libexec/java_home -v 1.8)"
export PATH="$JAVA_HOME/bin:$PATH"
fi
export PATH="/usr/local/opt/[email protected]/bin:$PATH"
# homebrew version
export SPARK_HOME="/usr/local/Cellar/apache-spark/2.2.0/libexec"
export PYTHONPATH="$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.4-src.zip:$PYTHONPATH"
export PYSPARK_DRIVER_PYTHON="ipython"
# pre-built version
export SPARK_HOME="/usr/local/share/apache-spark/spark-2.2.0-bin-hadoop2.7"
export PATH="$SPARK_HOME/bin:$PATH"
export PYTHONPATH="$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.4-src.zip:$PYTHONPATH"
# build version
export SPARK_HOME="/usr/local/share/apache-spark/spark-2.2.0"
export PATH="$SPARK_HOME/bin:$PATH"
export PYTHONPATH="$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.4-src.zip:$PYTHONPATH"
ref:
https://spark.apache.org/docs/latest/programming-guide.html
https://spark.apache.org/docs/latest/configuration.html
$ cd $SPARK_HOME
$ cp conf/spark-defaults.conf.template conf/spark-defaults.conf
spark.driver.memory 4g
spark.executor.memory 4g
spark.jars.packages com.github.fommil.netlib:all:1.1.2,mysql:mysql-connector-java:5.1.41
spark.serializer org.apache.spark.serializer.KryoSerializer
$ cp conf/spark-env.sh.template conf/spark-env.sh
export PYTHONHASHSEED=42
$ cp conf/log4j.properties.template conf/log4j.properties
ref:
https://spark.apache.org/docs/latest/configuration.html
Commands
Local Mode
$ spark-shell
$ export PYSPARK_DRIVER_PYTHON="jupyter" && \
export PYSPARK_DRIVER_PYTHON_OPTS="notebook --ip 0.0.0.0" && \
pyspark \
--packages "com.github.fommil.netlib:all:1.1.2,mysql:mysql-connector-java:5.1.41" \
--driver-memory 4g \
--executor-memory 4g \
--master "local[*]"
$ spark-shell \
--packages "com.github.fommil.netlib:all:1.1.2,mysql:mysql-connector-java:5.1.41"
--master "local-cluster[3, 1, 4096]"
# Spark Application UI on the driver
$ open http://localhost:4040/
ref:
https://spark.apache.org/docs/latest/programming-guide.html
Standalone mode
There are two deploy modes for Spark Standalone. In client mode, the driver is launched in the same process as the client that submits the application. In cluster mode, however, the driver is launched from one of the Worker.
$ ./sbin/start-master.sh -h localhost
$ ./sbin/start-slave.sh spark://localhost:7077
# Spark Web UI on the cluster manager
$ open http://localhost:8080/
$ pyspark \
--driver-memory 4g \
--executor-memory 4g \
--master spark://localhost:7077
$ spark-submit \
--master spark://localhost:7077 \
examples/src/main/python/pi.py 10
$ spark-submit \
--driver-memory 2g \
--driver-java-options "-XX:ThreadStackSize=81920" \
--total-executor-cores 3 \
--executor-cores 3 \
--executor-memory 12g \
--conf "spark.executor.extraJavaOptions=-XX:ThreadStackSize=81920" \
--master spark://localhost:7077 \
--packages "mysql:mysql-connector-java:5.1.41,com.hankcs:hanlp:portable-1.3.4,edu.stanford.nlp:stanford-corenlp:3.7.0" \
--jars "/Users/vinta/Projects/albedo/spark-data/stanford-corenlp-3.8.0-models.jar" \
--class ws.vinta.albedo.LogisticRegressionRanker \
target/albedo-1.0.0-SNAPSHOT.jar
# Spark Application UI on the driver
$ open http://localhost:4040/
ref:
https://spark.apache.org/docs/latest/spark-standalone.html
https://spark.apache.org/docs/latest/submitting-applications.html
https://spark.apache.org/docs/latest/configuration.html