Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring infrastructure #27

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
version: "3.7"
services:
spark-master:
image: spydernaz/spark-master:latest
image: spark-master:latest
ports:
- "9090:8080"
- "7077:7077"
Expand All @@ -11,7 +11,7 @@ services:
environment:
- "SPARK_LOCAL_IP=spark-master"
spark-worker:
image: spydernaz/spark-worker:latest
image: spark-worker:latest
depends_on:
- spark-master
environment:
Expand All @@ -23,3 +23,5 @@ services:
volumes:
- ./apps:/opt/spark-apps
- ./data:/opt/spark-data
ports:
- "8081"
87 changes: 32 additions & 55 deletions docker/base/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,57 +1,34 @@
FROM openjdk:8u212-b04-jdk-stretch
LABEL author="Nathaniel Vala" email="[email protected]"
LABEL version="0.2"

ENV DAEMON_RUN=true
ENV SPARK_VERSION=2.4.3
ENV HADOOP_VERSION=2.7
ENV SCALA_VERSION=2.12.4
ENV SCALA_HOME=/usr/share/scala
ENV SPARK_HOME=/spark


RUN apt-get update && apt-get install -y curl vim wget software-properties-common ssh net-tools ca-certificates jq

# apt update && apt -y upgrade \
# apt install -y wget ca-certificates && \
# apt install -y curl bash jq && \
RUN cd "/tmp" && \
wget --no-verbose "https://downloads.typesafe.com/scala/${SCALA_VERSION}/scala-${SCALA_VERSION}.tgz" && \
tar xzf "scala-${SCALA_VERSION}.tgz" && \
mkdir "${SCALA_HOME}" && \
rm "/tmp/scala-${SCALA_VERSION}/bin/"*.bat && \
mv "/tmp/scala-${SCALA_VERSION}/bin" "/tmp/scala-${SCALA_VERSION}/lib" "${SCALA_HOME}" && \
ln -s "${SCALA_HOME}/bin/"* "/usr/bin/" && \
rm -rf "/tmp/"*


# RUN apk add --no-cache --virtual=.build-dependencies wget ca-certificates && \
# apk add --no-cache bash curl jq && \
# cd "/tmp" && \
# wget --no-verbose "https://downloads.typesafe.com/scala/${SCALA_VERSION}/scala-${SCALA_VERSION}.tgz" && \
# tar xzf "scala-${SCALA_VERSION}.tgz" && \
# mkdir "${SCALA_HOME}" && \
# rm "/tmp/scala-${SCALA_VERSION}/bin/"*.bat && \
# mv "/tmp/scala-${SCALA_VERSION}/bin" "/tmp/scala-${SCALA_VERSION}/lib" "${SCALA_HOME}" && \
# ln -s "${SCALA_HOME}/bin/"* "/usr/bin/" && \
# apk del .build-dependencies && \
# rm -rf "/tmp/"*
FROM oraclelinux:8-slim

ARG JAVA_VERSION=8.0.232-open
ARG SCALA_VERSION=2.12.10
ARG SBT_VERSION=1.3.7
ARG SPARK_VERSION=2.4.4
ARG HADOOP_VERSION=2.7

# Add Dependencies for PySpark
RUN apt-get install -y python3 python3-pip python3-numpy python3-matplotlib python3-scipy python3-pandas python3-simpy
RUN update-alternatives --install "/usr/bin/python" "python" "$(which python3)" 1


#Scala instalation
RUN export PATH="/usr/local/sbt/bin:$PATH" && apt update && apt install ca-certificates wget tar && mkdir -p "/usr/local/sbt" && wget -qO - --no-check-certificate "https://github.com/sbt/sbt/releases/download/v1.2.8/sbt-1.2.8.tgz" | tar xz -C /usr/local/sbt --strip-components=1 && sbt sbtVersion

RUN wget --no-verbose http://apache.mirror.iphh.net/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && tar -xvzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
&& mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark \
&& rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz



# Fix the value of PYTHONHASHSEED
# Note: this is needed when you use Python 3.3 or greater
ENV PYTHONHASHSEED 1
ENV DAEMON_RUN=true
ENV PYTHONHASHSEED=1
ENV SDKMAN_DIR=/usr/local/sdkman
ENV JAVA_HOME=${SDKMAN_DIR}/candidates/java/current/
ENV SCALA_HOME=${SDKMAN_DIR}/candidates/scala/current/
ENV SBT_HOME=${SDKMAN_DIR}/candidates/sbt/current/
ENV SPARK_HOME=/opt/spark

RUN touch /etc/dnf/dnf.conf && microdnf update -y
RUN microdnf install bash curl ca-certificates unzip zip tar gzip bzip2 which findutils python3 python3-pip -y && microdnf clean all -y
RUN curl -s "https://get.sdkman.io" | bash
RUN set -x \
&& echo "sdkman_auto_answer=true" > $SDKMAN_DIR/etc/config \
&& echo "sdkman_auto_selfupdate=false" >> $SDKMAN_DIR/etc/config \
&& echo "sdkman_insecure_ssl=false" >> $SDKMAN_DIR/etc/config
RUN bash -c "source $SDKMAN_DIR/bin/sdkman-init.sh && \
sdk update && sdk upgrade && sdk selfupdate && \
sdk install java ${JAVA_VERSION} && sdk default java ${JAVA_VERSION} && \
sdk install scala ${SCALA_VERSION} && sdk default scala ${SCALA_VERSION} && \
sdk install sbt ${SBT_VERSION} && sdk default sbt ${SBT_VERSION} && \
sdk flush archives && sdk flush temp && sdk flush broadcast"
RUN pip3 --no-cache-dir install matplotlib pandas simpy numpy scipy
RUN curl -o /tmp/spark.tgz https://www-us.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && tar -xvzf /tmp/spark.tgz --one-top-level=spark --strip-components 1 -C /opt/ && rm -f /tmp/spark.tgz
RUN alternatives --set python /usr/bin/python3

WORKDIR ${SPARK_HOME}
8 changes: 4 additions & 4 deletions docker/spark-master/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
FROM spydernaz/spark-base:latest
FROM spark-base:latest

COPY start-master.sh /

ENV SPARK_MASTER_PORT 7077
ENV SPARK_MASTER_WEBUI_PORT 8080
ENV SPARK_MASTER_LOG /spark/logs
ENV SPARK_MASTER_PORT=7077
ENV SPARK_MASTER_WEBUI_PORT=8080
ENV SPARK_MASTER_LOG=${SPARK_HOME}/logs

EXPOSE 8080 7077 6066

Expand Down
10 changes: 4 additions & 6 deletions docker/spark-master/start-master.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
#!/bin/bash

export SPARK_MASTER_HOST=`hostname`
export SPARK_MASTER_HOST=`cat /etc/hostname`

. "/spark/sbin/spark-config.sh"
. "$SPARK_HOME/sbin/spark-config.sh"

. "/spark/bin/load-spark-env.sh"
. "$SPARK_HOME/bin/load-spark-env.sh"

mkdir -p $SPARK_MASTER_LOG

export SPARK_HOME=/spark

ln -sf /dev/stdout $SPARK_MASTER_LOG/spark-master.out

cd /spark/bin && /spark/sbin/../bin/spark-class org.apache.spark.deploy.master.Master --ip $SPARK_MASTER_HOST --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT >> $SPARK_MASTER_LOG/spark-master.out
cd $SPARK_HOME/bin && $SPARK_HOME/sbin/../bin/spark-class org.apache.spark.deploy.master.Master --ip $SPARK_MASTER_HOST --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT >> $SPARK_MASTER_LOG/spark-master.out
2 changes: 1 addition & 1 deletion docker/spark-submit/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM spydernaz/spark-base:latest
FROM spark-base:latest

COPY spark-submit.sh /

Expand Down
2 changes: 1 addition & 1 deletion docker/spark-submit/spark-submit.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

/spark/bin/spark-submit \
$SPARK_HOME/bin/spark-submit \
--class ${SPARK_APPLICATION_MAIN_CLASS} \
--master ${SPARK_MASTER_URL} \
--deploy-mode cluster \
Expand Down
8 changes: 4 additions & 4 deletions docker/spark-worker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
FROM spydernaz/spark-base:latest
FROM spark-base:latest

COPY start-worker.sh /

ENV SPARK_WORKER_WEBUI_PORT 8081
ENV SPARK_WORKER_LOG /spark/logs
ENV SPARK_MASTER "spark://spark-master:7077"
ENV SPARK_WORKER_WEBUI_PORT=8081
ENV SPARK_WORKER_LOG=${SPARK_HOME}/logs
ENV SPARK_MASTER="spark://spark-master:7077"

EXPOSE 8081

Expand Down
8 changes: 3 additions & 5 deletions docker/spark-worker/start-worker.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
#!/bin/bash

. "/spark/sbin/spark-config.sh"
. "/spark/bin/load-spark-env.sh"
. "$SPARK_HOME/sbin/spark-config.sh"
. "$SPARK_HOME/bin/load-spark-env.sh"

mkdir -p $SPARK_WORKER_LOG

export SPARK_HOME=/spark

ln -sf /dev/stdout $SPARK_WORKER_LOG/spark-worker.out

/spark/sbin/../bin/spark-class org.apache.spark.deploy.worker.Worker --webui-port $SPARK_WORKER_WEBUI_PORT $SPARK_MASTER >> $SPARK_WORKER_LOG/spark-worker.out
$SPARK_HOME/sbin/../bin/spark-class org.apache.spark.deploy.worker.Worker --webui-port $SPARK_WORKER_WEBUI_PORT $SPARK_MASTER >> $SPARK_WORKER_LOG/spark-worker.out