Commit 8f81a21d by Abdul Mannan

Add dockerfiles for analytics pipeline

parent 2f5002b7
FROM edxops/xenial-common:latest
MAINTAINER edxops
USER root
ENV BOTO_CONFIG=/dev/null \
JDK_URL=http://download.oracle.com/otn-pub/java/jdk/8u131-b11/d54c1d3a095b4ff2b6607d096fa80163/jdk-8u131-linux-x64.tar.gz \
JDK_DIST_FILE=jdk-8u131-linux-x64.tar.gz \
JAVA_HOME=/usr/lib/jvm/java-8-oracle \
HADOOP_URL=https://archive.apache.org/dist/hadoop/common/hadoop-2.7.2/hadoop-2.7.2.tar.gz \
HADOOP_DIST_FILE=hadoop-2.7.2.tar.gz \
HADOOP_HOME=/edx/app/hadoop/hadoop \
HADOOP_PREFIX=/edx/app/hadoop/hadoop \
HIVE_URL=https://archive.apache.org/dist/hive/hive-2.1.1/apache-hive-2.1.1-bin.tar.gz \
HIVE_DIST_FILE=apache-hive-2.1.1-bin.tar.gz \
HIVE_HOME=/edx/app/hadoop/hive \
SQOOP_URL=http://archive.apache.org/dist/sqoop/1.4.6/sqoop-1.4.6.bin__hadoop-2.0.4-alpha.tar.gz \
SQOOP_DIST_FILE=sqoop-1.4.6.bin__hadoop-2.0.4-alpha.tar.gz \
SQOOP_MYSQL_CONNECTOR_URL=http://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java-5.1.29.tar.gz \
SQOOP_MYSQL_CONNECTOR_FILE=mysql-connector-java-5.1.29 \
SQOOP_HOME=/edx/app/hadoop/sqoop \
SQOOP_LIB=/edx/app/hadoop/sqoop/lib \
SPARK_URL=https://archive.apache.org/dist/spark/spark-2.1.0/spark-2.1.0-bin-hadoop2.7.tgz \
SPARK_DIST_FILE=spark-2.1.0-bin-hadoop2.7.tgz \
SPARK_HOME=/edx/app/hadoop/spark \
LUIGI_CONFIG_PATH=/edx/app/analytics_pipeline/analytics_pipeline/config/luigi_docker.cfg \
ANALYTICS_PIPELINE_VENV=/edx/app/analytics_pipeline/venvs \
BOOTSTRAP=/etc/bootstrap.sh \
COMMON_BASE_DIR=/edx \
COMMON_PIP_PACKAGES_PIP='pip==9.0.3' \
COMMON_PIP_PACKAGES_SETUPTOOLS='setuptools==39.0.1' \
COMMON_PIP_PACKAGES_VIRTUALENV='virtualenv==15.2.0' \
COMMON_PIP_PACKAGES_VIRTUALENVWRAPPER='virtualenvwrapper==4.8.2' \
COMMON_MYSQL_READ_ONLY_USER='read_only' \
COMMON_MYSQL_READ_ONLY_PASS='password' \
ANALYTICS_PIPELINE_OUTPUT_DATABASE_USER='pipeline001' \
ANALYTICS_PIPELINE_OUTPUT_DATABASE_PASSWORD='password' \
EDX_PPA_KEY_SERVER='keyserver.ubuntu.com' \
EDX_PPA_KEY_ID='69464050'
ENV PATH="$PATH:/edx/app/analytics_pipeline/venvs/analytics_pipeline/bin:${JAVA_HOME}/bin:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:${HIVE_HOME}/bin:${SPARK_HOME}/bin:${SPARK_HOME}/sbin:${SQOOP_HOME}/bin" \
COMMON_DATA_DIR=$COMMON_BASE_DIR/var \
COMMON_APP_DIR=$COMMON_BASE_DIR/app \
COMMON_LOG_DIR=$COMMON_BASE_DIR/var/log \
COMMON_BIN_DIR=$COMMON_BASE_DIR/bin \
COMMON_CFG_DIR=$COMMON_BASE_DIR/etc
# add custom PPAs & install packages
RUN apt-get update -y && apt-get install -y software-properties-common \
&& apt-key adv --keyserver $EDX_PPA_KEY_SERVER --recv-keys $EDX_PPA_KEY_ID \
&& add-apt-repository -y 'deb http://ppa.edx.org xenial main' \
&& apt-get update -y \
&& apt-get install --no-install-recommends -y \
python2.7 python2.7-dev python-pip python-apt python-yaml python-jinja2 libmysqlclient-dev libffi-dev libssl-dev \
libatlas-base-dev libblas-dev liblapack-dev libpq-dev sudo make build-essential git-core \
openssh-server openssh-client rsync software-properties-common vim net-tools curl netcat mysql-client-5.6 \
apt-transport-https ntp acl lynx-cur logrotate rsyslog unzip \
ack-grep mosh tree screen tmux dnsutils inetutils-telnet \
&& rm -rf /var/lib/apt/lists/*
# creating directory structure
RUN mkdir -p $HADOOP_HOME $JAVA_HOME $ANALYTICS_PIPELINE_VENV /edx/app/hadoop/lib $HIVE_HOME /etc/luigi \
$SPARK_HOME $SQOOP_HOME $COMMON_DATA_DIR $COMMON_APP_DIR $COMMON_LOG_DIR $COMMON_BIN_DIR $COMMON_CFG_DIR/edx-analytics-pipeline
# create user & group for hadoop
RUN groupadd hadoop
RUN useradd -ms /bin/bash hadoop -g hadoop -d /edx/app/hadoop
RUN echo '%hadoop ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
# JAVA
RUN curl -fSL --header "Cookie:oraclelicense=accept-securebackup-cookie" "$JDK_URL" -o /var/tmp/$JDK_DIST_FILE \
&& tar -xzf /var/tmp/$JDK_DIST_FILE -C $JAVA_HOME --strip-components=1 \
&& rm -f /var/tmp/$JDK_DIST_FILE
# HADOOP
RUN curl -fSL "$HADOOP_URL" -o /var/tmp/$HADOOP_DIST_FILE \
&& tar -xzf /var/tmp/$HADOOP_DIST_FILE -C $HADOOP_HOME --strip-components=1 \
&& sed -i '/^export JAVA_HOME/ s:.*:export JAVA_HOME=/usr/lib/jvm/java-8-oracle\nexport HADOOP_PREFIX=/edx/app/hadoop/hadoop\nexport HADOOP_HOME=/edx/app/hadoop/hadoop\n:' $HADOOP_HOME/etc/hadoop/hadoop-env.sh \
&& sed -i '/^export HADOOP_CONF_DIR/ s:.*:export HADOOP_CONF_DIR=/edx/app/hadoop/hadoop/etc/hadoop/:' $HADOOP_HOME/etc/hadoop/hadoop-env.sh \
&& sed -i 's#<configuration>#<configuration><property><name>fs.defaultFS</name><value>hdfs://namenode:8020</value></property>#' $HADOOP_HOME/etc/hadoop/core-site.xml \
&& sed 's#<configuration>#<configuration><property><name>mapreduce.framework.name</name><value>yarn</value></property>#' $HADOOP_HOME/etc/hadoop/mapred-site.xml.template > $HADOOP_HOME/etc/hadoop/mapred-site.xml \
&& sed -i 's#<configuration>#<configuration><property><name>yarn.resourcemanager.hostname</name><value>resourcemanager</value></property>#' $HADOOP_HOME/etc/hadoop/yarn-site.xml \
&& rm -f /var/tmp/$HADOOP_DIST_FILE
# HIVE
RUN curl -fSL "$HIVE_URL" -o /var/tmp/$HIVE_DIST_FILE \
&& tar -xzf /var/tmp/$HIVE_DIST_FILE -C $HIVE_HOME --strip-components=1 \
&& rm -f /var/tmp/$HIVE_DIST_FILE
ADD docker/build/analytics_pipeline/hive-site.xml.template $HIVE_HOME/conf/hive-site.xml
# SPARK
RUN curl -fSL "$SPARK_URL" -o /var/tmp/$SPARK_DIST_FILE \
&& tar -xzf /var/tmp/$SPARK_DIST_FILE -C $SPARK_HOME --strip-components=1 \
&& echo 'spark.master spark://sparkmaster:7077\nspark.eventLog.enabled true\nspark.eventLog.dir hdfs://namenode:8020/tmp/spark-events\nspark.history.fs.logDirectory hdfs://namenode:8020/tmp/spark-events\nspark.sql.warehouse.dir hdfs://namenode:8020/spark-warehouse' > $SPARK_HOME/conf/spark-defaults.conf \
&& rm -f /var/tmp/$SPARK_DIST_FILE
# SQOOP
RUN curl -fSL "$SQOOP_URL" -o /var/tmp/$SQOOP_DIST_FILE \
&& curl -fSL "$SQOOP_MYSQL_CONNECTOR_URL" -o /var/tmp/$SQOOP_MYSQL_CONNECTOR_FILE.tar.gz \
&& tar -xzf /var/tmp/$SQOOP_DIST_FILE -C $SQOOP_HOME --strip-components=1 \
&& tar -xzf /var/tmp/$SQOOP_MYSQL_CONNECTOR_FILE.tar.gz -C /var/tmp/ \
&& cp /var/tmp/$SQOOP_MYSQL_CONNECTOR_FILE/$SQOOP_MYSQL_CONNECTOR_FILE-bin.jar $SQOOP_LIB \
&& cp /var/tmp/$SQOOP_MYSQL_CONNECTOR_FILE/$SQOOP_MYSQL_CONNECTOR_FILE-bin.jar $HIVE_HOME/lib/ \
&& rm -rf /var/tmp/$SQOOP_DIST_FILE /var/tmp/$SQOOP_MYSQL_CONNECTOR_FILE*
WORKDIR /var/tmp
# Edx Hadoop Util Library
RUN git clone https://github.com/edx/edx-analytics-hadoop-util \
&& cd /var/tmp/edx-analytics-hadoop-util \
&& $JAVA_HOME/bin/javac -cp `/edx/app/hadoop/hadoop/bin/hadoop classpath` org/edx/hadoop/input/ManifestTextInputFormat.java \
&& $JAVA_HOME/bin/jar cf /edx/app/hadoop/lib/edx-analytics-hadoop-util.jar org/edx/hadoop/input/ManifestTextInputFormat.class
# configure bootstrap scripts for container
ADD docker/build/analytics_pipeline/bootstrap.sh /etc/bootstrap.sh
RUN chown hadoop:hadoop /etc/bootstrap.sh \
&& chmod 700 /etc/bootstrap.sh \
&& chown -R hadoop:hadoop /edx/app/hadoop
# Analytics pipeline
RUN git clone https://github.com/edx/edx-analytics-pipeline \
&& cp /var/tmp/edx-analytics-pipeline/Makefile /var/tmp/Makefile \
&& cp -r /var/tmp/edx-analytics-pipeline/requirements /var/tmp/requirements \
&& rm -rf /var/tmp/edx-analytics-pipeline
RUN pip install $COMMON_PIP_PACKAGES_PIP $COMMON_PIP_PACKAGES_SETUPTOOLS $COMMON_PIP_PACKAGES_VIRTUALENV $COMMON_PIP_PACKAGES_VIRTUALENVWRAPPER \
&& virtualenv $ANALYTICS_PIPELINE_VENV/analytics_pipeline/ \
&& chown -R hadoop:hadoop $ANALYTICS_PIPELINE_VENV/analytics_pipeline/ \
&& echo '[hadoop]\nversion: cdh4\ncommand: /edx/app/hadoop/hadoop/bin/hadoop\nstreaming-jar: /edx/app/hadoop/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.7.2.jar' > /etc/luigi/client.cfg
RUN apt-get update && make system-requirements
USER hadoop
RUN touch /edx/app/hadoop/.bashrc \
&& echo 'export JAVA_HOME=/usr/lib/jvm/java-8-oracle\nexport HADOOP_HOME=/edx/app/hadoop/hadoop\nexport HIVE_HOME=/edx/app/hadoop/hive\nexport SQOOP_HOME=/edx/app/hadoop/sqoop\nexport SPARK_HOME=/edx/app/hadoop/spark\nexport PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/edx/app/analytics_pipeline/venvs/analytics_pipeline/bin:/usr/lib/jvm/java-8-oracle/bin:/edx/app/hadoop/hadoop/bin:/edx/app/hadoop/hadoop/sbin:/edx/app/hadoop/hive/bin:/edx/app/hadoop/spark/bin:/edx/app/hadoop/spark/sbin:/edx/app/hadoop/sqoop/bin"' > /edx/app/hadoop/.bashrc \
&& . $ANALYTICS_PIPELINE_VENV/analytics_pipeline/bin/activate \
&& make test-requirements requirements
RUN sudo chown hadoop:hadoop $COMMON_CFG_DIR/edx-analytics-pipeline/ \
&& echo "{\"username\": \"$COMMON_MYSQL_READ_ONLY_USER\", \"host\": \"mysql\", \"password\": \"$COMMON_MYSQL_READ_ONLY_PASS\", \"port\": 3306}" > $COMMON_CFG_DIR/edx-analytics-pipeline/input.json \
&& echo "{\"username\": \"$ANALYTICS_PIPELINE_OUTPUT_DATABASE_USER\", \"host\": \"mysql\", \"password\": \"$ANALYTICS_PIPELINE_OUTPUT_DATABASE_PASSWORD\", \"port\": 3306}" > $COMMON_CFG_DIR/edx-analytics-pipeline/output.json \
&& echo "{\"username\": \"dbadmin\", \"host\": \"vertica\", \"password\": \"\", \"port\": 5433}" > $COMMON_CFG_DIR/edx-analytics-pipeline/warehouse.json
ADD docker/build/analytics_pipeline/acceptance.json $COMMON_CFG_DIR/edx-analytics-pipeline/acceptance.json
WORKDIR /edx/app/analytics_pipeline/analytics_pipeline
CMD ["/etc/bootstrap.sh", "-d"]
{
"connection_user": "hadoop",
"credentials_file_url": "/edx/etc/edx-analytics-pipeline/output.json",
"exporter_output_bucket": "",
"geolocation_data": "hdfs://namenode:8020/edx-analytics-pipeline/geo.dat",
"hive_user": "hadoop",
"host": "analyticspipeline",
"identifier": "local-devstack",
"manifest_input_format": "org.edx.hadoop.input.ManifestTextInputFormat",
"oddjob_jar": "hdfs://namenode:8020/edx-analytics-pipeline/packages/edx-analytics-hadoop-util.jar",
"tasks_branch": "origin/HEAD",
"tasks_log_path": "/tmp/acceptance/",
"tasks_repo": "/edx/app/analytics_pipeline/analytics_pipeline",
"tasks_output_url": "hdfs://namenode:8020/tmp/acceptance-test-output/",
"vertica_creds_url": "/edx/etc/edx-analytics-pipeline/warehouse.json",
"elasticsearch_host": "http://elasticsearch:9200/",
"is_remote": "false"
}
#!/bin/bash
: ${HADOOP_HOME:=/edx/app/hadoop/hadoop}
bash $HADOOP_HOME/etc/hadoop/hadoop-env.sh
. /edx/app/analytics_pipeline/venvs/analytics_pipeline/bin/activate && make develop-local
# installing libraries if any - (resource urls added comma separated to the ACP system variable)
cd $HADOOP_HOME/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd -
if [[ $1 == "-d" ]]; then
while true; do sleep 30; done
fi
if [[ $1 == "-bash" ]]; then
/bin/bash
fi
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://mysql/edx_hive_metastore</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>edx_hive</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>edx</value>
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>hdfs://namenode:8020/edx-analytics-pipeline/warehouse/</value>
<description>location of default database for the warehouse</description>
</property>
<property>
<name>datanucleus.autoCreateSchema</name>
<value>false</value>
</property>
<property>
<name>hive.metastore.schema.verification</name>
<value>true</value>
</property>
</configuration>
FROM uhopper/hadoop:2.7.2
MAINTAINER edxops
ENV HDFS_CONF_dfs_datanode_data_dir=file:///hadoop/dfs/data \
MYSQL_VERSION=5.6 \
DEBIAN_FRONTEND=noninteractive
WORKDIR /tmp
RUN \
echo "deb http://ftp.de.debian.org/debian/ stretch main non-free contrib\n" > /etc/apt/sources.list && \
echo "deb-src http://ftp.de.debian.org/debian/ stretch main non-free contrib\n" >> /etc/apt/sources.list && \
echo "deb http://security.debian.org/ stretch/updates main contrib non-free\n" >> /etc/apt/sources.list && \
echo "deb-src http://security.debian.org/ stretch/updates main contrib non-free" >> /etc/apt/sources.list
RUN apt-get -y update
RUN apt-get -yqq install apt-transport-https lsb-release ca-certificates gnupg2
RUN ( apt-key adv --keyserver ha.pool.sks-keyservers.net --recv-keys A4A9406876FCBD3C456770C88C718D3B5072E1F5 \
|| apt-key adv --keyserver hkp://p80.pool.sks-keyservers.net:80 --recv-keys A4A9406876FCBD3C456770C88C718D3B5072E1F5 \
|| apt-key adv --keyserver hkps://hkps.pool.sks-keyservers.net --recv-keys A4A9406876FCBD3C456770C88C718D3B5072E1F5 )
RUN echo "deb http://repo.mysql.com/apt/debian/ stretch mysql-${MYSQL_VERSION}" > /etc/apt/sources.list.d/mysql.list
RUN apt-get update && apt-get install -y mysql-community-client && rm -rf /var/lib/apt/lists/*
WORKDIR /
RUN mkdir -p /hadoop/dfs/data
VOLUME /hadoop/dfs/data
ADD docker/build/analytics_pipeline_hadoop_datanode/datanode.sh /run.sh
RUN chmod a+x /run.sh
CMD ["/run.sh"]
#!/bin/bash
datadir=`echo $HDFS_CONF_dfs_datanode_data_dir | perl -pe 's#file://##'`
if [ ! -d $datadir ]; then
echo "Datanode data directory not found: $datadir"
exit 2
fi
$HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR datanode
FROM uhopper/hadoop:2.7.2
MAINTAINER edxops
ENV HDFS_CONF_dfs_namenode_name_dir=file:///hadoop/dfs/name \
MYSQL_VERSION=5.6 \
DEBIAN_FRONTEND=noninteractive
WORKDIR /tmp
RUN \
echo "deb http://ftp.de.debian.org/debian/ stretch main non-free contrib\n" > /etc/apt/sources.list && \
echo "deb-src http://ftp.de.debian.org/debian/ stretch main non-free contrib\n" >> /etc/apt/sources.list && \
echo "deb http://security.debian.org/ stretch/updates main contrib non-free\n" >> /etc/apt/sources.list && \
echo "deb-src http://security.debian.org/ stretch/updates main contrib non-free" >> /etc/apt/sources.list
RUN apt-get -y update
RUN apt-get -yqq install apt-transport-https lsb-release ca-certificates gnupg2
RUN ( apt-key adv --keyserver ha.pool.sks-keyservers.net --recv-keys A4A9406876FCBD3C456770C88C718D3B5072E1F5 \
|| apt-key adv --keyserver hkp://p80.pool.sks-keyservers.net:80 --recv-keys A4A9406876FCBD3C456770C88C718D3B5072E1F5 \
|| apt-key adv --keyserver hkps://hkps.pool.sks-keyservers.net --recv-keys A4A9406876FCBD3C456770C88C718D3B5072E1F5 )
RUN echo "deb http://repo.mysql.com/apt/debian/ stretch mysql-${MYSQL_VERSION}" > /etc/apt/sources.list.d/mysql.list
RUN apt-get update && apt-get install -y mysql-community-client && rm -rf /var/lib/apt/lists/*
WORKDIR /
RUN mkdir -p /hadoop/dfs/name
VOLUME /hadoop/dfs/name
ADD docker/build/analytics_pipeline_hadoop_namenode/namenode.sh /run.sh
RUN chmod a+x /run.sh
CMD ["/run.sh"]
#!/bin/bash
namedir=`echo $HDFS_CONF_dfs_namenode_name_dir | perl -pe 's#file://##'`
if [ ! -d $namedir ]; then
echo "Namenode name directory not found: $namedir"
exit 2
fi
if [ -z "$CLUSTER_NAME" ]; then
echo "Cluster name not specified"
exit 2
fi
if [ "`ls -A $namedir`" == "" ]; then
echo "Formatting namenode name directory: $namedir"
$HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR namenode -format $CLUSTER_NAME
fi
$HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR namenode
FROM uhopper/hadoop:2.7.2
MAINTAINER edxops
ENV MYSQL_VERSION=5.6 DEBIAN_FRONTEND=noninteractive
WORKDIR /tmp
RUN \
echo "deb http://ftp.de.debian.org/debian/ stretch main non-free contrib\n" > /etc/apt/sources.list && \
echo "deb-src http://ftp.de.debian.org/debian/ stretch main non-free contrib\n" >> /etc/apt/sources.list && \
echo "deb http://security.debian.org/ stretch/updates main contrib non-free\n" >> /etc/apt/sources.list && \
echo "deb-src http://security.debian.org/ stretch/updates main contrib non-free" >> /etc/apt/sources.list
RUN apt-get -y update
RUN apt-get -yqq install apt-transport-https lsb-release ca-certificates gnupg2
RUN ( apt-key adv --keyserver ha.pool.sks-keyservers.net --recv-keys A4A9406876FCBD3C456770C88C718D3B5072E1F5 \
|| apt-key adv --keyserver hkp://p80.pool.sks-keyservers.net:80 --recv-keys A4A9406876FCBD3C456770C88C718D3B5072E1F5 \
|| apt-key adv --keyserver hkps://hkps.pool.sks-keyservers.net --recv-keys A4A9406876FCBD3C456770C88C718D3B5072E1F5 )
RUN echo "deb http://repo.mysql.com/apt/debian/ stretch mysql-${MYSQL_VERSION}" > /etc/apt/sources.list.d/mysql.list
RUN apt-get update && apt-get install -y mysql-community-client && rm -rf /var/lib/apt/lists/*
WORKDIR /
ADD docker/build/analytics_pipeline_hadoop_nodemanager/nodemanager.sh /run.sh
RUN chmod a+x /run.sh
CMD ["/run.sh"]
#!/bin/bash
$HADOOP_PREFIX/bin/yarn --config $HADOOP_CONF_DIR nodemanager
FROM uhopper/hadoop:2.7.2
MAINTAINER edxops
ENV MYSQL_VERSION=5.6 DEBIAN_FRONTEND=noninteractive
WORKDIR /tmp
RUN \
echo "deb http://ftp.de.debian.org/debian/ stretch main non-free contrib\n" > /etc/apt/sources.list && \
echo "deb-src http://ftp.de.debian.org/debian/ stretch main non-free contrib\n" >> /etc/apt/sources.list && \
echo "deb http://security.debian.org/ stretch/updates main contrib non-free\n" >> /etc/apt/sources.list && \
echo "deb-src http://security.debian.org/ stretch/updates main contrib non-free" >> /etc/apt/sources.list
RUN apt-get -y update
RUN apt-get -yqq install apt-transport-https lsb-release ca-certificates gnupg2
RUN ( apt-key adv --keyserver ha.pool.sks-keyservers.net --recv-keys A4A9406876FCBD3C456770C88C718D3B5072E1F5 \
|| apt-key adv --keyserver hkp://p80.pool.sks-keyservers.net:80 --recv-keys A4A9406876FCBD3C456770C88C718D3B5072E1F5 \
|| apt-key adv --keyserver hkps://hkps.pool.sks-keyservers.net --recv-keys A4A9406876FCBD3C456770C88C718D3B5072E1F5 )
RUN echo "deb http://repo.mysql.com/apt/debian/ stretch mysql-${MYSQL_VERSION}" > /etc/apt/sources.list.d/mysql.list
RUN apt-get update && apt-get install -y mysql-community-client && rm -rf /var/lib/apt/lists/*
WORKDIR /
ADD docker/build/analytics_pipeline_hadoop_resourcemanager/resourcemanager.sh /run.sh
RUN chmod a+x /run.sh
CMD ["/run.sh"]
#!/bin/bash
$HADOOP_PREFIX/bin/yarn --config $HADOOP_CONF_DIR resourcemanager
FROM bde2020/spark-base:2.1.0-hadoop2.7
MAINTAINER edxops
ADD docker/build/analytics_pipeline_spark_master/master.sh /
ENV SPARK_MASTER_PORT=7077 \
SPARK_MASTER_WEBUI_PORT=8080 \
SPARK_MASTER_LOG=/spark/logs \
HADOOP_USER_NAME=hadoop \
SPARK_HOME=/spark \
PATH=$PATH:/spark/bin
RUN apt-get -y update && apt-get -y install --reinstall python-pkg-resources \
&& echo 'spark.master spark://sparkmaster:7077\nspark.eventLog.enabled true\nspark.eventLog.dir hdfs://namenode:8020/tmp/spark-events\nspark.history.fs.logDirectory hdfs://namenode:8020/tmp/spark-events' > /spark/conf/spark-defaults.conf
CMD ["/bin/bash", "/master.sh"]
# 18080: spark history server port
# 4040: spark UI port
# 6066: spark api port
EXPOSE 8080 7077 6066 18080 4040
#!/bin/bash
export SPARK_MASTER_HOST=`hostname`
. "/spark/sbin/spark-config.sh"
. "/spark/bin/load-spark-env.sh"
mkdir -p $SPARK_MASTER_LOG
setsid /spark/sbin/start-history-server.sh >/dev/null 2>&1 < /dev/null &
cd /spark/bin && /spark/sbin/../bin/spark-class org.apache.spark.deploy.master.Master \
--ip $SPARK_MASTER_HOST --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT >> $SPARK_MASTER_LOG/spark-master.out
FROM bde2020/spark-base:2.1.0-hadoop2.7
MAINTAINER edxops
ADD docker/build/analytics_pipeline_spark_worker/worker.sh /
ENV SPARK_WORKER_WEBUI_PORT=8081 \
SPARK_WORKER_LOG=/spark/logs \
SPARK_MASTER="spark://sparkmaster:7077" \
SPARK_HOME=/spark
RUN apt-get -y update && apt-get -y install --reinstall python-pkg-resources
CMD ["/bin/bash", "/worker.sh"]
EXPOSE 8081
#!/bin/bash
. "/spark/sbin/spark-config.sh"
. "/spark/bin/load-spark-env.sh"
mkdir -p $SPARK_WORKER_LOG
/spark/sbin/../bin/spark-class org.apache.spark.deploy.worker.Worker \
--webui-port $SPARK_WORKER_WEBUI_PORT $SPARK_MASTER >> $SPARK_WORKER_LOG/spark-worker.out
......@@ -32,3 +32,11 @@ weights:
- mongo: 1
- devpi: 1
- jenkins_build: 8
- analytics_pipeline: 8
- analytics_pipeline_hadoop_datanode: 2
- analytics_pipeline_hadoop_namenode: 3
- analytics_pipeline_hadoop_nodemanager: 3
- analytics_pipeline_hadoop_resourcemanager: 2
- analytics_pipeline_spark_master: 1
- analytics_pipeline_spark_worker: 1
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment