Commit 9851c3bd by Gabe Mulley

create an analytics pipeline container

parent 3ade5c9f
FROM edxoperations/precise-common:v2
MAINTAINER edxops
ENV CONFIG_BRANCH hack2015/gabe/analytics-hadoop
USER docker
WORKDIR /edx/app/edx_ansible/edx_ansible
RUN sudo git fetch --all
RUN sudo git checkout $CONFIG_BRANCH
RUN sudo git reset --hard origin/$CONFIG_BRANCH
RUN sudo git pull
WORKDIR /edx/app/edx_ansible/edx_ansible/docker/plays
RUN sudo ansible-playbook analytics_pipeline.yml -c local
USER root
CMD ["/edx/bin/analytics-pipeline-start.sh"]
- name: Deploy all dependencies needed to run edx-analytics-pipeline
hosts: all
sudo: True
gather_facts: True
roles:
- analytics_pipeline
......@@ -117,60 +117,43 @@
- install
- install:app-requirements
- name: ensure hdfs services are started
service: >
name=hdfs
state=started
tags:
- manage
- manage:start
- name: ensure map reduce services are started
service: >
name=yarn
state=started
tags:
- manage
- manage:start
- name: ensure package dir exists in HDFS
shell: >
. {{ HADOOP_COMMON_CONF_DIR }}/hadoop-env.sh && hdfs dfs -mkdir -p /edx-analytics-pipeline/packages/
sudo_user: "{{ hadoop_common_user }}"
tags:
- install
- install:app-requirements
- name: ensure util library is in HDFS
shell: >
. {{ HADOOP_COMMON_CONF_DIR }}/hadoop-env.sh && hdfs dfs -put -f {{ HADOOP_COMMON_USER_HOME }}/lib/edx-analytics-hadoop-util.jar /edx-analytics-pipeline/packages/
sudo_user: "{{ hadoop_common_user }}"
tags:
- install
- install:app-requirements
- name: ensure the data directory exists
shell: >
. {{ HADOOP_COMMON_CONF_DIR }}/hadoop-env.sh && hdfs dfs -mkdir -p {{ ANALYTICS_PIPELINE_HDFS_DATA_DIR }}
sudo_user: "{{ hadoop_common_user }}"
tags:
- install
- install:base
- name: ensure tracking log file can be read
file: >
path={{ COMMON_LOG_DIR }}/tracking/tracking.log
mode=0644
ignore_errors: yes
tags:
- install
- install:configuration
- name: pipeline start script file installed
template: >
src=analytics-pipeline-start.sh.j2
dest={{ COMMON_BIN_DIR }}/analytics-pipeline-start.sh
owner={{ hadoop_common_user }} group={{ hadoop_common_group }} mode=755
- name: cron job syncs tracking log file to hdfs
cron: >
user={{ hadoop_common_user }}
name="Sync tracking log to HDFS"
job="{{ HADOOP_COMMON_HOME }}/bin/hdfs dfs -put -f {{ COMMON_LOG_DIR }}/tracking/tracking.log {{ ANALYTICS_PIPELINE_HDFS_DATA_DIR }}/tracking.log"
tags:
- install
- install:configuration
#- name: ensure hdfs services are started
# shell: >
# . {{ HADOOP_COMMON_CONF_DIR }}/hadoop-env.sh && {{ HADOOP_COMMON_HOME }}/sbin/start-dfs.sh
#
#- name: ensure hdfs services are started
# shell: >
# . {{ HADOOP_COMMON_CONF_DIR }}/hadoop-env.sh && {{ HADOOP_COMMON_HOME }}/sbin/start-yarn.sh
#
#- name: ensure package dir exists in HDFS
# shell: >
# . {{ HADOOP_COMMON_CONF_DIR }}/hadoop-env.sh && hdfs dfs -mkdir -p /edx-analytics-pipeline/packages/
# sudo_user: "{{ hadoop_common_user }}"
#
#- name: ensure util library is in HDFS
# shell: >
# . {{ HADOOP_COMMON_CONF_DIR }}/hadoop-env.sh && hdfs dfs -put -f {{ HADOOP_COMMON_USER_HOME }}/lib/edx-analytics-hadoop-util.jar /edx-analytics-pipeline/packages/
# sudo_user: "{{ hadoop_common_user }}"
#
#- name: ensure the data directory exists
# shell: >
# . {{ HADOOP_COMMON_CONF_DIR }}/hadoop-env.sh && hdfs dfs -mkdir -p {{ ANALYTICS_PIPELINE_HDFS_DATA_DIR }}
# sudo_user: "{{ hadoop_common_user }}"
#
#- name: ensure tracking log file can be read
# file: >
# path={{ COMMON_LOG_DIR }}/tracking/tracking.log
# mode=0644
# ignore_errors: yes
#
#- name: cron job syncs tracking log file to hdfs
# cron: >
# user={{ hadoop_common_user }}
# name="Sync tracking log to HDFS"
# job="{{ HADOOP_COMMON_HOME }}/bin/hdfs dfs -put -f {{ COMMON_LOG_DIR }}/tracking/tracking.log {{ ANALYTICS_PIPELINE_HDFS_DATA_DIR }}/tracking.log"
#!/bin/bash
if [ -n "$GITHUB_USER" ]
then
sudo usermod docker -s /bin/bash
su docker -c "mkdir -p /home/docker/.ssh"
echo >> /home/docker/.ssh/authorized_keys
curl https://github.com/$GITHUB_USER.keys >> /home/docker/.ssh/authorized_keys
fi
# remote-task will be running ansible, and this directory will be owned by root, just clean it up.
sudo rm -rf /home/docker/.ansible
/usr/sbin/sshd
HADOOP_HOME={{ HADOOP_COMMON_HOME }}
su hadoop -c "$HADOOP_HOME/sbin/start-dfs.sh"
su hadoop -c "$HADOOP_HOME/sbin/start-yarn.sh"
su hadoop -c "$HADOOP_HOME/bin/hdfs dfs -mkdir -p /edx-analytics-pipeline/packages/"
su hadoop -c "$HADOOP_HOME/bin/hdfs dfs -put -f {{ HADOOP_COMMON_USER_HOME }}/lib/edx-analytics-hadoop-util.jar /edx-analytics-pipeline/packages/"
su hadoop -c "$HADOOP_HOME/bin/hdfs dfs -mkdir -p {{ ANALYTICS_PIPELINE_HDFS_DATA_DIR }}"
tail -f $HADOOP_HOME/logs/*.log
\ No newline at end of file
......@@ -55,3 +55,8 @@ edxlocal_database_users:
user: "{{ HIVE_METASTORE_DATABASE.user | default(None) }}",
pass: "{{ HIVE_METASTORE_DATABASE.password | default(None) }}"
}
- {
db: "{{ HIVE_METASTORE_DATABASE.name | default(None) }}",
user: "{{ HIVE_METASTORE_DATABASE.user | default(None) }}",
pass: "{{ HIVE_METASTORE_DATABASE.password | default(None) }}"
}
......@@ -58,5 +58,6 @@ hadoop_common_debian_pkgs:
- libtool
- zlib1g-dev
- maven
- openssh-server
hadoop_common_redhat_pkgs: []
......@@ -193,3 +193,12 @@
path={{ HADOOP_COMMON_SERVICES_DIR }}
mode=0750 owner={{ hadoop_common_user }} group={{ hadoop_common_group }}
state=directory
- name: ssh pid directory exists
file: >
path=/var/run/sshd
mode=0755 owner=root group=root
state=directory
- name: sshd service started
shell: /usr/sbin/sshd
......@@ -75,7 +75,6 @@
mode=0640 owner={{ hadoop_common_user }} group={{ hadoop_common_group }}
with_items:
- hive-env.sh
- hive-site.xml
- name: env vars sourced in hadoop env
lineinfile: >
......
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://{{ HIVE_METASTORE_DATABASE.host }}:{{ HIVE_METASTORE_DATABASE.port }}/{{ HIVE_METASTORE_DATABASE.name }}</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>{{ HIVE_METASTORE_DATABASE.user }}</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>{{ HIVE_METASTORE_DATABASE.password }}</value>
</property>
<property>
<name>datanucleus.autoCreateSchema</name>
<value>true</value>
</property>
<property>
<name>hive.metastore.schema.verification</name>
<value>true</value>
</property>
</configuration>
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment