--- # # edX Configuration # # github: https://github.com/edx/configuration # wiki: https://github.com/edx/configuration/wiki # code style: https://github.com/edx/configuration/wiki/Ansible-Coding-Conventions # license: https://github.com/edx/configuration/blob/master/LICENSE.TXT # # # # Tasks for role analytics_pipeline # # Overview: # # Prepare the machine to run the edX Analytics Data Pipeline. The pipeline currently "installs itself" # via an ansible playbook that is not included in the edx/configuration repo. However, in order to # run the pipeline in a devstack environment, some configuration needs to be performed. In a production # environment many of these config files are stored on S3. # # Dependencies: # # common: some of the variables from the common role are used here # hadoop_master: ensures hadoop services are installed # hive: the pipeline makes extensive usage of hive, so that needs to be installed as well # sqoop: similarly to hive, the pipeline uses this tool extensively # # Example play: # # - name: Deploy all dependencies of edx-analytics-pipeline to the node # hosts: all # sudo: True # gather_facts: True # roles: # - analytics_pipeline # # ansible-playbook -i 'localhost,' ./analytics_pipeline.yml -e@/ansible/vars/deployment.yml -e@/ansible/vars/env-deployment.yml # - name: create config directory file: > path="{{ ANALYTICS_PIPELINE_CONFIG_DIR }}" mode=0755 owner={{ hadoop_common_user }} group={{ hadoop_common_group }} state=directory tags: - install - install:configuration - name: store output database credentials for analytics pipeline copy: > content="{{ ANALYTICS_PIPELINE_OUTPUT_DATABASE | to_json }}" dest={{ COMMON_CFG_DIR }}/edx-analytics-pipeline/output.json mode=0644 owner={{ hadoop_common_user }} group={{ hadoop_common_group }} tags: - install - install:configuration - name: store input database credentials for analytics pipeline copy: > content="{{ ANALYTICS_PIPELINE_INPUT_DATABASE | to_json }}" dest={{ COMMON_CFG_DIR }}/edx-analytics-pipeline/input.json mode=0644 owner={{ hadoop_common_user }} group={{ hadoop_common_group }} tags: - install - install:configuration - name: luigi configuration directory created file: > path=/etc/luigi state=directory mode=755 tags: - install - install:configuration - name: luigi configuration file written template: > src=client.cfg.j2 dest=/etc/luigi/client.cfg mode=644 tags: - install - install:configuration - name: util library source checked out git: > dest={{ analytics_pipeline_util_library.path }} repo={{ analytics_pipeline_util_library.repo }} version={{ analytics_pipeline_util_library.version }} tags: - install - install:code - name: lib directory created file: > path={{ HADOOP_COMMON_USER_HOME }}/lib owner={{ hadoop_common_user }} group={{ hadoop_common_group }} state=directory tags: - install - install:app-requirements - name: check if the util library needs to be built stat: > path={{ HADOOP_COMMON_USER_HOME }}/lib/edx-analytics-hadoop-util.jar register: util_lib_built tags: - install - install:app-requirements - name: util library built shell: > chdir={{ analytics_pipeline_util_library.path }} {{ hadoop_common_java_home }}/bin/javac -cp `{{ HADOOP_COMMON_HOME }}/bin/hadoop classpath` org/edx/hadoop/input/ManifestTextInputFormat.java && {{ hadoop_common_java_home }}/bin/jar cf {{ HADOOP_COMMON_USER_HOME }}/lib/edx-analytics-hadoop-util.jar org/edx/hadoop/input/ManifestTextInputFormat.class && chown {{ hadoop_common_user }}:{{ hadoop_common_group }} {{ HADOOP_COMMON_USER_HOME }}/lib/edx-analytics-hadoop-util.jar when: not util_lib_built.stat.exists tags: - install - install:app-requirements - name: ensure hdfs services are started service: > name=hdfs state=started tags: - manage - manage:start - name: ensure map reduce services are started service: > name=yarn state=started tags: - manage - manage:start - name: ensure package dir exists in HDFS shell: > . {{ HADOOP_COMMON_CONF_DIR }}/hadoop-env.sh && hdfs dfs -mkdir -p /edx-analytics-pipeline/packages/ sudo_user: "{{ hadoop_common_user }}" tags: - install - install:app-requirements - name: ensure util library is in HDFS shell: > . {{ HADOOP_COMMON_CONF_DIR }}/hadoop-env.sh && hdfs dfs -put -f {{ HADOOP_COMMON_USER_HOME }}/lib/edx-analytics-hadoop-util.jar /edx-analytics-pipeline/packages/ sudo_user: "{{ hadoop_common_user }}" tags: - install - install:app-requirements - name: ensure the data directory exists shell: > . {{ HADOOP_COMMON_CONF_DIR }}/hadoop-env.sh && hdfs dfs -mkdir -p {{ ANALYTICS_PIPELINE_HDFS_DATA_DIR }} sudo_user: "{{ hadoop_common_user }}" tags: - install - install:base - name: ensure tracking log file can be read file: > path={{ COMMON_LOG_DIR }}/tracking/tracking.log mode=0644 ignore_errors: yes tags: - install - install:configuration - name: cron job syncs tracking log file to hdfs cron: > user={{ hadoop_common_user }} name="Sync tracking log to HDFS" job="{{ HADOOP_COMMON_HOME }}/bin/hdfs dfs -put -f {{ COMMON_LOG_DIR }}/tracking/tracking.log {{ ANALYTICS_PIPELINE_HDFS_DATA_DIR }}/tracking.log" tags: - install - install:configuration