Commit a05b70db by Joe Blaylock

Stanford-oriented Datadog updates

* Datadog explicitly sets hostname in style Stanford uses (relies on
  /etc/hostname being accurate, requires reboot)
* Enables nginx status check module integration for datadog
* Installs nginx access log processor that counts HTTP response codes
* Updates to lots of other places to make sure that their logs appear
  where we expect
* Has sample code showing how to emit events from log contents which make it
  into the Datadog event stream
* Separation of stage and prod datadog configurations
parent eea60f9f
......@@ -24,7 +24,7 @@
- "{{ secure_dir }}/vars/users.yml"
- "{{ secure_dir }}/vars/edxapp_prod_users.yml"
- "{{ secure_dir }}/vars/shib_prod_vars.yml"
- "{{ secure_dir }}/vars/datadog_stage.yml"
- "{{ secure_dir }}/vars/datadog_prod.yml"
roles:
- common
- role: nginx
......
......@@ -2,7 +2,7 @@
sudo: True
vars_files:
- "{{ secure_dir }}/vars/users_jumpbox.yml"
- "{{ secure_dir }}/vars/datadog_stage.yml"
- "{{ secure_dir }}/vars/datadog_prod.yml"
vars:
secure_dir: '../../../configuration-secure/ansible'
local_dir: '../../../configuration-secure/ansible/local'
......
......@@ -15,7 +15,7 @@
- "{{ secure_dir }}/vars/ora_prod_vars.yml"
- "{{ secure_dir }}/vars/users.yml"
- "{{ secure_dir }}/vars/edxapp_prod_users.yml"
- "{{ secure_dir }}/vars/datadog_stage.yml"
- "{{ secure_dir }}/vars/datadog_prod.yml"
roles:
- common
- role: nginx
......
......@@ -14,7 +14,7 @@
- "{{ secure_dir }}/vars/users.yml"
- "{{ secure_dir }}/vars/edxapp_prod_users.yml"
- "{{ secure_dir }}/vars/shib_prod_vars.yml"
- "{{ secure_dir }}/vars/datadog_stage.yml"
- "{{ secure_dir }}/vars/datadog_prod.yml"
roles:
- common
- { role: 'edxapp', celery_worker: True }
......
# this gets all running prod webservers
#- hosts: tag_environment_prod:&tag_function_xqueue
# or we can get subsets of them by name
- hosts: ~tag_Name_xqueue11_prod
- hosts: ~tag_Name_xqueue10_prod
#- hosts: security_group_edx-prod-EdxappServerSecurityGroup-NSKCQTMZIPQB
sudo: True
vars:
......@@ -13,7 +13,7 @@
- "{{ secure_dir }}/vars/xqueue_prod_vars.yml"
- "{{ secure_dir }}/vars/users.yml"
- "{{ secure_dir }}/vars/edxapp_prod_users.yml"
- "{{ secure_dir }}/vars/datadog_stage.yml"
- "{{ secure_dir }}/vars/datadog_prod.yml"
roles:
- common
- role: nginx
......
"""Consumes nginx access logs and emits counts by response code
Run from datadog's 'dogstreams' functionality.
"""
import re
import statsd
#SHORT_HOSTNAME = '.'.join(socket.gethostname().split('.')[:2])
MONTHS_LOOKUP = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
RETURN_RE = re.compile(r'^(?:.+?) \[(?P<day>\d\d)\/(?P<month>\w\w\w)\/(?P<year>\d\d\d\d):(?P<hour>\d\d):(?P<minute>\d\d):(?P<second>\d\d) (?:[-+]\d+)\] "(?P<url>.+?)" (?P<value>\d\d\d) (?:.+)')
def count_event_example(logger, line):
"""Do not use this method as-is; it emits events too fast.
Events are supposed to be special things that happen periodically, rather
than common things we want to count a lot of. But here's a straightforward
example that shows how to emit events based on the contents of a log file."""
return None
import time
from datetime import datetime
e = None
m = RETURN_RE.match(line)
if m and m.group('url') != 'GET /heartbeat HTTP/1.1':
e = {'msg_text': line, 'priority': 'low'}
e['timestamp'] = int(time.mktime(datetime(
month=MONTHS_LOOKUP[m.group('month')],
year=int(m.group('year')),
day=int(m.group('day')),
hour=int(m.group('hour')),
minute=int(m.group('minute')),
second=int(m.group('second'))).timetuple()))
e['msg_title'] = 'nginx HTTP ' + m.group('value')
e['event_type'] = 'http.'+m.group('value')
return e
def count(logger, line):
"""Side-effectually count HTTP response codes by code and class.
Inspired by http://docs.datadoghq.com/guides/logs/ - this doesn't do
normal log event processing, but instead uses the dogstats API to do
simple counting. It's desirable to do it this way because we get folded
into the agent's daemonization, and don't have to write our own or rely
on cron."""
m = RETURN_RE.match(line)
if m != None:
statsd.statsd.increment('nginx.response.'+m.group('value')+'.count', 1)
statsd.statsd.increment('nginx.response.'+m.group('value')[0]+'xx.count', 1)
return None
### Inspired by http://docs.datadoghq.com/guides/logs/
def test():
test_suite = [
('''10.0.0.65 - - [12/Nov/2013:06:30:53 +0000] "-" 400 0 "-" "-"''',
{'msg_text': '10.0.0.65 - - [12/Nov/2013:06:30:53 +0000] "-" 400 0 "-" "-"',
'priority': 'low',
'timestamp': 1384266653,
'msg_title': '400',
'event_type': '400', }),
('''10.0.0.65 - - [12/Nov/2013:06:30:54 +0000] "GET /heartbeat HTTP/1.1" 200 10097 "-" "ELB-HealthChecker/1.0"''',
{'msg_text': '10.0.0.65 - - [12/Nov/2013:06:30:54 +0000] "GET /heartbeat HTTP/1.1" 200 10097 "-" "ELB-HealthChecker/1.0"',
'priority': 'low',
'timestamp': 1384266654,
'msg_title': '200',
'event_type': '200', }),
('''10.0.0.65 - stanford [12/Nov/2013:19:35:30 +0000] "GET /courses/Education/EDUC115N/How_to_Learn_Math/courseware/627b094444a1487db5c1b3caaef096cf/d8d40562b7ec40789315e2ccefa5cc5b/ HTTP/1.1" 200 11219 "https://stage.class.stanford.edu/courses/Education/EDUC115N/How_to_Learn_Math/courseware/627b094444a1487db5c1b3caaef096cf/d8d40562b7ec40789315e2ccefa5cc5b/" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/30.0.1599.114 Chrome/30.0.1599.114 Safari/537.36"''',
{'msg_text': '10.0.0.65 - stanford [12/Nov/2013:19:35:30 +0000] "GET /courses/Education/EDUC115N/How_to_Learn_Math/courseware/627b094444a1487db5c1b3caaef096cf/d8d40562b7ec40789315e2ccefa5cc5b/ HTTP/1.1" 200 11219 "https://stage.class.stanford.edu/courses/Education/EDUC115N/How_to_Learn_Math/courseware/627b094444a1487db5c1b3caaef096cf/d8d40562b7ec40789315e2ccefa5cc5b/" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/30.0.1599.114 Chrome/30.0.1599.114 Safari/537.36"',
'priority': 'low',
'timestamp': 1384313730,
'msg_title': '200',
'event_type': '200', }),
('''10.0.0.65 - stanford [12/Nov/2013:19:36:54 +0000] "GET /jsi18n/ HTTP/1.1" 200 2170 "https://stage.class.stanford.edu/courses/Education/EDUC115N/How_to_Learn_Math/courseware/b5c2c03d98274010bdb655afa2eaed31/1e8b3bccf4c34f79b2e43ae64cd1f54c/" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/30.0.1599.114 Chrome/30.0.1599.114 Safari/537.36"''',
{'msg_text': '10.0.0.65 - stanford [12/Nov/2013:19:36:54 +0000] "GET /jsi18n/ HTTP/1.1" 200 2170 "https://stage.class.stanford.edu/courses/Education/EDUC115N/How_to_Learn_Math/courseware/b5c2c03d98274010bdb655afa2eaed31/1e8b3bccf4c34f79b2e43ae64cd1f54c/" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/30.0.1599.114 Chrome/30.0.1599.114 Safari/537.36"',
'priority': 'low',
'timestamp': 1384313814,
'msg_title': '200',
'event_type': '200', }),
]
# Set up the test logger
import logging
logging.basicConfig(level=logging.DEBUG)
for pair in test_suite:
actual = count_event_example(logging, pair[0])
assert pair[1] == actual, "%s != %s" % (pair[1], actual)
print 'test passes'
if __name__ == '__main__':
# For local testing, callable as "python /path/to/parsers.py"
test()
---
- name: datadog | restart the datadog service
service: name=datadog-agent state=restarted
\ No newline at end of file
service: name=datadog-agent state=restarted
......@@ -45,11 +45,73 @@
- ubuntu
when: ansible_distribution in common_debian_variants
- name: datadog | datadog user to syslog group for log reading
user: append=yes groups=syslog name=dd-agent state=present
tags:
- datadog
- ubuntu
- name: datadog | install dogstatsd library
pip: name=dogstatsd-python state=present
tags:
- datadog
- name: datadog | install log watcher scripts
copy:
src=usr/share/datadog/agent/nginx_log_http_response_counter.py
dest=/usr/share/datadog/agent/nginx_log_http_response_counter.py
owner=root group=root mode=0644
tags:
- datadog
- name: datadog | bootstrap config
shell: cp /etc/dd-agent/datadog.conf.example /etc/dd-agent/datadog.conf creates=/etc/dd-agent/datadog.conf
tags:
- datadog
- name: datadog | set config permissions
file: path=/etc/dd-agent/datadog.conf state=file owner=dd-agent group=root mode=0640
tags:
- datadog
- name: datadog | enable nginx module
template: dest=/etc/dd-agent/conf.d/nginx.yaml src=nginx.yaml.j2 owner=root group=root mode=0644
notify:
- datadog | restart the datadog service
when: nginx_sites is defined
tags:
- datadog
# quoting intentional, missing space after line=api_key: also
# ansible wasn't handling the double quoted yaml properly
# otherwise.
- name: datadog | set hostname if unset
lineinfile:
dest="/etc/dd-agent/datadog.conf"
"line=hostname:{{ansible_hostname}}.{{environment_tag}}"
state=present
"regexp=^\#?\s*hostname:.+$"
notify:
- datadog | restart the datadog service
tags:
- datadog
# quoting intentional, missing space after line=api_key: also
# ansible wasn't handling the double quoted yaml properly
# otherwise.
- name: datadog | update dogstreams
lineinfile:
dest="/etc/dd-agent/datadog.conf"
"line=dogstreams:{{log_base_dir}}/nginx/access.log:nginx_log_http_response_counter:count"
state=present
insertafter=EOF
"regexp=^\#?\s*dogstreams:.+$"
when: nginx_sites is defined
notify:
- datadog | restart the datadog service
tags:
- datadog
# quoting intentional, missing space after line=api_key: also
# ansible wasn't handling the double quoted yaml properly
# otherwise.
......@@ -62,3 +124,4 @@
- datadog | restart the datadog service
tags:
- datadog
init_config:
instances:
# For every instance, you have an `nginx_status_url` and (optionally)
# a list of tags.
- nginx_status_url: http://127.0.0.1/nginx_status/
tags:
- instance:{{ansible_hostname}}.{{environment_tag}}-nginx
#- nginx_status_url: http://example2.com:1234/nginx_status/
# tags:
# - instance:bar
......@@ -71,7 +71,7 @@
- logging
- update
- name: nginx | Set up nginx access log rotation
- name: nginx | Set up nginx error log rotation
template: dest=/etc/logrotate.d/nginx-error src=edx_logrotate_nginx_error.j2 owner=root group=root mode=644
tags:
- logging
......
......@@ -7,6 +7,9 @@ upstream app_server {
server {
listen {{ ORA_NGINX_PORT }} default_server;
access_log {{log_base_dir}}/nginx/access.log;
error_log {{log_base_dir}}/nginx/error.log error;
location / {
{% if ORA_BASIC_AUTH %}
......@@ -43,4 +46,13 @@ server {
proxy_redirect off;
proxy_pass http://app_server;
}
# Monitoring support for datadog.
location /nginx_status {
stub_status on;
access_log off;
allow 127.0.0.1/32;
deny all;
}
}
......@@ -7,6 +7,9 @@ upstream xqueue_app_server {
server {
listen {{ XQUEUE_NGINX_PORT }} default_server;
access_log {{log_base_dir}}/nginx/access.log;
error_log {{log_base_dir}}/nginx/error.log error;
location / {
try_files $uri @proxy_to_app;
}
......@@ -16,6 +19,14 @@ server {
try_files $uri @proxy_to_app;
}
# Monitoring support for datadog.
location /nginx_status {
stub_status on;
access_log off;
allow 127.0.0.1/32;
deny all;
}
location @proxy_to_app {
proxy_set_header X-Forwarded-Proto $http_x_forwarded_proto;
proxy_set_header X-Forwarded-Port $http_x_forwarded_port;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment