Commit 82204925 by Clinton Blackburn

Merge pull request #8 from edx/better-fake-data

Updated Fake Data Generator
parents bd4e6fc6 10621b44
[run] [run]
omit = analyticsdataserver/settings* omit = analyticsdataserver/settings*
*wsgi.py *wsgi.py
analytics_data_api/management/commands/generate_fake_enrollment_data.py analytics_data_api/management/commands/generate_fake_course_data.py
[report] [report]
# Regexes for lines to exclude from consideration # Regexes for lines to exclude from consideration
......
...@@ -51,7 +51,7 @@ syncdb: ...@@ -51,7 +51,7 @@ syncdb:
loaddata: syncdb loaddata: syncdb
python manage.py loaddata education_levels single_course_activity problem_response_answer_distribution --database=analytics python manage.py loaddata education_levels single_course_activity problem_response_answer_distribution --database=analytics
python manage.py generate_fake_enrollment_data python manage.py generate_fake_course_data
demo: clean requirements loaddata demo: clean requirements loaddata
python manage.py set_api_key edx edx python manage.py set_api_key edx edx
# pylint: disable=line-too-long # pylint: disable=line-too-long,invalid-name
import datetime import datetime
import logging
import random import random
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from django.utils import timezone
from analytics_data_api.v0 import models from analytics_data_api.v0 import models
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# http://stackoverflow.com/a/3590105 # http://stackoverflow.com/a/3590105
def constrained_sum_sample_pos(num_values, total): def constrained_sum_sample_pos(num_values, total):
"""Return a randomly chosen list of n positive integers summing to total. """Return a randomly chosen list of n positive integers summing to total.
...@@ -22,19 +28,15 @@ def get_count(start): ...@@ -22,19 +28,15 @@ def get_count(start):
class Command(BaseCommand): class Command(BaseCommand):
def handle(self, *args, **options): def generate_daily_data(self, course_id, start_date, end_date):
# Use the preset ratios below to generate data in the specified demographics
days = 120 gender_ratios = {
course_id = 'edX/DemoX/Demo_Course'
start_date = datetime.date(year=2014, month=1, day=1)
genders = {
'm': 0.6107, 'm': 0.6107,
'f': 0.3870, 'f': 0.3870,
'o': 0.23 'o': 0.23
} }
education_level_ratios = {
education_levels = {
'associates': 0.058, 'associates': 0.058,
'bachelors': 0.3355, 'bachelors': 0.3355,
'primary': 0.0046, 'primary': 0.0046,
...@@ -45,8 +47,7 @@ class Command(BaseCommand): ...@@ -45,8 +47,7 @@ class Command(BaseCommand):
'other': 0.0271, 'other': 0.0271,
'doctorate': 0.0470 'doctorate': 0.0470
} }
country_ratios = {
countries = {
'US': 0.34, 'US': 0.34,
'GH': 0.12, 'GH': 0.12,
'IN': 0.10, 'IN': 0.10,
...@@ -55,6 +56,7 @@ class Command(BaseCommand): ...@@ -55,6 +56,7 @@ class Command(BaseCommand):
'DE': 0.08 'DE': 0.08
} }
# Generate birth year ratios
birth_years = range(1960, 2005) birth_years = range(1960, 2005)
ratios = [n / 1000.0 for n in constrained_sum_sample_pos(len(birth_years), 1000)] ratios = [n / 1000.0 for n in constrained_sum_sample_pos(len(birth_years), 1000)]
birth_years = dict(zip(birth_years, ratios)) birth_years = dict(zip(birth_years, ratios))
...@@ -67,24 +69,29 @@ class Command(BaseCommand): ...@@ -67,24 +69,29 @@ class Command(BaseCommand):
models.CourseEnrollmentByCountry]: models.CourseEnrollmentByCountry]:
model.objects.all().delete() model.objects.all().delete()
logger.info("Deleted all daily course enrollment data.")
logger.info("Generating new daily course enrollment data...")
# Create new data # Create new data
daily_total = 1500 daily_total = 1500
for i in range(days): date = start_date
while date <= end_date:
daily_total = get_count(daily_total) daily_total = get_count(daily_total)
date = start_date + datetime.timedelta(days=i)
models.CourseEnrollmentDaily.objects.create(course_id=course_id, date=date, count=daily_total) models.CourseEnrollmentDaily.objects.create(course_id=course_id, date=date, count=daily_total)
for gender, ratio in genders.iteritems(): for gender, ratio in gender_ratios.iteritems():
count = int(ratio * daily_total) count = int(ratio * daily_total)
models.CourseEnrollmentByGender.objects.create(course_id=course_id, date=date, count=count, gender=gender) models.CourseEnrollmentByGender.objects.create(course_id=course_id, date=date, count=count,
gender=gender)
for short_name, ratio in education_levels.iteritems(): for short_name, ratio in education_level_ratios.iteritems():
education_level = models.EducationLevel.objects.get(short_name=short_name) education_level = models.EducationLevel.objects.get(short_name=short_name)
count = int(ratio * daily_total) count = int(ratio * daily_total)
models.CourseEnrollmentByEducation.objects.create(course_id=course_id, date=date, count=count, models.CourseEnrollmentByEducation.objects.create(course_id=course_id, date=date, count=count,
education_level=education_level) education_level=education_level)
for country_code, ratio in countries.iteritems(): for country_code, ratio in country_ratios.iteritems():
count = int(ratio * daily_total) count = int(ratio * daily_total)
models.CourseEnrollmentByCountry.objects.create(course_id=course_id, date=date, count=count, models.CourseEnrollmentByCountry.objects.create(course_id=course_id, date=date, count=count,
country_code=country_code) country_code=country_code)
...@@ -93,3 +100,41 @@ class Command(BaseCommand): ...@@ -93,3 +100,41 @@ class Command(BaseCommand):
count = int(ratio * daily_total) count = int(ratio * daily_total)
models.CourseEnrollmentByBirthYear.objects.create(course_id=course_id, date=date, count=count, models.CourseEnrollmentByBirthYear.objects.create(course_id=course_id, date=date, count=count,
birth_year=birth_year) birth_year=birth_year)
date = date + datetime.timedelta(days=1)
logger.info("Done!")
def generate_weekly_data(self, course_id, start_date, end_date):
activity_types = ['played_video', 'attempted_problem', 'posted_forum']
start = start_date
models.CourseActivityByWeek.objects.all().delete()
logger.info("Deleted all weekly course activity.")
logger.info("Generating new weekly course activity data...")
while start < end_date:
active_students = random.randint(100, 4000)
end = min(start + datetime.timedelta(weeks=1), end_date)
counts = constrained_sum_sample_pos(len(activity_types), active_students)
for activity_type, count in zip(activity_types, counts):
models.CourseActivityByWeek.objects.create(course_id=course_id, activity_type=activity_type,
count=count, interval_start=start, interval_end=end)
models.CourseActivityByWeek.objects.create(course_id=course_id, activity_type='any', count=active_students,
interval_start=start, interval_end=end)
start = end
logger.info("Done!")
def handle(self, *args, **options):
course_id = 'edX/DemoX/Demo_Course'
start_date = datetime.datetime(year=2014, month=1, day=1, tzinfo=timezone.utc)
end_date = timezone.now().replace(microsecond=0)
logger.info("Generating data for %s...", course_id)
self.generate_weekly_data(course_id, start_date, end_date)
self.generate_daily_data(course_id, start_date, end_date)
...@@ -96,7 +96,7 @@ class ProblemResponseAnswerDistribution(models.Model): ...@@ -96,7 +96,7 @@ class ProblemResponseAnswerDistribution(models.Model):
created = models.DateTimeField(auto_now_add=True, db_column='created') created = models.DateTimeField(auto_now_add=True, db_column='created')
Country = namedtuple('Country', 'name, code') Country = namedtuple('Country', 'name code')
class CourseEnrollmentByCountry(BaseCourseEnrollment): class CourseEnrollmentByCountry(BaseCourseEnrollment):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment