Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
edx-analytics-data-api
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
edx-analytics-data-api
Commits
68abaf6e
Commit
68abaf6e
authored
Feb 19, 2014
by
Brian Wilson
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add history to WeeklyAllUsersAndEnrollments report.
Change-Id: Iccd7c7ec7970375fd8e5151320575277674e949a
parent
a7128612
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
100 additions
and
40 deletions
+100
-40
edx/analytics/tasks/reports/tests/test_total_enrollments.py
+66
-17
edx/analytics/tasks/reports/total_enrollments.py
+34
-23
No files found.
edx/analytics/tasks/reports/tests/test_total_enrollments.py
View file @
68abaf6e
...
...
@@ -41,9 +41,22 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase):
# Mock the input and output targets
def
reformat
(
string
):
# Reformat string to make it like a hadoop tsv
"""Reformat string to make it like a TSV."""
return
textwrap
.
dedent
(
string
)
.
strip
()
.
replace
(
' '
,
'
\t
'
)
if
source
is
None
:
source
=
"""
course_1 2013-03-01 1
course_1 2013-03-30 2
course_2 2013-03-07 1
course_2 2013-03-08 1
course_2 2013-03-10 1
course_2 2013-03-13 1
course_3 2013-03-15 1
course_3 2013-03-18 1
course_3 2013-03-19 1
"""
input_targets
=
{
'source'
:
FakeTarget
(
reformat
(
source
)),
}
...
...
@@ -98,8 +111,8 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase):
"""
res
=
self
.
run_task
(
source
,
'2013-01-21'
,
4
)
weeks
=
set
([
'2012-12-31'
,
'2013-01-07'
,
'2013-01-14'
,
'2013-01-21'
])
self
.
assertEqual
(
weeks
,
set
(
str
(
w
)
for
w
in
res
.
columns
))
total_enrollment
=
res
.
loc
[
TOTAL_ENROLLMENT_ROWNAME
]
self
.
assertEqual
(
weeks
,
set
(
str
(
w
)
for
w
in
res
.
columns
))
# pylint: disable=maybe-no-member
total_enrollment
=
res
.
loc
[
TOTAL_ENROLLMENT_ROWNAME
]
# pylint: disable=maybe-no-member
self
.
assertTrue
(
isnan
(
total_enrollment
[
'2012-12-31'
]))
# no data
self
.
assertEqual
(
total_enrollment
[
'2013-01-07'
],
10
)
self
.
assertEqual
(
total_enrollment
[
'2013-01-14'
],
20
)
...
...
@@ -118,29 +131,60 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase):
course_2 2013-02-15 -2
"""
res
=
self
.
run_task
(
source
,
'2013-02-18'
,
2
)
total_enrollment
=
res
.
loc
[
TOTAL_ENROLLMENT_ROWNAME
]
total_enrollment
=
res
.
loc
[
TOTAL_ENROLLMENT_ROWNAME
]
# pylint: disable=maybe-no-member
self
.
assertEqual
(
total_enrollment
[
'2013-02-11'
],
13
)
self
.
assertEqual
(
total_enrollment
[
'2013-02-18'
],
24
)
def
test_offsets
(
self
):
source
=
"""
course_1 2013-03-01 1
course_1 2013-03-30 2
course_2 2013-03-07 1
course_2 2013-03-08 1
course_2 2013-03-10 1
course_2 2013-03-13 1
course_3 2013-03-15 1
course_3 2013-03-18 1
course_3 2013-03-19 1
offset
=
"""
course_2 2013-03-07 8
course_3 2013-03-15 6
"""
res
=
self
.
run_task
(
None
,
'2013-03-28'
,
6
,
offset
=
offset
)
total_enrollment
=
res
.
loc
[
TOTAL_ENROLLMENT_ROWNAME
]
# pylint: disable=maybe-no-member
self
.
assertTrue
(
isnan
(
total_enrollment
[
'2013-02-21'
]))
# no data
self
.
assertTrue
(
isnan
(
total_enrollment
[
'2013-02-28'
]))
# no data
self
.
assertEqual
(
total_enrollment
[
'2013-03-07'
],
10
)
self
.
assertEqual
(
total_enrollment
[
'2013-03-14'
],
13
)
self
.
assertEqual
(
total_enrollment
[
'2013-03-21'
],
22
)
self
.
assertEqual
(
total_enrollment
[
'2013-03-28'
],
22
)
def
test_non_overlapping_history
(
self
):
offset
=
"""
course_2 2013-03-07 8
course_3 2013-03-15 6
"""
res
=
self
.
run_task
(
source
,
'2013-03-28'
,
4
,
offset
=
offset
)
total_enrollment
=
res
.
loc
[
TOTAL_ENROLLMENT_ROWNAME
]
# Choose history so that it ends right before
# source data begins (on 3/1).
history
=
"""
2013-02-21 4
2013-02-28 10
"""
res
=
self
.
run_task
(
None
,
'2013-03-28'
,
6
,
offset
=
offset
,
history
=
history
)
total_enrollment
=
res
.
loc
[
TOTAL_ENROLLMENT_ROWNAME
]
# pylint: disable=maybe-no-member
self
.
assertEqual
(
total_enrollment
[
'2013-02-21'
],
4
)
self
.
assertEqual
(
total_enrollment
[
'2013-02-28'
],
10
)
self
.
assertEqual
(
total_enrollment
[
'2013-03-07'
],
10
)
self
.
assertEqual
(
total_enrollment
[
'2013-03-14'
],
13
)
self
.
assertEqual
(
total_enrollment
[
'2013-03-21'
],
22
)
self
.
assertEqual
(
total_enrollment
[
'2013-03-28'
],
22
)
def
test_overlapping_history
(
self
):
offset
=
"""
course_2 2013-03-07 8
course_3 2013-03-15 6
"""
# Choose history so that it overlaps
# with when source data begins (on 3/1).
history
=
"""
2013-02-18 4
2013-03-21 22
"""
res
=
self
.
run_task
(
None
,
'2013-03-28'
,
6
,
offset
=
offset
,
history
=
history
)
total_enrollment
=
res
.
loc
[
TOTAL_ENROLLMENT_ROWNAME
]
# pylint: disable=maybe-no-member
print
total_enrollment
self
.
assertEqual
(
total_enrollment
[
'2013-02-21'
],
5
)
self
.
assertEqual
(
total_enrollment
[
'2013-02-28'
],
9
)
self
.
assertEqual
(
total_enrollment
[
'2013-03-07'
],
10
)
self
.
assertEqual
(
total_enrollment
[
'2013-03-14'
],
13
)
self
.
assertEqual
(
total_enrollment
[
'2013-03-21'
],
22
)
...
...
@@ -156,7 +200,7 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase):
res
=
self
.
run_task
(
source
.
encode
(
'utf-8'
),
'2013-04-02'
,
1
)
self
.
assertEqual
(
res
.
loc
[
TOTAL_ENROLLMENT_ROWNAME
][
'2013-04-02'
],
2
)
self
.
assertEqual
(
res
.
loc
[
TOTAL_ENROLLMENT_ROWNAME
][
'2013-04-02'
],
2
)
# pylint: disable=maybe-no-member
def
test_task_urls
(
self
):
date
=
datetime
.
date
(
2013
,
01
,
20
)
...
...
@@ -164,6 +208,7 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase):
task
=
WeeklyAllUsersAndEnrollments
(
source
=
's3://bucket/path/'
,
offsets
=
's3://bucket/file.txt'
,
destination
=
'file://path/file.txt'
,
history
=
'file://path/history/file.gz'
,
date
=
date
)
requires
=
task
.
requires
()
...
...
@@ -176,5 +221,9 @@ class TestWeeklyAllUsersAndEnrollments(unittest.TestCase):
self
.
assertIsInstance
(
offsets
,
luigi
.
hdfs
.
HdfsTarget
)
self
.
assertEqual
(
offsets
.
format
,
luigi
.
hdfs
.
Plain
)
offsets
=
requires
[
'history'
]
.
output
()
self
.
assertIsInstance
(
offsets
,
luigi
.
File
)
self
.
assertEqual
(
offsets
.
format
,
luigi
.
format
.
Gzip
)
destination
=
task
.
output
()
self
.
assertIsInstance
(
destination
,
luigi
.
File
)
edx/analytics/tasks/reports/total_enrollments.py
View file @
68abaf6e
"""Total Enrollment related reports"""
import
csv
from
datetime
import
timedelta
import
luigi
import
luigi.hdfs
...
...
@@ -29,8 +30,8 @@ class AllCourseEnrollmentCountMixin(CourseEnrollmentCountMixin):
names
=
[
'date'
,
'count'
]
data
=
read_tsv
(
input_file
,
names
)
data
.
date
=
pandas
.
to_datetime
(
data
.
date
)
data
=
data
.
set_index
(
'date'
)
data
.
date
=
pandas
.
to_datetime
(
data
.
date
)
# pylint: disable=maybe-no-member
data
=
data
.
set_index
(
'date'
)
# pylint: disable=maybe-no-member
date_range
=
pandas
.
date_range
(
min
(
data
.
index
),
max
(
data
.
index
))
data
=
data
.
reindex
(
date_range
)
...
...
@@ -40,18 +41,27 @@ class AllCourseEnrollmentCountMixin(CourseEnrollmentCountMixin):
def
read_incremental_count_tsv
(
self
,
input_file
):
"""
Read TSV containing dates and corresponding counts into a pandas Series.
Read TSV containing dates and incremental counts.
Args:
input_file: TSV file with dates and incremental counts.
Interstitial incremental counts are filled as zeroes.
Returns:
pandas Series containing daily counts. Counts for missing days are set to zero.
"""
return
self
.
read_date_count_tsv
(
input_file
)
.
fillna
(
0
)
def
read_total_count_tsv
(
self
,
input_file
):
# TODO: this is a placeholder for reading in historical counts,
# such as total enrollment numbers. It will
# need to interpolate the interstitial NANs.
data
=
self
.
read_date_count_tsv
(
input_file
)
return
data
"""
Read TSV containing dates and total counts.
Args:
input_file: TSV file with dates and total counts.
Returns:
pandas Series containing daily counts. Counts for missing days are interpolated.
"""
return
self
.
read_date_count_tsv
(
input_file
)
.
interpolate
(
method
=
'time'
)
def
filter_duplicate_courses
(
self
,
daily_enrollment_totals
):
# TODO: implement this for real. (This is just a placeholder.)
...
...
@@ -85,6 +95,7 @@ class AllCourseEnrollmentCountMixin(CourseEnrollmentCountMixin):
writer
.
writerow
(
dict
((
k
,
k
)
for
k
in
fieldnames
))
# Write header
def
format_counts
(
counts_dict
):
"""Replace NaN with dashes."""
for
k
,
v
in
counts_dict
.
iteritems
():
yield
k
,
'-'
if
numpy
.
isnan
(
v
)
else
int
(
v
)
...
...
@@ -160,18 +171,18 @@ class WeeklyAllUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountMixin):
# Sum per-course counts to create a single series
# of total enrollment counts per day.
daily_overall_enrollment
=
daily_enrollment_totals
.
sum
(
axis
=
1
)
daily_overall_enrollment
.
name
=
TOTAL_ENROLLMENT_ROWNAME
# Prepend total enrollment history.
overall_enrollment_history
=
self
.
read_history
()
if
overall_enrollment_history
is
not
None
:
self
.
prepend_history
(
daily_overall_enrollment
,
overall_enrollment_history
)
daily_overall_enrollment
=
self
.
prepend_history
(
daily_overall_enrollment
,
overall_enrollment_history
)
# TODO: get user counts, as another series.
# TODO: Combine the two series into a single DataFrame, indexed by date.
# For now, put the single series into a data frame, so that
# it can be sampled and output in a consistent way.
daily_overall_enrollment
.
name
=
TOTAL_ENROLLMENT_ROWNAME
total_counts_by_day
=
pandas
.
DataFrame
(
daily_overall_enrollment
)
# Select values from DataFrame to display per-week.
...
...
@@ -181,7 +192,7 @@ class WeeklyAllUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountMixin):
self
.
weeks
,
)
with
self
.
output
()
.
open
(
'w'
)
as
output_file
:
with
self
.
output
()
.
open
(
'w'
)
as
output_file
:
# pylint: disable=maybe-no-member
self
.
save_output
(
total_counts_by_week
,
output_file
)
def
read_source
(
self
):
...
...
@@ -211,7 +222,7 @@ class WeeklyAllUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountMixin):
"""
data
=
None
if
self
.
input
()
.
get
(
'offsets'
):
with
self
.
input
()[
'offsets'
]
.
open
(
'r'
)
as
offset_file
:
with
self
.
input
()[
'offsets'
]
.
open
(
'r'
)
as
offset_file
:
# pylint: disable=maybe-no-member
data
=
self
.
read_course_date_count_tsv
(
offset_file
)
return
data
...
...
@@ -226,12 +237,11 @@ class WeeklyAllUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountMixin):
Returns None if no history was specified.
"""
# TODO: implement this for real. (This is just a placeholder.)
data
=
None
if
self
.
input
()
.
get
(
'history'
):
with
self
.
input
()[
'history'
]
.
open
(
'r'
)
as
history_file
:
# TODO: read input file and convert to a Series.
pass
with
self
.
input
()[
'history'
]
.
open
(
'r'
)
as
history_file
:
# pylint: disable=maybe-no-member
data
=
self
.
read_total_count_tsv
(
history_file
)
return
data
def
prepend_history
(
self
,
count_by_day
,
history
):
...
...
@@ -243,9 +253,10 @@ class WeeklyAllUsersAndEnrollments(luigi.Task, AllCourseEnrollmentCountMixin):
history: pandas Series, also of counts indexed by date.
"""
# TODO: implement this for real. (This is just a placeholder.)
# Check that entry doesn't already exist in count_by_day
# before adding value from history.
# For gaps in history, values should be extrapolated.
# Also may to need to reindex, since new dates are being added.
pass
# Get history dates that are not in the regular count data so there is no overlap.
last_day_of_history
=
count_by_day
.
index
[
0
]
-
timedelta
(
1
)
truncated_history
=
history
[:
last_day_of_history
]
result
=
count_by_day
.
append
(
truncated_history
,
verify_integrity
=
True
)
return
result
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment