Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
edx-analytics-data-api
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
edx-analytics-data-api
Commits
d0307a38
Commit
d0307a38
authored
Feb 12, 2014
by
Brian Wilson
Committed by
Gerrit Code Review
Feb 12, 2014
Browse files
Options
Browse Files
Download
Plain Diff
Merge "Implement skeleton version of Total Enrollment and Users report."
parents
8b555648
2365e97e
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
558 additions
and
91 deletions
+558
-91
edx/analytics/tasks/reports/enrollments.py
+124
-91
edx/analytics/tasks/reports/tests/test_total_enrollments.py
+198
-0
edx/analytics/tasks/reports/total_enrollments.py
+213
-0
edx/analytics/tasks/util/tsv.py
+23
-0
No files found.
edx/analytics/tasks/reports/enrollments.py
View file @
d0307a38
This diff is collapsed.
Click to expand it.
edx/analytics/tasks/reports/tests/test_total_enrollments.py
0 → 100644
View file @
d0307a38
"""Tests for Total Users and Enrollment report."""
from
contextlib
import
contextmanager
import
datetime
import
textwrap
from
StringIO
import
StringIO
from
unittest
import
TestCase
import
luigi
import
luigi.hdfs
from
mock
import
MagicMock
from
numpy
import
isnan
import
pandas
from
edx.analytics.tasks.reports.total_enrollments
import
TotalUsersAndEnrollmentsByWeek
,
TOTAL_ENROLLMENT_ROWNAME
class
FakeTarget
(
object
):
"""
Fake luigi like target that saves data in memory, using a
StringIO buffer.
"""
def
__init__
(
self
,
value
=
''
):
self
.
buffer
=
StringIO
(
value
)
# Rewind the buffer head so the value can be read
self
.
buffer
.
seek
(
0
)
@contextmanager
def
open
(
self
,
*
args
,
**
kwargs
):
yield
self
.
buffer
# Rewind the head for easy reading
self
.
buffer
.
seek
(
0
)
class
TestTotalUsersAndEnrollmentsByWeek
(
TestCase
):
"""Tests for TotalUsersAndEnrollmentsByWeek class."""
def
run_task
(
self
,
source
,
date
,
weeks
,
offset
=
None
,
history
=
None
):
"""
Run task with fake targets.
Returns:
the task output as a pandas dataframe.
"""
parsed_date
=
datetime
.
datetime
.
strptime
(
date
,
'
%
Y-
%
m-
%
d'
)
.
date
()
# Make offsets None if it was not specified.
task
=
TotalUsersAndEnrollmentsByWeek
(
source
=
'fake_source'
,
offsets
=
'fake_offsets'
if
offset
else
None
,
history
=
'fake_history'
if
history
else
None
,
destination
=
'fake_destination'
,
date
=
parsed_date
,
weeks
=
weeks
)
# Mock the input and output targets
def
reformat
(
string
):
# Reformat string to make it like a hadoop tsv
return
textwrap
.
dedent
(
string
)
.
strip
()
.
replace
(
' '
,
'
\t
'
)
input_targets
=
{
'source'
:
FakeTarget
(
reformat
(
source
)),
}
# Mock offsets only if specified.
if
offset
:
input_targets
.
update
({
'offsets'
:
FakeTarget
(
reformat
(
offset
))})
# Mock history only if specified.
if
history
:
input_targets
.
update
({
'history'
:
FakeTarget
(
reformat
(
history
))})
task
.
input
=
MagicMock
(
return_value
=
input_targets
)
output_target
=
FakeTarget
()
task
.
output
=
MagicMock
(
return_value
=
output_target
)
# Run the task and parse the output into a pandas dataframe
task
.
run
()
data
=
output_target
.
buffer
.
read
()
result
=
pandas
.
read_csv
(
StringIO
(
data
),
na_values
=
[
'-'
],
index_col
=
'name'
)
return
result
def
test_parse_source
(
self
):
source
=
"""
course_1 2013-01-01 10
course_1 2013-01-02 10
course_1 2013-01-03 10
course_1 2013-01-09 10
course_1 2013-01-17 10
course_2 2013-01-01 10
course_3 2013-01-01 10
"""
res
=
self
.
run_task
(
source
,
'2013-01-17'
,
3
)
# self.assertEqual(set(['name']), set(res.index))
self
.
assertEqual
(
set
([
'2013-01-03'
,
'2013-01-10'
,
'2013-01-17'
]),
set
(
res
.
columns
))
self
.
assertEqual
(
res
.
loc
[
TOTAL_ENROLLMENT_ROWNAME
][
'2013-01-03'
],
50
)
self
.
assertEqual
(
res
.
loc
[
TOTAL_ENROLLMENT_ROWNAME
][
'2013-01-10'
],
60
)
self
.
assertEqual
(
res
.
loc
[
TOTAL_ENROLLMENT_ROWNAME
][
'2013-01-17'
],
70
)
def
test_week_grouping
(
self
):
source
=
"""
course_1 2013-01-06 10
course_1 2013-01-14 10
"""
res
=
self
.
run_task
(
source
,
'2013-01-21'
,
4
)
weeks
=
set
([
'2012-12-31'
,
'2013-01-07'
,
'2013-01-14'
,
'2013-01-21'
])
self
.
assertEqual
(
weeks
,
set
(
str
(
w
)
for
w
in
res
.
columns
))
total_enrollment
=
res
.
loc
[
TOTAL_ENROLLMENT_ROWNAME
]
self
.
assertTrue
(
isnan
(
total_enrollment
[
'2012-12-31'
]))
# no data
self
.
assertEqual
(
total_enrollment
[
'2013-01-07'
],
10
)
self
.
assertEqual
(
total_enrollment
[
'2013-01-14'
],
20
)
self
.
assertTrue
(
isnan
(
total_enrollment
[
'2013-01-21'
]))
# no data
def
test_cumulative
(
self
):
source
=
"""
course_1 2013-02-01 4
course_1 2013-02-04 4
course_1 2013-02-08 5
course_1 2013-02-12 -4
course_1 2013-02-16 6
course_1 2013-02-18 6
course_2 2013-02-12 2
course_2 2013-02-14 3
course_2 2013-02-15 -2
"""
res
=
self
.
run_task
(
source
,
'2013-02-18'
,
2
)
total_enrollment
=
res
.
loc
[
TOTAL_ENROLLMENT_ROWNAME
]
self
.
assertEqual
(
total_enrollment
[
'2013-02-11'
],
13
)
self
.
assertEqual
(
total_enrollment
[
'2013-02-18'
],
24
)
def
test_offsets
(
self
):
source
=
"""
course_1 2013-03-01 1
course_1 2013-03-30 2
course_2 2013-03-07 1
course_2 2013-03-08 1
course_2 2013-03-10 1
course_2 2013-03-13 1
course_3 2013-03-15 1
course_3 2013-03-18 1
course_3 2013-03-19 1
"""
offset
=
"""
course_2 2013-03-07 8
course_3 2013-03-15 6
"""
res
=
self
.
run_task
(
source
,
'2013-03-28'
,
4
,
offset
=
offset
)
total_enrollment
=
res
.
loc
[
TOTAL_ENROLLMENT_ROWNAME
]
self
.
assertEqual
(
total_enrollment
[
'2013-03-07'
],
10
)
self
.
assertEqual
(
total_enrollment
[
'2013-03-14'
],
13
)
self
.
assertEqual
(
total_enrollment
[
'2013-03-21'
],
22
)
self
.
assertEqual
(
total_enrollment
[
'2013-03-28'
],
22
)
def
test_unicode
(
self
):
course_id
=
u'course_
\u2603
'
source
=
u"""
{course_id} 2013-04-01 1
{course_id} 2013-04-02 1
"""
.
format
(
course_id
=
course_id
)
res
=
self
.
run_task
(
source
.
encode
(
'utf-8'
),
'2013-04-02'
,
1
)
self
.
assertEqual
(
res
.
loc
[
TOTAL_ENROLLMENT_ROWNAME
][
'2013-04-02'
],
2
)
def
test_task_urls
(
self
):
date
=
datetime
.
date
(
2013
,
01
,
20
)
task
=
TotalUsersAndEnrollmentsByWeek
(
source
=
's3://bucket/path/'
,
offsets
=
's3://bucket/file.txt'
,
destination
=
'file://path/file.txt'
,
date
=
date
)
requires
=
task
.
requires
()
source
=
requires
[
'source'
]
.
output
()
self
.
assertIsInstance
(
source
,
luigi
.
hdfs
.
HdfsTarget
)
self
.
assertEqual
(
source
.
format
,
luigi
.
hdfs
.
PlainDir
)
offsets
=
requires
[
'offsets'
]
.
output
()
self
.
assertIsInstance
(
offsets
,
luigi
.
hdfs
.
HdfsTarget
)
self
.
assertEqual
(
offsets
.
format
,
luigi
.
hdfs
.
Plain
)
destination
=
task
.
output
()
self
.
assertIsInstance
(
destination
,
luigi
.
File
)
edx/analytics/tasks/reports/total_enrollments.py
0 → 100644
View file @
d0307a38
"""Total Enrollment related reports"""
import
csv
import
luigi
import
luigi.hdfs
import
numpy
import
pandas
from
edx.analytics.tasks.url
import
ExternalURL
,
get_target_from_url
from
edx.analytics.tasks.reports.enrollments
import
CourseEnrollmentCountMixin
ROWNAME_HEADER
=
'name'
TOTAL_ENROLLMENT_ROWNAME
=
'Total Enrollment'
class
TotalUsersAndEnrollmentsByWeek
(
luigi
.
Task
,
CourseEnrollmentCountMixin
):
"""
Calculates total users and enrollments across all (known) courses per week.
Parameters:
source: Location of daily enrollments per date. The format is a
TSV file, with fields course_id, date and count.
destination: Location of the resulting report. The output format is an
excel-compatible CSV file.
history: Location of historical values for total course enrollment.
The format is a TSV file, with fields "date" and "enrollments".
offsets: Location of seed values for each course. The format is a
Hadoop TSV file, with fields "course_id", "date" and "offset".
date: End date of the last week requested.
weeks: Number of weeks from the end date to request.
Output:
Excel-compatible CSV file with a header row and two non-header
rows. The first column is a title for the row, and subsequent
columns are the total counts for each week requested. The
first non-header row contains the total users at the end of
each week. The second row contains the total course
enrollments at the end of each week.
"""
# TODO: add the first (total users) row later, when we have access to total
# user counts (e.g. queried from and reconstructed from a production database).
source
=
luigi
.
Parameter
()
destination
=
luigi
.
Parameter
()
offsets
=
luigi
.
Parameter
(
default
=
None
)
history
=
luigi
.
Parameter
(
default
=
None
)
date
=
luigi
.
DateParameter
()
weeks
=
luigi
.
IntParameter
(
default
=
52
)
def
requires
(
self
):
results
=
{
'source'
:
ExternalURL
(
self
.
source
)}
if
self
.
offsets
:
results
.
update
({
'offsets'
:
ExternalURL
(
self
.
offsets
)})
if
self
.
history
:
results
.
update
({
'history'
:
ExternalURL
(
self
.
history
)})
return
results
def
output
(
self
):
return
get_target_from_url
(
self
.
destination
)
def
run
(
self
):
# Load the explicit enrollment data into a pandas dataframe.
daily_enrollment_changes
=
self
.
read_source
()
# Add enrollment offsets to allow totals to be calculated
# for explicit enrollments.
offsets
=
self
.
read_offsets
()
daily_enrollment_totals
=
self
.
calculate_total_enrollment
(
daily_enrollment_changes
,
offsets
)
# Remove (or merge or whatever) data for courses that
# would otherwise result in duplicate counts.
self
.
filter_duplicate_courses
(
daily_enrollment_totals
)
# Sum per-course counts to create a single series
# of total enrollment counts per day.
daily_overall_enrollment
=
daily_enrollment_totals
.
sum
(
axis
=
1
)
daily_overall_enrollment
.
name
=
TOTAL_ENROLLMENT_ROWNAME
# Prepend total enrollment history.
overall_enrollment_history
=
self
.
read_history
()
if
overall_enrollment_history
is
not
None
:
self
.
prepend_history
(
daily_overall_enrollment
,
overall_enrollment_history
)
# TODO: get user counts, as another series.
# TODO: Combine the two series into a single DataFrame, indexed by date.
# For now, put the single series into a data frame, so that
# it can be sampled and output in a consistent way.
total_counts_by_day
=
pandas
.
DataFrame
(
daily_overall_enrollment
)
# Select values from DataFrame to display per-week.
total_counts_by_week
=
self
.
select_weekly_values
(
total_counts_by_day
,
self
.
date
,
self
.
weeks
,
)
with
self
.
output
()
.
open
(
'w'
)
as
output_file
:
self
.
save_output
(
total_counts_by_week
,
output_file
)
def
read_source
(
self
):
"""
Read source into a pandas DataFrame.
Returns:
Pandas dataframe with one column per course_id. Indexed
for the time interval available in the source data.
"""
with
self
.
input
()[
'source'
]
.
open
(
'r'
)
as
input_file
:
course_date_count_data
=
self
.
read_course_date_count_tsv
(
input_file
)
data
=
self
.
initialize_daily_count
(
course_date_count_data
)
return
data
def
read_offsets
(
self
):
"""
Read offsets into a pandas DataFrame.
Returns:
Pandas dataframe with one row per course_id and
columns for the date and count of the offset.
Returns None if no offset was specified.
"""
data
=
None
if
self
.
input
()
.
get
(
'offsets'
):
with
self
.
input
()[
'offsets'
]
.
open
(
'r'
)
as
offset_file
:
data
=
self
.
read_course_date_count_tsv
(
offset_file
)
return
data
def
read_history
(
self
):
"""
Read course total enrollment history into a pandas DataFrame.
Returns:
Pandas Series, indexed by date, containing total
enrollment counts by date.
Returns None if no history was specified.
"""
# TODO: implement this for real. (This is just a placeholder.)
data
=
None
if
self
.
input
()
.
get
(
'history'
):
with
self
.
input
()[
'history'
]
.
open
(
'r'
)
as
history_file
:
# TODO: read input file and convert to a Series.
pass
return
data
def
prepend_history
(
self
,
count_by_day
,
history
):
"""
Add history to a series in-place.
Args:
count_by_day: pandas Series
history: pandas Series, also of counts indexed by date.
"""
# TODO: implement this for real. (This is just a placeholder.)
# Check that entry doesn't already exist in count_by_day
# before adding value from history.
# For gaps in history, values should be extrapolated.
# Also may to need to reindex, since new dates are being added.
pass
def
filter_duplicate_courses
(
self
,
daily_enrollment_totals
):
# TODO: implement this for real. (This is just a placeholder.)
# At this point we should remove data for courses that are
# represented by other courses, because the students have been
# moved to the new course. Perhaps this should actually
# perform a merge of the two courses, since we would want the
# history of one before the move date, and the history of the
# second after that date.
# Note that this is not the same filtering that would be applied
# to the EnrollmentsByWeek report.
pass
def
save_output
(
self
,
results
,
output_file
):
"""
Write output to CSV file.
Args:
results: a pandas DataFrame object containing series data
per row to be output.
"""
# transpose the dataframe so that weeks are columns, and output:
results
=
results
.
transpose
()
# List of fieldnames for the report
fieldnames
=
[
ROWNAME_HEADER
]
+
list
(
results
.
columns
)
writer
=
csv
.
DictWriter
(
output_file
,
fieldnames
)
writer
.
writerow
(
dict
((
k
,
k
)
for
k
in
fieldnames
))
# Write header
def
format_counts
(
counts_dict
):
for
k
,
v
in
counts_dict
.
iteritems
():
yield
k
,
'-'
if
numpy
.
isnan
(
v
)
else
int
(
v
)
for
series_name
,
series
in
results
.
iterrows
():
values
=
{
ROWNAME_HEADER
:
series_name
,
}
by_week_values
=
format_counts
(
series
.
to_dict
())
values
.
update
(
by_week_values
)
writer
.
writerow
(
values
)
edx/analytics/tasks/util/tsv.py
0 → 100644
View file @
d0307a38
"""Helpers for reading TSV files."""
import
csv
import
pandas
def
read_tsv
(
input_file
,
names
):
"""
Reads a tab-separated file into a DataFrame.
Args:
input_file (str): Path to the input file.
names (list): The names of the columns in the input file.
Returns:
A pandas DataFrame read from the file contents of the file.
"""
return
pandas
.
read_csv
(
input_file
,
names
=
names
,
quoting
=
csv
.
QUOTE_NONE
,
encoding
=
None
,
delimiter
=
'
\t
'
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment