psychoanalyze.py 11.2 KB
Newer Older
ichuang committed
1 2 3
#
# File:   psychometrics/psychoanalyze.py
#
ichuang committed
4
# generate pyschometrics plots from PsychometricData
ichuang committed
5 6 7 8 9 10 11 12

from __future__ import division

import datetime
import logging
import json
import math
import numpy as np
13
from opaque_keys.edx.locator import BlockUsageLocator
ichuang committed
14 15
from scipy.optimize import curve_fit

16
from django.conf import settings
ichuang committed
17
from django.db.models import Sum, Max
18 19
from psychometrics.models import PsychometricData
from courseware.models import StudentModule
20
from pytz import UTC
ichuang committed
21

22
log = logging.getLogger("edx.psychometrics")
ichuang committed
23

ichuang committed
24
#db = "ocwtutor"        # for debugging
25 26
#db = "default"

ichuang committed
27
db = getattr(settings, 'DATABASE_FOR_PSYCHOMETRICS', 'default')
ichuang committed
28 29 30 31

#-----------------------------------------------------------------------------
# fit functions

ichuang committed
32 33

def func_2pl(x, a, b):
ichuang committed
34 35 36 37
    """
    2-parameter logistic function
    """
    D = 1.7
ichuang committed
38 39
    edax = np.exp(D * a * (x - b))
    return edax / (1 + edax)
ichuang committed
40 41 42 43

#-----------------------------------------------------------------------------
# statistics class

ichuang committed
44

ichuang committed
45 46 47 48
class StatVar(object):
    """
    Simple statistics on floating point numbers: avg, sdv, var, min, max
    """
ichuang committed
49
    def __init__(self, unit=1):
ichuang committed
50 51 52 53 54 55
        self.sum = 0
        self.sum2 = 0
        self.cnt = 0
        self.unit = unit
        self.min = None
        self.max = None
ichuang committed
56 57

    def add(self, x):
ichuang committed
58 59 60 61 62
        if x is None:
            return
        if self.min is None:
            self.min = x
        else:
ichuang committed
63
            if x < self.min:
ichuang committed
64 65 66 67
                self.min = x
        if self.max is None:
            self.max = x
        else:
ichuang committed
68
            if x > self.max:
ichuang committed
69 70
                self.max = x
        self.sum += x
Calen Pennington committed
71
        self.sum2 += x ** 2
ichuang committed
72
        self.cnt += 1
ichuang committed
73

ichuang committed
74 75 76 77
    def avg(self):
        if self.cnt is None:
            return 0
        return self.sum / 1.0 / self.cnt / self.unit
ichuang committed
78

ichuang committed
79 80 81
    def var(self):
        if self.cnt is None:
            return 0
Calen Pennington committed
82
        return (self.sum2 / 1.0 / self.cnt / (self.unit ** 2)) - (self.avg() ** 2)
ichuang committed
83

ichuang committed
84 85
    def sdv(self):
        v = self.var()
Calen Pennington committed
86
        if v > 0:
ichuang committed
87 88 89
            return math.sqrt(v)
        else:
            return 0
ichuang committed
90

ichuang committed
91
    def __str__(self):
ichuang committed
92 93 94
        return 'cnt=%d, avg=%f, sdv=%f' % (self.cnt, self.avg(), self.sdv())

    def __add__(self, x):
ichuang committed
95 96 97 98 99 100
        self.add(x)
        return self

#-----------------------------------------------------------------------------
# histogram generator

ichuang committed
101 102

def make_histogram(ydata, bins=None):
ichuang committed
103 104 105
    '''
    Generate histogram of ydata using bins provided, or by default bins
    from 0 to 100 by 10.  bins should be ordered in increasing order.
ichuang committed
106

ichuang committed
107 108 109 110
    returns dict with keys being bins, and values being counts.
    special: hist['bins'] = bins
    '''
    if bins is None:
ichuang committed
111 112
        bins = range(0, 100, 10)

ichuang committed
113
    nbins = len(bins)
ichuang committed
114
    hist = dict(zip(bins, [0] * nbins))
ichuang committed
115
    for y in ydata:
116
        for b in bins[::-1]:  # in reverse order
Calen Pennington committed
117
            if y > b:
ichuang committed
118 119 120 121
                hist[b] += 1
                break
    # hist['bins'] = bins
    return hist
ichuang committed
122

ichuang committed
123 124
#-----------------------------------------------------------------------------

ichuang committed
125

ichuang committed
126 127 128 129 130
def problems_with_psychometric_data(course_id):
    '''
    Return dict of {problems (location urls): count} for which psychometric data is available.
    Does this for a given course_id.
    '''
131
    pmdset = PsychometricData.objects.using(db).filter(studentmodule__course_id=course_id)
ichuang committed
132
    plist = [p['studentmodule__module_state_key'] for p in pmdset.values('studentmodule__module_state_key').distinct()]
133 134 135 136 137 138 139 140
    problems = dict(
        (
            p,
            pmdset.filter(
                studentmodule__module_state_key=BlockUsageLocator.from_string(p)
            ).count()
        ) for p in plist
    )
ichuang committed
141 142 143 144 145

    return problems

#-----------------------------------------------------------------------------

ichuang committed
146

ichuang committed
147
def generate_plots_for_problem(problem):
ichuang committed
148

149 150 151
    pmdset = PsychometricData.objects.using(db).filter(
        studentmodule__module_state_key=BlockUsageLocator.from_string(problem)
    )
ichuang committed
152 153 154 155 156
    nstudents = pmdset.count()
    msg = ""
    plots = []

    if nstudents < 2:
ichuang committed
157
        msg += "%s nstudents=%d --> skipping, too few" % (problem, nstudents)
ichuang committed
158 159 160 161 162 163
        return msg, plots

    max_grade = pmdset[0].studentmodule.max_grade

    agdat = pmdset.aggregate(Sum('attempts'), Max('attempts'))
    max_attempts = agdat['attempts__max']
164
    total_attempts = agdat['attempts__sum']  # not used yet
ichuang committed
165 166 167

    msg += "max attempts = %d" % max_attempts

ichuang committed
168
    xdat = range(1, max_attempts + 1)
ichuang committed
169 170
    dataset = {'xdat': xdat}

171
    # compute grade statistics
ichuang committed
172
    grades = [pmd.studentmodule.grade for pmd in pmdset]
173 174 175 176 177
    gsv = StatVar()
    for g in grades:
        gsv += g
    msg += "<br><p><font color='blue'>Grade distribution: %s</font></p>" % gsv

ichuang committed
178 179 180 181 182 183 184 185 186 187 188 189 190
    # generate grade histogram
    ghist = []

    axisopts = """{
        xaxes: [{
            axisLabel: 'Grade'
        }],
        yaxes: [{
            position: 'left',
            axisLabel: 'Count'
         }]
         }"""

191 192 193 194
    if gsv.max > max_grade:
        msg += "<br/><p><font color='red'>Something is wrong: max_grade=%s, but max(grades)=%s</font></p>" % (max_grade, gsv.max)
        max_grade = gsv.max

ichuang committed
195
    if max_grade > 1:
ichuang committed
196
        ghist = make_histogram(grades, np.linspace(0, max_grade, max_grade + 1))
ichuang committed
197 198 199 200 201 202
        ghist_json = json.dumps(ghist.items())

        plot = {'title': "Grade histogram for %s" % problem,
                'id': 'histogram',
                'info': '',
                'data': "var dhist = %s;\n" % ghist_json,
ichuang committed
203
                'cmd': '[ {data: dhist, bars: { show: true, align: "center" }} ], %s' % axisopts,
ichuang committed
204 205 206 207 208 209 210
                }
        plots.append(plot)
    else:
        msg += "<br/>Not generating histogram: max_grade=%s" % max_grade

    # histogram of time differences between checks
    # Warning: this is inefficient - doesn't scale to large numbers of students
ichuang committed
211
    dtset = []  # time differences in minutes
ichuang committed
212 213 214
    dtsv = StatVar()
    for pmd in pmdset:
        try:
215
            checktimes = eval(pmd.checktimes)  # update log of attempt timestamps
ichuang committed
216 217
        except:
            continue
ichuang committed
218
        if len(checktimes) < 2:
ichuang committed
219 220 221
            continue
        ct0 = checktimes[0]
        for ct in checktimes[1:]:
ichuang committed
222
            dt = (ct - ct0).total_seconds() / 60.0
223
            if dt < 20:  # ignore if dt too long
ichuang committed
224 225 226 227
                dtset.append(dt)
                dtsv += dt
            ct0 = ct
    if dtsv.cnt > 2:
228
        msg += "<br/><p><font color='brown'>Time differences between checks: %s</font></p>" % dtsv
ichuang committed
229 230 231
        bins = np.linspace(0, 1.5 * dtsv.sdv(), 30)
        dbar = bins[1] - bins[0]
        thist = make_histogram(dtset, bins)
ichuang committed
232 233 234 235 236 237 238 239
        thist_json = json.dumps(sorted(thist.items(), key=lambda(x): x[0]))

        axisopts = """{ xaxes: [{ axisLabel: 'Time (min)'}], yaxes: [{position: 'left',axisLabel: 'Count'}]}"""

        plot = {'title': "Histogram of time differences between checks",
                'id': 'thistogram',
                'info': '',
                'data': "var thist = %s;\n" % thist_json,
ichuang committed
240
                'cmd': '[ {data: thist, bars: { show: true, align: "center", barWidth:%f }} ], %s' % (dbar, axisopts),
ichuang committed
241 242 243 244
                }
        plots.append(plot)

    # one IRT plot curve for each grade received (TODO: this assumes integer grades)
ichuang committed
245
    for grade in range(1, int(max_grade) + 1):
ichuang committed
246 247 248
        yset = {}
        gset = pmdset.filter(studentmodule__grade=grade)
        ngset = gset.count()
ichuang committed
249
        if ngset == 0:
ichuang committed
250 251 252 253
            continue
        ydat = []
        ylast = 0
        for x in xdat:
ichuang committed
254
            y = gset.filter(attempts=x).count() / ngset
Calen Pennington committed
255
            ydat.append(y + ylast)
ichuang committed
256 257 258
            ylast = y + ylast
        yset['ydat'] = ydat

259
        if len(ydat) > 3:  # try to fit to logistic function if enough data points
260 261 262 263 264 265 266 267 268 269
            try:
                cfp = curve_fit(func_2pl, xdat, ydat, [1.0, max_attempts / 2.0])
                yset['fitparam'] = cfp
                yset['fitpts'] = func_2pl(np.array(xdat), *cfp[0])
                yset['fiterr'] = [yd - yf for (yd, yf) in zip(ydat, yset['fitpts'])]
                fitx = np.linspace(xdat[0], xdat[-1], 100)
                yset['fitx'] = fitx
                yset['fity'] = func_2pl(np.array(fitx), *cfp[0])
            except Exception as err:
                log.debug('Error in psychoanalyze curve fitting: %s' % err)
ichuang committed
270 271 272 273 274 275 276 277 278 279 280 281 282 283 284

        dataset['grade_%d' % grade] = yset

    axisopts = """{
        xaxes: [{
            axisLabel: 'Number of Attempts'
        }],
        yaxes: [{
            max:1.0,
            position: 'left',
            axisLabel: 'Probability of correctness'
         }]
         }"""

    # generate points for flot plot
ichuang committed
285
    for grade in range(1, int(max_grade) + 1):
ichuang committed
286 287 288 289 290
        jsdata = ""
        jsplots = []
        gkey = 'grade_%d' % grade
        if gkey in dataset:
            yset = dataset[gkey]
ichuang committed
291
            jsdata += "var d%d = %s;\n" % (grade, json.dumps(zip(xdat, yset['ydat'])))
ichuang committed
292 293
            jsplots.append('{ data: d%d, lines: { show: false }, points: { show: true}, color: "red" }' % grade)
            if 'fitpts' in yset:
ichuang committed
294
                jsdata += 'var fit = %s;\n' % (json.dumps(zip(yset['fitx'], yset['fity'])))
ichuang committed
295
                jsplots.append('{ data: fit,  lines: { show: true }, color: "blue" }')
ichuang committed
296 297
                (a, b) = yset['fitparam'][0]
                irtinfo = "(2PL: D=1.7, a=%6.3f, b=%6.3f)" % (a, b)
ichuang committed
298 299 300
            else:
                irtinfo = ""

ichuang committed
301
            plots.append({'title': 'IRT Plot for grade=%s %s' % (grade, irtinfo),
ichuang committed
302 303 304
                          'id': "irt%s" % grade,
                          'info': '',
                          'data': jsdata,
ichuang committed
305
                          'cmd': '[%s], %s' % (','.join(jsplots), axisopts),
306
                          })
ichuang committed
307 308 309 310 311 312

    #log.debug('plots = %s' % plots)
    return msg, plots

#-----------------------------------------------------------------------------

ichuang committed
313

314
def make_psychometrics_data_update_handler(course_id, user, module_state_key):
ichuang committed
315 316
    """
    Construct and return a procedure which may be called to update
317
    the PsychometricData instance for the given StudentModule instance.
ichuang committed
318
    """
319
    sm, status = StudentModule.objects.get_or_create(
320
        course_id=course_id,
321 322 323 324
        student=user,
        module_state_key=module_state_key,
        defaults={'state': '{}', 'module_type': 'problem'},
    )
325

ichuang committed
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
    try:
        pmd = PsychometricData.objects.using(db).get(studentmodule=sm)
    except PsychometricData.DoesNotExist:
        pmd = PsychometricData(studentmodule=sm)

    def psychometrics_data_update_handler(state):
        """
        This function may be called each time a problem is successfully checked
        (eg on save_problem_check events in capa_module).

        state = instance state (a nice, uniform way to interface - for more future psychometric feature extraction)
        """
        try:
            state = json.loads(sm.state)
            done = state['done']
        except:
ichuang committed
342
            log.exception("Oops, failed to eval state for %s (state=%s)" % (sm, sm.state))
ichuang committed
343 344 345
            return

        pmd.done = done
346
        try:
347
            pmd.attempts = state.get('attempts', 0)
348
        except:
349
            log.exception("no attempts for %s (state=%s)" % (sm, sm.state))
350

ichuang committed
351
        try:
352
            checktimes = eval(pmd.checktimes)  # update log of attempt timestamps
ichuang committed
353 354
        except:
            checktimes = []
355
        checktimes.append(datetime.datetime.now(UTC))
ichuang committed
356 357 358 359 360 361 362
        pmd.checktimes = checktimes
        try:
            pmd.save()
        except:
            log.exception("Error in updating psychometrics data for %s" % sm)

    return psychometrics_data_update_handler