models.py 16.4 KB
Newer Older
1 2 3 4 5 6 7
"""
WE'RE USING MIGRATIONS!

If you make changes to this model, be sure to create an appropriate migration
file and check it in at the same time as your model changes. To do that,

1. Go to the edx-platform dir
8 9
2. ./manage.py schemamigration instructor_task --auto description_of_your_change
3. Add the migration file created in edx-platform/lms/djangoapps/instructor_task/migrations/
10 11 12 13 14


ASSUMPTIONS: modules have unique IDs, even across different module_types

"""
15 16
from cStringIO import StringIO
from gzip import GzipFile
17
from uuid import uuid4
18
import csv
19
import json
20 21 22
import hashlib
import os.path
import urllib
23

24 25 26 27
from boto.s3.connection import S3Connection
from boto.s3.key import Key

from django.conf import settings
28
from django.contrib.auth.models import User
29 30
from django.db import models, transaction

31 32
from xmodule_django.models import CourseKeyField

33 34 35 36

# define custom states used by InstructorTask
QUEUING = 'QUEUING'
PROGRESS = 'PROGRESS'
37 38 39 40 41 42 43 44 45 46 47 48


class InstructorTask(models.Model):
    """
    Stores information about background tasks that have been submitted to
    perform work by an instructor (or course staff).
    Examples include grading and rescoring.

    `task_type` identifies the kind of task being performed, e.g. rescoring.
    `course_id` uses the course run's unique id to identify the course.
    `task_key` stores relevant input arguments encoded into key value for testing to see
           if the task is already running (together with task_type and course_id).
49 50
    `task_input` stores input arguments as JSON-serialized dict, for reporting purposes.
        Examples include url of problem being rescored, id of student if only one student being rescored.
51 52 53 54 55 56 57 58 59 60 61

    `task_id` stores the id used by celery for the background task.
    `task_state` stores the last known state of the celery task
    `task_output` stores the output of the celery task.
        Format is a JSON-serialized dict.  Content varies by task_type and task_state.

    `requester` stores id of user who submitted the task
    `created` stores date that entry was first created
    `updated` stores date that entry was last modified
    """
    task_type = models.CharField(max_length=50, db_index=True)
62
    course_id = CourseKeyField(max_length=255, db_index=True)
63 64 65 66 67 68 69 70
    task_key = models.CharField(max_length=255, db_index=True)
    task_input = models.CharField(max_length=255)
    task_id = models.CharField(max_length=255, db_index=True)  # max_length from celery_taskmeta
    task_state = models.CharField(max_length=50, null=True, db_index=True)  # max_length from celery_taskmeta
    task_output = models.CharField(max_length=1024, null=True)
    requester = models.ForeignKey(User, db_index=True)
    created = models.DateTimeField(auto_now_add=True, null=True)
    updated = models.DateTimeField(auto_now=True)
71
    subtasks = models.TextField(blank=True)  # JSON dictionary
72 73 74 75 76 77 78 79 80 81 82 83 84

    def __repr__(self):
        return 'InstructorTask<%r>' % ({
            'task_type': self.task_type,
            'course_id': self.course_id,
            'task_input': self.task_input,
            'task_id': self.task_id,
            'task_state': self.task_state,
            'task_output': self.task_output,
        },)

    def __unicode__(self):
        return unicode(repr(self))
85 86 87

    @classmethod
    def create(cls, course_id, task_type, task_key, task_input, requester):
88 89 90 91 92 93 94 95 96 97
        """
        Create an instance of InstructorTask.

        The InstructorTask.save_now method makes sure the InstructorTask entry is committed.
        When called from any view that is wrapped by TransactionMiddleware,
        and thus in a "commit-on-success" transaction, an autocommit buried within here
        will cause any pending transaction to be committed by a successful
        save here.  Any future database operations will take place in a
        separate transaction.
        """
98 99 100 101 102 103 104 105 106 107 108 109
        # create the task_id here, and pass it into celery:
        task_id = str(uuid4())

        json_task_input = json.dumps(task_input)

        # check length of task_input, and return an exception if it's too long:
        if len(json_task_input) > 255:
            fmt = 'Task input longer than 255: "{input}" for "{task}" of "{course}"'
            msg = fmt.format(input=json_task_input, task=task_type, course=course_id)
            raise ValueError(msg)

        # create the task, then save it:
110 111 112 113 114 115 116 117 118
        instructor_task = cls(
            course_id=course_id,
            task_type=task_type,
            task_id=task_id,
            task_key=task_key,
            task_input=json_task_input,
            task_state=QUEUING,
            requester=requester
        )
119
        instructor_task.save_now()
120 121 122 123 124

        return instructor_task

    @transaction.autocommit
    def save_now(self):
125 126 127 128 129 130 131 132 133 134
        """
        Writes InstructorTask immediately, ensuring the transaction is committed.

        Autocommit annotation makes sure the database entry is committed.
        When called from any view that is wrapped by TransactionMiddleware,
        and thus in a "commit-on-success" transaction, this autocommit here
        will cause any pending transaction to be committed by a successful
        save here.  Any future database operations will take place in a
        separate transaction.
        """
135 136 137 138
        self.save()

    @staticmethod
    def create_output_for_success(returned_result):
139 140 141 142 143 144 145
        """
        Converts successful result to output format.

        Raises a ValueError exception if the output is too long.
        """
        # In future, there should be a check here that the resulting JSON
        # will fit in the column.  In the meantime, just return an exception.
146
        json_output = json.dumps(returned_result)
147 148
        if len(json_output) > 1023:
            raise ValueError("Length of task output is too long: {0}".format(json_output))
149 150 151 152 153
        return json_output

    @staticmethod
    def create_output_for_failure(exception, traceback_string):
        """
154
        Converts failed result information to output format.
155 156

        Traceback information is truncated or not included if it would result in an output string
157
        that would not fit in the database.  If the output is still too long, then the
158 159 160 161
        exception message is also truncated.

        Truncation is indicated by adding "..." to the end of the value.
        """
162
        tag = '...'
163
        task_progress = {'exception': type(exception).__name__, 'message': unicode(exception.message)}
164 165 166 167 168 169 170 171 172
        if traceback_string is not None:
            # truncate any traceback that goes into the InstructorTask model:
            task_progress['traceback'] = traceback_string
        json_output = json.dumps(task_progress)
        # if the resulting output is too long, then first shorten the
        # traceback, and then the message, until it fits.
        too_long = len(json_output) - 1023
        if too_long > 0:
            if traceback_string is not None:
173
                if too_long >= len(traceback_string) - len(tag):
174 175 176 177 178
                    # remove the traceback entry entirely (so no key or value)
                    del task_progress['traceback']
                    too_long -= (len(traceback_string) + len('traceback'))
                else:
                    # truncate the traceback:
179 180
                    task_progress['traceback'] = traceback_string[:-(too_long + len(tag))] + tag
                    too_long = 0
181 182
            if too_long > 0:
                # we need to shorten the message:
183
                task_progress['message'] = task_progress['message'][:-(too_long + len(tag))] + tag
184 185 186 187 188
            json_output = json.dumps(task_progress)
        return json_output

    @staticmethod
    def create_output_for_revoked():
189 190
        """Creates standard message to store in output format for revoked tasks."""
        return json.dumps({'message': 'Task revoked before running'})
191 192


193
class ReportStore(object):
194
    """
195 196
    Simple abstraction layer that can fetch and store CSV files for reports
    download. Should probably refactor later to create a ReportFile object that
197 198 199 200 201 202
    can simply be appended to for the sake of memory efficiency, rather than
    passing in the whole dataset. Doing that for now just because it's simpler.
    """
    @classmethod
    def from_config(cls):
        """
203
        Return one of the ReportStore subclasses depending on django
204 205 206 207
        configuration. Look at subclasses for expected configuration.
        """
        storage_type = settings.GRADES_DOWNLOAD.get("STORAGE_TYPE")
        if storage_type.lower() == "s3":
208
            return S3ReportStore.from_config()
209
        elif storage_type.lower() == "localfs":
210
            return LocalFSReportStore.from_config()
211

212 213 214 215 216 217 218 219 220
    def _get_utf8_encoded_rows(self, rows):
        """
        Given a list of `rows` containing unicode strings, return a
        new list of rows with those strings encoded as utf-8 for CSV
        compatibility.
        """
        for row in rows:
            yield [unicode(item).encode('utf-8') for item in row]

221

222
class S3ReportStore(ReportStore):
223
    """
224
    Reports store backed by S3. The directory structure we use to store things
225
    is::
226

227 228 229 230 231 232
        `{bucket}/{root_path}/{sha1 hash of course_id}/filename`

    We might later use subdirectories or metadata to do more intelligent
    grouping and querying, but right now it simply depends on its own
    conventions on where files are stored to know what to display. Clients using
    this class can name the final file whatever they want.
233 234 235 236 237 238 239 240
    """
    def __init__(self, bucket_name, root_path):
        self.root_path = root_path

        conn = S3Connection(
            settings.AWS_ACCESS_KEY_ID,
            settings.AWS_SECRET_ACCESS_KEY
        )
241

242 243 244 245
        self.bucket = conn.get_bucket(bucket_name)

    @classmethod
    def from_config(cls):
246
        """
247
        The expected configuration for an `S3ReportStore` is to have a
248 249 250
        `GRADES_DOWNLOAD` dict in settings with the following fields::

            STORAGE_TYPE : "s3"
251
            BUCKET : Your bucket name, e.g. "reports-bucket"
252 253 254 255 256 257 258
            ROOT_PATH : The path you want to store all course files under. Do not
                        use a leading or trailing slash. e.g. "staging" or
                        "staging/2013", not "/staging", or "/staging/"

        Since S3 access relies on boto, you must also define `AWS_ACCESS_KEY_ID`
        and `AWS_SECRET_ACCESS_KEY` in settings.
        """
259 260 261 262 263 264
        return cls(
            settings.GRADES_DOWNLOAD['BUCKET'],
            settings.GRADES_DOWNLOAD['ROOT_PATH']
        )

    def key_for(self, course_id, filename):
265
        """Return the S3 key we would use to store and retrieve the data for the
266
        given filename."""
267
        hashed_course_id = hashlib.sha1(course_id.to_deprecated_string())
268 269 270 271 272 273 274 275 276 277 278

        key = Key(self.bucket)
        key.key = "{}/{}/{}".format(
            self.root_path,
            hashed_course_id.hexdigest(),
            filename
        )

        return key

    def store(self, course_id, filename, buff):
279 280 281 282 283 284 285 286 287 288
        """
        Store the contents of `buff` in a directory determined by hashing
        `course_id`, and name the file `filename`. `buff` is typically a
        `StringIO`, but can be anything that implements `.getvalue()`.

        This method assumes that the contents of `buff` are gzip-encoded (it
        will add the appropriate headers to S3 to make the decompression
        transparent via the browser). Filenames should end in whatever
        suffix makes sense for the original file, so `.txt` instead of `.gz`
        """
289 290 291 292 293 294 295
        key = self.key_for(course_id, filename)

        data = buff.getvalue()
        key.size = len(data)
        key.content_encoding = "gzip"
        key.content_type = "text/csv"

296 297 298
        # Just setting the content encoding and type above should work
        # according to the docs, but when experimenting, this was necessary for
        # it to actually take.
299 300 301
        key.set_contents_from_string(
            data,
            headers={
302 303 304
                "Content-Encoding": "gzip",
                "Content-Length": len(data),
                "Content-Type": "text/csv",
305 306 307 308 309
            }
        )

    def store_rows(self, course_id, filename, rows):
        """
310 311 312 313 314 315
        Given a `course_id`, `filename`, and `rows` (each row is an iterable of
        strings), create a buffer that is a gzip'd csv file, and then `store()`
        that buffer.

        Even though we store it in gzip format, browsers will transparently
        download and decompress it. Filenames should end in `.csv`, not `.gz`.
316 317 318
        """
        output_buffer = StringIO()
        gzip_file = GzipFile(fileobj=output_buffer, mode="wb")
319 320
        csvwriter = csv.writer(gzip_file)
        csvwriter.writerows(self._get_utf8_encoded_rows(rows))
321 322 323 324 325 326 327 328 329 330
        gzip_file.close()

        self.store(course_id, filename, output_buffer)

    def links_for(self, course_id):
        """
        For a given `course_id`, return a list of `(filename, url)` tuples. `url`
        can be plugged straight into an href
        """
        course_dir = self.key_for(course_id, '')
331 332 333 334
        return [
            (key.key.split("/")[-1], key.generate_url(expires_in=300))
            for key in sorted(self.bucket.list(prefix=course_dir.key), reverse=True, key=lambda k: k.last_modified)
        ]
335 336


337
class LocalFSReportStore(ReportStore):
338
    """
339 340
    LocalFS implementation of a ReportStore. This is meant for debugging
    purposes and is *absolutely not for production use*. Use S3ReportStore for
341 342 343 344 345
    that. We use this in tests and for local development. When it generates
    links, it will make file:/// style links. That means you actually have to
    copy them and open them in a separate browser window, for security reasons.
    This lets us do the cheap thing locally for debugging without having to open
    up a separate URL that would only be used to send files in dev.
346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361
    """
    def __init__(self, root_path):
        """
        Initialize with root_path where we're going to store our files. We
        will build a directory structure under this for each course.
        """
        self.root_path = root_path
        if not os.path.exists(root_path):
            os.makedirs(root_path)

    @classmethod
    def from_config(cls):
        """
        Generate an instance of this object from Django settings. It assumes
        that there is a dict in settings named GRADES_DOWNLOAD and that it has
        a ROOT_PATH that maps to an absolute file path that the web app has
362
        write permissions to. `LocalFSReportStore` will create any intermediate
363 364 365
        directories as needed. Example::

            STORAGE_TYPE : "localfs"
366
            ROOT_PATH : /tmp/edx/report-downloads/
367 368 369 370 371
        """
        return cls(settings.GRADES_DOWNLOAD['ROOT_PATH'])

    def path_to(self, course_id, filename):
        """Return the full path to a given file for a given course."""
372
        return os.path.join(self.root_path, urllib.quote(course_id.to_deprecated_string(), safe=''), filename)
373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394

    def store(self, course_id, filename, buff):
        """
        Given the `course_id` and `filename`, store the contents of `buff` in
        that file. Overwrite anything that was there previously. `buff` is
        assumed to be a StringIO objecd (or anything that can flush its contents
        to string using `.getvalue()`).
        """
        full_path = self.path_to(course_id, filename)
        directory = os.path.dirname(full_path)
        if not os.path.exists(directory):
            os.mkdir(directory)

        with open(full_path, "wb") as f:
            f.write(buff.getvalue())

    def store_rows(self, course_id, filename, rows):
        """
        Given a course_id, filename, and rows (each row is an iterable of strings),
        write this data out.
        """
        output_buffer = StringIO()
395 396 397
        csvwriter = csv.writer(output_buffer)
        csvwriter.writerows(self._get_utf8_encoded_rows(rows))

398 399 400 401 402
        self.store(course_id, filename, output_buffer)

    def links_for(self, course_id):
        """
        For a given `course_id`, return a list of `(filename, url)` tuples. `url`
403
        can be plugged straight into an href. Note that `LocalFSReportStore`
404 405 406
        will generate `file://` type URLs, so you'll need to copy the URL and
        open it in a new browser window. Again, this class is only meant for
        local development.
407 408 409 410
        """
        course_dir = self.path_to(course_id, '')
        if not os.path.exists(course_dir):
            return []
411 412 413 414 415 416 417
        files = [(filename, os.path.join(course_dir, filename)) for filename in os.listdir(course_dir)]
        files.sort(key=lambda (filename, full_path): os.path.getmtime(full_path), reverse=True)

        return [
            (filename, ("file://" + urllib.quote(full_path)))
            for filename, full_path in files
        ]