Commit 41ea1383 by David Ormsbee Committed by GitHub

Merge pull request #14399 from edx/ormsbee/csm_crawler_locking

Disable student state writes for crawlers.
parents 1c8cf285 608fceb0
......@@ -687,7 +687,7 @@ class FieldDataCache(object):
A cache of django model objects needed to supply the data
for a module and its descendants
"""
def __init__(self, descriptors, course_id, user, select_for_update=False, asides=None):
def __init__(self, descriptors, course_id, user, asides=None, read_only=False):
"""
Find any courseware.models objects that are needed by any descriptor
in descriptors. Attempts to minimize the number of queries to the database.
......@@ -698,8 +698,8 @@ class FieldDataCache(object):
descriptors: A list of XModuleDescriptors.
course_id: The id of the current course
user: The user for which to cache data
select_for_update: Ignored
asides: The list of aside types to load, or None to prefetch no asides.
read_only: We should not perform writes (they become a no-op).
"""
if asides is None:
self.asides = []
......@@ -709,6 +709,7 @@ class FieldDataCache(object):
assert isinstance(course_id, CourseKey)
self.course_id = course_id
self.user = user
self.read_only = read_only
self.cache = {
Scope.user_state: UserStateCache(
......@@ -783,7 +784,7 @@ class FieldDataCache(object):
@classmethod
def cache_for_descriptor_descendents(cls, course_id, user, descriptor, depth=None,
descriptor_filter=lambda descriptor: True,
select_for_update=False, asides=None):
asides=None, read_only=False):
"""
course_id: the course in the context of which we want StudentModules.
user: the django user for whom to load modules.
......@@ -792,9 +793,8 @@ class FieldDataCache(object):
the supplied descriptor. If depth is None, load all descendant StudentModules
descriptor_filter is a function that accepts a descriptor and return whether the field data
should be cached
select_for_update: Ignored
"""
cache = FieldDataCache([], course_id, user, select_for_update, asides=asides)
cache = FieldDataCache([], course_id, user, asides=asides, read_only=read_only)
cache.add_descriptor_descendents(descriptor, depth, descriptor_filter)
return cache
......@@ -840,6 +840,8 @@ class FieldDataCache(object):
kv_dict (dict): dict mapping from `DjangoKeyValueStore.Key`s to field values
Raises: DatabaseError if any fields fail to save
"""
if self.read_only:
return
saved_fields = []
by_scope = defaultdict(dict)
......@@ -875,6 +877,8 @@ class FieldDataCache(object):
Raises: KeyError if key isn't found in the cache
"""
if self.read_only:
return
if key.scope.user == UserScope.ONE and not self.user.is_anonymous():
# If we're getting user data, we expect that the key matches the
......
......@@ -49,6 +49,7 @@ from lms.djangoapps.lms_xblock.models import XBlockAsidesConfig
from lms.djangoapps.lms_xblock.runtime import LmsModuleSystem
from lms.djangoapps.verify_student.services import VerificationService, ReverificationService
from openedx.core.djangoapps.bookmarks.services import BookmarksService
from openedx.core.djangoapps.crawlers.models import CrawlersConfig
from openedx.core.djangoapps.credit.services import CreditService
from openedx.core.djangoapps.util.user_utils import SystemUser
from openedx.core.lib.xblock_utils import (
......@@ -917,7 +918,8 @@ def get_module_by_usage_id(request, course_id, usage_id, disable_staff_debug_inf
field_data_cache = FieldDataCache.cache_for_descriptor_descendents(
course_id,
user,
descriptor
descriptor,
read_only=CrawlersConfig.is_crawler(request),
)
instance = get_module_for_descriptor(
user,
......
......@@ -50,6 +50,7 @@ from lms.djangoapps.commerce.utils import EcommerceService # pylint: disable=im
from milestones.tests.utils import MilestonesTestCaseMixin
from openedx.core.djangoapps.self_paced.models import SelfPacedConfiguration
from openedx.core.lib.gating import api as gating_api
from openedx.core.djangoapps.crawlers.models import CrawlersConfig
from student.models import CourseEnrollment
from student.tests.factories import AdminFactory, UserFactory, CourseEnrollmentFactory
from util.tests.test_date_utils import fake_ugettext, fake_pgettext
......@@ -58,7 +59,7 @@ from util.views import ensure_valid_course_key
from xmodule.modulestore import ModuleStoreEnum
from xmodule.modulestore.django import modulestore
from xmodule.modulestore.tests.django_utils import TEST_DATA_MIXED_MODULESTORE
from xmodule.modulestore.tests.django_utils import ModuleStoreTestCase
from xmodule.modulestore.tests.django_utils import ModuleStoreTestCase, SharedModuleStoreTestCase
from xmodule.modulestore.tests.factories import CourseFactory, ItemFactory, check_mongo_calls
from openedx.core.djangoapps.credit.api import set_credit_requirements
from openedx.core.djangoapps.credit.models import CreditCourse, CreditProvider
......@@ -2065,3 +2066,83 @@ class TestRenderXBlockSelfPaced(TestRenderXBlock):
def course_options(self):
return {'self_paced': True}
class TestIndexViewCrawlerStudentStateWrites(SharedModuleStoreTestCase):
"""
Ensure that courseware index requests do not trigger student state writes.
This is to prevent locking issues that have caused latency spikes in the
courseware_studentmodule table when concurrent requests each try to update
the same rows for sequence, section, and course positions.
"""
@classmethod
def setUpClass(cls):
"""Set up the simplest course possible."""
# setUpClassAndTestData() already calls setUpClass on SharedModuleStoreTestCase
# pylint: disable=super-method-not-called
with super(TestIndexViewCrawlerStudentStateWrites, cls).setUpClassAndTestData():
cls.course = CourseFactory.create()
with cls.store.bulk_operations(cls.course.id):
cls.chapter = ItemFactory.create(category='chapter', parent_location=cls.course.location)
cls.section = ItemFactory.create(category='sequential', parent_location=cls.chapter.location)
cls.vertical = ItemFactory.create(category='vertical', parent_location=cls.section.location)
@classmethod
def setUpTestData(cls):
"""Set up and enroll our fake user in the course."""
cls.password = 'test'
cls.user = UserFactory(password=cls.password)
CourseEnrollment.enroll(cls.user, cls.course.id)
def setUp(self):
"""Do the client login."""
super(TestIndexViewCrawlerStudentStateWrites, self).setUp()
self.client.login(username=self.user.username, password=self.password)
def test_write_by_default(self):
"""By default, always write student state, regardless of user agent."""
with patch('courseware.model_data.UserStateCache.set_many') as patched_state_client_set_many:
# Simulate someone using Chrome
self._load_courseware('Mozilla/5.0 AppleWebKit/537.36')
self.assertTrue(patched_state_client_set_many.called)
patched_state_client_set_many.reset_mock()
# Common crawler user agent
self._load_courseware('edX-downloader/0.1')
self.assertTrue(patched_state_client_set_many.called)
def test_writes_with_config(self):
"""Test state writes (or lack thereof) based on config values."""
CrawlersConfig.objects.create(known_user_agents='edX-downloader,crawler_foo', enabled=True)
with patch('courseware.model_data.UserStateCache.set_many') as patched_state_client_set_many:
# Exact matching of crawler user agent
self._load_courseware('crawler_foo')
self.assertFalse(patched_state_client_set_many.called)
# Partial matching of crawler user agent
self._load_courseware('edX-downloader/0.1')
self.assertFalse(patched_state_client_set_many.called)
# Simulate an actual browser hitting it (we should write)
self._load_courseware('Mozilla/5.0 AppleWebKit/537.36')
self.assertTrue(patched_state_client_set_many.called)
# Disabling the crawlers config should revert us to default behavior
CrawlersConfig.objects.create(enabled=False)
self.test_write_by_default()
def _load_courseware(self, user_agent):
"""Helper to load the actual courseware page."""
url = reverse(
'courseware_section',
kwargs={
'course_id': unicode(self.course.id),
'chapter': unicode(self.chapter.location.name),
'section': unicode(self.section.location.name),
}
)
response = self.client.get(url, HTTP_USER_AGENT=user_agent)
# Make sure we get back an actual 200, and aren't redirected because we
# messed up the setup somehow (e.g. didn't enroll properly)
self.assertEqual(response.status_code, 200)
......@@ -26,6 +26,7 @@ from xblock.fragment import Fragment
from opaque_keys.edx.keys import CourseKey
from openedx.core.djangoapps.lang_pref import LANGUAGE_KEY
from openedx.core.djangoapps.user_api.preferences.api import get_user_preference
from openedx.core.djangoapps.crawlers.models import CrawlersConfig
from shoppingcart.models import CourseRegistrationCode
from student.models import CourseEnrollment
from student.views import is_course_blocked
......@@ -101,7 +102,7 @@ class CoursewareIndex(View):
self.course = get_course_with_access(request.user, 'load', self.course_key, depth=CONTENT_DEPTH)
self.is_staff = has_access(request.user, 'staff', self.course)
self._setup_masquerade_for_effective_user()
return self._get()
return self._get(request)
except Redirect as redirect_error:
return redirect(redirect_error.url)
except UnicodeEncodeError:
......@@ -127,12 +128,12 @@ class CoursewareIndex(View):
# Set the user in the request to the effective user.
self.request.user = self.effective_user
def _get(self):
def _get(self, request):
"""
Render the index page.
"""
self._redirect_if_needed_to_access_course()
self._prefetch_and_bind_course()
self._prefetch_and_bind_course(request)
if self.course.has_children_at_depth(CONTENT_DEPTH):
self._reset_section_to_exam_if_required()
......@@ -324,13 +325,17 @@ class CoursewareIndex(View):
if self.chapter:
return self._find_block(self.chapter, self.section_url_name, 'section')
def _prefetch_and_bind_course(self):
def _prefetch_and_bind_course(self, request):
"""
Prefetches all descendant data for the requested section and
sets up the runtime, which binds the request user to the section.
"""
self.field_data_cache = FieldDataCache.cache_for_descriptor_descendents(
self.course_key, self.effective_user, self.course, depth=CONTENT_DEPTH,
self.course_key,
self.effective_user,
self.course,
depth=CONTENT_DEPTH,
read_only=CrawlersConfig.is_crawler(request),
)
self.course = get_module_for_descriptor(
......
......@@ -2160,6 +2160,9 @@ INSTALLED_APPS = (
# Customized celery tasks, including persisting failed tasks so they can
# be retried
'openedx.core.djangoapps.celery_utils',
# Ability to detect and special-case crawler behavior
'openedx.core.djangoapps.crawlers',
)
# Migrations which are not in the standard module "migrations"
......
"""Admin panel for configuring which user agents we consider to be Crawlers."""
from django.contrib import admin
from config_models.admin import ConfigurationModelAdmin
from .models import CrawlersConfig
admin.site.register(CrawlersConfig, ConfigurationModelAdmin)
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from django.db import migrations, models
import django.db.models.deletion
from django.conf import settings
class Migration(migrations.Migration):
dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.CreateModel(
name='CrawlersConfig',
fields=[
('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
('change_date', models.DateTimeField(auto_now_add=True, verbose_name='Change date')),
('enabled', models.BooleanField(default=False, verbose_name='Enabled')),
('known_user_agents', models.TextField(default=b'edX-downloader', help_text=b'A comma-separated list of known crawler user agents.', blank=True)),
('changed_by', models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, editable=False, to=settings.AUTH_USER_MODEL, null=True, verbose_name='Changed by')),
],
options={
'ordering': ('-change_date',),
'abstract': False,
},
),
]
"""
This module handles the detection of crawlers, so that we can handle them
appropriately in other parts of the code.
"""
from django.db import models
from config_models.models import ConfigurationModel
class CrawlersConfig(ConfigurationModel):
"""Configuration for the crawlers django app."""
known_user_agents = models.TextField(
blank=True,
help_text="A comma-separated list of known crawler user agents.",
default='edX-downloader',
)
def __unicode__(self):
return u'CrawlersConfig("{}")'.format(self.known_user_agents)
@classmethod
def is_crawler(cls, request):
"""Determine if the request came from a crawler or not.
This method is simplistic and only looks at the user agent header at the
moment, but could later be improved to be smarter about detection.
"""
current = cls.current()
if not current.enabled:
return False
req_user_agent = request.META.get('HTTP_USER_AGENT')
crawler_agents = current.known_user_agents.split(",")
# If there was no user agent detected or no crawler agents configured,
# then just return False.
if (not req_user_agent) or (not crawler_agents):
return False
# We perform prefix matching of the crawler agent here so that we don't
# have to worry about version bumps.
return any(
req_user_agent.startswith(crawler_agent)
for crawler_agent in crawler_agents
)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment