Commit cfa873cc by zubair-arbi

ignore MAC meta files on import and also remove any such files from the course

STUD-1725
parent d53a6669
"""
Script for removing all redundant Mac OS metadata files (with filename ".DS_Store"
or with filename which starts with "._") for all courses
"""
import logging
from django.core.management.base import BaseCommand
from xmodule.contentstore.django import contentstore
log = logging.getLogger(__name__)
class Command(BaseCommand):
"""
Remove all Mac OS related redundant files for all courses in contentstore
"""
help = 'Remove all Mac OS related redundant file/files for all courses in contentstore'
def handle(self, *args, **options):
"""
Execute the command
"""
content_store = contentstore()
success = False
log.info(u"-" * 80)
log.info(u"Cleaning up assets for all courses")
try:
# Remove all redundant Mac OS metadata files
assets_deleted = content_store.remove_redundant_content_for_courses()
success = True
except Exception as err:
log.info(u"=" * 30 + u"> failed to cleanup")
log.info(u"Error:")
log.info(err)
if success:
log.info(u"=" * 80)
log.info(u"Total number of assets deleted: {0}".format(assets_deleted))
"""
Test for assets cleanup of courses for Mac OS metadata files (with filename ".DS_Store"
or with filename which starts with "._")
"""
from django.core.management import call_command
from opaque_keys.edx.locations import SlashSeparatedCourseKey
from xmodule.contentstore.content import XASSET_LOCATION_TAG
from xmodule.contentstore.django import contentstore
from xmodule.modulestore.django import modulestore
from xmodule.modulestore.mongo.base import location_to_query
from xmodule.modulestore.tests.django_utils import ModuleStoreTestCase
from xmodule.modulestore.xml_importer import import_from_xml
class ExportAllCourses(ModuleStoreTestCase):
"""
Tests assets cleanup for all courses.
"""
def setUp(self):
""" Common setup. """
self.content_store = contentstore()
self.module_store = modulestore()
def test_export_all_courses(self):
"""
This test validates that redundant Mac metadata files ('._example.txt', '.DS_Store') are
cleaned up on import
"""
import_from_xml(
self.module_store,
'**replace_user**',
'common/test/data/',
['dot-underscore'],
static_content_store=self.content_store,
do_import_static=True,
verbose=True
)
course = self.module_store.get_course(SlashSeparatedCourseKey('edX', 'dot-underscore', '2014_Fall'))
self.assertIsNotNone(course)
# check that there are two assets ['example.txt', '.example.txt'] in contentstore for imported course
all_assets, count = self.content_store.get_all_content_for_course(course.id)
self.assertEqual(count, 2)
self.assertEqual(all_assets[0]['_id']['name'], u'.example.txt')
self.assertEqual(all_assets[1]['_id']['name'], u'example.txt')
# manually add redundant assets (file ".DS_Store" and filename starts with "._")
course_filter = course.id.make_asset_key("asset", None)
query = location_to_query(course_filter, wildcard=True, tag=XASSET_LOCATION_TAG)
query['_id.name'] = all_assets[0]['_id']['name']
asset_doc = self.content_store.fs_files.find_one(query)
asset_doc['_id']['name'] = u'._example_test.txt'
self.content_store.fs_files.insert(asset_doc)
asset_doc['_id']['name'] = u'.DS_Store'
self.content_store.fs_files.insert(asset_doc)
# check that now course has four assets
all_assets, count = self.content_store.get_all_content_for_course(course.id)
self.assertEqual(count, 4)
self.assertEqual(all_assets[0]['_id']['name'], u'.example.txt')
self.assertEqual(all_assets[1]['_id']['name'], u'example.txt')
self.assertEqual(all_assets[2]['_id']['name'], u'._example_test.txt')
self.assertEqual(all_assets[3]['_id']['name'], u'.DS_Store')
# now call asset_cleanup command and check that there is only two proper assets in contentstore for the course
call_command('cleanup_assets')
all_assets, count = self.content_store.get_all_content_for_course(course.id)
self.assertEqual(count, 2)
self.assertEqual(all_assets[0]['_id']['name'], u'.example.txt')
self.assertEqual(all_assets[1]['_id']['name'], u'example.txt')
......@@ -151,6 +151,8 @@ TECH_SUPPORT_EMAIL = ENV_TOKENS.get('TECH_SUPPORT_EMAIL', TECH_SUPPORT_EMAIL)
COURSES_WITH_UNSAFE_CODE = ENV_TOKENS.get("COURSES_WITH_UNSAFE_CODE", [])
ASSET_IGNORE_REGEX = ENV_TOKENS.get('ASSET_IGNORE_REGEX', ASSET_IGNORE_REGEX)
# Theme overrides
THEME_NAME = ENV_TOKENS.get('THEME_NAME', None)
......
......@@ -31,7 +31,7 @@ import lms.envs.common
# Although this module itself may not use these imported variables, other dependent modules may.
from lms.envs.common import (
USE_TZ, TECH_SUPPORT_EMAIL, PLATFORM_NAME, BUGS_EMAIL, DOC_STORE_CONFIG, ALL_LANGUAGES, WIKI_ENABLED, MODULESTORE,
update_module_store_settings
update_module_store_settings, ASSET_IGNORE_REGEX
)
from path import path
from warnings import simplefilter
......
......@@ -13,6 +13,7 @@ import os
import json
from bson.son import SON
from opaque_keys.edx.keys import AssetKey
from xmodule.modulestore.django import ASSET_IGNORE_REGEX
class MongoContentStore(ContentStore):
......@@ -170,6 +171,26 @@ class MongoContentStore(ContentStore):
course_key, start=start, maxresults=maxresults, get_thumbnails=False, sort=sort
)
def remove_redundant_content_for_courses(self):
"""
Finds and removes all redundant files (Mac OS metadata files with filename ".DS_Store"
or filename starts with "._") for all courses
"""
assets_to_delete = 0
for prefix in ['_id', 'content_son']:
query = SON([
('{}.tag'.format(prefix), XASSET_LOCATION_TAG),
('{}.category'.format(prefix), 'asset'),
('{}.name'.format(prefix), {'$regex': ASSET_IGNORE_REGEX}),
])
items = self.fs_files.find(query)
assets_to_delete = assets_to_delete + items.count()
for asset in items:
self.fs.delete(asset[prefix])
self.fs_files.remove(query)
return assets_to_delete
def _get_all_content_for_course(self, course_key, get_thumbnails=False, start=0, maxresults=-1, sort=None):
'''
Returns a list of all static assets for a course. The return format is a list of asset data dictionary elements.
......
......@@ -8,6 +8,8 @@ from __future__ import absolute_import
from importlib import import_module
from django.conf import settings
if not settings.configured:
settings.configure()
from django.core.cache import get_cache, InvalidCacheBackendError
import django.utils
......@@ -25,6 +27,8 @@ try:
except ImportError:
HAS_REQUEST_CACHE = False
ASSET_IGNORE_REGEX = getattr(settings, "ASSET_IGNORE_REGEX", r"(^\._.*$)|(^\.DS_Store$)|(^.*~$)")
def load_function(path):
"""
......
......@@ -3,6 +3,7 @@ import os
import mimetypes
from path import path
import json
import re
from .xml import XMLModuleStore, ImportSystem, ParentTracker
from xblock.runtime import KvsFieldData, DictKeyValueStore
......@@ -15,6 +16,7 @@ from xmodule.errortracker import make_error_tracker
from .store_utilities import rewrite_nonportable_content_links
import xblock
from xmodule.tabs import CourseTabList
from xmodule.modulestore.django import ASSET_IGNORE_REGEX
from xmodule.modulestore.exceptions import InvalidLocationError
from xmodule.modulestore.mongo.base import MongoRevisionKey
from xmodule.modulestore import ModuleStoreEnum
......@@ -49,7 +51,7 @@ def import_static_content(
content_path = os.path.join(dirname, filename)
if filename.endswith('~'):
if re.match(ASSET_IGNORE_REGEX, filename):
if verbose:
log.debug('skipping static content %s...', content_path)
continue
......
......@@ -21,3 +21,21 @@ class IgnoredFilesTestCase(unittest.TestCase):
self.assertIn("example.txt", name_val)
self.assertNotIn("example.txt~", name_val)
self.assertIn("GREEN", name_val["example.txt"])
def test_ignore_dot_underscore_static_files(self):
"""
Test for ignored Mac OS metadata files (filename starts with "._")
"""
course_dir = DATA_DIR / "dot-underscore"
course_id = SlashSeparatedCourseKey("edX", "dot-underscore", "2014_Fall")
content_store = Mock()
content_store.generate_thumbnail.return_value = ("content", "location")
import_static_content(course_dir, content_store, course_id)
saved_static_content = [call[0][0] for call in content_store.save.call_args_list]
name_val = {sc.name: sc.data for sc in saved_static_content}
self.assertIn("example.txt", name_val)
self.assertIn(".example.txt", name_val)
self.assertNotIn("._example.txt", name_val)
self.assertNotIn(".DS_Store", name_val)
self.assertIn("GREEN", name_val["example.txt"])
self.assertIn("BLUE", name_val[".example.txt"])
IGNORE MAC METADATA FILES
This course simulates an import of a course from a Mac OS that has some unnessary
metadata files (filename starts with ._) in assets (static/._example.txt). These
files do not belong with the content so skip them on import and also do a
cleanup for such already added assets.
<course org="edX" course="dot-underscore" slug="2014_Fall"/>
......@@ -253,6 +253,8 @@ for name, value in ENV_TOKENS.get("CODE_JAIL", {}).items():
COURSES_WITH_UNSAFE_CODE = ENV_TOKENS.get("COURSES_WITH_UNSAFE_CODE", [])
ASSET_IGNORE_REGEX = ENV_TOKENS.get('ASSET_IGNORE_REGEX', ASSET_IGNORE_REGEX)
# Event Tracking
if "TRACKING_IGNORE_URL_PATTERNS" in ENV_TOKENS:
TRACKING_IGNORE_URL_PATTERNS = ENV_TOKENS.get("TRACKING_IGNORE_URL_PATTERNS")
......
......@@ -266,6 +266,9 @@ FEATURES = {
}
# Ignore static asset files on import which match this pattern
ASSET_IGNORE_REGEX = r"(^\._.*$)|(^\.DS_Store$)|(^.*~$)"
# Used for A/B testing
DEFAULT_GROUPS = []
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment