Merge pull request #88 from edx/zoldak/revert-search-push

Revert accidental push of changes to master

Merge pull request #88 from edx/zoldak/revert-search-push
Revert accidental push of changes to master
f00b42c9 · Jay Zoldak · b5ba7b82 · 51b70e42 · b5ba7b82 · b5ba7b82
Commit f00b42c9 authored Jun 06, 2013 by Jay Zoldak
8 changed files
--- a/common/djangoapps/search/__init__.py
+++ b/common/djangoapps/search/__init__.py
--- a/common/djangoapps/search/analyzer.json
+++ b/common/djangoapps/search/analyzer.json
-{
-"analyzer": {
-
-    "transcript_analyzer": {
-        "type": "custom",
-        "tokenizer": "standard",
-        "filter": ["protected", "asciifolding", "custom_word_delimiter", "lowercase", "custom_stemmer", "shingle"],
-        "char_filter": ["custom_mapping"]
-    }
-},
-
-"filter" : {
-
-    "custom_word_delimiter":{
-        "type": "word_delimiter",
-        "preserve_original": "true"
-    },
-
-    "custom_stemmer": {
-        "type": "stemmer",
-        "name": "english"
-    }, 
-
-    "protected": {
-        "type": "keyword_marker",
-        "keywords_path": "protectedWords.txt"
-    }
-},
-
-"char_filter": {
-    "custom_mapping": {
-        "type": "mapping",
-        "mappings": ["\n=>-"]
-    }
-}
-}
\ No newline at end of file
--- a/common/djangoapps/search/es_requests.py
+++ b/common/djangoapps/search/es_requests.py
-import requests
-import json
-
-
-class ElasticDatabase:
-
-    def __init__(self, url, index_settings_file, *args):
-        """
-        Will initialize elastic search object with any indices specified by args
-
-        specifically the url should be something of the form `http://localhost:9200`
-        importantly do not include a slash at the end of the url name.
-
-        args should be a list of dictionaries, each dictionary specifying a JSON mapping
-        to be used for a specific type.
-
-        Example Dictionary:
-            {"index": "transcript", "type": "6-002x", "mapping":
-                {
-                "properties" : {
-                    "searchable_text": {
-                        "type": "string",
-                        "store": "yes",
-                        "index": "analyzed"
-                       }
-                    }
-                }
-            }
-
-        Eventually we will support different configuration files for different indices, but
-        since this is only indexing transcripts right now it seems excessive"""
-
-        self.url = url
-        self.args = args
-        self.index_settings = open(index_settings_file, 'rb').read()
-
-    def parse_args(self):
-        for mapping in self.args:
-            try:
-                json_mapping = json.loads(mapping)
-            except ValueError:
-                print "Badly formed JSON args, please check your mappings file"
-                break
-
-            try:
-                index = json_mapping['index']
-                type_ = json_mapping['type']
-                mapping = json_mapping['mapping']
-                self.setup_index(index)
-                self.setup_type(index, type_, mapping)
-            except KeyError:
-                print "Could not find needed keys. Keys found: "
-                print mapping.keys()
-                continue
-
-    def setup_type(self, index, type_, json_mapping):
-        """
-        json_mapping should be a dictionary starting at the properties level of a mapping.
-
-        The type level will be added, so if you include it things will break. The purpose of this
-        is to encourage loose coupling between types and mappings for better code
-        """
-
-        full_url = "/".join([self.url, index, type_, "_mapping"])
-        json_put_body = {type_: json_mapping}
-        requests.put(full_url, data=json_put_body)
-
-    def has_index(self, index):
-        """Checks to see if a given index exists in the database returns existance boolean,
-
-        If this returns something other than a 200 or a 404 something is wrong and so we error"""
-        full_url = "/".join([self.url, index])
-        status = requests.head(full_url).status_code
-        if status == 200:
-            return True
-        if status == 404:
-            return False
-        else:
-            print "Got an unexpected reponse code: " + str(status)
-            raise
-
-    def setup_index(self, index):
-        """Creates a new elasticsearch index, returns the response it gets"""
-        full_url = "/".join(self.url, index) + "/"
-        return requests.put(full_url, data=self.index_settings)
-
-    def index_data(self, index, type_, id_, data):
-        """Data should be passed in as a dictionary, assumes it matches the given mapping"""
-        full_url = "/".join([self.url, index, type_, id_])
-        response = requests.put(full_url, json.dumps(data))
-        return json.loads(response)['ok']
-
-    def get_index_settings(self, index):
-        """Returns the current settings of """
-        full_url = "/".join([self.url, index, "_settings"])
-        return json.loads(requests.get(full_url)._content)
-
-    def get_type_mapping(self, index, type_):
-        full_url = "/".join([self.url, index, type_, "_mapping"])
-        return json.loads(requests.get(full_url)._content)
-
-    def index_data(self, index, type_, id_, json_data):
-        full_url = "/".join([self.url, index, type_, id_])
-        requests.put(full_url, data=json_data)
--- a/common/djangoapps/search/index.py
+++ b/common/djangoapps/search/index.py
-import os
-import os.path as pt
-import json
-import re
-import string
-
-from pyes import *
-import nltk.stem.snowball as snowball
-import fuzzy
-
-
-def grab_transcripts(sjson_directory):
-    """Returns referenes to all of the files contained within a subs directory"""
-    all_children = [child for child in os.listdir(sjson_directory)]
-    all_transcripts = [child for child in all_children if pt.isfile(pt.join(sjson_directory, child))]
-    # . is not a valid character for a youtube id, so it can be reliably used to pick up the start
-    # of the file extension
-    uuids = [transcript_id[:transcript_id.find(".")] for transcript_id in all_transcripts]
-    parsed_transcripts = [open(pt.join(sjson_directory, transcript)).read() for transcript in all_transcripts]
-    return zip([clean_transcript(transcript) for transcript in parsed_transcripts], uuids)
-
-
-def clean_transcript(transcript_string):
-    """Tries to parse and clean a raw transcript. Errors for invalid sjson"""
-    transcript_list = filter(None, json.loads(transcript_string)['text'])
-    relevant_text = " ".join([phrase.encode('utf-8').strip() for phrase in transcript_list])
-    relevant_text = relevant_text.lower().translate(None, string.punctuation)
-    cleanedText = re.sub('\n', " ", relevant_text)
-    return cleanedText
-
-
-def phonetic_transcript(clean_transcript, stemmer):
-    return " ".join([phoneticize(word, stemmer) for word in clean_transcript.split(" ")])
-
-
-def phoneticize(word, stemmer):
-    encode = lambda word: word.decode('utf-8').encode('ascii', 'ignore')
-    phonetic = lambda word: fuzzy.nysiis(stemmer.stem(encode(word)))
-    return phonetic(word)
-
-
-def initialize_transcripts(database, mapping):
-    database.indices.create_index("transcript-index")
-
-
-def index_course(database, sjson_directory, course_name, mapping):
-    stemmer = snowball.EnglishStemmer()
-    database.put_mapping(course_name, {'properties': mapping}, "transcript-index")
-    all_transcripts = grab_transcripts(sjson_directory)
-    video_counter = 0
-    for transcript_tuple in all_transcripts:
-        data_map = {"searchable_text": transcript_tuple[0], "uuid": transcript_tuple[1]}
-        data_map['phonetic_text'] = phonetic_transcript(transcript_tuple[0], stemmer)
-        database.index(data_map, "transcript-index", course_name)
-        video_counter += 1
-    database.indices.refresh("transcript-index")
-
-
-def fuzzy_search(database, query, course_name):
-    search_query = FuzzyLikeThisFieldQuery("searchable_text", query)
-    return database.search(query=search_query, indices="transcript-index")
-
-
-def phonetic_search(database, query, course_name):
-    stemmer = snowball.EnglishStemmer()
-    search_query = TextQuery("phonetic_text", phoneticize(query, stemmer))
-    return database.search(query=search_query, indices="transcript-index")
-
-
-data_directory = '/Users/climatologist/edx_all/data/content-mit-6002x/static/subs/'
-mapping_directory = 'mapping.json'
-database = ES('127.0.0.1:9200')
-mapping = json.loads(open(mapping_directory, 'rb').read())
-
-#initialize_transcripts(database, mapping)
-#index_course(database, data_directory, "test-course", mapping)
-fuzzy_results = fuzzy_search(database, "gaussian", "test-course")
-phonetic_results = phonetic_search(database, "gaussian", "test-course")
-for r in fuzzy_results:
-    print "Fuzzy: " + r['uuid']
-for r in phonetic_results:
-    print "Phonetic: " + r['uuid']
--- a/common/djangoapps/search/mapping.json
+++ b/common/djangoapps/search/mapping.json
-{
-
-	"searchable_text": {
-		"boost": 1.0,
-		"index": "analyzed",
-		"store": "yes",
-		"type": "string",
-		"term_vector": "with_positions_offsets",
-        "analyzer": "transcript_analyzer"
-	}
-}
\ No newline at end of file
--- a/common/djangoapps/search/protectedWords.txt
+++ b/common/djangoapps/search/protectedWords.txt
-"gauss",
-"stokes",
-"navier",
-"einstein",
-"goddard",
-"oppenheimer",
-"bloch",
-"hawkings",
-"newton",
-"bohr",
-"darwin",
-"planck",
-"rontgen",
-"tesla",
-"franklin"
\ No newline at end of file
--- a/common/djangoapps/search/settings.json
+++ b/common/djangoapps/search/settings.json
-{
-	"settings": {
-		"index": {
-			"number_of_replicas": 2,
-			"number_of_shards": 3
-		}
-	}
-}
\ No newline at end of file
--- a/common/djangoapps/search/views.py
+++ b/common/djangoapps/search/views.py
-from django.http import HttpResponse
-from django.template.loader import get_template
-from django.template import Context
-from django.contrib.auth.models import User
-from django.contrib.staticfiles import finders
-from courseware.courses import get_courses
-from courseware.model_data import ModelDataCache
-from courseware.module_render import get_module_for_descriptor
-
-from courseware.views import registered_for_course
-#import logging
-import lxml
-import re
-import posixpath
-import urllib
-from os import listdir
-from os.path import isfile
-from os.path import join
-
-
-def test(request):
-    user = User.objects.prefetch_related("groups").get(id=request.user.id)
-    request.user = user
-
-    course_list = get_courses(user, request.META.get('HTTP_HOST'))
-
-    all_modules = [get_module(request, user, course) for course in course_list if registered_for_course(course, user)]
-    child_modules = []
-    for module in all_modules:
-        child_modules.extend(module.get_children())
-    bottom_modules = []
-    for module in child_modules:
-        bottom_modules.extend(module.get_children())
-    asset_divs = get_asset_div(convert_to_valid_html(bottom_modules[2].get_html()))
-    strings = [get_transcript_directory(lxml.html.tostring(div)) for div in asset_divs]
-    search_template = get_template('search.html')
-    html = search_template.render(Context({'course_list': strings}))
-    return HttpResponse(html)
-
-
-def get_children(course):
-    """Returns the children of a given course"""
-    attributes = [child.location for child in course._child_instances]
-    return attributes
-
-
-def convert_to_valid_html(html):
-    replacement = {"&lt;": "<", "&gt;": ">", "&#34;": "\"", "&#39;": "'"}
-    for i, j in replacement.iteritems():
-        html = html.replace(i, j)
-    return html
-
-
-def get_asset_div(html_page):
-    return lxml.html.find_class(html_page, "video")
-
-
-def get_module(request, user, course):
-    model_data_cache = ModelDataCache.cache_for_descriptor_descendents(course.id, user, course, depth=2)
-    course_module = get_module_for_descriptor(user, request, course, model_data_cache, course.id)
-    return course_module
-
-
-def get_youtube_code(module_html):
-    youtube_snippet = re.sub(r'(.*?)(1\.0:)(.*?)(,1\.25)(.*)', r'\3', module_html)
-    sliced_youtube_code = youtube_snippet[:youtube_snippet.find('\n')]
-    return sliced_youtube_code
-
-
-def get_transcript_directory(module_html):
-    directory_snippet = re.sub(r'(.*?)(data-caption-asset-path=\")(.*?)(\">.*)', r'\3', module_html)
-    sliced_directory = directory_snippet[:directory_snippet.find('\n')]
-    return resolve_to_absolute_path(sliced_directory)
-
-
-def resolve_to_absolute_path(transcript_directory):
-    normalized_path = posixpath.normpath(urllib.unquote(transcript_directory)).lstrip('/')
-    return all_transcript_files(normalized_path)
-
-
-def all_transcript_files(normalized_path):
-    files = [transcript for transcript in listdir(normalized_path) if isfile(join(normalized_path, transcript))]
-    return files