Commit f00b42c9 by Jay Zoldak

Merge pull request #88 from edx/zoldak/revert-search-push

Revert accidental push of changes to master
parents b5ba7b82 51b70e42
{
"analyzer": {
"transcript_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["protected", "asciifolding", "custom_word_delimiter", "lowercase", "custom_stemmer", "shingle"],
"char_filter": ["custom_mapping"]
}
},
"filter" : {
"custom_word_delimiter":{
"type": "word_delimiter",
"preserve_original": "true"
},
"custom_stemmer": {
"type": "stemmer",
"name": "english"
},
"protected": {
"type": "keyword_marker",
"keywords_path": "protectedWords.txt"
}
},
"char_filter": {
"custom_mapping": {
"type": "mapping",
"mappings": ["\n=>-"]
}
}
}
\ No newline at end of file
import requests
import json
class ElasticDatabase:
def __init__(self, url, index_settings_file, *args):
"""
Will initialize elastic search object with any indices specified by args
specifically the url should be something of the form `http://localhost:9200`
importantly do not include a slash at the end of the url name.
args should be a list of dictionaries, each dictionary specifying a JSON mapping
to be used for a specific type.
Example Dictionary:
{"index": "transcript", "type": "6-002x", "mapping":
{
"properties" : {
"searchable_text": {
"type": "string",
"store": "yes",
"index": "analyzed"
}
}
}
}
Eventually we will support different configuration files for different indices, but
since this is only indexing transcripts right now it seems excessive"""
self.url = url
self.args = args
self.index_settings = open(index_settings_file, 'rb').read()
def parse_args(self):
for mapping in self.args:
try:
json_mapping = json.loads(mapping)
except ValueError:
print "Badly formed JSON args, please check your mappings file"
break
try:
index = json_mapping['index']
type_ = json_mapping['type']
mapping = json_mapping['mapping']
self.setup_index(index)
self.setup_type(index, type_, mapping)
except KeyError:
print "Could not find needed keys. Keys found: "
print mapping.keys()
continue
def setup_type(self, index, type_, json_mapping):
"""
json_mapping should be a dictionary starting at the properties level of a mapping.
The type level will be added, so if you include it things will break. The purpose of this
is to encourage loose coupling between types and mappings for better code
"""
full_url = "/".join([self.url, index, type_, "_mapping"])
json_put_body = {type_: json_mapping}
requests.put(full_url, data=json_put_body)
def has_index(self, index):
"""Checks to see if a given index exists in the database returns existance boolean,
If this returns something other than a 200 or a 404 something is wrong and so we error"""
full_url = "/".join([self.url, index])
status = requests.head(full_url).status_code
if status == 200:
return True
if status == 404:
return False
else:
print "Got an unexpected reponse code: " + str(status)
raise
def setup_index(self, index):
"""Creates a new elasticsearch index, returns the response it gets"""
full_url = "/".join(self.url, index) + "/"
return requests.put(full_url, data=self.index_settings)
def index_data(self, index, type_, id_, data):
"""Data should be passed in as a dictionary, assumes it matches the given mapping"""
full_url = "/".join([self.url, index, type_, id_])
response = requests.put(full_url, json.dumps(data))
return json.loads(response)['ok']
def get_index_settings(self, index):
"""Returns the current settings of """
full_url = "/".join([self.url, index, "_settings"])
return json.loads(requests.get(full_url)._content)
def get_type_mapping(self, index, type_):
full_url = "/".join([self.url, index, type_, "_mapping"])
return json.loads(requests.get(full_url)._content)
def index_data(self, index, type_, id_, json_data):
full_url = "/".join([self.url, index, type_, id_])
requests.put(full_url, data=json_data)
import os
import os.path as pt
import json
import re
import string
from pyes import *
import nltk.stem.snowball as snowball
import fuzzy
def grab_transcripts(sjson_directory):
"""Returns referenes to all of the files contained within a subs directory"""
all_children = [child for child in os.listdir(sjson_directory)]
all_transcripts = [child for child in all_children if pt.isfile(pt.join(sjson_directory, child))]
# . is not a valid character for a youtube id, so it can be reliably used to pick up the start
# of the file extension
uuids = [transcript_id[:transcript_id.find(".")] for transcript_id in all_transcripts]
parsed_transcripts = [open(pt.join(sjson_directory, transcript)).read() for transcript in all_transcripts]
return zip([clean_transcript(transcript) for transcript in parsed_transcripts], uuids)
def clean_transcript(transcript_string):
"""Tries to parse and clean a raw transcript. Errors for invalid sjson"""
transcript_list = filter(None, json.loads(transcript_string)['text'])
relevant_text = " ".join([phrase.encode('utf-8').strip() for phrase in transcript_list])
relevant_text = relevant_text.lower().translate(None, string.punctuation)
cleanedText = re.sub('\n', " ", relevant_text)
return cleanedText
def phonetic_transcript(clean_transcript, stemmer):
return " ".join([phoneticize(word, stemmer) for word in clean_transcript.split(" ")])
def phoneticize(word, stemmer):
encode = lambda word: word.decode('utf-8').encode('ascii', 'ignore')
phonetic = lambda word: fuzzy.nysiis(stemmer.stem(encode(word)))
return phonetic(word)
def initialize_transcripts(database, mapping):
database.indices.create_index("transcript-index")
def index_course(database, sjson_directory, course_name, mapping):
stemmer = snowball.EnglishStemmer()
database.put_mapping(course_name, {'properties': mapping}, "transcript-index")
all_transcripts = grab_transcripts(sjson_directory)
video_counter = 0
for transcript_tuple in all_transcripts:
data_map = {"searchable_text": transcript_tuple[0], "uuid": transcript_tuple[1]}
data_map['phonetic_text'] = phonetic_transcript(transcript_tuple[0], stemmer)
database.index(data_map, "transcript-index", course_name)
video_counter += 1
database.indices.refresh("transcript-index")
def fuzzy_search(database, query, course_name):
search_query = FuzzyLikeThisFieldQuery("searchable_text", query)
return database.search(query=search_query, indices="transcript-index")
def phonetic_search(database, query, course_name):
stemmer = snowball.EnglishStemmer()
search_query = TextQuery("phonetic_text", phoneticize(query, stemmer))
return database.search(query=search_query, indices="transcript-index")
data_directory = '/Users/climatologist/edx_all/data/content-mit-6002x/static/subs/'
mapping_directory = 'mapping.json'
database = ES('127.0.0.1:9200')
mapping = json.loads(open(mapping_directory, 'rb').read())
#initialize_transcripts(database, mapping)
#index_course(database, data_directory, "test-course", mapping)
fuzzy_results = fuzzy_search(database, "gaussian", "test-course")
phonetic_results = phonetic_search(database, "gaussian", "test-course")
for r in fuzzy_results:
print "Fuzzy: " + r['uuid']
for r in phonetic_results:
print "Phonetic: " + r['uuid']
{
"searchable_text": {
"boost": 1.0,
"index": "analyzed",
"store": "yes",
"type": "string",
"term_vector": "with_positions_offsets",
"analyzer": "transcript_analyzer"
}
}
\ No newline at end of file
"gauss",
"stokes",
"navier",
"einstein",
"goddard",
"oppenheimer",
"bloch",
"hawkings",
"newton",
"bohr",
"darwin",
"planck",
"rontgen",
"tesla",
"franklin"
\ No newline at end of file
{
"settings": {
"index": {
"number_of_replicas": 2,
"number_of_shards": 3
}
}
}
\ No newline at end of file
from django.http import HttpResponse
from django.template.loader import get_template
from django.template import Context
from django.contrib.auth.models import User
from django.contrib.staticfiles import finders
from courseware.courses import get_courses
from courseware.model_data import ModelDataCache
from courseware.module_render import get_module_for_descriptor
from courseware.views import registered_for_course
#import logging
import lxml
import re
import posixpath
import urllib
from os import listdir
from os.path import isfile
from os.path import join
def test(request):
user = User.objects.prefetch_related("groups").get(id=request.user.id)
request.user = user
course_list = get_courses(user, request.META.get('HTTP_HOST'))
all_modules = [get_module(request, user, course) for course in course_list if registered_for_course(course, user)]
child_modules = []
for module in all_modules:
child_modules.extend(module.get_children())
bottom_modules = []
for module in child_modules:
bottom_modules.extend(module.get_children())
asset_divs = get_asset_div(convert_to_valid_html(bottom_modules[2].get_html()))
strings = [get_transcript_directory(lxml.html.tostring(div)) for div in asset_divs]
search_template = get_template('search.html')
html = search_template.render(Context({'course_list': strings}))
return HttpResponse(html)
def get_children(course):
"""Returns the children of a given course"""
attributes = [child.location for child in course._child_instances]
return attributes
def convert_to_valid_html(html):
replacement = {"&lt;": "<", "&gt;": ">", "&#34;": "\"", "&#39;": "'"}
for i, j in replacement.iteritems():
html = html.replace(i, j)
return html
def get_asset_div(html_page):
return lxml.html.find_class(html_page, "video")
def get_module(request, user, course):
model_data_cache = ModelDataCache.cache_for_descriptor_descendents(course.id, user, course, depth=2)
course_module = get_module_for_descriptor(user, request, course, model_data_cache, course.id)
return course_module
def get_youtube_code(module_html):
youtube_snippet = re.sub(r'(.*?)(1\.0:)(.*?)(,1\.25)(.*)', r'\3', module_html)
sliced_youtube_code = youtube_snippet[:youtube_snippet.find('\n')]
return sliced_youtube_code
def get_transcript_directory(module_html):
directory_snippet = re.sub(r'(.*?)(data-caption-asset-path=\")(.*?)(\">.*)', r'\3', module_html)
sliced_directory = directory_snippet[:directory_snippet.find('\n')]
return resolve_to_absolute_path(sliced_directory)
def resolve_to_absolute_path(transcript_directory):
normalized_path = posixpath.normpath(urllib.unquote(transcript_directory)).lstrip('/')
return all_transcript_files(normalized_path)
def all_transcript_files(normalized_path):
files = [transcript for transcript in listdir(normalized_path) if isfile(join(normalized_path, transcript))]
return files
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment