Commit 1ac6e123 by Slater-Victoroff

Pyes working, considering switch to raw requests, phonetic and fuzzy search both working

parent c865641f
import os
import os.path as pt
import json
import re
import string
from pyes import *
import nltk.stem.snowball as snowball
import fuzzy
def grab_transcripts(sjson_directory):
"""Returns referenes to all of the files contained within a subs directory"""
all_children = [child for child in os.listdir(sjson_directory)]
all_transcripts = [child for child in all_children if pt.isfile(pt.join(sjson_directory, child))]
# . is not a valid character for a youtube id, so it can be reliably used to pick up the start
# of the file extension
uuids = [transcript_id[:transcript_id.find(".")] for transcript_id in all_transcripts]
parsed_transcripts = [open(pt.join(sjson_directory, transcript)).read() for transcript in all_transcripts]
return zip([clean_transcript(transcript) for transcript in parsed_transcripts], uuids)
def clean_transcript(transcript_string):
"""Tries to parse and clean a raw transcript. Errors for invalid sjson"""
transcript_list = filter(None, json.loads(transcript_string)['text'])
relevant_text = " ".join([phrase.encode('utf-8').strip() for phrase in transcript_list])
relevant_text = relevant_text.lower().translate(None, string.punctuation)
cleanedText = re.sub('\n', " ", relevant_text)
return cleanedText
def phonetic_transcript(clean_transcript, stemmer):
return " ".join([phoneticize(word, stemmer) for word in clean_transcript.split(" ")])
def phoneticize(word, stemmer):
encode = lambda word: word.decode('utf-8').encode('ascii', 'ignore')
phonetic = lambda word: fuzzy.nysiis(stemmer.stem(encode(word)))
return phonetic(word)
def initialize_transcripts(database, mapping):
database.indices.create_index("transcript-index")
def index_course(database, sjson_directory, course_name, mapping):
stemmer = snowball.EnglishStemmer()
database.put_mapping(course_name, {'properties': mapping}, "transcript-index")
all_transcripts = grab_transcripts(sjson_directory)
video_counter = 0
for transcript_tuple in all_transcripts:
data_map = {"searchable_text": transcript_tuple[0], "uuid": transcript_tuple[1]}
data_map['phonetic_text'] = phonetic_transcript(transcript_tuple[0], stemmer)
database.index(data_map, "transcript-index", course_name)
video_counter += 1
database.indices.refresh("transcript-index")
def fuzzy_search(database, query, course_name):
search_query = FuzzyLikeThisFieldQuery("searchable_text", query)
return database.search(query=search_query, indices="transcript-index")
def phonetic_search(database, query, course_name):
stemmer = snowball.EnglishStemmer()
search_query = TextQuery("phonetic_text", phoneticize(query, stemmer))
return database.search(query=search_query, indices="transcript-index")
data_directory = '/Users/climatologist/edx_all/data/content-mit-6002x/static/subs/'
mapping_directory = 'mapping.json'
database = ES('127.0.0.1:9200')
mapping = json.loads(open(mapping_directory, 'rb').read())
#initialize_transcripts(database, mapping)
#index_course(database, data_directory, "test-course", mapping)
fuzzy_results = fuzzy_search(database, "gaussian", "test-course")
phonetic_results = phonetic_search(database, "gaussian", "test-course")
for r in fuzzy_results:
print "Fuzzy: " + r['uuid']
for r in phonetic_results:
print "Phonetic: " + r['uuid']
{
"searchable_text": {
"boost": 1.0,
"index": "analyzed",
"store": "yes",
"type": "string",
"term_vector": "with_positions_offsets"
},
"phonetic_text": {
"boost": 1.0,
"index": "analyzed",
"store": "yes",
"type": "string",
"term_vector": "with_positions_offsets"
},
"uuid": {
"index": "not_analyzed",
"store": "yes",
"type": "string"
}
}
\ No newline at end of file
from django.http import HttpResponse
from django.template.loader import get_template
from django.template import Context
from django.contrib.auth.models import User
from django.contrib.staticfiles import finders
from courseware.courses import get_courses
from courseware.model_data import ModelDataCache
from courseware.module_render import get_module_for_descriptor
from courseware.views import registered_for_course
#import logging
import lxml
import re
import posixpath
import urllib
from os import listdir
from os.path import isfile
from os.path import join
def test(request):
user = User.objects.prefetch_related("groups").get(id=request.user.id)
request.user = user
course_list = get_courses(user, request.META.get('HTTP_HOST'))
all_modules = [get_module(request, user, course) for course in course_list if registered_for_course(course, user)]
child_modules = []
for module in all_modules:
child_modules.extend(module.get_children())
bottom_modules = []
for module in child_modules:
bottom_modules.extend(module.get_children())
asset_divs = get_asset_div(convert_to_valid_html(bottom_modules[2].get_html()))
strings = [get_transcript_directory(lxml.html.tostring(div)) for div in asset_divs]
search_template = get_template('search.html')
html = search_template.render(Context({'course_list': strings}))
return HttpResponse(html)
def get_children(course):
"""Returns the children of a given course"""
attributes = [child.location for child in course._child_instances]
return attributes
def convert_to_valid_html(html):
replacement = {"&lt;": "<", "&gt;": ">", "&#34;": "\"", "&#39;": "'"}
for i, j in replacement.iteritems():
html = html.replace(i, j)
return html
def get_asset_div(html_page):
return lxml.html.find_class(html_page, "video")
def get_module(request, user, course):
model_data_cache = ModelDataCache.cache_for_descriptor_descendents(course.id, user, course, depth=2)
course_module = get_module_for_descriptor(user, request, course, model_data_cache, course.id)
return course_module
def get_youtube_code(module_html):
youtube_snippet = re.sub(r'(.*?)(1\.0:)(.*?)(,1\.25)(.*)', r'\3', module_html)
sliced_youtube_code = youtube_snippet[:youtube_snippet.find('\n')]
return sliced_youtube_code
def get_transcript_directory(module_html):
directory_snippet = re.sub(r'(.*?)(data-caption-asset-path=\")(.*?)(\">.*)', r'\3', module_html)
sliced_directory = directory_snippet[:directory_snippet.find('\n')]
return resolve_to_absolute_path(sliced_directory)
def resolve_to_absolute_path(transcript_directory):
normalized_path = posixpath.normpath(urllib.unquote(transcript_directory)).lstrip('/')
return all_transcript_files(normalized_path)
def all_transcript_files(normalized_path):
files = [transcript for transcript in listdir(normalized_path) if isfile(join(normalized_path, transcript))]
return files
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment