Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
C
course-discovery
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
course-discovery
Commits
6332bcbf
Commit
6332bcbf
authored
Dec 06, 2016
by
Matthew Piatetsky
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Use ngrams for autocomplete and set up index settings/mappings
ECOM-4738
parent
d9ace3d8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
134 additions
and
23 deletions
+134
-23
course_discovery/apps/api/v1/tests/test_views/test_search.py
+5
-11
course_discovery/apps/api/v1/views/search.py
+19
-6
course_discovery/apps/core/tests/mixins.py
+10
-3
course_discovery/apps/core/utils.py
+4
-1
course_discovery/apps/course_metadata/search_indexes.py
+2
-0
course_discovery/apps/edx_haystack_extensions/backends.py
+30
-1
course_discovery/apps/edx_haystack_extensions/management/commands/update_index.py
+3
-1
course_discovery/settings/base.py
+61
-0
No files found.
course_discovery/apps/api/v1/tests/test_views/test_search.py
View file @
6332bcbf
...
...
@@ -298,7 +298,7 @@ class AggregateSearchViewSet(DefaultPartnerMixin, SerializationMixin, LoginMixin
[
self
.
serialize_course_run
(
course_run
),
self
.
serialize_program
(
program
)])
class
TypeaheadSearchViewTests
(
TypeaheadSerializationMixin
,
LoginMixin
,
APITestCase
):
class
TypeaheadSearchViewTests
(
TypeaheadSerializationMixin
,
LoginMixin
,
ElasticsearchTestMixin
,
APITestCase
):
path
=
reverse
(
'api:v1:search-typeahead'
)
function_score
=
{
'functions'
:
[
...
...
@@ -307,12 +307,6 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC
{
'linear'
:
{
'start'
:
{
'origin'
:
'now'
,
'scale'
:
'1d'
,
'decay'
:
0.95
}},
'weight'
:
5.0
}
],
'boost'
:
1.0
,
'score_mode'
:
'sum'
,
'boost_mode'
:
'sum'
,
'query'
:
{
'query_string'
:
{
'auto_generate_phrase_queries'
:
True
,
'analyze_wildcard'
:
True
,
'query'
:
'((title:*pytho* OR course_key:*pytho*) AND status:(active))'
}
}
}
def
get_typeahead_response
(
self
,
query
=
None
):
...
...
@@ -323,7 +317,7 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC
url
=
'{path}?{qs}'
.
format
(
path
=
self
.
path
,
qs
=
qs
)
config
=
ElasticsearchBoostConfig
.
get_solo
()
config
.
function_score
=
self
.
function_score
config
.
function_score
.
update
(
self
.
function_score
)
config
.
save
()
return
self
.
client
.
get
(
url
)
...
...
@@ -405,7 +399,7 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC
def
test_micromasters_boosting
(
self
):
""" Verify micromasters are boosted over xseries."""
title
=
"
test_micromasters_boosting
"
title
=
"
micromasters
"
ProgramFactory
(
title
=
title
+
"1"
,
status
=
ProgramStatus
.
Active
,
...
...
@@ -420,7 +414,7 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC
def
test_start_date_boosting
(
self
):
""" Verify upcoming courses are boosted over past courses."""
title
=
"
test_start_date_boosting
"
title
=
"
start
"
now
=
datetime
.
datetime
.
utcnow
()
CourseRunFactory
(
title
=
title
+
"1"
,
start
=
now
-
datetime
.
timedelta
(
weeks
=
10
))
CourseRunFactory
(
title
=
title
+
"2"
,
start
=
now
+
datetime
.
timedelta
(
weeks
=
1
))
...
...
@@ -431,7 +425,7 @@ class TypeaheadSearchViewTests(TypeaheadSerializationMixin, LoginMixin, APITestC
def
test_self_paced_boosting
(
self
):
""" Verify that self paced courses are boosted over instructor led courses."""
title
=
"
test_self_paced_boosting
"
title
=
"
paced
"
CourseRunFactory
(
title
=
title
+
"1"
,
pacing_type
=
'instructor_paced'
)
CourseRunFactory
(
title
=
title
+
"2"
,
pacing_type
=
'self_paced'
)
response
=
self
.
get_typeahead_response
(
title
)
...
...
course_discovery/apps/api/v1/views/search.py
View file @
6332bcbf
...
...
@@ -121,23 +121,36 @@ class AggregateSearchViewSet(BaseHaystackViewSet):
class
TypeaheadSearchView
(
APIView
):
"""
Typeahead for courses and programs.
"""
""" Typeahead for courses and programs. """
RESULT_COUNT
=
3
permission_classes
=
(
IsAuthenticated
,)
def
get_results
(
self
,
query
):
query
=
'(title:*{query}* OR course_key:*{query}*)'
.
format
(
query
=
query
.
lower
())
course_runs
=
SearchQuerySet
()
.
models
(
CourseRun
)
.
raw_search
(
query
)
course_runs
=
SearchQuerySet
()
.
models
(
CourseRun
)
.
filter
(
SQ
(
title_autocomplete
=
query
)
|
SQ
(
course_key
=
query
))
course_runs
=
course_runs
.
filter
(
published
=
True
)
.
exclude
(
hidden
=
True
)
course_runs
=
course_runs
[:
self
.
RESULT_COUNT
]
programs
=
SearchQuerySet
()
.
models
(
Program
)
.
raw_search
(
query
)
programs
=
SearchQuerySet
()
.
models
(
Program
)
.
filter
(
SQ
(
title_autocomplete
=
query
)
)
programs
=
programs
.
filter
(
status
=
ProgramStatus
.
Active
)
programs
=
programs
[:
self
.
RESULT_COUNT
]
return
course_runs
,
programs
def
get
(
self
,
request
,
*
args
,
**
kwargs
):
"""
Typeahead uses the ngram_analyzer as the index_analyzer to generate ngrams of the title during indexing.
i.e. Data Science -> da, dat, at, ata, data, etc...
Typeahead uses the lowercase analyzer as the search_analyzer.
The ngram_analyzer uses the lowercase filter as well, which makes typeahead case insensitive.
Available analyzers are defined in index _settings and field level analyzers are defined in the index _mapping.
NGrams are used rather than EdgeNgrams because NGrams allow partial searches across white space:
i.e. data sci - > data science, but not data analysis or scientific method
---
parameters:
- name: q
description: "Search text"
paramType: query
required: true
type: string
"""
query
=
request
.
query_params
.
get
(
'q'
)
if
not
query
:
raise
ParseError
(
"The 'q' querystring parameter is required for searching."
)
...
...
course_discovery/apps/core/tests/mixins.py
View file @
6332bcbf
import
logging
from
django.conf
import
settings
from
elasticsearch
import
Elasticsearch
from
haystack
import
connections
as
haystack_connections
from
course_discovery.apps.core.utils
import
ElasticsearchUtils
...
...
@@ -12,12 +12,19 @@ class ElasticsearchTestMixin(object):
@classmethod
def
setUpClass
(
cls
):
super
(
ElasticsearchTestMixin
,
cls
)
.
setUpClass
()
host
=
settings
.
HAYSTACK_CONNECTIONS
[
'default'
][
'URL'
]
cls
.
index
=
settings
.
HAYSTACK_CONNECTIONS
[
'default'
][
'INDEX_NAME'
]
cls
.
es
=
Elasticsearch
(
host
)
# Make use of the changes in our custom ES backend
# This is required for typeahead autocomplete to work in the tests
connection
=
haystack_connections
[
'default'
]
cls
.
backend
=
connection
.
get_backend
()
# Without this line, haystack doesn't fully recreate the connection
# The first test using this backend succeeds, but the following tests
# do not set the Elasticsearch _mapping
def
setUp
(
self
):
super
(
ElasticsearchTestMixin
,
self
)
.
setUp
()
self
.
backend
.
setup_complete
=
False
self
.
es
=
self
.
backend
.
conn
self
.
reset_index
()
self
.
refresh_index
()
...
...
course_discovery/apps/core/utils.py
View file @
6332bcbf
import
datetime
import
logging
from
django.conf
import
settings
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -16,7 +18,8 @@ class ElasticsearchUtils(object):
# Create an index with a unique (timestamped) name
timestamp
=
datetime
.
datetime
.
utcnow
()
.
strftime
(
"
%
Y
%
m
%
d
%
H
%
M
%
S"
)
index
=
'{alias}_{timestamp}'
.
format
(
alias
=
alias
,
timestamp
=
timestamp
)
es
.
indices
.
create
(
index
=
index
)
index_settings
=
settings
.
ELASTICSEARCH_INDEX_SETTINGS
es
.
indices
.
create
(
index
=
index
,
body
=
index_settings
)
logger
.
info
(
'...index [
%
s] created.'
,
index
)
# Point the alias to the new index
...
...
course_discovery/apps/course_metadata/search_indexes.py
View file @
6332bcbf
...
...
@@ -59,6 +59,7 @@ class BaseIndex(indexes.SearchIndex):
class
BaseCourseIndex
(
OrganizationsMixin
,
BaseIndex
):
key
=
indexes
.
CharField
(
model_attr
=
'key'
,
stored
=
True
)
title
=
indexes
.
CharField
(
model_attr
=
'title'
,
boost
=
TITLE_FIELD_BOOST
)
title_autocomplete
=
indexes
.
NgramField
(
model_attr
=
'title'
,
boost
=
TITLE_FIELD_BOOST
)
short_description
=
indexes
.
CharField
(
model_attr
=
'short_description'
,
null
=
True
)
full_description
=
indexes
.
CharField
(
model_attr
=
'full_description'
,
null
=
True
)
subjects
=
indexes
.
MultiValueField
(
faceted
=
True
)
...
...
@@ -181,6 +182,7 @@ class ProgramIndex(BaseIndex, indexes.Indexable, OrganizationsMixin):
uuid
=
indexes
.
CharField
(
model_attr
=
'uuid'
)
title
=
indexes
.
CharField
(
model_attr
=
'title'
,
boost
=
TITLE_FIELD_BOOST
)
title_autocomplete
=
indexes
.
NgramField
(
model_attr
=
'title'
,
boost
=
TITLE_FIELD_BOOST
)
subtitle
=
indexes
.
CharField
(
model_attr
=
'subtitle'
)
type
=
indexes
.
CharField
(
model_attr
=
'type__name'
,
faceted
=
True
)
marketing_url
=
indexes
.
CharField
(
null
=
True
)
...
...
course_discovery/apps/edx_haystack_extensions/backends.py
View file @
6332bcbf
...
...
@@ -74,8 +74,37 @@ class NonClearingSearchBackendMixin(object):
# pylint: disable=abstract-method
class
ConfigurableElasticBackend
(
ElasticsearchSearchBackend
):
def
specify_analyzers
(
self
,
mapping
,
field
,
index_analyzer
,
search_analyzer
):
""" Specify separate index and search analyzers for the given field.
Args:
mapping (dict): /_mapping attribute on index (maps analyzers to fields)
field (str): which field to modify
index_analyzer (str): name of the index_analyzer (should be defined in the /_settings attribute)
search_analyzer (str): name of the search_analyzer (should be defined in the /_settings attribute)
"""
# The generic analyzer is used for both if index_analyzer and search_analyzer are not specified
mapping
[
field
]
.
pop
(
'analyzer'
)
mapping
[
field
]
.
update
({
'index_analyzer'
:
index_analyzer
,
'search_analyzer'
:
search_analyzer
})
def
build_schema
(
self
,
fields
):
content_field_name
,
mapping
=
super
()
.
build_schema
(
fields
)
# Use the ngram analyzer as the index_analyzer and the lowercase analyzer as the search_analyzer
# This is necessary to support partial searches/typeahead
# If we used ngram analyzer for both, then 'running' would get split into ngrams like "ing"
# and all words containing ing would come back in typeahead.
self
.
specify_analyzers
(
mapping
=
mapping
,
field
=
'title_autocomplete'
,
index_analyzer
=
'ngram_analyzer'
,
search_analyzer
=
'lowercase'
)
return
(
content_field_name
,
mapping
)
# pylint: disable=abstract-method
class
EdxElasticsearchSearchBackend
(
SimpleQuerySearchBackendMixin
,
NonClearingSearchBackendMixin
,
ElasticsearchSearch
Backend
):
ConfigurableElastic
Backend
):
pass
...
...
course_discovery/apps/edx_haystack_extensions/management/commands/update_index.py
View file @
6332bcbf
import
datetime
import
logging
from
django.conf
import
settings
from
haystack
import
connections
as
haystack_connections
from
haystack.management.commands.update_index
import
Command
as
HaystackCommand
...
...
@@ -84,5 +85,6 @@ class Command(HaystackCommand):
"""
timestamp
=
datetime
.
datetime
.
utcnow
()
.
strftime
(
'
%
Y
%
m
%
d_
%
H
%
M
%
S'
)
index_name
=
'{alias}_{timestamp}'
.
format
(
alias
=
prefix
,
timestamp
=
timestamp
)
backend
.
conn
.
indices
.
create
(
index
=
index_name
)
index_settings
=
settings
.
ELASTICSEARCH_INDEX_SETTINGS
backend
.
conn
.
indices
.
create
(
index
=
index_name
,
body
=
index_settings
)
return
index_name
course_discovery/settings/base.py
View file @
6332bcbf
...
...
@@ -350,6 +350,67 @@ SWAGGER_SETTINGS = {
'permission_denied_handler'
:
'course_discovery.apps.api.views.api_docs_permission_denied_handler'
}
# Elasticsearch uses index settings to specify available analyzers.
# We are adding the lowercase analyzer and tweaking the ngram analyzers here,
# so we need to use these settings rather than the index defaults.
# We are making these changes to enable autocomplete for the typeahead endpoint.
ELASTICSEARCH_INDEX_SETTINGS
=
{
'settings'
:
{
'analysis'
:
{
'tokenizer'
:
{
'haystack_edgengram_tokenizer'
:
{
'type'
:
'edgeNGram'
,
'side'
:
'front'
,
'min_gram'
:
2
,
'max_gram'
:
15
},
'haystack_ngram_tokenizer'
:
{
'type'
:
'nGram'
,
'min_gram'
:
2
,
'max_gram'
:
15
}
},
'analyzer'
:
{
'lowercase'
:
{
'type'
:
'custom'
,
'tokenizer'
:
'keyword'
,
'filter'
:
[
'lowercase'
]
},
'ngram_analyzer'
:
{
'type'
:
'custom'
,
'filter'
:
[
'haystack_ngram'
,
'lowercase'
],
'tokenizer'
:
'standard'
},
'edgengram_analyzer'
:
{
'type'
:
'custom'
,
'filter'
:
[
'haystack_edgengram'
,
'lowercase'
],
'tokenizer'
:
'standard'
}
},
'filter'
:
{
'haystack_edgengram'
:
{
'type'
:
'edgeNGram'
,
'min_gram'
:
2
,
'max_gram'
:
15
},
'haystack_ngram'
:
{
'type'
:
'nGram'
,
'min_gram'
:
2
,
'max_gram'
:
15
}
}
}
}
}
# Haystack configuration (http://django-haystack.readthedocs.io/en/v2.5.0/settings.html)
HAYSTACK_ITERATOR_LOAD_PER_QUERY
=
200
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment