Commit 05331999 by Toby Lawrence Committed by GitHub

Merge pull request #165 from edx/clintonb/replace-tire

Replaced Tire with elasticsearch-model
parents 17c4c7fd 6d7aec75
......@@ -23,4 +23,6 @@ before_install:
- mkdir -p ./mongo/log
- mongod --fork --dbpath ./mongo/db --logpath ./mongo/log/mongodb.log
script: bundle exec rspec
script:
- bin/rake search:initialize
- bin/rspec
......@@ -31,8 +31,8 @@ gem 'will_paginate_mongoid', "~>2.0"
gem 'rdiscount'
gem 'nokogiri', "~>1.6.8"
gem 'tire', "0.6.2"
gem 'tire-contrib'
gem 'elasticsearch', '~> 0.4'
gem 'elasticsearch-model', '~> 0.1.8'
gem 'dalli'
......@@ -47,6 +47,7 @@ group :test do
gem 'guard-unicorn'
gem 'rack-test', :require => 'rack/test'
gem 'rspec', '~> 2.11.0'
gem 'webmock', '~> 1.22'
end
gem 'newrelic_rpm'
......
......@@ -32,7 +32,7 @@ GEM
minitest (~> 5.1)
thread_safe (~> 0.3, >= 0.3.4)
tzinfo (~> 1.1)
ansi (1.5.0)
addressable (2.4.0)
bson (3.2.4)
bson_ext (1.5.1)
builder (3.2.2)
......@@ -41,6 +41,8 @@ GEM
simplecov
url
coderay (1.0.7)
crack (0.4.3)
safe_yaml (~> 1.0.0)
dalli (2.1.0)
database_cleaner (1.5.1)
delayed_job (4.1.1)
......@@ -53,18 +55,33 @@ GEM
docile (1.1.5)
domain_name (0.5.24)
unf (>= 0.0.5, < 1.0.0)
elasticsearch (0.4.11)
elasticsearch-api (= 0.4.11)
elasticsearch-transport (= 0.4.11)
elasticsearch-api (0.4.11)
multi_json
elasticsearch-model (0.1.8)
activesupport (> 3)
elasticsearch (> 0.4)
hashie
elasticsearch-transport (0.4.11)
faraday
multi_json
enumerize (0.11.0)
activesupport (>= 3.2)
factory_girl (4.5.0)
activesupport (>= 3.0.0)
faker (1.6.1)
i18n (~> 0.5)
faraday (0.9.2)
multipart-post (>= 1.2, < 3)
guard (1.3.2)
listen (>= 0.4.2)
thor (>= 0.14.6)
guard-unicorn (0.0.7)
guard (>= 1.1)
hashr (0.0.22)
hashdiff (0.2.3)
hashie (3.4.3)
http-cookie (1.0.2)
domain_name (~> 0.5)
i18n (0.7.0)
......@@ -89,6 +106,7 @@ GEM
mongoid
rake
multi_json (1.11.2)
multipart-post (2.0.0)
netrc (0.10.3)
newrelic_rpm (3.16.0.318)
nokogiri (1.6.8)
......@@ -125,6 +143,7 @@ GEM
rspec-expectations (2.11.2)
diff-lcs (~> 1.1.3)
rspec-mocks (2.11.2)
safe_yaml (1.0.4)
simplecov (0.11.1)
docile (~> 1.1.0)
json (~> 1.8)
......@@ -138,16 +157,6 @@ GEM
thor (0.16.0)
thread_safe (0.3.5)
tilt (1.3.3)
tire (0.6.2)
activemodel (>= 3.0)
activesupport
ansi
hashr (~> 0.0.19)
multi_json (~> 1.3)
rake
rest-client (~> 1.6)
tire-contrib (0.1.1)
tire
tzinfo (1.2.2)
thread_safe (~> 0.1)
unf (0.1.4)
......@@ -158,6 +167,10 @@ GEM
rack
raindrops (~> 0.7)
url (0.3.2)
webmock (1.22.3)
addressable (>= 2.3.6)
crack (>= 0.3.2)
hashdiff
will_paginate (3.0.7)
will_paginate_mongoid (2.0.1)
mongoid
......@@ -176,6 +189,8 @@ DEPENDENCIES
database_cleaner (~> 1.5.1)
delayed_job
delayed_job_mongoid
elasticsearch (~> 0.4)
elasticsearch-model (~> 0.1.8)
enumerize
factory_girl (~> 4.0)
faker (~> 1.6)
......@@ -199,8 +214,7 @@ DEPENDENCIES
rs_voteable_mongo!
rspec (~> 2.11.0)
sinatra
tire (= 0.6.2)
tire-contrib
unicorn
webmock (~> 1.22)
will_paginate_mongoid (~> 2.0)
yajl-ruby
......@@ -14,9 +14,8 @@ An independent comment system which supports voting and nested comments. It
also supports features including instructor endorsement for education-aimed
discussion platforms.
Running the Server
------------------
Getting Started
---------------
If you are running cs_comments_service as part of edx-platform__ development under
devstack, it is strongly recommended to read `those setup documents`__ first. Note that
devstack will take care of just about all of the installation, configuration, and
......@@ -30,15 +29,49 @@ This service relies on Elasticsearch and MongoDB. By default the service will us
however, if you wish to change these values, refer to `config/application.yml` and `config/mongoid.yml` for the
environment variables that can be set to override the defaults.
Before the server is first run, ensure gems are installed by doing ``bundle install``.
Install the requisite gems:
.. code-block:: bash
$ bundle install
Setup the search index. Note that the command below creates an alias with a unique name (e.g. content_20160101), and
assigns it a known alias: content. If you choose not to use the command below, you should still opt to reference your
index by an alias rather than the actual index name. This will enable you to swap out indices (e.g. reindex) without
having to take downtime or modify code with a new index name.
.. code-block:: bash
$ bin/rake search:initialize
Run ther server:
.. code-block::
$ ruby app.rb
To run the server, do ``ruby app.rb [-p PORT]`` where PORT defaults to 4567.
By default Sinatra runs on port `4567`. If you'd like to use a different port pass the `-p` parameter:
.. code-block::
$ ruby app.rb -p 5678
Running Tests
-------------
To run tests, do ``bundle exec rspec``. Append ``--help`` or see rspec documentation
for additional options to this command.
Tests are built using the rspec__ framework, and can be run with the command below:
.. code-block::
$ bin/rspec
If you'd like to view additional options for the command, append the `--help` option:
.. code-block::
$ bin/rspec --help
__ http://rspec.info/
Internationalization (i18n) and Localization (l10n)
---------------------------------------------------
......@@ -62,12 +95,12 @@ follow the instructions here__ to set up your ``.transifexrc`` file.
__ http://support.transifex.com/customer/portal/articles/1000855-configuring-the-client
To upload strings to Transifex for translation when you change the set
of translatable strings: ``bundle exec rake i18n:push``
of translatable strings: ``bin/rake i18n:push``
To fetch the latest translations from Transifex: ``bundle exec rake i18n:pull``
To fetch the latest translations from Transifex: ``bin/rake i18n:pull``
The repository includes some translations so they will be available
upon deployment. To commit an update to these: ``bundle exec rake i18n:commit``
upon deployment. To commit an update to these: ``bin/rake i18n:commit``
License
-------
......
......@@ -17,21 +17,23 @@ rescue LoadError
# no rspec available
end
Tire.configure do
url YAML.load(application_yaml)['elasticsearch_server']
end
LOG = Logger.new(STDERR)
desc 'Load the environment'
task :environment do
environment = ENV['SINATRA_ENV'] || 'development'
Sinatra::Base.environment = environment
Mongoid.load!('config/mongoid.yml')
Mongoid.logger.level = Logger::INFO
module CommentService
class << self;
attr_accessor :config;
attr_accessor :config
def search_enabled?
self.config[:enable_search]
end
end
end
......@@ -41,8 +43,8 @@ task :environment do
Dir[File.dirname(__FILE__) + '/models/*.rb'].each { |file| require file }
end
Dir.glob('lib/tasks/*.rake').each { |r| import r }
task :console => :environment do
binding.pry
end
Dir.glob('lib/tasks/*.rake').each { |r| import r }
get "#{APIPREFIX}/threads" do # retrieve threads by course
threads = Content.where({"_type" => "CommentThread", "course_id" => params["course_id"]})
if params[:commentable_ids]
threads = threads.in({"commentable_id" => params[:commentable_ids].split(",")})
end
handle_threads_query(
threads,
params["user_id"],
params["course_id"],
get_group_ids_from_params(params),
value_to_boolean(params["flagged"]),
value_to_boolean(params["unread"]),
value_to_boolean(params["unanswered"]),
params["sort_key"],
params["sort_order"],
params["page"],
params["per_page"]
threads,
params['user_id'],
params['course_id'],
get_group_ids_from_params(params),
value_to_boolean(params['flagged']),
value_to_boolean(params['unread']),
value_to_boolean(params['unanswered']),
params['sort_key'],
params['sort_order'],
params['page'],
params['per_page']
).to_json
end
......@@ -83,6 +83,12 @@ post "#{APIPREFIX}/threads/:thread_id/comments" do |thread_id|
end
delete "#{APIPREFIX}/threads/:thread_id" do |thread_id|
thread.destroy
begin
thread.destroy
rescue Elasticsearch::Transport::Transport::Errors::NotFound
# If the thread is not in the index, that's actually a good thing given that we just removed it.
# Note that this exception will probably only be encountered for tests that don't wait for the index
# to be refreshed before attempting to destroy a newly-recreated thread.
end
thread.to_hash.to_json
end
......@@ -60,5 +60,4 @@ delete "#{APIPREFIX}/comments/:comment_id" do |comment_id|
pass
end
end
comment.to_hash.to_json
end
get "#{APIPREFIX}/search/threads" do
local_params = params # Necessary for params to be available inside blocks
group_ids = get_group_ids_from_params(local_params)
context = local_params["context"] ? local_params["context"] : "course"
search_text = local_params["text"]
if !search_text
{}.to_json
else
# Because threads and comments are currently separate unrelated documents in
# Elasticsearch, we must first query for all matching documents, then
# extract the set of thread ids, and then sort the threads by the specified
# criteria and paginate. For performance reasons, we currently limit the
# number of documents considered (ordered by update recency), which means
# that matching threads can be missed if the search terms are very common.
def get_thread_ids(context, group_ids, local_params, search_text)
filters = []
filters.push({term: {commentable_id: local_params['commentable_id']}}) if local_params['commentable_id']
filters.push({terms: {commentable_id: local_params['commentable_ids'].split(',')}}) if local_params['commentable_ids']
filters.push({term: {course_id: local_params['course_id']}}) if local_params['course_id']
filters.push({or: [
{not: {exists: {field: :context}}},
{term: {context: context}}
]})
get_matching_thread_ids = lambda do |search_text|
self.class.trace_execution_scoped(["Custom/get_search_threads/es_search"]) do
search = Tire.search Content::ES_INDEX_NAME do
query do
match [:title, :body], search_text, :operator => "AND"
filtered do
filter :term, :commentable_id => local_params["commentable_id"] if local_params["commentable_id"]
filter :terms, :commentable_id => local_params["commentable_ids"].split(",") if local_params["commentable_ids"]
filter :term, :course_id => local_params["course_id"] if local_params["course_id"]
filter :or, [
{:not => {:exists => {:field => :context}}},
{:term => {:context => context}}
]
unless group_ids.empty?
filters.push(
{
or: [
{:not => {:exists => {:field => :group_id}}},
{:terms => {:group_id => group_ids}}
]
}
)
end
if not group_ids.empty?
if group_ids.length > 1
group_id_criteria = {:terms => {:group_id => group_ids}}
else
group_id_criteria = {:term => {:group_id => group_ids[0]}}
end
self.class.trace_execution_scoped(['Custom/get_search_threads/es_search']) do
body = {
size: CommentService.config['max_deep_search_comment_count'].to_i,
sort: [
{updated_at: :desc}
],
query: {
multi_match: {
query: search_text,
fields: [:title, :body],
operator: :AND
},
filtered: {
filter: {
and: filters
}
}
}
}
filter :or, [
{:not => {:exists => {:field => :group_id}}},
group_id_criteria
]
end
response = Elasticsearch::Model.client.search(index: Content::ES_INDEX_NAME, body: body)
end
end
sort do
by "updated_at", "desc"
end
size CommentService.config["max_deep_search_comment_count"].to_i
end
thread_ids = Set.new
search.results.each do |content|
case content.type
when "comment_thread"
thread_ids.add(content.id)
when "comment"
thread_ids.add(content.comment_thread_id)
end
end
thread_ids
thread_ids = Set.new
response['hits']['hits'].each do |hit|
case hit['_type']
when CommentThread.document_type
thread_ids.add(hit['_id'])
when Comment.document_type
thread_ids.add(hit['_source']['comment_thread_id'])
else
# There shouldn't be any other document types. Nevertheless, ignore them, if they are present.
next
end
end
thread_ids
end
end
def get_suggested_text(search_text)
body = {
suggestions: {
text: search_text,
phrase: {
field: :_all
}
}
}
response = Elasticsearch::Model.client.suggest(index: Content::ES_INDEX_NAME, body: body)
suggestions = response.fetch('suggestions', [])
if suggestions.length > 0
options = suggestions[0]['options']
if options.length > 0
return options[0]['text']
end
end
nil
end
def get_threads(context, group_ids, local_params, search_text)
# Because threads and comments are currently separate unrelated documents in
# Elasticsearch, we must first query for all matching documents, then
# extract the set of thread ids, and then sort the threads by the specified
# criteria and paginate. For performance reasons, we currently limit the
# number of documents considered (ordered by update recency), which means
# that matching threads can be missed if the search terms are very common.
thread_ids = get_thread_ids(context, group_ids, local_params, search_text)
corrected_text = nil
if thread_ids.empty?
# Sadly, Elasticsearch does not have a facility for computing suggestions
# with respect to a filter. It would be expensive to determine the best
# suggestion with respect to our filter parameters, so we simply re-query
# with the top suggestion. If that has no results, then we return no results
# and no correction.
thread_ids = get_matching_thread_ids.call(search_text)
corrected_text = nil
if thread_ids.empty?
suggest = Tire.suggest Content::ES_INDEX_NAME do
suggestion "" do
text search_text
phrase :_all
end
end
corrected_text = suggest.results.texts.first
thread_ids = get_matching_thread_ids.call(corrected_text) if corrected_text
corrected_text = nil if thread_ids.empty?
end
corrected_text = get_suggested_text(search_text)
thread_ids = get_thread_ids(context, group_ids, local_params, corrected_text) if corrected_text
corrected_text = nil if thread_ids.empty?
end
result_obj = handle_threads_query(
CommentThread.in({"_id" => thread_ids.to_a}),
local_params["user_id"],
local_params["course_id"],
result_obj = handle_threads_query(
CommentThread.in({_id: thread_ids.to_a}),
local_params['user_id'],
local_params['course_id'],
group_ids,
value_to_boolean(local_params["flagged"]),
value_to_boolean(local_params["unread"]),
value_to_boolean(local_params["unanswered"]),
local_params["sort_key"],
local_params["sort_order"],
local_params["page"],
local_params["per_page"],
value_to_boolean(local_params['flagged']),
value_to_boolean(local_params['unread']),
value_to_boolean(local_params['unanswered']),
local_params['sort_key'],
local_params['sort_order'],
local_params['page'],
local_params['per_page'],
context
)
if !result_obj.empty?
result_obj[:corrected_text] = corrected_text
# NOTE this reflects the total results from ES, but does not consider
# any post-filtering that might happen (e.g. unread, flagged...) before
# results are shown to the user.
result_obj[:total_results] = thread_ids.size
end
result_obj.to_json
)
unless result_obj.empty?
result_obj[:corrected_text] = corrected_text
# NOTE this reflects the total results from ES, but does not consider
# any post-filtering that might happen (e.g. unread, flagged...) before
# results are shown to the user.
result_obj[:total_results] = thread_ids.size
end
result_obj.to_json
end
get "#{APIPREFIX}/search/threads" do
local_params = params # Necessary for params to be available inside blocks
group_ids = get_group_ids_from_params(local_params)
context = local_params["context"] ? local_params["context"] : "course"
search_text = local_params["text"]
if !search_text
'{}'
else
get_threads(context, group_ids, local_params, search_text)
end
end
......@@ -14,6 +14,10 @@ module CommentService
class << self
attr_accessor :config
attr_accessor :blocked_hashes
def search_enabled?
self.config[:enable_search]
end
end
API_VERSION = 'v1'
API_PREFIX = "/api/#{API_VERSION}"
......@@ -26,11 +30,6 @@ end
application_yaml = ERB.new(File.read("config/application.yml")).result()
CommentService.config = YAML.load(application_yaml).with_indifferent_access
Tire.configure do
url CommentService.config[:elasticsearch_server]
logger STDERR if ENV["ENABLE_ELASTICSEARCH_DEBUGGING"]
end
Mongoid.load!("config/mongoid.yml", environment)
Mongoid.logger.level = Logger::INFO
Mongo::Logger.logger.level = ENV["ENABLE_MONGO_DEBUGGING"] ? Logger::DEBUG : Logger::INFO
......@@ -48,11 +47,13 @@ helpers do
end
end
Dir[File.dirname(__FILE__) + '/lib/**/*.rb'].each {|file| require file}
Dir[File.dirname(__FILE__) + '/models/*.rb'].each {|file| require file}
Dir[File.dirname(__FILE__) + '/presenters/*.rb'].each {|file| require file}
Dir[File.dirname(__FILE__) + '/lib/**/*.rb'].each { |file| require file }
Dir[File.dirname(__FILE__) + '/models/*.rb'].each { |file| require file }
Dir[File.dirname(__FILE__) + '/presenters/*.rb'].each { |file| require file }
# Ensure elasticsearch index mappings exist.
Elasticsearch::Model.client = Elasticsearch::Client.new(host: CommentService.config[:elasticsearch_server], log: false)
# Ensure Elasticsearch index mappings exist.
Comment.put_search_index_mapping
CommentThread.put_search_index_mapping
......@@ -106,7 +107,6 @@ class Time
end
# these files must be required in order
require './api/search'
require './api/commentables'
......@@ -138,55 +138,61 @@ error ArgumentError do
error 400, [env['sinatra.error'].message].to_json
end
CommentService.blocked_hashes = Content.mongo_client[:blocked_hash].find(nil, projection: {hash: 1}).map {|d| d["hash"]}
CommentService.blocked_hashes = Content.mongo_client[:blocked_hash].find(nil, projection: {hash: 1}).map { |d| d["hash"] }
def get_db_is_master
Mongoid::Clients.default.command(isMaster: 1)
end
def get_es_status
res = Tire::Configuration.client.get Tire::Configuration.url
JSON.parse res.body
def elasticsearch_health
Elasticsearch::Model.client.cluster.health
end
get '/heartbeat' do
# mongo is reachable and ready to handle requests
db_ok = false
def is_mongo_available?
begin
res = get_db_is_master
db_ok = res.ok? && res.documents.first['ismaster'] == true
response = get_db_is_master
return response.ok? && (response.documents.first['ismaster'] == true)
rescue
# ignored
end
error 500, JSON.generate({"OK" => false, "check" => "db"}) unless db_ok
# E_S is reachable and ready to handle requests
es_ok = false
false
end
def is_elasticsearch_available?
begin
es_status = get_es_status
es_ok = es_status["status"] == 200
health = elasticsearch_health
return !health['timed_out'] && %w(yellow green).include?(health['status'])
rescue
# ignored
end
error 500, JSON.generate({"OK" => false, "check" => "es"}) unless es_ok
JSON.generate({"OK" => true})
false
end
get '/heartbeat' do
error 500, JSON.generate({OK: false, check: :db}) unless is_mongo_available?
error 500, JSON.generate({OK: false, check: :es}) unless is_elasticsearch_available?
JSON.generate({OK: true})
end
get '/selftest' do
begin
t1 = Time.now
status = {
"db" => get_db_is_master,
"es" => get_es_status,
"last_post_created" => (Content.last.created_at rescue nil),
"total_posts" => Content.count,
"total_users" => User.count,
"elapsed_time" => Time.now - t1
db: get_db_is_master,
es: elasticsearch_health,
last_post_created: (Content.last.created_at rescue nil),
total_posts: Content.count,
total_users: User.count,
elapsed_time: Time.now - t1
}
JSON.generate(status)
rescue => ex
[ 500,
{'Content-Type' => 'text/plain'},
"#{ex.backtrace.first}: #{ex.message} (#{ex.class})\n\t#{ex.backtrace[1..-1].join("\n\t")}"
[500,
{'Content-Type' => 'text/plain'},
"#{ex.backtrace.first}: #{ex.message} (#{ex.class})\n\t#{ex.backtrace[1..-1].join("\n\t")}"
]
end
end
......@@ -4,3 +4,4 @@ elasticsearch_server: <%= ENV['SEARCH_SERVER'] || 'http://localhost:9200' %>
max_deep_search_comment_count: 5000
default_locale: <%= ENV['SERVICE_LANGUAGE'] || 'en-US' %>
manual_pagination_batch_size: <%= ENV['MANUAL_PAGINATION_BATCH_SIZE'] || 500 %>
enable_search: true
require 'elasticsearch'
module TaskHelpers
module ElasticsearchHelper
LOG = Logger.new(STDERR)
def self.create_index(name=nil)
name ||= "#{Content::ES_INDEX_NAME}_#{Time.now.strftime('%Y%m%d%H%M%S')}"
index = Tire.index(name)
LOG.info "Creating new index: #{name}..."
index.create
[CommentThread, Comment].each do |model|
LOG.info "Applying index mappings for #{model.name}"
model.put_search_index_mapping(index)
mappings = {}
[Comment, CommentThread].each do |model|
mappings.merge! model.mappings.to_hash
end
LOG.info '...done!'
index
Elasticsearch::Model.client.indices.create(index: name, body: {mappings: mappings})
LOG.info "Created new index: #{name}."
name
end
def self.delete_index(name)
Tire.index(name).delete
begin
Elasticsearch::Model.client.indices.delete(index: name)
LOG.info "Deleted index: #{name}."
rescue Elasticsearch::Transport::Transport::Errors::NotFound
# NOTE (CCB): Future versions of the Elasticsearch client support the ignore parameter,
# that can be used to ignore 404 errors.
LOG.info "Unable to delete non-existent index: #{name}."
end
end
def self.get_index
CommentThread.tire.index
def self.get_index_shard_count(name)
settings = Elasticsearch::Model.client.indices.get_settings(index: name)
settings[name]['settings']['index']['number_of_shards']
end
def self.get_index_shard_count(name)
settings = Tire.index(name)
settings['index.number_of_shards']
def self.move_alias(alias_name, index_name)
actions = [
{add: {index: index_name, alias: alias_name}}
]
begin
response = Elasticsearch::Model.client.indices.get_alias(name: alias_name)
if response.length
actions.unshift({remove: {index: response.keys.join(','), alias: alias_name}})
end
rescue Elasticsearch::Transport::Transport::Errors::NotFound
# NOTE (CCB): Future versions of the Elasticsearch client support the ignore parameter,
# that can be used to ignore 404 errors.
end
body = {actions: actions}
Elasticsearch::Model.client.indices.update_aliases(body: body)
LOG.info "Alias [#{alias_name}] now points to index [#{index_name}]."
end
def self.refresh_index(name)
Elasticsearch::Model.client.indices.refresh(index: name)
end
end
end
......@@ -91,11 +91,6 @@ namespace :db do
end
task :seed => [:environment, :clean] do
Tire.index 'comment_threads' do
delete
end
CommentThread.create_elasticsearch_index
beginning_time = Time.now
(1..10).map { |id| create_test_user(id) }
......
require 'task_helpers'
namespace :search do
def import_from_cursor(cursor, index, opts)
tot = cursor.count
cnt = 0
t = Time.now
index.import cursor, {:method => :paginate, :per_page => opts[:batch_size]} do |documents|
if cnt % opts[:batch_size] == 0 then
elapsed_secs = (Time.now - t).round(2)
pct_complete = (100 * (cnt/tot.to_f)).round(2)
LOG.info "#{index.name}: imported #{cnt} of #{tot} (#{pct_complete}% complete after #{elapsed_secs} seconds)"
end
cnt += documents.length
sleep opts[:sleep_time]
documents
end
LOG.info "#{index.name}: finished importing #{cnt} documents"
cnt
end
def move_alias_to(name, index)
# if there was a previous index, switch over the alias to point to the new index
alias_ = Tire::Alias.find name
if alias_
# does the alias already point to this index?
if alias_.indices.include? index.name
return false
end
# remove the alias from wherever it points to now
LOG.info "alias already exists (will move): #{alias_.indices.to_ary.join(',')}"
alias_.indices.each do |old_index_name|
alias_.indices.delete old_index_name unless old_index_name == name
end
else
# create the alias
LOG.info "alias \"#{name}\" does not yet exist - creating."
alias_ = Tire::Alias.new :name => name
end
# point the alias at our new index
alias_.indices.add index.name
alias_.save
LOG.info "alias \"#{name}\" now points to index #{index.name}."
true
end
def do_reindex (opts, in_place=false)
start_time = Time.now
# create the new index with a unique name
new_index = TaskHelpers::ElasticsearchHelper.create_index
# unless the user is forcing a rebuild, or the index does not yet exist, we
# can do a Tire api reindex which is much faster than reimporting documents
# from mongo.
#
# Checking if the index exists is tricky. Tire automatically created an index
# for the model class when the app loaded if one did not already exist. However,
# it won't create an alias, which is what our app uses. So if the index exists
# but not the alias, we know that it's auto-created.
old_index = TaskHelpers::ElasticsearchHelper.get_index
alias_name = old_index.name
alias_ = Tire::Alias.find alias_name
if alias_.nil?
# edge case.
# the alias doesn't exist, so we know the index was auto-created.
# We will delete it and replace it with an alias.
raise RuntimeError, 'Cannot reindex in-place, no valid source index' if in_place
LOG.warn 'deleting auto-created index to make room for the alias'
old_index.delete
# NOTE on the small chance that another process re-auto-creates the index
# we just deleted before we have a chance to create the alias, this next
# call will fail.
move_alias_to(Content::ES_INDEX_NAME, new_index_name)
end
op = in_place ? 'reindex' : '(re)build index'
LOG.info "preparing to #{op}"
content_types = %w(Comment CommentThread)
if in_place
# reindex, moving source documents directly from old index to new
LOG.info 'copying documents from original index (this may take a while!)'
old_index.reindex new_index.name
LOG.info 'done copying!'
else
# fetch all the documents ever, up til start_time
cursor = Content.where(:_type.in => content_types, :updated_at.lte => start_time)
# import them to the new index
import_from_cursor(cursor, new_index, opts)
end
# move the alias if necessary
did_alias_move = move_alias_to(Content::ES_INDEX_NAME, new_index)
if did_alias_move
# Reimport any source documents that got updated since start_time,
# while the alias still pointed to the old index.
# Elasticsearch understands our document ids, so re-indexing the same
# document won't create duplicates.
LOG.info "importing any documents that changed between #{start_time} and now"
cursor = Content.where(:_type.in => content_types, :updated_at.gte => start_time)
import_from_cursor(cursor, new_index, opts)
end
end
desc 'Copies contents of MongoDB into Elasticsearch if updated in the last N minutes.'
task :catchup, [:minutes, :batch_size, :sleep_time] => :environment do |t, args|
opts = batch_opts args
the_index = TaskHelpers::ElasticsearchHelper.get_index
alias_ = Tire::Alias.find the_index.name
# this check makes sure we are working with the index to which
# the desired model's alias presently points.
raise RuntimeError, "could not find live index" if alias_.nil?
desc 'Indexes content updated in the last N minutes.'
task :catchup, [:minutes] => :environment do |t, args|
start_time = Time.now - (args[:minutes].to_i * 60)
cursor = Content.where(:_type.in => %w(Comment CommentThread), :updated_at.gte => start_time)
import_from_cursor(cursor, the_index, opts)
end
def batch_opts(args)
args = args.to_hash
{:batch_size => args[:batch_size].nil? ? 500 : args[:batch_size].to_i,
:sleep_time => args[:sleep_time].nil? ? 0 : args[:sleep_time].to_i}
[Comment, CommentThread].each do |model|
model.where(:updated_at.gte => start_time).import(index: Content::ES_INDEX_NAME)
end
end
desc 'Removes any data from Elasticsearch that no longer exists in MongoDB.'
task :prune, [:batch_size, :sleep_time] => :environment do |t, args|
opts = batch_opts args
the_index = TaskHelpers::ElasticsearchHelper.get_index
puts "pruning #{the_index.name}"
alias_ = Tire::Alias.find the_index.name
raise RuntimeError, 'could not find live index' if alias_.nil?
scan_size = opts[:batch_size] / TaskHelpers::ElasticsearchHelper.get_index_shard_count(the_index.name)
cnt = 0
[CommentThread, Comment].each do |klass|
doc_type = klass.document_type
# this check makes sure we are working with the index to which
# the desired model's alias presently points.
search = Tire::Search::Scan.new the_index.name, {size: scan_size, type: doc_type}
search.each do |results|
es_ids = results.map(&:id)
mongo_ids = klass.where(:id.in => es_ids).map { |d| d.id.to_s }
to_delete = es_ids - mongo_ids
if to_delete.size > 0
cnt += to_delete.size
puts "deleting #{to_delete.size} orphaned #{doc_type} documents from elasticsearch"
the_index.bulk_delete (to_delete).map { |v| {"type" => doc_type, "id" => v} }
end
puts "#{the_index.name}/#{doc_type}: processed #{search.seen} of #{search.total}"
sleep opts[:sleep_time]
end
desc 'Reindex all data from the database'
task :reindex, [:index] => :environment do |t, args|
args.with_defaults(:index => Content::ES_INDEX_NAME)
[Comment, CommentThread].each do |model|
model.import(index: args[:index])
end
puts "done pruning #{the_index.name}, deleted a total of #{cnt} orphaned documents"
end
desc 'Rebuild the content index from MongoDB data.'
task :rebuild, [:batch_size, :sleep_time] => :environment do |t, args|
do_reindex(batch_opts(args))
desc 'Generate a new, empty physical index, without bringing it online.'
task :create_index => :environment do
TaskHelpers::ElasticsearchHelper.create_index
end
desc 'Rebuild the content index from already-indexed data (in place).'
task :reindex, [:batch_size, :sleep_time] => :environment do |t, args|
do_reindex(batch_opts(args), true)
desc 'Creates a new search index and points the "content" alias to it'
task :initialize => :environment do
index = TaskHelpers::ElasticsearchHelper.create_index
TaskHelpers::ElasticsearchHelper.move_alias(Content::ES_INDEX_NAME, index)
end
desc 'Generate a new, empty physical index, without bringing it online.'
task :create_index => :environment do
TaskHelpers::ElasticsearchHelper.create_index
desc 'Sets/moves an alias to the specified index'
task :move_alias, [:alias, :index] => :environment do |t, args|
TaskHelpers::ElasticsearchHelper.move_alias(args[:alias], args[:index])
end
end
require_relative 'concerns/searchable'
require_relative 'content'
require_relative 'constants'
......@@ -6,8 +7,7 @@ class Comment < Content
include Mongoid::Timestamps
include Mongoid::MagicCounterCache
include ActiveModel::MassAssignmentSecurity
include Tire::Model::Search
include Tire::Model::Callbacks
include Searchable
voteable self, :up => +1, :down => -1
......
require 'new_relic/agent/method_tracer'
require_relative 'concerns/searchable'
require_relative 'content'
require_relative 'constants'
......@@ -6,8 +7,7 @@ class CommentThread < Content
include Mongoid::Timestamps
include Mongoid::Attributes::Dynamic
include ActiveModel::MassAssignmentSecurity
include Tire::Model::Search
include Tire::Model::Callbacks
include Searchable
extend Enumerize
voteable self, :up => +1, :down => -1
......@@ -39,10 +39,8 @@ class CommentThread < Content
indexes :created_at, type: :date, included_in_all: false
indexes :updated_at, type: :date, included_in_all: false
indexes :last_activity_at, type: :date, included_in_all: false
indexes :comment_count, type: :integer, included_in_all: false
indexes :votes_point, type: :integer, as: 'votes_point', included_in_all: false
indexes :context, type: :string, index: :not_analyzed, included_in_all: false
indexes :course_id, type: :string, index: :not_analyzed, included_in_all: false
indexes :commentable_id, type: :string, index: :not_analyzed, included_in_all: false
......
module Searchable
extend ActiveSupport::Concern
included do
include Elasticsearch::Model
# We specify our own callbacks, instead of using Elasticsearch::Model::Callbacks, so that we can disable
# indexing for tests where search functionality is not needed. This should improve test execution times.
after_create :index_document
after_update :update_indexed_document
after_destroy :delete_document
def self.put_search_index_mapping(index=nil)
index ||= self.index_name
success = self.__elasticsearch__.client.indices.put_mapping(index: index, type: self.document_type, body: self.mappings.to_hash)
unless success
logger.warn "WARNING! could not apply search index mapping for #{self.name}"
end
end
def as_indexed_json(options={})
# TODO: Play with the `MyModel.indexes` method -- reject non-mapped attributes, `:as` options, etc
self.as_json(options.merge root: false)
end
private
def index_document
__elasticsearch__.index_document if CommentService.search_enabled?
end
# This is named in this manner to prevent collisions with Mongoid's update_document method.
def update_indexed_document
__elasticsearch__.update_document if CommentService.search_enabled?
end
def delete_document
__elasticsearch__.delete_document if CommentService.search_enabled?
end
end
end
class Content
include Mongoid::Document
include Mongo::Voteable
ES_INDEX_NAME = 'content'
field :visible, type: Boolean, default: true
field :abuse_flaggers, type: Array, default: []
field :historical_abuse_flaggers, type: Array, default: [] #preserve abuse flaggers after a moderator unflags
......@@ -16,16 +18,6 @@ class Content
index({comment_thread_id: 1, endorsed: 1}, {sparse: true})
index({commentable_id: 1}, {sparse: true, background: true})
ES_INDEX_NAME = 'content'
def self.put_search_index_mapping(idx=nil)
idx ||= self.tire.index
success = idx.mapping(self.tire.document_type, {:properties => self.tire.mapping})
unless success
logger.warn "WARNING! could not apply search index mapping for #{self.name}"
end
end
before_save :set_username
......
......@@ -3,6 +3,7 @@ require 'faker'
describe 'app' do
include_context 'search_enabled'
before(:each) { set_api_key_header }
let(:body) { Faker::Lorem.word }
......
......@@ -3,15 +3,14 @@ require 'unicode_shared_examples'
describe "app" do
describe "search" do
include_context 'search_enabled'
before (:each) { set_api_key_header }
let(:author) { create_test_user(42) }
let(:course_id) { "test/course/id" }
def get_result_ids(result)
result["collection"].map {|t| t["id"]}
result["collection"].map { |t| t["id"] }
end
describe "GET /api/v1/search/threads" do
......@@ -21,17 +20,17 @@ describe "app" do
result.should == {}
end
it "returns an empty reuslt if text parameter is missing" do
it "returns an empty result if text parameter is missing" do
get "/api/v1/search/threads", course_id: course_id
assert_empty_response
end
it "returns an empty reuslt if sort key is invalid" do
it "returns an empty result if sort key is invalid" do
get "/api/v1/search/threads", course_id: course_id, text: "foobar", sort_key: "invalid", sort_order: "desc"
assert_empty_response
end
it "returns an empty reuslt if sort order is invalid" do
it "returns an empty result if sort order is invalid" do
get "/api/v1/search/threads", course_id: course_id, text: "foobar", sort_key: "date", sort_order: "invalid"
assert_empty_response
end
......@@ -69,13 +68,13 @@ describe "app" do
last_response.should be_ok
result = parse(last_response.body)
actual_ids = Set.new get_result_ids(result)
expected_ids = Set.new expected_thread_indexes.map {|i| threads[i].id.to_s}
expected_ids = Set.new expected_thread_indexes.map { |i| threads[i].id.to_s }
actual_ids.should == expected_ids
end
it "by course_id" do
get "/api/v1/search/threads", text: "text", course_id: "test/course/id0"
assert_response_contains((0..29).find_all {|i| i % 2 == 0})
assert_response_contains((0..29).find_all { |i| i % 2 == 0 })
end
it "by context" do
......@@ -87,7 +86,7 @@ describe "app" do
user = create_test_user(Random.new)
user.mark_as_read(threads[0])
get "/api/v1/search/threads", text: "text", course_id: "test/course/id0", user_id: user.id, unread: true
assert_response_contains((1..29).find_all {|i| i % 2 == 0})
assert_response_contains((1..29).find_all { |i| i % 2 == 0 })
end
it "with flagged filter" do
......@@ -121,22 +120,22 @@ describe "app" do
it "by commentable_id" do
get "/api/v1/search/threads", text: "text", commentable_id: "commentable0"
assert_response_contains((0..29).find_all {|i| i % 3 == 0})
assert_response_contains((0..29).find_all { |i| i % 3 == 0 })
end
it "by commentable_ids" do
get "/api/v1/search/threads", text: "text", commentable_ids: "commentable0,commentable1"
assert_response_contains((0..29).find_all {|i| i % 3 == 0 || i % 3 == 1})
assert_response_contains((0..29).find_all { |i| i % 3 == 0 || i % 3 == 1 })
end
it "by group_id" do
get "/api/v1/search/threads", text: "text", group_id: "1"
assert_response_contains((0..29).find_all {|i| i % 5 == 0 || i % 5 == 1})
assert_response_contains((0..29).find_all { |i| i % 5 == 0 || i % 5 == 1 })
end
it "by group_ids" do
get "/api/v1/search/threads", text: "text", group_ids: "1,2"
expected_ids = (0..29).find_all {|i| i % 5 == 0 || i % 5 == 1 || i % 5 == 2}
expected_ids = (0..29).find_all { |i| i % 5 == 0 || i % 5 == 1 || i % 5 == 2 }
assert_response_contains(expected_ids)
end
......@@ -148,8 +147,8 @@ describe "app" do
describe "sorting works" do
let!(:threads) do
threads = (0..5).map {|i| make_thread(author, "text", course_id, "dummy")}
[1, 2].map {|i| author.vote(threads[i], :up)}
threads = (0..5).map { |i| make_thread(author, "text", course_id, "dummy") }
[1, 2].map { |i| author.vote(threads[i], :up) }
[1, 3].map do |i|
threads[i].comment_count = 5
threads[i].save!
......@@ -164,7 +163,7 @@ describe "app" do
last_response.should be_ok
result = parse(last_response.body)
actual_ids = get_result_ids(result)
expected_ids = expected_thread_indexes.map {|i| threads[i].id.to_s}
expected_ids = expected_thread_indexes.map { |i| threads[i].id.to_s }
actual_ids.should == expected_ids
end
......@@ -197,7 +196,7 @@ describe "app" do
describe "pagination" do
let!(:threads) do
threads = (1..50).map {|i| make_thread(author, "text", course_id, "dummy")}
threads = (1..50).map { |i| make_thread(author, "text", course_id, "dummy") }
refresh_es_index
threads
end
......@@ -210,7 +209,7 @@ describe "app" do
result = parse(last_response.body)
result_ids += get_result_ids(result)
end
result_ids.should == threads.reverse.map {|t| t.id.to_s}
result_ids.should == threads.reverse.map { |t| t.id.to_s }
end
it "works correctly with page size 1" do
......@@ -227,7 +226,7 @@ describe "app" do
end
describe "spelling correction" do
let(:commentable_id) {"test_commentable"}
let(:commentable_id) { "test_commentable" }
def check_correction(original_text, corrected_text)
get "/api/v1/search/threads", text: original_text
......@@ -292,8 +291,8 @@ describe "app" do
end
end
it "returns the correct values for total_results and num_pages" do
course_id = "test/course/id"
it 'returns the correct values for total_results and num_pages' do
course_id = 'test/course/id'
for i in 1..100 do
text = "all"
text += " half" if i % 2 == 0
......@@ -302,15 +301,14 @@ describe "app" do
text += " one" if i == 100
# There is currently a bug that causes only 10 threads with matching
# titles/bodies to be considered, so this test case uses comments.
thread = make_thread(author, "dummy text", course_id, "dummy_commentable")
make_comment(author, thread, text)
create(:comment, course_id: course_id, body: text)
end
# Elasticsearch does not necessarily make newly indexed content
# available immediately, so we must explicitly refresh the index
refresh_es_index
test_text = lambda do |text, expected_total_results, expected_num_pages|
get "/api/v1/search/threads", course_id: course_id, text: text, per_page: "10"
get '/api/v1/search/threads', course_id: course_id, text: text, per_page: '10'
last_response.should be_ok
result = parse(last_response.body)
result["total_results"].should == expected_total_results
......
......@@ -16,6 +16,9 @@ require 'yajl'
require 'support/database_cleaner'
require 'support/elasticsearch'
require 'support/factory_girl'
require 'webmock/rspec'
WebMock.allow_net_connect!
# setup test environment
set :environment, :test
......
def delete_es_index
Tire.index Content::ES_INDEX_NAME do
delete
end
require 'task_helpers'
def refresh_es_index
TaskHelpers::ElasticsearchHelper.refresh_index(Content::ES_INDEX_NAME)
end
def create_es_index
new_index = Tire.index Content::ES_INDEX_NAME
new_index.create
[CommentThread, Comment].each do |klass|
klass.put_search_index_mapping
RSpec.shared_context 'search_enabled' do
before(:all) do
CommentService.config[:enable_search] = true
end
end
def refresh_es_index
es_index_name = Content::ES_INDEX_NAME
Tire.index es_index_name do
refresh
before(:each) do
index = TaskHelpers::ElasticsearchHelper.create_index
TaskHelpers::ElasticsearchHelper.move_alias(Content::ES_INDEX_NAME, index)
end
after(:each) do
TaskHelpers::ElasticsearchHelper.delete_index(Content::ES_INDEX_NAME)
end
end
RSpec.configure do |config|
config.before(:each) do
delete_es_index
create_es_index
config.before(:suite) do
CommentService.config[:enable_search] = false
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment