Skip to content

Commit

Permalink
Added scroll api to fetch search code (#3846)
Browse files Browse the repository at this point in the history
* Added scroll api to fetch search code

* fixed tests
  • Loading branch information
amir-qayyum-khan authored Feb 26, 2018
1 parent ea466ef commit cc8311a
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 23 deletions.
44 changes: 29 additions & 15 deletions search/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,28 @@ def execute_search(search_obj):
return search_obj.execute()


def scan_search(search_obj):
"""
Executes a scan search after checking the connection and return a
generator that will iterate over all the documents matching the query.
Args:
search_obj (Search): elasticsearch_dsl Search object
Returns:
generator of dict:
A generator that will iterate over all the documents matching the query
"""
# make sure there is a live connection
if search_obj._index is None: # pylint: disable=protected-access
# If you're seeing this it means you're creating Search() without using
# create_search_obj which sets important fields like the index.
raise ImproperlyConfigured("search object is missing an index")

get_conn()
return search_obj.scan()


def get_searchable_programs(user, staff_program_ids):
"""
Determines the programs a user is eligible to search
Expand Down Expand Up @@ -190,14 +212,13 @@ def prepare_and_execute_search(user, search_param_dict=None, search_func=execute
return search_func(search_obj)


def search_for_field(search_obj, field_name, page_size=DEFAULT_ES_LOOP_PAGE_SIZE):
def search_for_field(search_obj, field_name):
"""
Retrieves all unique instances of a field for documents that match an ES query
Args:
search_obj (Search): Search object
field_name (str): The name of the field for the value to get
page_size (int): Number of docs per page of results
Returns:
set: Set of unique values
Expand All @@ -207,21 +228,14 @@ def search_for_field(search_obj, field_name, page_size=DEFAULT_ES_LOOP_PAGE_SIZE
# index is altered during the loop.
# This also limits the query to only return the field value.
search_obj = search_obj.sort('_doc').source(include=[field_name])
loop = 0
all_results_returned = False
while not all_results_returned:
from_index = loop * page_size
to_index = from_index + page_size
search_results = execute_search(search_obj[from_index: to_index])
# add the field value for every search result hit to the set
for hit in search_results.hits:
results.add(getattr(hit, field_name))
all_results_returned = to_index >= search_results.hits.total
loop += 1
search_results = scan_search(search_obj)
# add the field value for every search result hit to the set
for hit in search_results:
results.add(getattr(hit, field_name))
return results


def get_all_query_matching_emails(search_obj, page_size=DEFAULT_ES_LOOP_PAGE_SIZE):
def get_all_query_matching_emails(search_obj):
"""
Retrieves all unique emails for documents that match an ES query
Expand All @@ -232,7 +246,7 @@ def get_all_query_matching_emails(search_obj, page_size=DEFAULT_ES_LOOP_PAGE_SIZ
Returns:
set: Set of unique emails
"""
return search_for_field(search_obj, "email", page_size=page_size)
return search_for_field(search_obj, "email")


def search_percolate_queries(program_enrollment_id, source_type):
Expand Down
22 changes: 14 additions & 8 deletions search/api_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,21 +213,27 @@ def test_search_for_field(self):
"""
Test that a set of search results will yield an expected set of values
"""
test_es_page_size = 2
search = create_search_obj(self.user)
user_ids = self.program.programenrollment_set.values_list("user__id", flat=True).order_by("-user__id")
results = search_for_field(search, 'user_id', page_size=test_es_page_size)
assert results == set(user_ids[:test_es_page_size])
user_ids = self.program.programenrollment_set.values_list(
"user__id", flat=True
).exclude(
user__id=self.user.id
).order_by("-user__id")
results = search_for_field(search, 'user_id')
assert results == set(user_ids)

def test_all_query_matching_emails(self):
"""
Test that a set of search results will yield an expected set of emails
"""
test_es_page_size = 2
search = create_search_obj(self.user)
user_ids = self.program.programenrollment_set.values_list("user__email", flat=True).order_by("-user__id")
results = get_all_query_matching_emails(search, page_size=test_es_page_size)
assert results == set(user_ids[:test_es_page_size])
user_ids = self.program.programenrollment_set.values_list(
"user__email", flat=True
).exclude(
user__email=self.user.email
).order_by("-user__id")
results = get_all_query_matching_emails(search)
assert results == set(user_ids)

# This patch works around on_commit by invoking it immediately, since in TestCase all tests run in transactions
@patch('search.signals.transaction.on_commit', side_effect=lambda callback: callback())
Expand Down

0 comments on commit cc8311a

Please sign in to comment.