Skip to content

Commit

Permalink
Merge pull request #143 from MITLibraries/tco-113-primo-preprocessor
Browse files Browse the repository at this point in the history
Adds preprocessor for incoming primo searches
  • Loading branch information
JPrevost authored Dec 5, 2024
2 parents fa5bb3b + ff94e13 commit 5fbfd28
Show file tree
Hide file tree
Showing 4 changed files with 221 additions and 1 deletion.
76 changes: 76 additions & 0 deletions app/models/preprocessor_primo.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# frozen_string_literal: true

# PreprocessorPrimo handles manipulating incoming data from the Primo UI into a structure that TACOS can work with
class PreprocessorPrimo
# to_tacos processes raw incoming query from Primo, looks at each part to see if it is a keyword anywhere search
# Any portion that is not a keyword anywhere search drops the entire search from TACOS, logging
# as the shared Term `unhandled complex primo query` to allow us to track how frequently we are
# dropping terms so we can come back later to build out more complex handing if this is common enough
# to warrant the additional work.
# @param query [String] example `any,contains,this is a keyword search`
def self.to_tacos(query)
# Primo and TACOS agreed upon joiner is `;;;`
split_query = query.split(';;;')

if split_query.count > 1
Rails.logger.debug('Multipart primo query detected')

# As we are not currently handling complex queries, always set the value to something we can track frequency of
'unhandled complex primo query'
else
Rails.logger.debug('Simple primo query detected')

extract_keyword(query)
end
end

# keyword? confirms whether a portion of a primo query is a keyword search
# Note: we expect only 3 elements to this array for simple keyword searches and that arrays created from the Primo
# input to be collapsed so commas in the original search have been handled via the comma_handler method
# @param query_part_array [Array] example ['any', 'contains', 'this is a keyword search']
# @return [Boolean]
def self.keyword?(query_part_array)
return false unless query_part_array.count == 3
return false unless query_part_array[0] == 'any'

# For now, we are allowing all variants of the second portion of the primo query input
# The expected values are: contains, exact, begins_with, equals
# Uncommenting the following statement would allow us to restrict to just the default 'contains' if desireable
#
# return false unless query_part_array[1] == 'contains'

true
end

# extract_keyword works at the level of a single keyword query input coming from primo and
# returns a string with just that keyword with the operators removed
# @param query_part [String] example `any,contains,this is a keyword search`
# @return [String] the extracted keyword phrase
def self.extract_keyword(query_part)
query_part_array = query_part.split(',')

# We don't anticipate this being a normal state so we are tracking it under the Term `invalid primo query` as well
# as sending an exception to Sentry so we can understand the context in which this happens if it does
if query_part_array.count < 3
Sentry.capture_message('PreprocessorPrimo: Invalid Primo query during keyword extraction')
return 'invalid primo query'
end

the_keywords = join_keyword_and_drop_extra_parts(query_part_array)

return 'unhandled complex primo query' unless keyword?([query_part_array[0], query_part_array[1], the_keywords])

the_keywords
end

# join_keyword_and_drop_extra_parts handles the logic necessary to join searches that contain commas into a single ruby string
# after we separate the incoming string into an array based on commas
# @param query_part [String] example `['any', 'contains', 'this', 'is', 'a', 'keyword', 'search']`
# @return [String] example 'this,is,a,keyword,search'
def self.join_keyword_and_drop_extra_parts(query_part_array)
# For complex queries, which we are not handling yet, we'll need to determine how TACOS should handle the final
# element of the input which will be a boolean operator. For now, we will have stopped processing those by this
# point during the initial logic in `to_tacos` that splits on `;;` and returns if the result is more than one query
query_part_array.slice(2..).join(',')
end
end
18 changes: 17 additions & 1 deletion app/models/search_logger.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,24 @@ class SearchLogger
# Receives a phrase and source and creates a search event. Will find or create a term as needed.
# @return [SearchEvent] the newly created SearchEvent
def self.logevent(phrase, source)
term = Term.create_or_find_by!(phrase:)
term = Term.create_or_find_by!(phrase: extract_phrase(phrase, source))
term.calculate_categorizations
term.search_events.create!(source:)
end

# Coordinates `phrase` extraction from incoming data from each `source`. If no `source` is matched,
# passes through incoming `phrase`.
# Note: as it may become useful to test in a production environment, we match on patterns of sources
# rather than exact string matches. Example: `primo`, `primo-testing`, `primo-playground` are all handled
# with the same case.
def self.extract_phrase(phrase, source)
case source
when /primo/
Rails.logger.debug('Primo case detected')
PreprocessorPrimo.to_tacos(phrase)
else
Rails.logger.debug('default case detected')
phrase
end
end
end
26 changes: 26 additions & 0 deletions test/controllers/graphql_controller_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -214,4 +214,30 @@ class GraphqlControllerTest < ActionDispatch::IntegrationTest
assert_equal 'Transactional', json['data']['lookupTerm']['categories'].first['name']
assert_in_delta 0.95, json['data']['lookupTerm']['categories'].first['confidence']
end

test 'primo searches use the preprocessor to extract actual keywords' do
post '/graphql', params: { query: '{
logSearchEvent(sourceSystem: "primo-test",
searchTerm: "any,contains,Super cool search") {
phrase
}
}' }

json = response.parsed_body

assert_equal 'Super cool search', json['data']['logSearchEvent']['phrase']
end

test 'primo searches use the preprocessor and logs complex queries to a specific term' do
post '/graphql', params: { query: '{
logSearchEvent(sourceSystem: "primo-test",
searchTerm: "any,contains,Super cool search;;;any,contains,uh oh this is getting complicated") {
phrase
}
}' }

json = response.parsed_body

assert_equal 'unhandled complex primo query', json['data']['logSearchEvent']['phrase']
end
end
102 changes: 102 additions & 0 deletions test/models/preprocessor_primo_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# frozen_string_literal: true

#
require 'test_helper'

class PreprocessorPrimoTest < ActiveSupport::TestCase
test 'to_tacos returns unhandled for complex queries' do
input = 'any,contains,space;;;any,contains,madness'

assert_equal('unhandled complex primo query', PreprocessorPrimo.to_tacos(input))
end

test 'to_tacos returns unhandled for targeted field queries' do
input = 'title,contains,space'

assert_equal('unhandled complex primo query', PreprocessorPrimo.to_tacos(input))
end

test 'to_tacos returns phrase ready for tacos for simple keyword input' do
input = 'any,contains,space'

assert_equal('space', PreprocessorPrimo.to_tacos(input))
end

test 'to_tacos returns phrase ready for complex keyword input' do
input = 'any,contains,Yan, F., Krantz, P., Sung, Y., Kjaergaard, M., Campbell, D.L., Orlando, T.P., Gustavsson, S. and Oliver, W.D., 2018. Tunable coupling scheme for implementing high-fidelity two-qubit gates. Physical Review Applied, 10(5), p.054062.'
expected = 'Yan, F., Krantz, P., Sung, Y., Kjaergaard, M., Campbell, D.L., Orlando, T.P., Gustavsson, S. and Oliver, W.D., 2018. Tunable coupling scheme for implementing high-fidelity two-qubit gates. Physical Review Applied, 10(5), p.054062.'

assert_equal(expected, PreprocessorPrimo.to_tacos(input))
end

test 'keyword? returns true for any contains phrase pattern' do
input = 'any,contains,popcorn anomoly'.split(',')

assert(PreprocessorPrimo.keyword?(input))
end

test 'keyword? returns false for input with more than 3 array elements' do
# NOTE: this query entering tacos would work... but it would have been cleaned up prior to running
# keyword? in our application via the normal flow
input = 'any,contains,popcorn anomoly: why life on the moon is complex, and other cat facts'.split(',')

assert_not(PreprocessorPrimo.keyword?(input))
end

test 'keyword? returns false for input with less than 3 array elements' do
input = 'any,contains'.split(',')

assert_not(PreprocessorPrimo.keyword?(input))
end

test 'keyword? returns false for non-any input' do
input = 'title,contains,popcorn anomoly'.split(',')

assert_not(PreprocessorPrimo.keyword?(input))
end

test 'keyword? returns true for non-contains inputs' do
# NOTE: this portion of they primo query focuses on how to handle the phrase. All the words, any of the words,
# the exact phrase, begins_with. For now we treat them all the same as standard keyword queries.
input = 'any,exact,popcorn anomoly'.split(',')

assert(PreprocessorPrimo.keyword?(input))
end

test 'extract keyword returns keyword for simple keywords' do
input = 'any,contains,popcorn anomoly'

assert_equal('popcorn anomoly', PreprocessorPrimo.extract_keyword(input))
end

test 'extract keyword returns keyword for simple non-contains keywords' do
input = 'any,exact,popcorn anomoly'

assert_equal('popcorn anomoly', PreprocessorPrimo.extract_keyword(input))
end

test 'extract keyword returns unhandled complex primo query for non-any searches' do
input = 'title,contains,popcorn anomoly'

assert_equal('unhandled complex primo query', PreprocessorPrimo.extract_keyword(input))
end

test 'extract keyword returns keyword for keywords with punctuation' do
input = 'any,contains,popcorn anomoly: a cats! life. on & mars!'

assert_equal('popcorn anomoly: a cats! life. on & mars!', PreprocessorPrimo.extract_keyword(input))
end

test 'extract keyword returns keyword for keywords with commas' do
input = 'any,contains,popcorn anomoly, and so can you'

assert_equal('popcorn anomoly, and so can you', PreprocessorPrimo.extract_keyword(input))
end

test 'extract keyword returns keyword for keywords with multiple commas and other punctuation' do
input = 'any,contains,popcorn anomoly: a cats! life. on & mars!, words, of {truth} (and, also not,)'

assert_equal('popcorn anomoly: a cats! life. on & mars!, words, of {truth} (and, also not,)',
PreprocessorPrimo.extract_keyword(input))
end
end

0 comments on commit 5fbfd28

Please sign in to comment.