From 73734cfed8cb84518f2d761a17808e1d81394217 Mon Sep 17 00:00:00 2001 From: Matthew Bernhardt Date: Mon, 12 Aug 2024 17:05:29 -0400 Subject: [PATCH 1/2] Add SuggestedResource matches to metric reports ** Why are these changes being introduced: * We need to include the performance of our SuggestedResource records in the historical performance reports we are generating. ** Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/tco-25 ** How does this address that need: * This copies the approach for counting Journal matches, applying it to the SuggestedResource model as well. This includes a migration to add a field to the report model, and tests and fixtures to confirm that the counts are being generated correctly. ** Document any side effects to this change: * Maybe not a side effect (but maybe it is) - the SuggestedResource model did not have a full_term_match method before this PR. This work adds it as well as tests for its behavior. --- app/models/detector/suggested_resource.rb | 8 ++++-- app/models/metrics/algorithms.rb | 10 ++++++- ...ted_resource_exact_to_metrics_algorithm.rb | 5 ++++ db/schema.rb | 3 ++- .../fixtures/detector/suggested_resources.yml | 6 +++++ test/fixtures/search_events.yml | 7 +++++ test/fixtures/terms.yml | 3 +++ .../detector/suggested_resource_test.rb | 26 +++++++++++++++++++ test/models/metrics/algorithms_test.rb | 18 +++++++++++++ 9 files changed, 82 insertions(+), 4 deletions(-) create mode 100644 db/migrate/20240813181057_add_suggested_resource_exact_to_metrics_algorithm.rb diff --git a/app/models/detector/suggested_resource.rb b/app/models/detector/suggested_resource.rb index 3583297..f4ccdcb 100644 --- a/app/models/detector/suggested_resource.rb +++ b/app/models/detector/suggested_resource.rb @@ -23,12 +23,12 @@ class SuggestedResource < ApplicationRecord before_save :update_fingerprint def update_fingerprint - self.fingerprint = calculate_fingerprint(phrase) + self.fingerprint = Detector::SuggestedResource.calculate_fingerprint(phrase) end # This implements the OpenRefine fingerprinting algorithm. See # https://openrefine.org/docs/technical-reference/clustering-in-depth#fingerprint - def calculate_fingerprint(old_phrase) + def self.calculate_fingerprint(old_phrase) modified_phrase = old_phrase modified_phrase = modified_phrase.strip modified_phrase = modified_phrase.downcase @@ -76,5 +76,9 @@ def self.bulk_replace(input) record.save end end + + def self.full_term_match(phrase) + SuggestedResource.where(fingerprint: calculate_fingerprint(phrase)) + end end end diff --git a/app/models/metrics/algorithms.rb b/app/models/metrics/algorithms.rb index 2ba9b24..8de98c8 100644 --- a/app/models/metrics/algorithms.rb +++ b/app/models/metrics/algorithms.rb @@ -45,6 +45,7 @@ def generate(month = nil) end Metrics::Algorithms.create(month:, doi: matches[:doi], issn: matches[:issn], isbn: matches[:isbn], pmid: matches[:pmid], journal_exact: matches[:journal_exact], + suggested_resource_exact: matches[:suggested_resource_exact], unmatched: matches[:unmatched]) end @@ -73,8 +74,9 @@ def count_matches(events) def event_matches(event, matches) ids = match_standard_identifiers(event, matches) journal_exact = process_journals(event, matches) + suggested_resource_exact = process_suggested_resources(event, matches) - matches[:unmatched] += 1 if ids.identifiers.blank? && journal_exact.count.zero? + matches[:unmatched] += 1 if ids.identifiers.blank? && journal_exact.count.zero? && suggested_resource_exact.count.zero? end # Checks for StandardIdentifer matches @@ -107,5 +109,11 @@ def process_journals(event, matches) matches[:journal_exact] += 1 if journal_exact.count.positive? journal_exact end + + def process_suggested_resources(event, matches) + suggested_resource_exact = Detector::SuggestedResource.full_term_match(event.term.phrase) + matches[:suggested_resource_exact] += 1 if suggested_resource_exact.count.positive? + suggested_resource_exact + end end end diff --git a/db/migrate/20240813181057_add_suggested_resource_exact_to_metrics_algorithm.rb b/db/migrate/20240813181057_add_suggested_resource_exact_to_metrics_algorithm.rb new file mode 100644 index 0000000..7d6257c --- /dev/null +++ b/db/migrate/20240813181057_add_suggested_resource_exact_to_metrics_algorithm.rb @@ -0,0 +1,5 @@ +class AddSuggestedResourceExactToMetricsAlgorithm < ActiveRecord::Migration[7.1] + def change + add_column :metrics_algorithms, :suggested_resource_exact, :integer + end +end diff --git a/db/schema.rb b/db/schema.rb index a7d1c32..5be5861 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[7.1].define(version: 2024_08_05_203949) do +ActiveRecord::Schema[7.1].define(version: 2024_08_13_181057) do create_table "detector_journals", force: :cascade do |t| t.string "name" t.json "additional_info" @@ -40,6 +40,7 @@ t.datetime "created_at", null: false t.datetime "updated_at", null: false t.integer "journal_exact" + t.integer "suggested_resource_exact" end create_table "search_events", force: :cascade do |t| diff --git a/test/fixtures/detector/suggested_resources.yml b/test/fixtures/detector/suggested_resources.yml index 82c43a7..bf56056 100644 --- a/test/fixtures/detector/suggested_resources.yml +++ b/test/fixtures/detector/suggested_resources.yml @@ -29,3 +29,9 @@ web_of_knowledge: url: https://libguides.mit.edu/webofsci phrase: web of knowledge fingerprint: knowledge of web + +nobel_laureate: + title: Professor Moungi Bawendi + url: https://news.mit.edu/2023/mit-chemist-moungi-bawendi-shares-nobel-prize-chemistry-1004 + phrase: moungi bawendi + fingerprint: bawendi moungi diff --git a/test/fixtures/search_events.yml b/test/fixtures/search_events.yml index b99b4cb..168250f 100644 --- a/test/fixtures/search_events.yml +++ b/test/fixtures/search_events.yml @@ -38,3 +38,10 @@ old_month_nature_medicine: term: journal_nature_medicine source: test created_at: <%= 1.year.ago %> +suggested_resource_jstor: + term: suggested_resource_jstor + source: test +old_suggested_resource_jstor: + term: suggested_resource_jstor + source: test + created_at: <%= 1.year.ago %> diff --git a/test/fixtures/terms.yml b/test/fixtures/terms.yml index 121cf0f..ef2114e 100644 --- a/test/fixtures/terms.yml +++ b/test/fixtures/terms.yml @@ -28,3 +28,6 @@ isbn_9781319145446: journal_nature_medicine: phrase: 'nature medicine' + +suggested_resource_jstor: + phrase: 'jstor' diff --git a/test/models/detector/suggested_resource_test.rb b/test/models/detector/suggested_resource_test.rb index 57031b9..815f497 100644 --- a/test/models/detector/suggested_resource_test.rb +++ b/test/models/detector/suggested_resource_test.rb @@ -125,5 +125,31 @@ class SuggestedResourceTest < ActiveSupport::TestCase assert_equal 'delta gamma', resource.fingerprint end + + test 'fingerprint matches on search term' do + expected = detector_suggested_resources('jstor') + actual = Detector::SuggestedResource.full_term_match('jstor') + + assert_equal 1, actual.count + assert_equal expected, actual.first + end + + test 'fingerprint matches on any word order or punctuation' do + expected = detector_suggested_resources('nobel_laureate') + actual_one = Detector::SuggestedResource.full_term_match('Moungi Bawendi') + actual_two = Detector::SuggestedResource.full_term_match('Bawendi, Moungi') + + assert_equal 1, actual_one.count + assert_equal expected, actual_one.first + assert_equal actual_one.first, actual_two.first + end + + test 'partial fingerprint matches do not count' do + actual_partial = Detector::SuggestedResource.full_term_match('science web') + actual_extra = Detector::SuggestedResource.full_term_match('the web of science') + + assert_predicate actual_partial.count, :zero? + assert_predicate actual_extra.count, :zero? + end end end diff --git a/test/models/metrics/algorithms_test.rb b/test/models/metrics/algorithms_test.rb index 279a081..7c22377 100644 --- a/test/models/metrics/algorithms_test.rb +++ b/test/models/metrics/algorithms_test.rb @@ -49,6 +49,12 @@ class Algorithms < ActiveSupport::TestCase assert_equal 1, aggregate.journal_exact end + test 'suggested_resource exact counts are included in monthly aggregation' do + aggregate = Metrics::Algorithms.new.generate(DateTime.now) + + assert_equal 1, aggregate.suggested_resource_exact + end + test 'unmatched counts are included are included in monthly aggregation' do aggregate = Metrics::Algorithms.new.generate(DateTime.now) @@ -124,6 +130,12 @@ class Algorithms < ActiveSupport::TestCase assert_equal 2, aggregate.journal_exact end + test 'suggested_resource exact counts are included in total aggregation' do + aggregate = Metrics::Algorithms.new.generate + + assert_equal 2, aggregate.suggested_resource_exact + end + test 'unmatched counts are included are included in total aggregation' do aggregate = Metrics::Algorithms.new.generate @@ -159,6 +171,11 @@ class Algorithms < ActiveSupport::TestCase SearchEvent.create(term: terms(:journal_nature_medicine), source: 'test') end + suggested_resource_exact_count = rand(1...100) + suggested_resource_exact_count.times do + SearchEvent.create(term: terms(:suggested_resource_jstor), source: 'test') + end + unmatched_expected_count = rand(1...100) unmatched_expected_count.times do SearchEvent.create(term: terms(:hi), source: 'test') @@ -171,6 +188,7 @@ class Algorithms < ActiveSupport::TestCase assert_equal isbn_expected_count, aggregate.isbn assert_equal pmid_expected_count, aggregate.pmid assert_equal journal_exact_count, aggregate.journal_exact + assert_equal suggested_resource_exact_count, aggregate.suggested_resource_exact assert_equal unmatched_expected_count, aggregate.unmatched end end From d0dbd1249630b51e8fbe4a84ee606aca2716c5e8 Mon Sep 17 00:00:00 2001 From: Matthew Bernhardt Date: Wed, 14 Aug 2024 14:18:23 -0400 Subject: [PATCH 2/2] Add documentation to methods based on code review --- app/models/detector/suggested_resource.rb | 16 ++++++++++++++++ app/models/metrics/algorithms.rb | 10 ++++++++++ 2 files changed, 26 insertions(+) diff --git a/app/models/detector/suggested_resource.rb b/app/models/detector/suggested_resource.rb index f4ccdcb..11bd296 100644 --- a/app/models/detector/suggested_resource.rb +++ b/app/models/detector/suggested_resource.rb @@ -22,12 +22,19 @@ module Detector class SuggestedResource < ApplicationRecord before_save :update_fingerprint + # This exists for the before_save lifecycle hook to call the calculate_fingerprint method, to ensure that these + # records always have a correctly-calculated fingerprint. It has no arguments and returns nothing. def update_fingerprint self.fingerprint = Detector::SuggestedResource.calculate_fingerprint(phrase) end # This implements the OpenRefine fingerprinting algorithm. See # https://openrefine.org/docs/technical-reference/clustering-in-depth#fingerprint + # + # @param old_phrase [String] A text string which needs to have its fingerprint calculated. This could either be the + # "phrase" field on the SuggestedResource record, or an incoming search term received from a contributing system. + # + # @return [String] A string of all words in the input, downcased, normalized, and alphabetized. def self.calculate_fingerprint(old_phrase) modified_phrase = old_phrase modified_phrase = modified_phrase.strip @@ -77,6 +84,15 @@ def self.bulk_replace(input) end end + # Identify any SuggestedResource record whose pre-calculated fingerprint matches the fingerprint of the incoming + # phrase. + # + # @note There is a uniqueness constraint on the SuggestedResource fingerprint field, so there should only ever be + # one match (if any). + # + # @param phrase [String]. A string representation of a searchterm (not an actual Term object) + # + # @return [Detector::SuggestedResource] The record whose fingerprint matches that of the search term. def self.full_term_match(phrase) SuggestedResource.where(fingerprint: calculate_fingerprint(phrase)) end diff --git a/app/models/metrics/algorithms.rb b/app/models/metrics/algorithms.rb index 8de98c8..d3e543d 100644 --- a/app/models/metrics/algorithms.rb +++ b/app/models/metrics/algorithms.rb @@ -110,6 +110,16 @@ def process_journals(event, matches) journal_exact end + # Checks for SuggestedResource matches + # + # @note This only checks for exact matches of the search term, so any extra or missing words will result in no + # match. + # + # @param event [SearchEvent] an individual search event to check for matches + # @param matches [Hash] a Hash that keeps track of how many of each algorithm we match + # @return [Array] an array of the one Detector::SuggestedResource record whose fingerprint matches that of the + # search phrase (if one exists). The uniqueness constraint on the fingerprint should mean there is only ever one + # matched record. def process_suggested_resources(event, matches) suggested_resource_exact = Detector::SuggestedResource.full_term_match(event.term.phrase) matches[:suggested_resource_exact] += 1 if suggested_resource_exact.count.positive?