Skip to content

Commit

Permalink
Merge pull request #65 from MITLibraries/tco54-historical-snapshot-jo…
Browse files Browse the repository at this point in the history
…urnals

Adds exact journal matches to historical analysis
  • Loading branch information
JPrevost authored Jul 31, 2024
2 parents 9e9d29e + 1e0fdc1 commit eec3780
Show file tree
Hide file tree
Showing 7 changed files with 110 additions and 30 deletions.
79 changes: 59 additions & 20 deletions app/models/metrics/algorithms.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
#
# Table name: metrics_algorithms
#
# id :integer not null, primary key
# month :date
# doi :integer
# issn :integer
# isbn :integer
# pmid :integer
# unmatched :integer
# created_at :datetime not null
# updated_at :datetime not null
# id :integer not null, primary key
# month :date
# doi :integer
# issn :integer
# isbn :integer
# pmid :integer
# unmatched :integer
# created_at :datetime not null
# updated_at :datetime not null
# journal_exact :integer
#
module Metrics
# Algorithms aggregates statistics for matches for all SearchEvents
Expand Down Expand Up @@ -43,30 +44,68 @@ def generate(month = nil)
count_matches(SearchEvent.all.includes(:term))
end
Metrics::Algorithms.create(month:, doi: matches[:doi], issn: matches[:issn], isbn: matches[:isbn],
pmid: matches[:pmid], unmatched: matches[:unmatched])
pmid: matches[:pmid], journal_exact: matches[:journal_exact],
unmatched: matches[:unmatched])
end

# Counts matches supplied events
# Counts matches for supplied events
#
# @note We currently only have StandardIdentifiers to match. As we add new algorithms, this method will need to
# expand to handle additional match types.
# @param events [Array of SearchEvents] An array of SearchEvents to check for matches.
# @return [Hash] A Hash with keys for each known algorithm and the count of matched SearchEvents.
def count_matches(events)
matches = Hash.new(0)
known_ids = %i[unmatched pmid isbn issn doi]

events.each do |event|
ids = StandardIdentifiers.new(event.term.phrase)
event_matches(event, matches)
end

matches
end

# Checks for matches for a single event
#
# @note We currently match StandardIdentifiers and Exact Journals. As we add new algorithms, this method will need
# to expand to handle additional match types.
#
# @param event [SearchEvent] an individual search event to check for matches
# @param matches [Hash] a Hash that keeps track of how many of each algorithm we match
# @return does not return anything (the same matches Hash is passed in each loop but not explicitly sent back)
def event_matches(event, matches)
ids = match_standard_identifiers(event, matches)
journal_exact = process_journals(event, matches)

matches[:unmatched] += 1 if ids.identifiers.blank?
matches[:unmatched] += 1 if ids.identifiers.blank? && journal_exact.count.zero?
end

# Checks for StandardIdentifer matches
#
# @param event [SearchEvent] an individual search event to check for matches
# @param matches [Hash] a Hash that keeps track of how many of each algorithm we match
# @return [Array] an array of matched StandardIdentifiers
def match_standard_identifiers(event, matches)
known_ids = %i[unmatched pmid isbn issn doi]
ids = StandardIdentifiers.new(event.term.phrase)

known_ids.each do |id|
matches[id] += 1 if ids.identifiers[id].present?
end
known_ids.each do |id|
matches[id] += 1 if ids.identifiers[id].present?
end
ids
end

matches
# Checks for Journal matches
#
# @note we are only checking for exact matches at this time as the partial match algorithm is more noise than signal
# @note this detection is not a guarantee of search intent and should not be considered a guarantee that we
# understand the search intent. We have not yet done validation on this algoritm to understand what percentage it
# is useful. This information should be conveyed in any reports that use this data.
#
# @param event [SearchEvent] an individual search event to check for matches
# @param matches [Hash] a Hash that keeps track of how many of each algorithm we match
# @return [Array] an array of matched Detector::Journal records
def process_journals(event, matches)
journal_exact = Detector::Journal.full_term_match(event.term.phrase)
matches[:journal_exact] += 1 if journal_exact.count.positive?
journal_exact
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
class AddJournalExactToMetricsAlgorithm < ActiveRecord::Migration[7.1]
def change
add_column :metrics_algorithms, :journal_exact, :integer
end
end
3 changes: 2 additions & 1 deletion db/schema.rb

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions test/fixtures/search_events.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,10 @@ current_month_doi:
current_month_isbn:
term: isbn_9781319145446
source: test
current_month_nature_medicine:
term: journal_nature_medicine
source: test
old_month_nature_medicine:
term: journal_nature_medicine
source: test
created_at: <%= 1.year.ago %>
3 changes: 3 additions & 0 deletions test/fixtures/terms.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,6 @@ doi:

isbn_9781319145446:
phrase: 'Sadava, D. E., D. M. Hillis, et al. Life: The Science of Biology. 11th ed. W. H. Freeman, 2016. ISBN: 9781319145446'

journal_nature_medicine:
phrase: 'nature medicine'
8 changes: 8 additions & 0 deletions test/models/detector/journal_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,13 @@ class JournalTest < ActiveSupport::TestCase
actual = Detector::Journal.partial_term_match('words and stuff Nature medicine, 1999')
assert actual.count == 2
end

test 'mixed titles are downcased when saved' do
mixed_case = 'ThIs Is A tItLe'
actual = Detector::Journal.create(name: mixed_case)
actual.reload
refute_equal(mixed_case, actual.name)
assert_equal(mixed_case.downcase, actual.name)
end
end
end
35 changes: 26 additions & 9 deletions test/models/metrics/algorithms_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
#
# Table name: metrics_algorithms
#
# id :integer not null, primary key
# month :date
# doi :integer
# issn :integer
# isbn :integer
# pmid :integer
# unmatched :integer
# created_at :datetime not null
# updated_at :datetime not null
# id :integer not null, primary key
# month :date
# doi :integer
# issn :integer
# isbn :integer
# pmid :integer
# unmatched :integer
# created_at :datetime not null
# updated_at :datetime not null
# journal_exact :integer
#
require 'test_helper'

Expand All @@ -38,6 +39,11 @@ class Algorithms < ActiveSupport::TestCase
assert aggregate.pmid == 1
end

test 'journal exact counts are included in monthly aggregation' do
aggregate = Metrics::Algorithms.new.generate(DateTime.now)
assert aggregate.journal_exact == 1
end

test 'unmatched counts are included are included in monthly aggregation' do
aggregate = Metrics::Algorithms.new.generate(DateTime.now)
assert aggregate.unmatched == 2
Expand Down Expand Up @@ -102,6 +108,11 @@ class Algorithms < ActiveSupport::TestCase
assert aggregate.pmid == 2
end

test 'journal exact counts are included in total aggregation' do
aggregate = Metrics::Algorithms.new.generate
assert aggregate.journal_exact == 2
end

test 'unmatched counts are included are included in total aggregation' do
aggregate = Metrics::Algorithms.new.generate
assert aggregate.unmatched == 2
Expand Down Expand Up @@ -131,6 +142,11 @@ class Algorithms < ActiveSupport::TestCase
SearchEvent.create(term: terms(:pmid_38908367), source: 'test')
end

journal_exact_count = rand(1...100)
journal_exact_count.times do
SearchEvent.create(term: terms(:journal_nature_medicine), source: 'test')
end

unmatched_expected_count = rand(1...100)
unmatched_expected_count.times do
SearchEvent.create(term: terms(:hi), source: 'test')
Expand All @@ -142,6 +158,7 @@ class Algorithms < ActiveSupport::TestCase
assert issn_expected_count == aggregate.issn
assert isbn_expected_count == aggregate.isbn
assert pmid_expected_count == aggregate.pmid
assert journal_exact_count == aggregate.journal_exact
assert unmatched_expected_count == aggregate.unmatched
end
end

0 comments on commit eec3780

Please sign in to comment.