From 1c911cf87c054e97d30500bf68a75ee1ae37340c Mon Sep 17 00:00:00 2001 From: Jeremy Prevost Date: Tue, 15 Oct 2024 13:50:00 -0400 Subject: [PATCH 1/2] Update variable name in Detectors to phrase Why are these changes being introduced: * we were often passing Strings but calling the variable `term` which was misleading as our strings were often phrases that are part of the `Term` object. Aka we were calling `Term.phrase` a `term`. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TCO-106 How does this address that need: * Renamed `term` to `phrase` to better reflect it often comes from a `Term.phrase` rather than being the actual `Term` object * Improved some documentation --- app/models/detector/lcsh.rb | 11 +++++++---- app/models/detector/pattern_checker.rb | 19 ++++++++++++++----- app/models/detector/standard_identifiers.rb | 11 +++++++---- test/models/detector/lcsh_test.rb | 8 ++++---- 4 files changed, 32 insertions(+), 17 deletions(-) diff --git a/app/models/detector/lcsh.rb b/app/models/detector/lcsh.rb index 0cf5121..b1e0f22 100644 --- a/app/models/detector/lcsh.rb +++ b/app/models/detector/lcsh.rb @@ -12,9 +12,12 @@ class Lcsh # For now the initialize method just needs to run the pattern checker. A space for future development would be to # write additional methods to look up the detected LCSH for more information, and to confirm that the phrase is # actually an LCSH. - def initialize(term) + # + # @param phrase String. Often a `Term.phrase`. + # @return Nothing intentional. Data is written to Hash `@detections` during processing. + def initialize(phrase) @detections = {} - term_pattern_checker(term) + pattern_checker(phrase) end # The record method will consult the set of regex-based detectors that are defined in Detector::Lcsh. Any matches @@ -41,10 +44,10 @@ def self.record(term) private - # term_patterns are regex patterns that can be applied to indicate whether a search string is looking for an LCSH + # patterns are regex patterns that can be applied to indicate whether a search string is looking for an LCSH # string. At the moment there is only one - for the separator character " -- " - but others might be possible if # there are regex-able vocabulary quirks which might separate subject values from non-subject values. - def term_patterns + def patterns { separator: /(.*)\s--\s(.*)/ } diff --git a/app/models/detector/pattern_checker.rb b/app/models/detector/pattern_checker.rb index d1dbedd..dc13c23 100644 --- a/app/models/detector/pattern_checker.rb +++ b/app/models/detector/pattern_checker.rb @@ -4,9 +4,13 @@ class Detector # PatternChecker is intended to be added to Detectors via `include Detector::PatternChecker` to make # these methods available to instances of the class module PatternChecker - def term_pattern_checker(term) - term_patterns.each_pair do |type, pattern| - @detections[type.to_sym] = match(pattern, term) if match(pattern, term).present? + # pattern_checker iterates over all patterns defined in the calling object's `pattern` method. + # + # @param phrase [String]. Often a `Term.phrase`. + # @return Nothing intentional. Data is written to Hash `@detections` during processing. + def pattern_checker(phrase) + patterns.each_pair do |type, pattern| + @detections[type.to_sym] = match(pattern, phrase) if match(pattern, phrase).present? end end @@ -15,8 +19,13 @@ def term_pattern_checker(term) # might be expected, but just "1234-5678". Using ruby's string.scan(pattern) may be worthwhile if we want to detect # all possible matches instead of just the first. That may require a larger refactor though as initial tests of doing # that change did result in unintended results so it was backed out for now. - def match(pattern, term) - pattern.match(term).to_s.strip + # + # @param pattern Regexp + # @param phrase String. Often a `Term.phrase`. + # + # @return String + def match(pattern, phrase) + pattern.match(phrase).to_s.strip end end end diff --git a/app/models/detector/standard_identifiers.rb b/app/models/detector/standard_identifiers.rb index 3455309..7b03979 100644 --- a/app/models/detector/standard_identifiers.rb +++ b/app/models/detector/standard_identifiers.rb @@ -13,9 +13,12 @@ def self.table_name_prefix # shared instance methods include Detector::PatternChecker - def initialize(term) + # Initialization process will run pattern checkers and strip invalid ISSN detections. + # @param phrase String. Often a `Term.phrase`. + # @return Nothing intentional. Data is written to Hash `@detections` during processing. + def initialize(phrase) @detections = {} - term_pattern_checker(term) + pattern_checker(phrase) strip_invalid_issns end @@ -43,8 +46,8 @@ def self.record(term) private - # term_patterns are regex patterns to be applied to the basic search box input - def term_patterns + # patterns are regex patterns to be applied to the basic search box input + def patterns { isbn: /\b(ISBN-*(1[03])* *(: ){0,1})*(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})\b/, issn: /\b[0-9]{4}-[0-9]{3}[0-9xX]\b/, diff --git a/test/models/detector/lcsh_test.rb b/test/models/detector/lcsh_test.rb index 6e6b1cd..7cf41ad 100644 --- a/test/models/detector/lcsh_test.rb +++ b/test/models/detector/lcsh_test.rb @@ -10,8 +10,8 @@ class LcshTest < ActiveSupport::TestCase 'Space vehicles -- Materials -- Congresses' ] - true_samples.each do |term| - actual = Detector::Lcsh.new(term).detections + true_samples.each do |phrase| + actual = Detector::Lcsh.new(phrase).detections assert_includes(actual, :separator) end @@ -25,8 +25,8 @@ class LcshTest < ActiveSupport::TestCase 'This one should--also not work' ] - false_samples.each do |term| - actual = Detector::Lcsh.new(term).detections + false_samples.each do |phrase| + actual = Detector::Lcsh.new(phrase).detections assert_not_includes(actual, :separator) end From 81d604d120061a2aa186097dab0d13a953fd62af Mon Sep 17 00:00:00 2001 From: Jeremy Prevost Date: Tue, 15 Oct 2024 16:11:28 -0400 Subject: [PATCH 2/2] Update citation to use phrases instead of Terms Why are these changes being introduced: * this normalizes all Citation to work like our other Detectors to take string input rather than Terms for their non-record entry-points Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TCO-106 How does this address that need: * Reworked relevant methods and documentation (and some tests) --- app/models/detector/citation.rb | 81 ++++++++++++++------------- test/models/detector/citation_test.rb | 74 ++++++++++++------------ 2 files changed, 80 insertions(+), 75 deletions(-) diff --git a/app/models/detector/citation.rb b/app/models/detector/citation.rb index d4ba7e2..7ef5b2c 100644 --- a/app/models/detector/citation.rb +++ b/app/models/detector/citation.rb @@ -5,9 +5,10 @@ class Detector # targeted at a particular citation format, but was designed based on characteristics of five formats: APA, MLA, # Chicago, Terabian, and IEEE. # - # It receives a Term object, which is parsed in various ways en route to calculating a final score. Terms with a - # higher score are more citation-like, while a score of 0 indicates a Term that has no hallmarks of being a citation. - # Terms whose score is higher than the REQUIRED_SCORE value can be registered as a Detection. + # It receives a phrase (often from `Term.phrase`), which is parsed in various ways en route to calculating a final + # score. Phrases with a higher score are more citation-like, while a score of 0 indicates a phrase that has no + # hallmarks of being a citation. + # Phrases whose score is higher than the REQUIRED_SCORE value can be registered as a Detection. class Citation attr_reader :score, :subpatterns, :summary @@ -26,12 +27,13 @@ class Citation quotes: /".*?"/ }.freeze - # The required score value is the threshold needed for a Term to be officially recorded with a Detection. + # The required score value is the threshold needed for a phrase to be officially recorded with a Detection via it's + # associated Term. REQUIRED_SCORE = 6 # Summary thresholds are used by the calculate_score method. This class counts the number of occurrences of specific # characters in the @summary instance variable. The thresholds here determine whether any of those counts are high - # enough to contribute to the Term's citation score. + # enough to contribute to the phrase's citation score. SUMMARY_THRESHOLDS = { characters: 25, colons: 2, @@ -48,28 +50,31 @@ def detection? @score >= REQUIRED_SCORE end - # The initializer handles the parsing of a Term object, and subsequent population of the @subpatterns, @summary, + # The initializer handles the parsing of a phrase, and subsequent population of the @subpatterns, @summary, # and @score instance variables. @subpatterns contains all the citation components which have been flagged by the - # CITATION_PATTERNS hash. @summary contains counts of how often certain characters or words appear in the Term. + # CITATION_PATTERNS hash. @summary contains counts of how often certain characters or words appear in the phrase. # Finally, the @score value is a summary of how many elements in the subpatterns or summary report were detected. # - # @note This method can be called directly via Detector::Citation.new(Term). It is also called indirectly via the + # @note This method can be called directly via Detector::Citation.new(phrase). It is also called indirectly via the # Detector::Citation.record(Term) instance method. This method can be called directly when a Detection is not # desired. - def initialize(term) + # @param phrase String. Often a `Term.phrase`. + # @return Nothing intentional. Data is written to Hashes `@subpatterns`, `@summary`, + # and `@score` during processing. + def initialize(phrase) @subpatterns = {} @summary = {} - pattern_checker(term.phrase) - summarize(term.phrase) + pattern_checker(phrase) + summarize(phrase) @score = calculate_score end # The record method first runs all of the parsers by running the initialize method. If the resulting score is higher # than the REQUIRED_SCORE value, then a Detection is registered. - # + # @param term [Term] # @return nil def self.record(term) - cit = Detector::Citation.new(term) + cit = Detector::Citation.new(term.phrase) return unless cit.detection? Detection.find_or_create_by( @@ -90,7 +95,7 @@ def self.record(term) # if the brackets pattern finds two matches, it still only adds one to the final score. # # For the summary report, each value is compared with a threshold value in the SUMMARY_THRESHOLDS constant. The - # number of values which meet or exceed their threshold are added to the score. As an example, if a search term has + # number of values which meet or exceed their threshold are added to the score. As an example, if a search phrase has # five words, this value is compared to the word threshold (also five). Because the threshold is met, the score gets # incremented by one. # @@ -103,45 +108,45 @@ def calculate_score summary_score + @subpatterns.length end - # This calculates the number of characters in the search term. It is called by the summarize method. - def characters(term) - term.length + # This calculates the number of characters in the search phrase. It is called by the summarize method. + def characters(phrase) + phrase.length end - # This counts the number of colons that appear in the search term, because they tend to appear more often in + # This counts the number of colons that appear in the search phrase, because they tend to appear more often in # citations than in other searches. It is called by the summarize method. - def colons(term) - term.count(':') + def colons(phrase) + phrase.count(':') end - # This counts the number of commas in the search term. It is called by the summarize method. - def commas(term) - term.count(',') + # This counts the number of commas in the search phrase. It is called by the summarize method. + def commas(phrase) + phrase.count(',') end # This builds one of the two main components of the Citation detector - the subpattern report. It uses each of the # regular expressions in the CITATION_PATTERNS constant, extracting all matches using the scan method. # # @return hash - def pattern_checker(term) + def pattern_checker(phrase) CITATION_PATTERNS.each_pair do |type, pattern| - @subpatterns[type.to_sym] = scan(pattern, term) if scan(pattern, term).present? + @subpatterns[type.to_sym] = scan(pattern, phrase) if scan(pattern, phrase).present? end end - # This counts the number of periods in the search term. It is called by the summarize method. - def periods(term) - term.count('.') + # This counts the number of periods in the search phrase. It is called by the summarize method. + def periods(phrase) + phrase.count('.') end # This is a convenience method for the scan method, which is used by pattern_checker. - def scan(pattern, term) - term.scan(pattern).map(&:strip) + def scan(pattern, phrase) + phrase.scan(pattern).map(&:strip) end - # This counts the semicolons in the search term. It is called by the summarize method. - def semicolons(term) - term.count(';') + # This counts the semicolons in the search phrase. It is called by the summarize method. + def semicolons(phrase) + phrase.count(';') end # This builds one of the two main components of the Citation detector - the summary report. It calls each of the @@ -149,15 +154,15 @@ def semicolons(term) # instance variable. # # @return hash - def summarize(term) + def summarize(phrase) %w[characters colons commas periods semicolons words].each do |check| - @summary[check.to_sym] = send(check, term) + @summary[check.to_sym] = send(check, phrase) end end - # This counts the number of words in the search term. It is called by the summarize method. - def words(term) - term.split.length + # This counts the number of words in the search phrase. It is called by the summarize method. + def words(phrase) + phrase.split.length end end end diff --git a/test/models/detector/citation_test.rb b/test/models/detector/citation_test.rb index d553a5c..900a84c 100644 --- a/test/models/detector/citation_test.rb +++ b/test/models/detector/citation_test.rb @@ -6,7 +6,7 @@ class Detector class CitationTest < ActiveSupport::TestCase test 'detector::citation exposes three instance variables' do t = terms('citation') - result = Detector::Citation.new(t) + result = Detector::Citation.new(t.phrase) assert_predicate result.score, :present? @@ -16,194 +16,194 @@ class CitationTest < ActiveSupport::TestCase end test 'detector::citation generates certain summary counts always' do - result = Detector::Citation.new(terms('hi')) + result = Detector::Citation.new(terms('hi').phrase) expected = %i[characters colons commas periods semicolons words] assert_equal expected, result.summary.keys end test 'summary includes a character count' do - result = Detector::Citation.new(Term.new(phrase: 'a')) + result = Detector::Citation.new('a') assert_equal 1, result.summary[:characters] # Multibyte character - result = Detector::Citation.new(Term.new(phrase: 'あ')) + result = Detector::Citation.new('あ') assert_equal 1, result.summary[:characters] # Twelve thousand characters? No problem... phrase = String.new('a' * 12_345) - result = Detector::Citation.new(Term.new(phrase:)) + result = Detector::Citation.new(phrase) assert_equal 12_345, result.summary[:characters] end test 'summary includes a count of colons in term' do - result = Detector::Citation.new(Term.new(phrase: 'No colons here')) + result = Detector::Citation.new('No colons here') assert_equal 0, result.summary[:colons] - result = Detector::Citation.new(Term.new(phrase: 'Three: colons :: here')) + result = Detector::Citation.new('Three: colons :: here') assert_equal 3, result.summary[:colons] end test 'summary includes a count of commas in term' do - result = Detector::Citation.new(Term.new(phrase: 'No commas here')) + result = Detector::Citation.new('No commas here') assert_equal 0, result.summary[:commas] - result = Detector::Citation.new(Term.new(phrase: 'Please, buy, apples, mac, and, cheese, milk, and, bread,.')) + result = Detector::Citation.new('Please, buy, apples, mac, and, cheese, milk, and, bread,.') assert_equal 9, result.summary[:commas] end test 'summary includes a count of periods in term' do - result = Detector::Citation.new(Term.new(phrase: 'No periods here')) + result = Detector::Citation.new('No periods here') assert_equal 0, result.summary[:periods] - result = Detector::Citation.new(Term.new(phrase: 'This has periods. There are two of them.')) + result = Detector::Citation.new('This has periods. There are two of them.') assert_equal 2, result.summary[:periods] - result = Detector::Citation.new(Term.new(phrase: 'This ends with an ellipses, which does not count, but no periods…')) + result = Detector::Citation.new('This ends with an ellipses, which does not count, but no periods…') assert_equal 0, result.summary[:periods] end test 'summary includes a count of semicolons in term' do - result = Detector::Citation.new(Term.new(phrase: 'No semicolons here')) + result = Detector::Citation.new('No semicolons here') assert_equal 0, result.summary[:semicolons] - result = Detector::Citation.new(Term.new(phrase: 'This has one semicolon;')) + result = Detector::Citation.new('This has one semicolon;') assert_equal 1, result.summary[:semicolons] - result = Detector::Citation.new(Term.new(phrase: '"HTML entities are counted"')) + result = Detector::Citation.new('"HTML entities are counted"') assert_equal 2, result.summary[:semicolons] end test 'summary includes a word count' do - result = Detector::Citation.new(Term.new(phrase: 'brief')) + result = Detector::Citation.new('brief') assert_equal 1, result.summary[:words] - result = Detector::Citation.new(Term.new(phrase: ' extra ')) + result = Detector::Citation.new(' extra ') assert_equal 1, result.summary[:words] - result = Detector::Citation.new(Term.new(phrase: 'less brief')) + result = Detector::Citation.new('less brief') assert_equal 2, result.summary[:words] - result = Detector::Citation.new(Term.new(phrase: 'hyphenated-word')) + result = Detector::Citation.new('hyphenated-word') assert_equal 1, result.summary[:words] end test 'summary word count handles non-space separators' do - result = Detector::Citation.new(Term.new(phrase: "tabs\tdo\tcount")) + result = Detector::Citation.new("tabs\tdo\tcount") assert_equal 3, result.summary[:words] - result = Detector::Citation.new(Term.new(phrase: "newlines\nalso\ncount")) + result = Detector::Citation.new("newlines\nalso\ncount") assert_equal 3, result.summary[:words] end test 'subpatterns are empty by default' do - result = Detector::Citation.new(Term.new(phrase: 'nothing here')) + result = Detector::Citation.new('nothing here') assert_empty(result.subpatterns) end test 'subpatterns flag all APA-style "volume(issue)" sequences' do - result = Detector::Citation.new(Term.new(phrase: 'Weinstein, J. (2009). Classical Philology, 104(4), 439-458.')) + result = Detector::Citation.new('Weinstein, J. (2009). Classical Philology, 104(4), 439-458.') assert_equal ['104(4)'], result.subpatterns[:apa_volume_issue] end test 'subpatterns flag all "no." instances with a number' do - result = Detector::Citation.new(Term.new(phrase: 'Yes or no. vol. 6, no. 12, pp. 314')) + result = Detector::Citation.new('Yes or no. vol. 6, no. 12, pp. 314') assert_equal ['no. 12'], result.subpatterns[:no] end test 'subpatterns flag page ranges without spaces' do - result = Detector::Citation.new(Term.new(phrase: 'Read from pages 1-100')) + result = Detector::Citation.new('Read from pages 1-100') assert_equal ['1-100'], result.subpatterns[:pages] - result = Detector::Citation.new(Term.new(phrase: '1 - 100')) + result = Detector::Citation.new('1 - 100') assert_empty(result.subpatterns) end test 'subpatterns flag all "pp." instances with a number' do - result = Detector::Citation.new(Term.new(phrase: 'I love this app. vol. 6, no. 12, pp. 314')) + result = Detector::Citation.new('I love this app. vol. 6, no. 12, pp. 314') assert_equal ['pp. 314'], result.subpatterns[:pp] end test 'subpatterns flag all "vol." instances with a number' do - result = Detector::Citation.new(Term.new(phrase: 'This is frivol. vol. 6, no. 12, pp. 314')) + result = Detector::Citation.new('This is frivol. vol. 6, no. 12, pp. 314') assert_equal ['vol. 6'], result.subpatterns[:vol] end test 'subpatterns flag all years in parentheses' do - result = Detector::Citation.new(Term.new(phrase: 'Only two (2) four-digit years (1996) (1997) here since 2024.')) + result = Detector::Citation.new('Only two (2) four-digit years (1996) (1997) here since 2024.') assert_equal ['(1996)', '(1997)'], result.subpatterns[:year_parens] end test 'subpatterns flag phrases in square brackets' do - result = Detector::Citation.new(Term.new(phrase: 'Artificial intelligence. [Online serial].')) + result = Detector::Citation.new('Artificial intelligence. [Online serial].') assert_equal ['[Online serial]'], result.subpatterns[:brackets] end # This is pretty rough. test 'subpatterns attempts to flag names as they appear in author lists' do - result = Detector::Citation.new(Term.new(phrase: 'Sadava, D. E., D. M. Hillis, et al. Life: The Science of Biology. 11th ed. W. H. Freeman, 2016. ISBN: 9781319145446')) + result = Detector::Citation.new('Sadava, D. E., D. M. Hillis, et al. Life: The Science of Biology. 11th ed. W. H. Freeman, 2016. ISBN: 9781319145446') # This is also catching the last word of the title. assert_equal ['Sadava,', 'Hillis,', 'Biology.', 'Freeman,'], result.subpatterns[:lastnames] end test 'subpatterns flag phrases in quotes' do - result = Detector::Citation.new(Term.new(phrase: '"Principles of Materials Science and Engineering" by William F. Smith and Javad Hashemi')) + result = Detector::Citation.new('"Principles of Materials Science and Engineering" by William F. Smith and Javad Hashemi') assert_equal ['"Principles of Materials Science and Engineering"'], result.subpatterns[:quotes] # Need two to catch anything - result = Detector::Citation.new(Term.new(phrase: 'Principles of Materials Science and Engineering" by William F. Smith and Javad Hashemi')) + result = Detector::Citation.new('Principles of Materials Science and Engineering" by William F. Smith and Javad Hashemi') assert_empty(result.subpatterns) end test 'citation score increases as phrase gets more citation-like' do - result = Detector::Citation.new(Term.new(phrase: 'simple search phrase')) + result = Detector::Citation.new('simple search phrase') assert_equal 0, result.score - result = Detector::Citation.new(Term.new(phrase: 'Science Education and Cultural Diversity: Mapping the Field. Studies in Science Education, 24(1), 49–73.')) + result = Detector::Citation.new('Science Education and Cultural Diversity: Mapping the Field. Studies in Science Education, 24(1), 49–73.') assert_operator 0, :<, result.score end test 'detection? convenience method returns true for obvious citations' do - result = Detector::Citation.new(terms('citation')) + result = Detector::Citation.new(terms('citation').phrase) assert_predicate result, :detection? end test 'detection? convenience method returns false for obvious non-citations' do - result = Detector::Citation.new(terms('hi')) + result = Detector::Citation.new(terms('hi').phrase) assert_not result.detection? end