diff --git a/app/graphql/types/search_event_type.rb b/app/graphql/types/search_event_type.rb index d1d0d32..9c0b246 100644 --- a/app/graphql/types/search_event_type.rb +++ b/app/graphql/types/search_event_type.rb @@ -15,7 +15,7 @@ def phrase end def standard_identifiers - StandardIdentifiers.new(@object.term.phrase).identifiers.map do |identifier| + Detector::StandardIdentifiers.new(@object.term.phrase).identifiers.map do |identifier| { kind: identifier.first, value: identifier.last } end end diff --git a/app/models/detector/standard_identifiers.rb b/app/models/detector/standard_identifiers.rb new file mode 100644 index 0000000..e869a98 --- /dev/null +++ b/app/models/detector/standard_identifiers.rb @@ -0,0 +1,76 @@ +# frozen_string_literal: true + +# StandardIdentifiers is a PatternDectector implementation that detects the identifiers DOI, ISBN, ISSN, PMID. +# See /docs/reference/pattern_detection_and_enhancement.md for details. + +module Detector + class StandardIdentifiers + attr_reader :identifiers + + def initialize(term) + @identifiers = {} + term_pattern_checker(term) + strip_invalid_issns + end + + private + + def term_pattern_checker(term) + term_patterns.each_pair do |type, pattern| + @identifiers[type.to_sym] = match(pattern, term) if match(pattern, term).present? + end + end + + # Note on the limitations of this implementation + # We only detect the first match of each pattern, so a search of "1234-5678 5678-1234" will not return two ISSNs as + # might be expected, but just "1234-5678". Using ruby's string.scan(pattern) may be worthwhile if we want to detect + # all possible matches instead of just the first. That may require a larger refactor though as initial tests of doing + # that change did result in unintended results so it was backed out for now. + def match(pattern, term) + pattern.match(term).to_s.strip + end + + # term_patterns are regex patterns to be applied to the basic search box input + def term_patterns + { + isbn: /\b(ISBN-*(1[03])* *(: ){0,1})*(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})\b/, + issn: /\b[0-9]{4}-[0-9]{3}[0-9xX]\b/, + pmid: /\b((pmid|PMID):\s?(\d{7,8}))\b/, + doi: %r{\b10\.(\d+\.*)+/(([^\s.])+\.*)+\b} + } + end + + def strip_invalid_issns + return unless @identifiers[:issn] + + @identifiers.delete(:issn) unless validate_issn(@identifiers[:issn]) + end + + # validate_issn is only called when the regex for an ISSN has indicated an ISSN + # of sufficient format is present - but the regex does not attempt to + # validate that the check digit in the ISSN spec is correct. This method + # does that calculation, so we do not returned falsely detected ISSNs, + # like "2015-2019". + # + # The algorithm is defined at + # https://datatracker.ietf.org/doc/html/rfc3044#section-2.2 + # An example calculation is shared at + # https://en.wikipedia.org/wiki/International_Standard_Serial_Number#Code_format + def validate_issn(candidate) + digits = candidate.delete('-')[..6].chars + check_digit = candidate.last.downcase + sum = 0 + + digits.each_with_index do |digit, idx| + sum += digit.to_i * (8 - idx.to_i) + end + + actual_digit = 11 - sum.modulo(11) + actual_digit = 'x' if actual_digit == 10 + + return true if actual_digit.to_s == check_digit.to_s + + false + end + end +end diff --git a/app/models/metrics/algorithms.rb b/app/models/metrics/algorithms.rb index 2ba9b24..c83e9c6 100644 --- a/app/models/metrics/algorithms.rb +++ b/app/models/metrics/algorithms.rb @@ -84,7 +84,7 @@ def event_matches(event, matches) # @return [Array] an array of matched StandardIdentifiers def match_standard_identifiers(event, matches) known_ids = %i[unmatched pmid isbn issn doi] - ids = StandardIdentifiers.new(event.term.phrase) + ids = Detector::StandardIdentifiers.new(event.term.phrase) known_ids.each do |id| matches[id] += 1 if ids.identifiers[id].present? diff --git a/app/models/standard_identifiers.rb b/app/models/standard_identifiers.rb deleted file mode 100644 index b8fd3ed..0000000 --- a/app/models/standard_identifiers.rb +++ /dev/null @@ -1,73 +0,0 @@ -# frozen_string_literal: true - -# StandardIdentifiers is a PatternDectector implementation that detects the identifiers DOI, ISBN, ISSN, PMID. -# See /docs/reference/pattern_detection_and_enhancement.md for details. -class StandardIdentifiers - attr_reader :identifiers - - def initialize(term) - @identifiers = {} - term_pattern_checker(term) - strip_invalid_issns - end - - private - - def term_pattern_checker(term) - term_patterns.each_pair do |type, pattern| - @identifiers[type.to_sym] = match(pattern, term) if match(pattern, term).present? - end - end - - # Note on the limitations of this implementation - # We only detect the first match of each pattern, so a search of "1234-5678 5678-1234" will not return two ISSNs as - # might be expected, but just "1234-5678". Using ruby's string.scan(pattern) may be worthwhile if we want to detect - # all possible matches instead of just the first. That may require a larger refactor though as initial tests of doing - # that change did result in unintended results so it was backed out for now. - def match(pattern, term) - pattern.match(term).to_s.strip - end - - # term_patterns are regex patterns to be applied to the basic search box input - def term_patterns - { - isbn: /\b(ISBN-*(1[03])* *(: ){0,1})*(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})\b/, - issn: /\b[0-9]{4}-[0-9]{3}[0-9xX]\b/, - pmid: /\b((pmid|PMID):\s?(\d{7,8}))\b/, - doi: %r{\b10\.(\d+\.*)+/(([^\s.])+\.*)+\b} - } - end - - def strip_invalid_issns - return unless @identifiers[:issn] - - @identifiers.delete(:issn) unless validate_issn(@identifiers[:issn]) - end - - # validate_issn is only called when the regex for an ISSN has indicated an ISSN - # of sufficient format is present - but the regex does not attempt to - # validate that the check digit in the ISSN spec is correct. This method - # does that calculation, so we do not returned falsely detected ISSNs, - # like "2015-2019". - # - # The algorithm is defined at - # https://datatracker.ietf.org/doc/html/rfc3044#section-2.2 - # An example calculation is shared at - # https://en.wikipedia.org/wiki/International_Standard_Serial_Number#Code_format - def validate_issn(candidate) - digits = candidate.delete('-')[..6].chars - check_digit = candidate.last.downcase - sum = 0 - - digits.each_with_index do |digit, idx| - sum += digit.to_i * (8 - idx.to_i) - end - - actual_digit = 11 - sum.modulo(11) - actual_digit = 'x' if actual_digit == 10 - - return true if actual_digit.to_s == check_digit.to_s - - false - end -end diff --git a/test/models/detector/standard_identifiers_test.rb b/test/models/detector/standard_identifiers_test.rb new file mode 100644 index 0000000..bee47d8 --- /dev/null +++ b/test/models/detector/standard_identifiers_test.rb @@ -0,0 +1,194 @@ +# frozen_string_literal: true + +require 'test_helper' + +module Detector + class StandardIdentifiersTest < ActiveSupport::TestCase + test 'ISBN detected in a string' do + actual = Detector::StandardIdentifiers.new('test 978-3-16-148410-0 test').identifiers + + assert_equal('978-3-16-148410-0', actual[:isbn]) + end + + test 'ISBN-10 examples' do + # from wikipedia + samples = ['99921-58-10-7', '9971-5-0210-0', '960-425-059-0', '80-902734-1-6', '85-359-0277-5', + '1-84356-028-3', '0-684-84328-5', '0-8044-2957-X', '0-85131-041-9', '93-86954-21-4', '0-943396-04-2', + '0-9752298-0-X'] + + samples.each do |isbn| + actual = Detector::StandardIdentifiers.new(isbn).identifiers + + assert_equal(isbn, actual[:isbn]) + end + end + + test 'ISBN-13 examples' do + samples = ['978-99921-58-10-7', '979-9971-5-0210-0', '978-960-425-059-0', '979-80-902734-1-6', '978-85-359-0277-5', + '979-1-84356-028-3', '978-0-684-84328-5', '979-0-8044-2957-X', '978-0-85131-041-9', '979-93-86954-21-4', + '978-0-943396-04-2', '979-0-9752298-0-X'] + + samples.each do |isbn| + actual = Detector::StandardIdentifiers.new(isbn).identifiers + + assert_equal(isbn, actual[:isbn]) + end + end + + test 'not ISBNs' do + samples = ['orange cats like popcorn', '1234-6798', 'another ISBN not found here'] + + samples.each do |notisbn| + actual = Detector::StandardIdentifiers.new(notisbn).identifiers + + assert_nil(actual[:isbn]) + end + end + + test 'ISBNs need boundaries' do + samples = ['990026671500206761', '979-0-9752298-0-XYZ'] + # note, there is a theoretical case of `asdf979-0-9752298-0-X` returning as an ISBN 10 even though it doesn't + # have a word boundary because the `-` is treated as a boundary so `0-9752298-0-X` would be an ISBN10. We can + # consider whether we care in the future as we look for incorrect real-world matches. + + samples.each do |notisbn| + actual = Detector::StandardIdentifiers.new(notisbn).identifiers + + assert_nil(actual[:isbn]) + end + end + + test 'ISSNs detected in a string' do + actual = Detector::StandardIdentifiers.new('test 0250-6335 test').identifiers + + assert_equal('0250-6335', actual[:issn]) + end + + test 'ISSN examples' do + samples = %w[0250-6335 0000-0019 1864-0761 1877-959X 0973-7758 1877-5683 1440-172X 1040-5631] + + samples.each do |issn| + actual = Detector::StandardIdentifiers.new(issn).identifiers + + assert_equal(issn, actual[:issn]) + end + end + + test 'not ISSN examples' do + samples = ['orange cats like popcorn', '12346798', 'another ISSN not found here', '99921-58-10-7'] + + samples.each do |notissn| + actual = Detector::StandardIdentifiers.new(notissn).identifiers + + assert_nil(actual[:issn]) + end + end + + test 'ISSNs need boundaries' do + actual = Detector::StandardIdentifiers.new('12345-5678 1234-56789').identifiers + + assert_nil(actual[:issn]) + end + + test 'ISSN validate rejects ISSNs with wrong check digit' do + samples = %w[ + 1234-5678 + 2015-2016 + 1460-2441 + 1460-2442 + 1460-2443 + 1460-2444 + 1460-2445 + 1460-2446 + 1460-2447 + 1460-2448 + 1460-2449 + 1460-2440 + 0250-6331 + 0250-6332 + 0250-6333 + 0250-6334 + 0250-6336 + 0250-6337 + 0250-6338 + 0250-6339 + 0250-6330 + 0250-633x + 0250-633X + ] + samples.each do |notissn| + actual = Detector::StandardIdentifiers.new(notissn).identifiers + + assert_nil(actual[:issn]) + end + end + + test 'ISSN validate method accepts ISSNs with correct check digit' do + samples = %w[ + 1460-244X + 2015-223x + 0250-6335 + 0973-7758 + ] + samples.each do |issn| + actual = Detector::StandardIdentifiers.new(issn).identifiers + + assert_equal(issn, actual[:issn]) + end + end + + test 'doi detected in string' do + actual = Detector::StandardIdentifiers.new('"Quantum tomography: Measured measurement", Markus Aspelmeyer, nature physics "\ + "January 2009, Volume 5, No 1, pp11-12; [ doi:10.1038/nphys1170 ]').identifiers + + assert_equal('10.1038/nphys1170', actual[:doi]) + end + + test 'doi examples' do + samples = %w[10.1038/nphys1170 10.1002/0470841559.ch1 10.1594/PANGAEA.726855 10.1594/GFZ.GEOFON.gfz2009kciu + 10.1594/PANGAEA.667386 10.3207/2959859860 10.3866/PKU.WHXB201112303 10.1430/8105 10.1392/BC1.0] + + samples.each do |doi| + actual = Detector::StandardIdentifiers.new(doi).identifiers + + assert_equal(doi, actual[:doi]) + end + end + + test 'not doi examples' do + samples = ['orange cats like popcorn', '10.1234 almost doi', 'another doi not found here', '99921-58-10-7'] + + samples.each do |notdoi| + actual = Detector::StandardIdentifiers.new(notdoi).identifiers + + assert_nil(actual[:notdoi]) + end + end + + test 'pmid detected in string' do + actual = Detector::StandardIdentifiers.new('Citation and stuff PMID: 35648703 more stuff.').identifiers + + assert_equal('PMID: 35648703', actual[:pmid]) + end + + test 'pmid examples' do + samples = ['PMID: 35648703', 'pmid: 1234567', 'PMID:35648703'] + + samples.each do |pmid| + actual = Detector::StandardIdentifiers.new(pmid).identifiers + + assert_equal(pmid, actual[:pmid]) + end + end + + test 'not pmid examples' do + samples = ['orange cats like popcorn', 'pmid:almost', 'PMID: asdf', '99921-58-10-7'] + + samples.each do |notpmid| + actual = Detector::StandardIdentifiers.new(notpmid).identifiers + + assert_nil(actual[:pmid]) + end + end + end +end diff --git a/test/models/standard_identifiers_test.rb b/test/models/standard_identifiers_test.rb deleted file mode 100644 index 24df967..0000000 --- a/test/models/standard_identifiers_test.rb +++ /dev/null @@ -1,192 +0,0 @@ -# frozen_string_literal: true - -require 'test_helper' - -class StandardIdentifiersTest < ActiveSupport::TestCase - test 'ISBN detected in a string' do - actual = StandardIdentifiers.new('test 978-3-16-148410-0 test').identifiers - - assert_equal('978-3-16-148410-0', actual[:isbn]) - end - - test 'ISBN-10 examples' do - # from wikipedia - samples = ['99921-58-10-7', '9971-5-0210-0', '960-425-059-0', '80-902734-1-6', '85-359-0277-5', - '1-84356-028-3', '0-684-84328-5', '0-8044-2957-X', '0-85131-041-9', '93-86954-21-4', '0-943396-04-2', - '0-9752298-0-X'] - - samples.each do |isbn| - actual = StandardIdentifiers.new(isbn).identifiers - - assert_equal(isbn, actual[:isbn]) - end - end - - test 'ISBN-13 examples' do - samples = ['978-99921-58-10-7', '979-9971-5-0210-0', '978-960-425-059-0', '979-80-902734-1-6', '978-85-359-0277-5', - '979-1-84356-028-3', '978-0-684-84328-5', '979-0-8044-2957-X', '978-0-85131-041-9', '979-93-86954-21-4', - '978-0-943396-04-2', '979-0-9752298-0-X'] - - samples.each do |isbn| - actual = StandardIdentifiers.new(isbn).identifiers - - assert_equal(isbn, actual[:isbn]) - end - end - - test 'not ISBNs' do - samples = ['orange cats like popcorn', '1234-6798', 'another ISBN not found here'] - - samples.each do |notisbn| - actual = StandardIdentifiers.new(notisbn).identifiers - - assert_nil(actual[:isbn]) - end - end - - test 'ISBNs need boundaries' do - samples = ['990026671500206761', '979-0-9752298-0-XYZ'] - # note, there is a theoretical case of `asdf979-0-9752298-0-X` returning as an ISBN 10 even though it doesn't - # have a word boundary because the `-` is treated as a boundary so `0-9752298-0-X` would be an ISBN10. We can - # consider whether we care in the future as we look for incorrect real-world matches. - - samples.each do |notisbn| - actual = StandardIdentifiers.new(notisbn).identifiers - - assert_nil(actual[:isbn]) - end - end - - test 'ISSNs detected in a string' do - actual = StandardIdentifiers.new('test 0250-6335 test').identifiers - - assert_equal('0250-6335', actual[:issn]) - end - - test 'ISSN examples' do - samples = %w[0250-6335 0000-0019 1864-0761 1877-959X 0973-7758 1877-5683 1440-172X 1040-5631] - - samples.each do |issn| - actual = StandardIdentifiers.new(issn).identifiers - - assert_equal(issn, actual[:issn]) - end - end - - test 'not ISSN examples' do - samples = ['orange cats like popcorn', '12346798', 'another ISSN not found here', '99921-58-10-7'] - - samples.each do |notissn| - actual = StandardIdentifiers.new(notissn).identifiers - - assert_nil(actual[:issn]) - end - end - - test 'ISSNs need boundaries' do - actual = StandardIdentifiers.new('12345-5678 1234-56789').identifiers - - assert_nil(actual[:issn]) - end - - test 'ISSN validate rejects ISSNs with wrong check digit' do - samples = %w[ - 1234-5678 - 2015-2016 - 1460-2441 - 1460-2442 - 1460-2443 - 1460-2444 - 1460-2445 - 1460-2446 - 1460-2447 - 1460-2448 - 1460-2449 - 1460-2440 - 0250-6331 - 0250-6332 - 0250-6333 - 0250-6334 - 0250-6336 - 0250-6337 - 0250-6338 - 0250-6339 - 0250-6330 - 0250-633x - 0250-633X - ] - samples.each do |notissn| - actual = StandardIdentifiers.new(notissn).identifiers - - assert_nil(actual[:issn]) - end - end - - test 'ISSN validate method accepts ISSNs with correct check digit' do - samples = %w[ - 1460-244X - 2015-223x - 0250-6335 - 0973-7758 - ] - samples.each do |issn| - actual = StandardIdentifiers.new(issn).identifiers - - assert_equal(issn, actual[:issn]) - end - end - - test 'doi detected in string' do - actual = StandardIdentifiers.new('"Quantum tomography: Measured measurement", Markus Aspelmeyer, nature physics "\ - "January 2009, Volume 5, No 1, pp11-12; [ doi:10.1038/nphys1170 ]').identifiers - - assert_equal('10.1038/nphys1170', actual[:doi]) - end - - test 'doi examples' do - samples = %w[10.1038/nphys1170 10.1002/0470841559.ch1 10.1594/PANGAEA.726855 10.1594/GFZ.GEOFON.gfz2009kciu - 10.1594/PANGAEA.667386 10.3207/2959859860 10.3866/PKU.WHXB201112303 10.1430/8105 10.1392/BC1.0] - - samples.each do |doi| - actual = StandardIdentifiers.new(doi).identifiers - - assert_equal(doi, actual[:doi]) - end - end - - test 'not doi examples' do - samples = ['orange cats like popcorn', '10.1234 almost doi', 'another doi not found here', '99921-58-10-7'] - - samples.each do |notdoi| - actual = StandardIdentifiers.new(notdoi).identifiers - - assert_nil(actual[:notdoi]) - end - end - - test 'pmid detected in string' do - actual = StandardIdentifiers.new('Citation and stuff PMID: 35648703 more stuff.').identifiers - - assert_equal('PMID: 35648703', actual[:pmid]) - end - - test 'pmid examples' do - samples = ['PMID: 35648703', 'pmid: 1234567', 'PMID:35648703'] - - samples.each do |pmid| - actual = StandardIdentifiers.new(pmid).identifiers - - assert_equal(pmid, actual[:pmid]) - end - end - - test 'not pmid examples' do - samples = ['orange cats like popcorn', 'pmid:almost', 'PMID: asdf', '99921-58-10-7'] - - samples.each do |notpmid| - actual = StandardIdentifiers.new(notpmid).identifiers - - assert_nil(actual[:pmid]) - end - end -end