From c4c74946dc878efef41f8ea42dcf57fb44eca4b5 Mon Sep 17 00:00:00 2001 From: Peter Mangiafico Date: Mon, 23 Apr 2018 15:54:42 -0700 Subject: [PATCH] automatically only use first initial when not ambiguous --- app/models/author.rb | 7 ++++ config/settings.yml | 1 - lib/agent/author_name.rb | 15 ++++---- lib/web_of_science/query_author.rb | 2 +- spec/factories/author.rb | 13 +++++++ spec/lib/agent/author_name_spec.rb | 55 ++++++++++++++++-------------- spec/models/author_spec.rb | 13 +++++++ 7 files changed, 72 insertions(+), 34 deletions(-) diff --git a/app/models/author.rb b/app/models/author.rb index adf3eb8e7..79d71baf9 100644 --- a/app/models/author.rb +++ b/app/models/author.rb @@ -30,6 +30,13 @@ def institution Settings.HARVESTER.INSTITUTION.name end + # indicates if the LastName, FirstInitial form for this user is ambiguous within our author database and there are no alternate identities + def ambiguous_first_initial? + return true unless first_name && last_name + first_initial_not_unique = self.class.where('preferred_first_name like ? and preferred_last_name = ?', "#{first_name[0]}%", last_name).where(active_in_cap: true, cap_import_enabled: true).size > 1 + (first_initial_not_unique || !author_identities.empty?) + end + # @return [Array] ScienceWireIds for approved publications def approved_sciencewire_ids publications.where("contributions.status = 'approved'") diff --git a/config/settings.yml b/config/settings.yml index bc965fcc5..9815144cd 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -64,7 +64,6 @@ DOI: HARVESTER: USE_MIDDLE_NAME: true - USE_FIRST_INITIAL: false USE_AUTHOR_IDENTITIES: false INSTITUTION: name: Stanford University diff --git a/lib/agent/author_name.rb b/lib/agent/author_name.rb index f3eab4e1a..f3d661414 100644 --- a/lib/agent/author_name.rb +++ b/lib/agent/author_name.rb @@ -51,9 +51,10 @@ def text_search_query text_search_terms.map { |x| "\"#{x}\"" }.join(' or ') end - def text_search_terms + def text_search_terms(options = {}) + use_first_initial = options[:use_first_initial] || true @text_search_terms ||= - [first_name_query, middle_name_query].flatten.reject(&:empty?).uniq + [first_name_query(use_first_initial), middle_name_query(use_first_initial)].flatten.reject(&:empty?).uniq end def ==(other) @@ -68,10 +69,10 @@ def ==(other) # 'Lastname,Firstname' or # 'Lastname,FirstInitial' # @return [Array|String] names - def first_name_query + def first_name_query(use_first_initial) return '' if last.empty? && first.empty? - query = ["#{last_name},#{first_name}"] - query += ["#{last_name},#{first_initial}"] if Settings.HARVESTER.USE_FIRST_INITIAL + query = ["#{last_name},#{first_name}"] + query += ["#{last_name},#{first_initial}"] if use_first_initial query end @@ -80,10 +81,10 @@ def first_name_query # 'Lastname,Firstname,MiddleInitial' or # 'Lastname,FirstInitial,MiddleInitial' # @return [Array|String] names - def middle_name_query + def middle_name_query(use_first_initial) return '' unless middle =~ /^[[:alpha:]]/ query = ["#{last_name},#{first_name},#{middle_name}", "#{last_name},#{first_name},#{middle_initial}"] - query += ["#{last_name},#{first_initial}#{middle_initial}", "#{last_name},#{first_initial},#{middle_initial}"] if Settings.HARVESTER.USE_FIRST_INITIAL + query += ["#{last_name},#{first_initial}#{middle_initial}", "#{last_name},#{first_initial},#{middle_initial}"] if use_first_initial query end diff --git a/lib/web_of_science/query_author.rb b/lib/web_of_science/query_author.rb index 60a069af1..6b4c0499a 100644 --- a/lib/web_of_science/query_author.rb +++ b/lib/web_of_science/query_author.rb @@ -33,7 +33,7 @@ def names ident.last_name, ident.first_name, Settings.HARVESTER.USE_MIDDLE_NAME ? ident.middle_name : '' - ).text_search_terms + ).text_search_terms(use_first_initial: !author.ambiguous_first_initial?) end.flatten.uniq end diff --git a/spec/factories/author.rb b/spec/factories/author.rb index 5715cf92d..0cb204578 100644 --- a/spec/factories/author.rb +++ b/spec/factories/author.rb @@ -54,6 +54,19 @@ end end + factory :odd_name, parent: :author do + active_in_cap { true } + cap_import_enabled { true } + official_first_name { 'Somebody' } + official_last_name { 'WithReallyUnusualName' } + official_middle_name { '' } + preferred_first_name { 'Somebody' } + preferred_last_name { 'WithReallyUnusualName' } + preferred_middle_name { '' } + email { 'Somebody.WithReallyUnusualName@stanford.edu' } + emails_for_harvest { 'Somebody.WithReallyUnusualName@stanford.edu' } + end + # Public data from # - https://stanfordwho.stanford.edu # - https://med.stanford.edu/profiles/russ-altman diff --git a/spec/lib/agent/author_name_spec.rb b/spec/lib/agent/author_name_spec.rb index 8049c2ecd..73f0bb4c2 100644 --- a/spec/lib/agent/author_name_spec.rb +++ b/spec/lib/agent/author_name_spec.rb @@ -117,21 +117,18 @@ describe '#text_search_terms' do it 'includes first_name_query and middle_name_query elements' do - fnames = all_names.send(:first_name_query) - mnames = all_names.send(:middle_name_query) + fnames = all_names.send(:first_name_query, true) + mnames = all_names.send(:middle_name_query, true) expect(all_names.text_search_terms).to include(*fnames, *mnames) end end describe '#first_name_query' do it 'when no names are present returns an empty String' do - expect(no_names.send(:first_name_query)).to eq '' + expect(no_names.send(:first_name_query, true)).to eq '' end - context 'when all names are present' do - let(:fn_query) { all_names.send(:first_name_query) } - before do - allow(Settings.HARVESTER).to receive(:USE_FIRST_INITIAL).and_return(false) - end + context 'when all names are present with middle initial' do + let(:fn_query) { all_names.send(:first_name_query, true) } it 'is Array with non-empty unique values' do expect(fn_query).to be_an Array expect(fn_query).to all(be_a(String)) @@ -141,8 +138,8 @@ it 'includes name with first_name' do expect(fn_query).to include "#{all_names.last_name},#{all_names.first_name}" end - it 'excludes name with first_initial when settings do not allow for it' do - expect(fn_query).not_to include "#{all_names.last_name},#{all_names.first_initial}" + it 'includes name with first_initial when settings allow for it' do + expect(fn_query).to include "#{all_names.last_name},#{all_names.first_initial}" end it 'does not include name with middle_name' do expect(fn_query).not_to include "#{all_names.last_name},#{all_names.first_name},#{all_names.middle_name}" @@ -153,26 +150,37 @@ expect(fn_query).to all(exclude(",#{all_names.middle_initial}")) end end - context 'when all names are present and settings allow for first initial' do - before do - allow(Settings.HARVESTER).to receive(:USE_FIRST_INITIAL).and_return(true) + context 'when all names are present without middle initial' do + let(:fn_query) { all_names.send(:first_name_query, false) } + it 'is Array with non-empty unique values' do + expect(fn_query).to be_an Array + expect(fn_query).to all(be_a(String)) + expect(fn_query).not_to include(be_empty) + expect(fn_query.size).to eq(fn_query.uniq.size) end - let(:fn_query) { all_names.send(:first_name_query) } - it 'includes name with first_initial when settings allow for it' do - expect(fn_query).to include "#{all_names.last_name},#{all_names.first_initial}" + it 'includes name with first_name' do + expect(fn_query).to include "#{all_names.last_name},#{all_names.first_name}" + end + it 'does not include name with first_initial' do + expect(fn_query).not_to include "#{all_names.last_name},#{all_names.first_initial}" + end + it 'does not include name with middle_name' do + expect(fn_query).not_to include "#{all_names.last_name},#{all_names.first_name},#{all_names.middle_name}" + expect(fn_query).to all(exclude(",#{all_names.middle_name}")) + end + it 'does not include name with middle_initial' do + expect(fn_query).not_to include "#{all_names.last_name},#{all_names.first_name},#{all_names.middle_initial}" + expect(fn_query).to all(exclude(",#{all_names.middle_initial}")) end end end describe '#middle_name_query' do it 'when no names are present returns an empty String' do - expect(no_names.send(:middle_name_query)).to eq '' + expect(no_names.send(:middle_name_query, false)).to eq '' end context 'when all names are present' do - let(:mn_query) { all_names.send(:middle_name_query) } - before do - allow(Settings.HARVESTER).to receive(:USE_FIRST_INITIAL).and_return(false) - end + let(:mn_query) { all_names.send(:middle_name_query, false) } it 'is Array with non-empty unique values' do expect(mn_query).to be_an Array expect(mn_query).to all(be_a(String)) @@ -196,10 +204,7 @@ end end context 'when all names are present and settings allow for first initial' do - let(:mn_query) { all_names.send(:middle_name_query) } - before do - allow(Settings.HARVESTER).to receive(:USE_FIRST_INITIAL).and_return(true) - end + let(:mn_query) { all_names.send(:middle_name_query, true) } it 'includes name with middle_initial appended to first initial when settings allow for it' do expect(mn_query).to include "#{all_names.last_name},#{all_names.first_initial}#{all_names.middle_initial}" end diff --git a/spec/models/author_spec.rb b/spec/models/author_spec.rb index 0215a7467..6f2ab51af 100644 --- a/spec/models/author_spec.rb +++ b/spec/models/author_spec.rb @@ -29,6 +29,19 @@ end end + describe '#ambiguous_first_initial?' do + it 'confirms ambiguous first initial' do + subject.update_from_cap_authorship_profile_hash(auth_hash) + expect(subject.author_identities.size).to eq(2) # has alternate identities + expect(subject.ambiguous_first_initial?).to eq(true) # thus cannot search with first initial + end + it 'confirms non-ambiguous first initial' do + odd_name = create :odd_name + expect(odd_name.author_identities.size).to eq(0) # has no alternate identities + expect(odd_name.ambiguous_first_initial?).to eq(false) # and no other odd names likes this at stanford, so ok to search with first initial + end + end + describe '#first_name' do it 'is the preferred_first_name' do subject.update_from_cap_authorship_profile_hash(auth_hash)