-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
…ntroduce IdentifierFactory cloned from unreleased dhp-commons This commit addresses two changes: * upgrade the dhp-schemas dependency version range from [4.0.0, 5.0.0) to [6.0.0, 7.0.0) * introduce IdentifierFactory cloned from unreleased dhp-commons after dhp-schemas dependency upgrade
- Loading branch information
1 parent
364f466
commit 23245fb
Showing
12 changed files
with
653 additions
and
3 deletions.
There are no files selected for viewing
75 changes: 75 additions & 0 deletions
75
...actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/CleaningFunctions.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
package eu.dnetlib.iis.wf.export.actionmanager; | ||
|
||
import java.util.HashSet; | ||
import java.util.Objects; | ||
import java.util.Optional; | ||
import java.util.Set; | ||
|
||
import org.apache.commons.lang3.StringUtils; | ||
|
||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; | ||
|
||
public class CleaningFunctions { | ||
|
||
public static final String DOI_PREFIX_REGEX = "(^10\\.|\\/10\\.)"; | ||
public static final String DOI_PREFIX = "10."; | ||
|
||
public static final Set<String> PID_BLACKLIST = new HashSet<>(); | ||
|
||
static { | ||
PID_BLACKLIST.add("none"); | ||
PID_BLACKLIST.add("na"); | ||
} | ||
|
||
public CleaningFunctions() { | ||
} | ||
|
||
/** | ||
* Utility method that filter PID values on a per-type basis. | ||
* @param s the PID whose value will be checked. | ||
* @return false if the pid matches the filter criteria, true otherwise. | ||
*/ | ||
public static boolean pidFilter(StructuredProperty s) { | ||
final String pidValue = s.getValue(); | ||
if (Objects.isNull(s.getQualifier()) || | ||
StringUtils.isBlank(pidValue) || | ||
StringUtils.isBlank(pidValue.replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) { | ||
return false; | ||
} | ||
if (CleaningFunctions.PID_BLACKLIST.contains(pidValue)) { | ||
return false; | ||
} | ||
return !PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue); | ||
} | ||
|
||
/** | ||
* Utility method that normalises PID values on a per-type basis. | ||
* @param pid the PID whose value will be normalised. | ||
* @return the PID containing the normalised value. | ||
*/ | ||
public static StructuredProperty normalizePidValue(StructuredProperty pid) { | ||
pid | ||
.setValue( | ||
normalizePidValue( | ||
pid.getQualifier().getClassid(), | ||
pid.getValue())); | ||
|
||
return pid; | ||
} | ||
|
||
public static String normalizePidValue(String pidType, String pidValue) { | ||
String value = Optional | ||
.ofNullable(pidValue) | ||
.map(String::trim) | ||
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty")); | ||
|
||
switch (pidType) { | ||
|
||
// TODO add cleaning for more PID types as needed | ||
case "doi": | ||
return value.toLowerCase().replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX); | ||
} | ||
return value; | ||
} | ||
|
||
} |
269 changes: 269 additions & 0 deletions
269
...actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/IdentifierFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,269 @@ | ||
package eu.dnetlib.iis.wf.export.actionmanager; | ||
|
||
import static com.google.common.base.Preconditions.checkArgument; | ||
import static eu.dnetlib.dhp.schema.common.ModelConstants.ARXIV_ID; | ||
import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID; | ||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DATACITE_ID; | ||
import static eu.dnetlib.dhp.schema.common.ModelConstants.EUROPE_PUBMED_CENTRAL_ID; | ||
import static eu.dnetlib.dhp.schema.common.ModelConstants.OPEN_APC_ID; | ||
import static eu.dnetlib.dhp.schema.common.ModelConstants.OPEN_APC_NAME; | ||
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBMED_CENTRAL_ID; | ||
import static eu.dnetlib.dhp.schema.common.ModelConstants.ROHUB_ID; | ||
import static eu.dnetlib.dhp.schema.common.ModelConstants.ZENODO_OD_ID; | ||
import static eu.dnetlib.dhp.schema.common.ModelConstants.ZENODO_R3_ID; | ||
|
||
import java.io.Serializable; | ||
import java.nio.charset.StandardCharsets; | ||
import java.security.MessageDigest; | ||
import java.util.HashMap; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Objects; | ||
import java.util.Optional; | ||
import java.util.Set; | ||
import java.util.function.Function; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.Stream; | ||
|
||
import org.apache.commons.codec.binary.Hex; | ||
import org.apache.commons.lang3.StringUtils; | ||
|
||
import com.google.common.collect.HashBiMap; | ||
import com.google.common.collect.Maps; | ||
|
||
import eu.dnetlib.dhp.schema.common.ModelSupport; | ||
import eu.dnetlib.dhp.schema.oaf.Instance; | ||
import eu.dnetlib.dhp.schema.oaf.KeyValue; | ||
import eu.dnetlib.dhp.schema.oaf.OafEntity; | ||
import eu.dnetlib.dhp.schema.oaf.Result; | ||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; | ||
|
||
/** | ||
* Factory class for OpenAIRE identifiers in the Graph | ||
*/ | ||
public class IdentifierFactory implements Serializable { | ||
|
||
public static final String ID_SEPARATOR = "::"; | ||
public static final String ID_PREFIX_SEPARATOR = "|"; | ||
|
||
public static final int ID_PREFIX_LEN = 12; | ||
|
||
/** | ||
* Declares the associations PID_TYPE -> [DATASOURCE ID, NAME] considered authoritative for that PID_TYPE. | ||
* The id of the record (source_::id) will be rewritten as pidType_::id) | ||
*/ | ||
public static final Map<PidType, HashBiMap<String, String>> PID_AUTHORITY = Maps.newHashMap(); | ||
|
||
static { | ||
PID_AUTHORITY.put(PidType.doi, HashBiMap.create()); | ||
PID_AUTHORITY.get(PidType.doi).put(CROSSREF_ID, "Crossref"); | ||
PID_AUTHORITY.get(PidType.doi).put(DATACITE_ID, "Datacite"); | ||
PID_AUTHORITY.get(PidType.doi).put(ZENODO_OD_ID, "ZENODO"); | ||
PID_AUTHORITY.get(PidType.doi).put(ZENODO_R3_ID, "Zenodo"); | ||
|
||
PID_AUTHORITY.put(PidType.pmc, HashBiMap.create()); | ||
PID_AUTHORITY.get(PidType.pmc).put(EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central"); | ||
PID_AUTHORITY.get(PidType.pmc).put(PUBMED_CENTRAL_ID, "PubMed Central"); | ||
|
||
PID_AUTHORITY.put(PidType.pmid, HashBiMap.create()); | ||
PID_AUTHORITY.get(PidType.pmid).put(EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central"); | ||
PID_AUTHORITY.get(PidType.pmid).put(PUBMED_CENTRAL_ID, "PubMed Central"); | ||
|
||
PID_AUTHORITY.put(PidType.arXiv, HashBiMap.create()); | ||
PID_AUTHORITY.get(PidType.arXiv).put(ARXIV_ID, "arXiv.org e-Print Archive"); | ||
|
||
PID_AUTHORITY.put(PidType.w3id, HashBiMap.create()); | ||
PID_AUTHORITY.get(PidType.w3id).put(ROHUB_ID, "ROHub"); | ||
} | ||
|
||
/** | ||
* Declares the associations PID_TYPE -> [DATASOURCE ID, PID SUBSTRING] considered as delegated authority for that | ||
* PID_TYPE. Example, Zenodo is delegated to forge DOIs that contain the 'zenodo' word. | ||
* | ||
* If a record with the same id (same pid) comes from 2 data sources, the one coming from a delegated source wins. E.g. Zenodo records win over those from Datacite. | ||
* See also https://code-repo.d4science.org/D-Net/dnet-hadoop/pulls/187 and the class dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java | ||
*/ | ||
public static final Map<PidType, Map<String, String>> DELEGATED_PID_AUTHORITY = Maps.newHashMap(); | ||
|
||
static { | ||
DELEGATED_PID_AUTHORITY.put(PidType.doi, new HashMap<>()); | ||
DELEGATED_PID_AUTHORITY.get(PidType.doi).put(ZENODO_OD_ID, "zenodo"); | ||
DELEGATED_PID_AUTHORITY.get(PidType.doi).put(ZENODO_R3_ID, "zenodo"); | ||
DELEGATED_PID_AUTHORITY.put(PidType.w3id, new HashMap<>()); | ||
DELEGATED_PID_AUTHORITY.get(PidType.w3id).put(ROHUB_ID, "ro-id"); | ||
} | ||
|
||
/** | ||
* Declares the associations PID_TYPE -> [DATASOURCE ID, NAME] whose records are considered enrichment for the graph. | ||
* Their OpenAIRE ID is built from the declared PID type. Are merged with their corresponding record, identified by | ||
* the same OpenAIRE id. | ||
*/ | ||
public static final Map<PidType, HashBiMap<String, String>> ENRICHMENT_PROVIDER = Maps.newHashMap(); | ||
|
||
static { | ||
ENRICHMENT_PROVIDER.put(PidType.doi, HashBiMap.create()); | ||
ENRICHMENT_PROVIDER.get(PidType.doi).put(OPEN_APC_ID, OPEN_APC_NAME); | ||
} | ||
|
||
/** | ||
* Creates an identifier from the most relevant PID (if available) provided by a known PID authority in the given | ||
* entity T. Returns entity.id when none of the PIDs meet the selection criteria is available. | ||
* | ||
* @param entity the entity providing PIDs and a default ID. | ||
* @param <T> the specific entity type. Currently Organization and Result subclasses are supported. | ||
* @param md5 indicates whether should hash the PID value or not. | ||
* @return an identifier from the most relevant PID, entity.id otherwise | ||
*/ | ||
public static <T extends OafEntity> String createIdentifier(T entity, boolean md5) { | ||
|
||
checkArgument(StringUtils.isNoneBlank(entity.getId()), "missing entity identifier"); | ||
|
||
final Map<String, Set<StructuredProperty>> pids = extractPids(entity); | ||
|
||
return pids | ||
.values() | ||
.stream() | ||
.flatMap(Set::stream) | ||
.min(new PidComparator<>(entity)) | ||
.map( | ||
min -> Optional | ||
.ofNullable(pids.get(min.getQualifier().getClassid())) | ||
.map( | ||
p -> p | ||
.stream() | ||
.sorted(new PidValueComparator()) | ||
.findFirst() | ||
.map(s -> idFromPid(entity, s, md5)) | ||
.orElseGet(entity::getId)) | ||
.orElseGet(entity::getId)) | ||
.orElseGet(entity::getId); | ||
} | ||
|
||
private static <T extends OafEntity> Map<String, Set<StructuredProperty>> extractPids(T entity) { | ||
if (entity instanceof Result) { | ||
return Optional | ||
.ofNullable(((Result) entity).getInstance()) | ||
.map(IdentifierFactory::mapPids) | ||
.orElse(new HashMap<>()); | ||
} else { | ||
return entity | ||
.getPid() | ||
.stream() | ||
.map(CleaningFunctions::normalizePidValue) | ||
.filter(CleaningFunctions::pidFilter) | ||
.collect( | ||
Collectors | ||
.groupingBy( | ||
p -> p.getQualifier().getClassid(), | ||
Collectors.mapping(p -> p, Collectors.toCollection(HashSet::new)))); | ||
} | ||
} | ||
|
||
private static Map<String, Set<StructuredProperty>> mapPids(List<Instance> instance) { | ||
return instance | ||
.stream() | ||
.map(i -> pidFromInstance(i.getPid(), i.getCollectedfrom(), false)) | ||
.flatMap(Function.identity()) | ||
.collect( | ||
Collectors | ||
.groupingBy( | ||
p -> p.getQualifier().getClassid(), | ||
Collectors.mapping(p -> p, Collectors.toCollection(HashSet::new)))); | ||
} | ||
|
||
private static Stream<StructuredProperty> pidFromInstance(List<StructuredProperty> pid, KeyValue collectedFrom, | ||
boolean mapHandles) { | ||
return Optional | ||
.ofNullable(pid) | ||
.map( | ||
pp -> pp | ||
.stream() | ||
// filter away PIDs provided by a DS that is not considered an authority for the | ||
// given PID Type | ||
.filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles)) | ||
.map(CleaningFunctions::normalizePidValue) | ||
.filter(p -> isNotFromDelegatedAuthority(collectedFrom, p)) | ||
.filter(CleaningFunctions::pidFilter)) | ||
.orElse(Stream.empty()); | ||
} | ||
|
||
private static boolean shouldFilterPidByCriteria(KeyValue collectedFrom, StructuredProperty p, boolean mapHandles) { | ||
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid()); | ||
|
||
if (Objects.isNull(collectedFrom)) { | ||
return false; | ||
} | ||
|
||
boolean isEnrich = Optional | ||
.ofNullable(ENRICHMENT_PROVIDER.get(pType)) | ||
.map( | ||
enrich -> enrich.containsKey(collectedFrom.getKey()) | ||
|| enrich.containsValue(collectedFrom.getValue())) | ||
.orElse(false); | ||
|
||
boolean isAuthority = Optional | ||
.ofNullable(PID_AUTHORITY.get(pType)) | ||
.map( | ||
authorities -> authorities.containsKey(collectedFrom.getKey()) | ||
|| authorities.containsValue(collectedFrom.getValue())) | ||
.orElse(false); | ||
|
||
return (mapHandles && pType.equals(PidType.handle)) || isEnrich || isAuthority; | ||
} | ||
|
||
private static boolean isNotFromDelegatedAuthority(KeyValue collectedFrom, StructuredProperty p) { | ||
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid()); | ||
|
||
final Map<String, String> da = DELEGATED_PID_AUTHORITY.get(pType); | ||
if (Objects.isNull(da)) { | ||
return true; | ||
} | ||
if (!da.containsKey(collectedFrom.getKey())) { | ||
return true; | ||
} | ||
return StringUtils.contains(p.getValue(), da.get(collectedFrom.getKey())); | ||
} | ||
|
||
/** | ||
* @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)} | ||
*/ | ||
public static <T extends OafEntity> String createIdentifier(T entity) { | ||
|
||
return createIdentifier(entity, true); | ||
} | ||
|
||
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s, boolean md5) { | ||
return idFromPid(ModelSupport.getIdPrefix(entity.getClass()), s.getQualifier().getClassid(), s.getValue(), md5); | ||
} | ||
|
||
public static String idFromPid(String numericPrefix, String pidType, String pidValue, boolean md5) { | ||
return new StringBuilder() | ||
.append(numericPrefix) | ||
.append(ID_PREFIX_SEPARATOR) | ||
.append(createPrefix(pidType)) | ||
.append(ID_SEPARATOR) | ||
.append(md5 ? md5(pidValue) : pidValue) | ||
.toString(); | ||
} | ||
|
||
// create the prefix (length = 12) | ||
private static String createPrefix(String pidType) { | ||
StringBuilder prefix = new StringBuilder(StringUtils.left(pidType, ID_PREFIX_LEN)); | ||
while (prefix.length() < ID_PREFIX_LEN) { | ||
prefix.append("_"); | ||
} | ||
return prefix.substring(0, ID_PREFIX_LEN); | ||
} | ||
|
||
public static String md5(final String s) { | ||
try { | ||
final MessageDigest md = MessageDigest.getInstance("MD5"); | ||
md.update(s.getBytes(StandardCharsets.UTF_8)); | ||
return new String(Hex.encodeHex(md.digest())); | ||
} catch (final Exception e) { | ||
return null; | ||
} | ||
} | ||
|
||
} |
45 changes: 45 additions & 0 deletions
45
...nager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/OrganizationPidComparator.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
package eu.dnetlib.iis.wf.export.actionmanager; | ||
|
||
import java.util.Comparator; | ||
|
||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; | ||
|
||
public class OrganizationPidComparator implements Comparator<StructuredProperty> { | ||
|
||
@Override | ||
public int compare(StructuredProperty left, StructuredProperty right) { | ||
if (left == null) { | ||
return right == null ? 0 : -1; | ||
} else if (right == null) { | ||
return 1; | ||
} | ||
|
||
PidType lClass = PidType.tryValueOf(left.getQualifier().getClassid()); | ||
PidType rClass = PidType.tryValueOf(right.getQualifier().getClassid()); | ||
|
||
if (lClass.equals(rClass)) | ||
return 0; | ||
|
||
if (lClass.equals(PidType.openorgs)) | ||
return -1; | ||
if (rClass.equals(PidType.openorgs)) | ||
return 1; | ||
|
||
if (lClass.equals(PidType.GRID)) | ||
return -1; | ||
if (rClass.equals(PidType.GRID)) | ||
return 1; | ||
|
||
if (lClass.equals(PidType.mag_id)) | ||
return -1; | ||
if (rClass.equals(PidType.mag_id)) | ||
return 1; | ||
|
||
if (lClass.equals(PidType.urn)) | ||
return -1; | ||
if (rClass.equals(PidType.urn)) | ||
return 1; | ||
|
||
return 0; | ||
} | ||
} |
Oops, something went wrong.