From 23245fb3997da51e223b7c44ce87977c40d533ed Mon Sep 17 00:00:00 2001 From: Marek Horst Date: Tue, 23 Apr 2024 15:59:43 +0200 Subject: [PATCH] Closes #1453, #1454: Upgrade the dhp-schemas dependency version and introduce IdentifierFactory cloned from unreleased dhp-commons This commit addresses two changes: * upgrade the dhp-schemas dependency version range from [4.0.0, 5.0.0) to [6.0.0, 7.0.0) * introduce IdentifierFactory cloned from unreleased dhp-commons after dhp-schemas dependency upgrade --- .../actionmanager/CleaningFunctions.java | 75 +++++ .../actionmanager/IdentifierFactory.java | 269 ++++++++++++++++++ .../OrganizationPidComparator.java | 45 +++ .../wf/export/actionmanager/PidBlacklist.java | 7 + .../actionmanager/PidBlacklistProvider.java | 40 +++ .../export/actionmanager/PidComparator.java | 48 ++++ .../iis/wf/export/actionmanager/PidType.java | 79 +++++ .../actionmanager/PidValueComparator.java | 32 +++ .../actionmanager/ResultPidComparator.java | 55 ++++ .../entity/patent/PatentExporterJob.java | 2 +- .../entity/software/SoftwareExporterJob.java | 2 +- pom.xml | 2 +- 12 files changed, 653 insertions(+), 3 deletions(-) create mode 100644 iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/CleaningFunctions.java create mode 100644 iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/IdentifierFactory.java create mode 100644 iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/OrganizationPidComparator.java create mode 100644 iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/PidBlacklist.java create mode 100644 iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/PidBlacklistProvider.java create mode 100644 iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/PidComparator.java create mode 100644 iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/PidType.java create mode 100644 iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/PidValueComparator.java create mode 100644 iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/ResultPidComparator.java diff --git a/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/CleaningFunctions.java b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/CleaningFunctions.java new file mode 100644 index 000000000..d55d966f0 --- /dev/null +++ b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/CleaningFunctions.java @@ -0,0 +1,75 @@ +package eu.dnetlib.iis.wf.export.actionmanager; + +import java.util.HashSet; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; + +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class CleaningFunctions { + + public static final String DOI_PREFIX_REGEX = "(^10\\.|\\/10\\.)"; + public static final String DOI_PREFIX = "10."; + + public static final Set PID_BLACKLIST = new HashSet<>(); + + static { + PID_BLACKLIST.add("none"); + PID_BLACKLIST.add("na"); + } + + public CleaningFunctions() { + } + + /** + * Utility method that filter PID values on a per-type basis. + * @param s the PID whose value will be checked. + * @return false if the pid matches the filter criteria, true otherwise. + */ + public static boolean pidFilter(StructuredProperty s) { + final String pidValue = s.getValue(); + if (Objects.isNull(s.getQualifier()) || + StringUtils.isBlank(pidValue) || + StringUtils.isBlank(pidValue.replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) { + return false; + } + if (CleaningFunctions.PID_BLACKLIST.contains(pidValue)) { + return false; + } + return !PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue); + } + + /** + * Utility method that normalises PID values on a per-type basis. + * @param pid the PID whose value will be normalised. + * @return the PID containing the normalised value. + */ + public static StructuredProperty normalizePidValue(StructuredProperty pid) { + pid + .setValue( + normalizePidValue( + pid.getQualifier().getClassid(), + pid.getValue())); + + return pid; + } + + public static String normalizePidValue(String pidType, String pidValue) { + String value = Optional + .ofNullable(pidValue) + .map(String::trim) + .orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty")); + + switch (pidType) { + + // TODO add cleaning for more PID types as needed + case "doi": + return value.toLowerCase().replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX); + } + return value; + } + +} \ No newline at end of file diff --git a/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/IdentifierFactory.java b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/IdentifierFactory.java new file mode 100644 index 000000000..ef3ace0ff --- /dev/null +++ b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/IdentifierFactory.java @@ -0,0 +1,269 @@ +package eu.dnetlib.iis.wf.export.actionmanager; + +import static com.google.common.base.Preconditions.checkArgument; +import static eu.dnetlib.dhp.schema.common.ModelConstants.ARXIV_ID; +import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DATACITE_ID; +import static eu.dnetlib.dhp.schema.common.ModelConstants.EUROPE_PUBMED_CENTRAL_ID; +import static eu.dnetlib.dhp.schema.common.ModelConstants.OPEN_APC_ID; +import static eu.dnetlib.dhp.schema.common.ModelConstants.OPEN_APC_NAME; +import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBMED_CENTRAL_ID; +import static eu.dnetlib.dhp.schema.common.ModelConstants.ROHUB_ID; +import static eu.dnetlib.dhp.schema.common.ModelConstants.ZENODO_OD_ID; +import static eu.dnetlib.dhp.schema.common.ModelConstants.ZENODO_R3_ID; + +import java.io.Serializable; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.apache.commons.codec.binary.Hex; +import org.apache.commons.lang3.StringUtils; + +import com.google.common.collect.HashBiMap; +import com.google.common.collect.Maps; + +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.Instance; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +/** + * Factory class for OpenAIRE identifiers in the Graph + */ +public class IdentifierFactory implements Serializable { + + public static final String ID_SEPARATOR = "::"; + public static final String ID_PREFIX_SEPARATOR = "|"; + + public static final int ID_PREFIX_LEN = 12; + + /** + * Declares the associations PID_TYPE -> [DATASOURCE ID, NAME] considered authoritative for that PID_TYPE. + * The id of the record (source_::id) will be rewritten as pidType_::id) + */ + public static final Map> PID_AUTHORITY = Maps.newHashMap(); + + static { + PID_AUTHORITY.put(PidType.doi, HashBiMap.create()); + PID_AUTHORITY.get(PidType.doi).put(CROSSREF_ID, "Crossref"); + PID_AUTHORITY.get(PidType.doi).put(DATACITE_ID, "Datacite"); + PID_AUTHORITY.get(PidType.doi).put(ZENODO_OD_ID, "ZENODO"); + PID_AUTHORITY.get(PidType.doi).put(ZENODO_R3_ID, "Zenodo"); + + PID_AUTHORITY.put(PidType.pmc, HashBiMap.create()); + PID_AUTHORITY.get(PidType.pmc).put(EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central"); + PID_AUTHORITY.get(PidType.pmc).put(PUBMED_CENTRAL_ID, "PubMed Central"); + + PID_AUTHORITY.put(PidType.pmid, HashBiMap.create()); + PID_AUTHORITY.get(PidType.pmid).put(EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central"); + PID_AUTHORITY.get(PidType.pmid).put(PUBMED_CENTRAL_ID, "PubMed Central"); + + PID_AUTHORITY.put(PidType.arXiv, HashBiMap.create()); + PID_AUTHORITY.get(PidType.arXiv).put(ARXIV_ID, "arXiv.org e-Print Archive"); + + PID_AUTHORITY.put(PidType.w3id, HashBiMap.create()); + PID_AUTHORITY.get(PidType.w3id).put(ROHUB_ID, "ROHub"); + } + + /** + * Declares the associations PID_TYPE -> [DATASOURCE ID, PID SUBSTRING] considered as delegated authority for that + * PID_TYPE. Example, Zenodo is delegated to forge DOIs that contain the 'zenodo' word. + * + * If a record with the same id (same pid) comes from 2 data sources, the one coming from a delegated source wins. E.g. Zenodo records win over those from Datacite. + * See also https://code-repo.d4science.org/D-Net/dnet-hadoop/pulls/187 and the class dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java + */ + public static final Map> DELEGATED_PID_AUTHORITY = Maps.newHashMap(); + + static { + DELEGATED_PID_AUTHORITY.put(PidType.doi, new HashMap<>()); + DELEGATED_PID_AUTHORITY.get(PidType.doi).put(ZENODO_OD_ID, "zenodo"); + DELEGATED_PID_AUTHORITY.get(PidType.doi).put(ZENODO_R3_ID, "zenodo"); + DELEGATED_PID_AUTHORITY.put(PidType.w3id, new HashMap<>()); + DELEGATED_PID_AUTHORITY.get(PidType.w3id).put(ROHUB_ID, "ro-id"); + } + + /** + * Declares the associations PID_TYPE -> [DATASOURCE ID, NAME] whose records are considered enrichment for the graph. + * Their OpenAIRE ID is built from the declared PID type. Are merged with their corresponding record, identified by + * the same OpenAIRE id. + */ + public static final Map> ENRICHMENT_PROVIDER = Maps.newHashMap(); + + static { + ENRICHMENT_PROVIDER.put(PidType.doi, HashBiMap.create()); + ENRICHMENT_PROVIDER.get(PidType.doi).put(OPEN_APC_ID, OPEN_APC_NAME); + } + + /** + * Creates an identifier from the most relevant PID (if available) provided by a known PID authority in the given + * entity T. Returns entity.id when none of the PIDs meet the selection criteria is available. + * + * @param entity the entity providing PIDs and a default ID. + * @param the specific entity type. Currently Organization and Result subclasses are supported. + * @param md5 indicates whether should hash the PID value or not. + * @return an identifier from the most relevant PID, entity.id otherwise + */ + public static String createIdentifier(T entity, boolean md5) { + + checkArgument(StringUtils.isNoneBlank(entity.getId()), "missing entity identifier"); + + final Map> pids = extractPids(entity); + + return pids + .values() + .stream() + .flatMap(Set::stream) + .min(new PidComparator<>(entity)) + .map( + min -> Optional + .ofNullable(pids.get(min.getQualifier().getClassid())) + .map( + p -> p + .stream() + .sorted(new PidValueComparator()) + .findFirst() + .map(s -> idFromPid(entity, s, md5)) + .orElseGet(entity::getId)) + .orElseGet(entity::getId)) + .orElseGet(entity::getId); + } + + private static Map> extractPids(T entity) { + if (entity instanceof Result) { + return Optional + .ofNullable(((Result) entity).getInstance()) + .map(IdentifierFactory::mapPids) + .orElse(new HashMap<>()); + } else { + return entity + .getPid() + .stream() + .map(CleaningFunctions::normalizePidValue) + .filter(CleaningFunctions::pidFilter) + .collect( + Collectors + .groupingBy( + p -> p.getQualifier().getClassid(), + Collectors.mapping(p -> p, Collectors.toCollection(HashSet::new)))); + } + } + + private static Map> mapPids(List instance) { + return instance + .stream() + .map(i -> pidFromInstance(i.getPid(), i.getCollectedfrom(), false)) + .flatMap(Function.identity()) + .collect( + Collectors + .groupingBy( + p -> p.getQualifier().getClassid(), + Collectors.mapping(p -> p, Collectors.toCollection(HashSet::new)))); + } + + private static Stream pidFromInstance(List pid, KeyValue collectedFrom, + boolean mapHandles) { + return Optional + .ofNullable(pid) + .map( + pp -> pp + .stream() + // filter away PIDs provided by a DS that is not considered an authority for the + // given PID Type + .filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles)) + .map(CleaningFunctions::normalizePidValue) + .filter(p -> isNotFromDelegatedAuthority(collectedFrom, p)) + .filter(CleaningFunctions::pidFilter)) + .orElse(Stream.empty()); + } + + private static boolean shouldFilterPidByCriteria(KeyValue collectedFrom, StructuredProperty p, boolean mapHandles) { + final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid()); + + if (Objects.isNull(collectedFrom)) { + return false; + } + + boolean isEnrich = Optional + .ofNullable(ENRICHMENT_PROVIDER.get(pType)) + .map( + enrich -> enrich.containsKey(collectedFrom.getKey()) + || enrich.containsValue(collectedFrom.getValue())) + .orElse(false); + + boolean isAuthority = Optional + .ofNullable(PID_AUTHORITY.get(pType)) + .map( + authorities -> authorities.containsKey(collectedFrom.getKey()) + || authorities.containsValue(collectedFrom.getValue())) + .orElse(false); + + return (mapHandles && pType.equals(PidType.handle)) || isEnrich || isAuthority; + } + + private static boolean isNotFromDelegatedAuthority(KeyValue collectedFrom, StructuredProperty p) { + final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid()); + + final Map da = DELEGATED_PID_AUTHORITY.get(pType); + if (Objects.isNull(da)) { + return true; + } + if (!da.containsKey(collectedFrom.getKey())) { + return true; + } + return StringUtils.contains(p.getValue(), da.get(collectedFrom.getKey())); + } + + /** + * @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)} + */ + public static String createIdentifier(T entity) { + + return createIdentifier(entity, true); + } + + private static String idFromPid(T entity, StructuredProperty s, boolean md5) { + return idFromPid(ModelSupport.getIdPrefix(entity.getClass()), s.getQualifier().getClassid(), s.getValue(), md5); + } + + public static String idFromPid(String numericPrefix, String pidType, String pidValue, boolean md5) { + return new StringBuilder() + .append(numericPrefix) + .append(ID_PREFIX_SEPARATOR) + .append(createPrefix(pidType)) + .append(ID_SEPARATOR) + .append(md5 ? md5(pidValue) : pidValue) + .toString(); + } + + // create the prefix (length = 12) + private static String createPrefix(String pidType) { + StringBuilder prefix = new StringBuilder(StringUtils.left(pidType, ID_PREFIX_LEN)); + while (prefix.length() < ID_PREFIX_LEN) { + prefix.append("_"); + } + return prefix.substring(0, ID_PREFIX_LEN); + } + + public static String md5(final String s) { + try { + final MessageDigest md = MessageDigest.getInstance("MD5"); + md.update(s.getBytes(StandardCharsets.UTF_8)); + return new String(Hex.encodeHex(md.digest())); + } catch (final Exception e) { + return null; + } + } + +} \ No newline at end of file diff --git a/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/OrganizationPidComparator.java b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/OrganizationPidComparator.java new file mode 100644 index 000000000..f68d2e607 --- /dev/null +++ b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/OrganizationPidComparator.java @@ -0,0 +1,45 @@ +package eu.dnetlib.iis.wf.export.actionmanager; + +import java.util.Comparator; + +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class OrganizationPidComparator implements Comparator { + + @Override + public int compare(StructuredProperty left, StructuredProperty right) { + if (left == null) { + return right == null ? 0 : -1; + } else if (right == null) { + return 1; + } + + PidType lClass = PidType.tryValueOf(left.getQualifier().getClassid()); + PidType rClass = PidType.tryValueOf(right.getQualifier().getClassid()); + + if (lClass.equals(rClass)) + return 0; + + if (lClass.equals(PidType.openorgs)) + return -1; + if (rClass.equals(PidType.openorgs)) + return 1; + + if (lClass.equals(PidType.GRID)) + return -1; + if (rClass.equals(PidType.GRID)) + return 1; + + if (lClass.equals(PidType.mag_id)) + return -1; + if (rClass.equals(PidType.mag_id)) + return 1; + + if (lClass.equals(PidType.urn)) + return -1; + if (rClass.equals(PidType.urn)) + return 1; + + return 0; + } +} \ No newline at end of file diff --git a/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/PidBlacklist.java b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/PidBlacklist.java new file mode 100644 index 000000000..44ffb7046 --- /dev/null +++ b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/PidBlacklist.java @@ -0,0 +1,7 @@ +package eu.dnetlib.iis.wf.export.actionmanager; + +import java.util.HashMap; +import java.util.HashSet; + +public class PidBlacklist extends HashMap> { +} \ No newline at end of file diff --git a/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/PidBlacklistProvider.java b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/PidBlacklistProvider.java new file mode 100644 index 000000000..c9384e93a --- /dev/null +++ b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/PidBlacklistProvider.java @@ -0,0 +1,40 @@ +package eu.dnetlib.iis.wf.export.actionmanager; + + +import java.io.IOException; +import java.util.HashSet; +import java.util.Optional; +import java.util.Set; + +import org.apache.commons.io.IOUtils; + +import com.fasterxml.jackson.databind.ObjectMapper; + +public class PidBlacklistProvider { + + private static final PidBlacklist blacklist; + + static { + try { + String json = IOUtils.toString(IdentifierFactory.class.getResourceAsStream("pid_blacklist.json")); + blacklist = new ObjectMapper().readValue(json, PidBlacklist.class); + + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public static PidBlacklist getBlacklist() { + return blacklist; + } + + public static Set getBlacklist(String pidType) { + return Optional + .ofNullable(getBlacklist().get(pidType)) + .orElse(new HashSet<>()); + } + + private PidBlacklistProvider() { + } + +} \ No newline at end of file diff --git a/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/PidComparator.java b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/PidComparator.java new file mode 100644 index 000000000..b3cd1ad3f --- /dev/null +++ b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/PidComparator.java @@ -0,0 +1,48 @@ +package eu.dnetlib.iis.wf.export.actionmanager; + +import java.util.Comparator; + +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class PidComparator implements Comparator { + + private final T entity; + + public PidComparator(T entity) { + this.entity = entity; + } + + @Override + public int compare(StructuredProperty left, StructuredProperty right) { + + if (left == null && right == null) + return 0; + if (left == null) + return 1; + if (right == null) + return -1; + + if (ModelSupport.isSubClass(entity, Result.class)) { + return compareResultPids(left, right); + } + + if (ModelSupport.isSubClass(entity, Organization.class)) { + return compareOrganizationtPids(left, right); + } + + // Else (but unlikely), lexicographical ordering will do. + return left.getQualifier().getClassid().compareTo(right.getQualifier().getClassid()); + } + + private int compareResultPids(StructuredProperty left, StructuredProperty right) { + return new ResultPidComparator().compare(left, right); + } + + private int compareOrganizationtPids(StructuredProperty left, StructuredProperty right) { + return new OrganizationPidComparator().compare(left, right); + } +} \ No newline at end of file diff --git a/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/PidType.java b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/PidType.java new file mode 100644 index 000000000..bcbfa5964 --- /dev/null +++ b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/PidType.java @@ -0,0 +1,79 @@ +package eu.dnetlib.iis.wf.export.actionmanager; + + +import org.apache.commons.lang3.EnumUtils; + +public enum PidType { + + /** + * The DOI syntax shall be made up of a DOI prefix and a DOI suffix separated by a forward slash. + * + * There is no defined limit on the length of the DOI name, or of the DOI prefix or DOI suffix. + * + * The DOI name is case-insensitive and can incorporate any printable characters from the legal graphic characters + * of Unicode. Further constraints on character use (e.g. use of language-specific alphanumeric characters) can be + * defined for an application by the ISO 26324 Registration Authority. + * + * + * DOI prefix: The DOI prefix shall be composed of a directory indicator followed by a registrant code. + * These two components shall be separated by a full stop (period). The directory indicator shall be "10" and + * distinguishes the entire set of character strings (prefix and suffix) as digital object identifiers within the + * resolution system. + * + * Registrant code: The second element of the DOI prefix shall be the registrant code. The registrant code is a + * unique string assigned to a registrant. + * + * DOI suffix: The DOI suffix shall consist of a character string of any length chosen by the registrant. + * Each suffix shall be unique to the prefix element that precedes it. The unique suffix can be a sequential number, + * or it might incorporate an identifier generated from or based on another system used by the registrant + * (e.g. ISAN, ISBN, ISRC, ISSN, ISTC, ISNI; in such cases, a preferred construction for such a suffix can be + * specified, as in Example 1). + * + * Source: https://www.doi.org/doi_handbook/2_Numbering.html#2.2 + */ + doi, + + /** + * PubMed Unique Identifier (PMID) + * + * This field is a 1-to-8 digit accession number with no leading zeros. It is present on all records and is the + * accession number for managing and disseminating records. PMIDs are not reused after records are deleted. + * + * Beginning in February 2012 PMIDs include extensions following a decimal point to account for article versions + * (e.g., 21804956.2). All citations are considered version 1 until replaced. The extended PMID is not displayed + * on the MEDLINE format. + * + * View the citation in abstract format in PubMed to access additional versions when available (see the article in + * the Jan-Feb 2012 NLM Technical Bulletin). + * + * Source: https://www.nlm.nih.gov/bsd/mms/medlineelements.html#pmid + */ + pmid, + + /** + * This field contains the unique identifier for the cited article in PubMed Central. The identifier begins with the + * prefix PMC. + * + * Source: https://www.nlm.nih.gov/bsd/mms/medlineelements.html#pmc + */ + pmc, handle, arXiv, nct, pdb, w3id, + + // Organization + openorgs, ROR, GRID, PIC, ISNI, Wikidata, FundRef, corda, corda_h2020, mag_id, urn, + + // Used by dedup + undefined, original; + + public static boolean isValid(String type) { + return EnumUtils.isValidEnum(PidType.class, type); + } + + public static PidType tryValueOf(String s) { + try { + return PidType.valueOf(s); + } catch (Exception e) { + return PidType.original; + } + } + +} \ No newline at end of file diff --git a/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/PidValueComparator.java b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/PidValueComparator.java new file mode 100644 index 000000000..f9399c8b9 --- /dev/null +++ b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/PidValueComparator.java @@ -0,0 +1,32 @@ +package eu.dnetlib.iis.wf.export.actionmanager; + +import java.util.Comparator; +import java.util.Optional; + +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class PidValueComparator implements Comparator { + + @Override + public int compare(StructuredProperty left, StructuredProperty right) { + + if (left == null && right == null) + return 0; + if (left == null) + return 1; + if (right == null) + return -1; + + StructuredProperty l = CleaningFunctions.normalizePidValue(left); + StructuredProperty r = CleaningFunctions.normalizePidValue(right); + + return Optional + .ofNullable(l.getValue()) + .map( + lv -> Optional + .ofNullable(r.getValue()) + .map(rv -> lv.compareTo(rv)) + .orElse(-1)) + .orElse(1); + } +} \ No newline at end of file diff --git a/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/ResultPidComparator.java b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/ResultPidComparator.java new file mode 100644 index 000000000..aefe38c00 --- /dev/null +++ b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/ResultPidComparator.java @@ -0,0 +1,55 @@ +package eu.dnetlib.iis.wf.export.actionmanager; + +import java.util.Comparator; + +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class ResultPidComparator implements Comparator { + + @Override + public int compare(StructuredProperty left, StructuredProperty right) { + + PidType lClass = PidType.tryValueOf(left.getQualifier().getClassid()); + PidType rClass = PidType.tryValueOf(right.getQualifier().getClassid()); + + if (lClass.equals(rClass)) + return 0; + + if (lClass.equals(PidType.doi)) + return -1; + if (rClass.equals(PidType.doi)) + return 1; + + if (lClass.equals(PidType.pmid)) + return -1; + if (rClass.equals(PidType.pmid)) + return 1; + + if (lClass.equals(PidType.pmc)) + return -1; + if (rClass.equals(PidType.pmc)) + return 1; + + if (lClass.equals(PidType.handle)) + return -1; + if (rClass.equals(PidType.handle)) + return 1; + + if (lClass.equals(PidType.arXiv)) + return -1; + if (rClass.equals(PidType.arXiv)) + return 1; + + if (lClass.equals(PidType.nct)) + return -1; + if (rClass.equals(PidType.nct)) + return 1; + + if (lClass.equals(PidType.pdb)) + return -1; + if (rClass.equals(PidType.pdb)) + return 1; + + return 0; + } +} \ No newline at end of file diff --git a/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/entity/patent/PatentExporterJob.java b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/entity/patent/PatentExporterJob.java index 4fbb0e56c..009d320fa 100644 --- a/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/entity/patent/PatentExporterJob.java +++ b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/entity/patent/PatentExporterJob.java @@ -6,7 +6,6 @@ import com.google.common.collect.Lists; import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.iis.common.InfoSpaceConstants; import eu.dnetlib.iis.common.java.io.HdfsUtils; import eu.dnetlib.iis.common.spark.JavaSparkContextFactory; @@ -17,6 +16,7 @@ import eu.dnetlib.iis.referenceextraction.patent.schemas.DocumentToPatent; import eu.dnetlib.iis.referenceextraction.patent.schemas.Patent; import eu.dnetlib.iis.wf.export.actionmanager.AtomicActionSerializationUtils; +import eu.dnetlib.iis.wf.export.actionmanager.IdentifierFactory; import eu.dnetlib.iis.wf.export.actionmanager.OafConstants; import eu.dnetlib.iis.wf.export.actionmanager.cfg.StaticConfigurationProvider; import eu.dnetlib.iis.wf.export.actionmanager.entity.ConfidenceLevelUtils; diff --git a/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/entity/software/SoftwareExporterJob.java b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/entity/software/SoftwareExporterJob.java index fa2fa0002..3570db223 100644 --- a/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/entity/software/SoftwareExporterJob.java +++ b/iis-wf/iis-wf-export-actionmanager/src/main/java/eu/dnetlib/iis/wf/export/actionmanager/entity/software/SoftwareExporterJob.java @@ -7,7 +7,6 @@ import com.google.common.collect.Sets; import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.iis.common.InfoSpaceConstants; import eu.dnetlib.iis.common.java.io.HdfsUtils; import eu.dnetlib.iis.common.spark.JavaSparkContextFactory; @@ -17,6 +16,7 @@ import eu.dnetlib.iis.referenceextraction.softwareurl.schemas.DocumentToSoftwareUrlWithMeta; import eu.dnetlib.iis.transformers.metadatamerger.schemas.ExtractedDocumentMetadataMergedWithOriginal; import eu.dnetlib.iis.wf.export.actionmanager.AtomicActionSerializationUtils; +import eu.dnetlib.iis.wf.export.actionmanager.IdentifierFactory; import eu.dnetlib.iis.wf.export.actionmanager.OafConstants; import eu.dnetlib.iis.wf.export.actionmanager.cfg.StaticConfigurationProvider; import eu.dnetlib.iis.wf.export.actionmanager.entity.ConfidenceLevelUtils; diff --git a/pom.xml b/pom.xml index e47dfea12..16e992b4e 100644 --- a/pom.xml +++ b/pom.xml @@ -501,7 +501,7 @@ eu.dnetlib.dhp dhp-schemas - [4.0.0, 5.0.0) + [6.0.0, 7.0.0)