Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SCRUM-4442 & SCRUM-4554 VEP result loading #1700

Merged
merged 9 commits into from
Nov 5, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/main/cliapp/src/service/DataLoadService.js
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ export class DataLoadService extends BaseAuthService {
getBackendBulkLoadTypes(loadType) {
const bulkLoadTypes = {
BulkFMSLoad: [
'BIOGRID-ORCS',
'GFF', // This needs to be removed at some point

'GFF_EXON',
'GFF_CDS',
'GFF_TRANSCRIPT',
Expand All @@ -95,8 +95,9 @@ export class DataLoadService extends BaseAuthService {
'PHENOTYPE',
'PARALOGY',
'SEQUENCE_TARGETING_REAGENT',
// 'VARIATION',
'BIOGRID-ORCS',
'VARIATION',
'VEPGENE',
'VEPTRANSCRIPT',
],
BulkURLLoad: [
'ONTOLOGY',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ private Gff3Constants() {
"mRNA", "ncRNA", "piRNA", "lincRNA", "miRNA", "pre_miRNA", "snoRNA", "lncRNA",
"tRNA", "snRNA", "rRNA", "antisense_RNA", "C_gene_segment", "V_gene_segment",
"pseudogene_attribute", "pseudogenic_transcript", "lnc_RNA", "nc_primary_transcript",
"circular_ncRNA"
"circular_ncRNA", "transcript"
);

public static final List<String> STRANDS = List.of("+", "-");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@ private ValidationConstants() {
public static final String DUPLICATE_MESSAGE = "Duplicate entries found";
public static final String DUPLICATE_RELATION_PREFIX = "Entries found with same relation field - ";
public static final String UNRECOGNIZED_MESSAGE = "Unrecognized entry"; // To be used instead of INVALID_MESSAGE when entry to be skipped instead of failed
public static final String AMBIGUOUS_MESSAGE = "Could not be unambiguously resolved";

}
Original file line number Diff line number Diff line change
Expand Up @@ -87,5 +87,11 @@ private VocabularyConstants() {
public static final String HTP_DATASET_CATEGORY_TAGS_VOCABULARY = "data_set_category_tags";
public static final String HTP_DATASET_NOTE_TYPE_VOCABULARY_TERM_SET = "htp_expression_dataset_note_type";
public static final String HTP_DATASET_SAMPLE_NOTE_TYPE_VOCABULARY_TERM_SET = "htp_expression_dataset_sample_note_type";

public static final String VEP_IMPACT_VOCABULARY = "vep_impact";
public static final String VEP_CONSEQUENCE_VOCABULARY = "vep_consequence";
public static final String SIFT_PREDICTION_VOCABULARY = "sift_prediction";
public static final String POLYPHEN_PREDICTION_VOCABULARY = "polyphen_prediction";

public static final String HTP_DATASET_SAMPLE_SEQUENCE_FORMAT_VOCABULARY = "htp_data_sample_sequencing_format";
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package org.alliancegenome.curation_api.controllers.crud;

import java.util.List;

import org.alliancegenome.curation_api.controllers.base.BaseEntityCrudController;
import org.alliancegenome.curation_api.dao.PredictedVariantConsequenceDAO;
import org.alliancegenome.curation_api.interfaces.crud.PredictedVariantConsequenceCrudInterface;
import org.alliancegenome.curation_api.jobs.executors.VepGeneExecutor;
import org.alliancegenome.curation_api.jobs.executors.VepTranscriptExecutor;
import org.alliancegenome.curation_api.model.entities.PredictedVariantConsequence;
import org.alliancegenome.curation_api.model.ingest.dto.fms.VepTxtDTO;
import org.alliancegenome.curation_api.response.APIResponse;
import org.alliancegenome.curation_api.services.PredictedVariantConsequenceService;

import jakarta.annotation.PostConstruct;
import jakarta.enterprise.context.RequestScoped;
import jakarta.inject.Inject;

@RequestScoped
public class PredictedVariantConsequenceCrudController extends BaseEntityCrudController<PredictedVariantConsequenceService, PredictedVariantConsequence, PredictedVariantConsequenceDAO>
implements PredictedVariantConsequenceCrudInterface {

@Inject PredictedVariantConsequenceService predictedVariantConsequenceService;
@Inject VepTranscriptExecutor vepTranscriptExecutor;
@Inject VepGeneExecutor vepGeneExecutor;

@Override
@PostConstruct
protected void init() {
setService(predictedVariantConsequenceService);
}

public APIResponse updateTranscriptLevelConsequences(String dataProvider, List<VepTxtDTO> consequenceData) {
return vepTranscriptExecutor.runLoadApi(predictedVariantConsequenceService, dataProvider, consequenceData);
}

public APIResponse updateGeneLevelConsequences(String dataProvider, List<VepTxtDTO> consequenceData) {
return vepGeneExecutor.runLoadApi(dataProvider, consequenceData);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package org.alliancegenome.curation_api.dao;

import org.alliancegenome.curation_api.dao.base.BaseSQLDAO;
import org.alliancegenome.curation_api.model.entities.PredictedVariantConsequence;

import jakarta.enterprise.context.ApplicationScoped;

@ApplicationScoped
public class PredictedVariantConsequenceDAO extends BaseSQLDAO<PredictedVariantConsequence> {

protected PredictedVariantConsequenceDAO() {
super(PredictedVariantConsequence.class);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ public enum BackendBulkLoadType {
CONSTRUCT_ASSOCIATION("json"),
VARIANT("json"),
VARIATION("json"), // FMS variants as opposed to direct submission for VARIANT
VEPTRANSCRIPT("tsv"),
VEPGENE("tsv"),

// GFF all from the same file but split out
GFF("gff"), // For Database entries
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package org.alliancegenome.curation_api.interfaces.crud;

import java.util.List;

import org.alliancegenome.curation_api.interfaces.base.BaseIdCrudInterface;
import org.alliancegenome.curation_api.model.entities.PredictedVariantConsequence;
import org.alliancegenome.curation_api.model.ingest.dto.fms.VepTxtDTO;
import org.alliancegenome.curation_api.response.APIResponse;
import org.alliancegenome.curation_api.view.View;
import org.eclipse.microprofile.openapi.annotations.tags.Tag;

import com.fasterxml.jackson.annotation.JsonView;

import jakarta.ws.rs.Consumes;
import jakarta.ws.rs.POST;
import jakarta.ws.rs.Path;
import jakarta.ws.rs.PathParam;
import jakarta.ws.rs.Produces;
import jakarta.ws.rs.core.MediaType;

@Path("predictedvariantconsequence")
@Tag(name = "CRUD - Predicted Variant Consequence")
@Produces(MediaType.APPLICATION_JSON)
@Consumes(MediaType.APPLICATION_JSON)
public interface PredictedVariantConsequenceCrudInterface extends BaseIdCrudInterface<PredictedVariantConsequence> {

@POST
@Path("/bulk/{dataProvider}/transcriptConsequenceFile")
@JsonView(View.FieldsAndLists.class)
APIResponse updateTranscriptLevelConsequences(@PathParam("dataProvider") String dataProvider, List<VepTxtDTO> consequenceData);

@POST
@Path("/bulk/{dataProvider}/geneConsequenceFile")
@JsonView(View.FieldsAndLists.class)
APIResponse updateGeneLevelConsequences(@PathParam("dataProvider") String dataProvider, List<VepTxtDTO> consequenceData);

}
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ public class BulkLoadJobExecutor {
@Inject Gff3ExonExecutor gff3ExonExecutor;
@Inject Gff3CDSExecutor gff3CDSExecutor;
@Inject Gff3TranscriptExecutor gff3TranscriptExecutor;
@Inject VepTranscriptExecutor vepTranscriptExecutor;
@Inject VepGeneExecutor vepGeneExecutor;

@Inject ExpressionAtlasExecutor expressionAtlasExecutor;

Expand Down Expand Up @@ -140,6 +142,10 @@ public void process(BulkLoadFileHistory bulkLoadFileHistory, Boolean cleanUp) th
expressionAtlasExecutor.execLoad(bulkLoadFileHistory);
} else if (bulkLoadFileHistory.getBulkLoad().getBackendBulkLoadType() == BackendBulkLoadType.BIOGRID_ORCS) {
biogridOrcExecutor.execLoad(bulkLoadFileHistory);
} else if (bulkLoadFileHistory.getBulkLoad().getBackendBulkLoadType() == BackendBulkLoadType.VEPTRANSCRIPT) {
vepTranscriptExecutor.execLoad(bulkLoadFileHistory);
} else if (bulkLoadFileHistory.getBulkLoad().getBackendBulkLoadType() == BackendBulkLoadType.VEPGENE) {
vepGeneExecutor.execLoad(bulkLoadFileHistory);
} else if (bulkLoadFileHistory.getBulkLoad().getBackendBulkLoadType() == BackendBulkLoadType.HTPDATASAMPLE) {
htpExpressionDatasetSampleAnnotationExecutor.execLoad(bulkLoadFileHistory);
} else {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
package org.alliancegenome.curation_api.jobs.executors;

import com.fasterxml.jackson.dataformat.xml.XmlMapper;
import com.fasterxml.jackson.dataformat.xml.annotation.JacksonXmlElementWrapper;
import jakarta.enterprise.context.ApplicationScoped;
import jakarta.inject.Inject;
import lombok.extern.jbosslog.JBossLog;
import static org.alliancegenome.curation_api.services.DataProviderService.RESOURCE_DESCRIPTOR_PREFIX;

import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;

import org.alliancegenome.curation_api.model.entities.CrossReference;
import org.alliancegenome.curation_api.model.entities.DataProvider;
import org.alliancegenome.curation_api.model.entities.Organization;
Expand All @@ -17,15 +20,12 @@
import org.alliancegenome.curation_api.util.ProcessDisplayHelper;
import org.jetbrains.annotations.NotNull;

import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import com.fasterxml.jackson.dataformat.xml.XmlMapper;
import com.fasterxml.jackson.dataformat.xml.annotation.JacksonXmlElementWrapper;

import static org.alliancegenome.curation_api.services.DataProviderService.RESOURCE_DESCRIPTOR_PREFIX;
import jakarta.enterprise.context.ApplicationScoped;
import jakarta.inject.Inject;

@JBossLog
@ApplicationScoped
public class ExpressionAtlasExecutor extends LoadFileExecutor {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -218,14 +218,13 @@ protected <E extends AuditedObject, T extends BaseDTO> boolean runLoad(BaseUpser
idsAdded.add(dbObject.getId());
}
} catch (ObjectUpdateException e) {
// e.printStackTrace();
history.incrementFailed();
addException(history, e.getData());
} catch (KnownIssueValidationException e) {
Log.debug(e.getMessage());
history.incrementSkipped();
} catch (Exception e) {
// e.printStackTrace();
e.printStackTrace();
history.incrementFailed();
addException(history, new ObjectUpdateExceptionData(dtoObject, e.getMessage(), e.getStackTrace()));
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
package org.alliancegenome.curation_api.jobs.executors;

import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;

import org.alliancegenome.curation_api.dao.PredictedVariantConsequenceDAO;
import org.alliancegenome.curation_api.enums.BackendBulkDataProvider;
import org.alliancegenome.curation_api.exceptions.KnownIssueValidationException;
import org.alliancegenome.curation_api.exceptions.ObjectUpdateException;
import org.alliancegenome.curation_api.exceptions.ObjectUpdateException.ObjectUpdateExceptionData;
import org.alliancegenome.curation_api.jobs.util.CsvSchemaBuilder;
import org.alliancegenome.curation_api.model.entities.bulkloads.BulkFMSLoad;
import org.alliancegenome.curation_api.model.entities.bulkloads.BulkLoadFileHistory;
import org.alliancegenome.curation_api.model.ingest.dto.fms.VepTxtDTO;
import org.alliancegenome.curation_api.response.APIResponse;
import org.alliancegenome.curation_api.response.LoadHistoryResponce;
import org.alliancegenome.curation_api.services.PredictedVariantConsequenceService;
import org.alliancegenome.curation_api.util.ProcessDisplayHelper;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections4.ListUtils;

import com.fasterxml.jackson.databind.MappingIterator;
import com.fasterxml.jackson.dataformat.csv.CsvMapper;
import com.fasterxml.jackson.dataformat.csv.CsvParser;
import com.fasterxml.jackson.dataformat.csv.CsvSchema;

import io.quarkus.logging.Log;
import jakarta.enterprise.context.ApplicationScoped;
import jakarta.inject.Inject;

@ApplicationScoped
public class VepGeneExecutor extends LoadFileExecutor {

@Inject PredictedVariantConsequenceDAO predictedVariantConsequenceDAO;
@Inject PredictedVariantConsequenceService predictedVariantConsequenceService;

public void execLoad(BulkLoadFileHistory bulkLoadFileHistory) {
try {

CsvSchema vepTxtSchema = CsvSchemaBuilder.vepTxtSchema();
CsvMapper csvMapper = new CsvMapper();
MappingIterator<VepTxtDTO> it = csvMapper.enable(CsvParser.Feature.INSERT_NULLS_FOR_MISSING_COLUMNS).readerFor(VepTxtDTO.class).with(vepTxtSchema).readValues(new GZIPInputStream(new FileInputStream(bulkLoadFileHistory.getBulkLoadFile().getLocalFilePath())));
List<VepTxtDTO> vepData = it.readAll();


BulkFMSLoad fmsLoad = (BulkFMSLoad) bulkLoadFileHistory.getBulkLoad();
BackendBulkDataProvider dataProvider = BackendBulkDataProvider.valueOf(fmsLoad.getFmsDataSubType());

List<Long> consequenceIdsLoaded = new ArrayList<>();
List<Long> consequenceIdsBefore = predictedVariantConsequenceService.getGeneLevelIdsByDataProvider(dataProvider);

bulkLoadFileHistory.setCount(vepData.size());
updateHistory(bulkLoadFileHistory);

boolean success = runLoad(bulkLoadFileHistory, dataProvider, vepData, consequenceIdsLoaded);
if (success) {
runCleanup(predictedVariantConsequenceService, bulkLoadFileHistory, dataProvider.name(), consequenceIdsBefore, consequenceIdsLoaded, "gene-level predicted variant consequences");
}
bulkLoadFileHistory.finishLoad();
updateHistory(bulkLoadFileHistory);
updateExceptions(bulkLoadFileHistory);

} catch (Exception e) {
failLoad(bulkLoadFileHistory, e);
e.printStackTrace();
}
}

protected boolean runLoad(BulkLoadFileHistory history, BackendBulkDataProvider dataProvider, List<VepTxtDTO> objectList, List<Long> idsUpdated) {
ProcessDisplayHelper ph = new ProcessDisplayHelper();
ph.addDisplayHandler(loadProcessDisplayService);
if (CollectionUtils.isNotEmpty(objectList)) {
String loadMessage = objectList.get(0).getClass().getSimpleName() + " update";
if (dataProvider != null) {
loadMessage = loadMessage + " for " + dataProvider.name();
}
ph.startProcess(loadMessage, objectList.size());

updateHistory(history);
for (VepTxtDTO dtoObject : objectList) {
try {
Long idUpdated = predictedVariantConsequenceService.updateGeneLevelConsequence(dtoObject);
history.incrementCompleted();
if (idsUpdated != null) {
idsUpdated.add(idUpdated);
}
} catch (ObjectUpdateException e) {
history.incrementFailed();
addException(history, e.getData());
} catch (KnownIssueValidationException e) {
Log.debug(e.getMessage());
history.incrementSkipped();
} catch (Exception e) {
e.printStackTrace();
history.incrementFailed();
addException(history, new ObjectUpdateExceptionData(dtoObject, e.getMessage(), e.getStackTrace()));
}
if (history.getErrorRate() > 0.25) {
Log.error("Failure Rate > 25% aborting load");
updateHistory(history);
updateExceptions(history);
failLoadAboveErrorRateCutoff(history);
return false;
}
ph.progressProcess();
}
updateHistory(history);
updateExceptions(history);
ph.finishProcess();
}
return true;
}

protected void runCleanup(BulkLoadFileHistory history, String dataProviderName, List<Long> annotationIdsBefore, List<Long> annotationIdsAfter, String loadTypeString, Boolean deprecate) {
Log.debug("runLoad: After: " + dataProviderName + " " + annotationIdsAfter.size());

List<Long> distinctAfter = annotationIdsAfter.stream().distinct().collect(Collectors.toList());
Log.debug("runLoad: Distinct: " + dataProviderName + " " + distinctAfter.size());

List<Long> idsToReset = ListUtils.subtract(annotationIdsBefore, distinctAfter);
Log.debug("runLoad: Reset: " + dataProviderName + " " + idsToReset.size());

String countType = loadTypeString + " reset";

long existingResets = history.getCount(countType).getTotal() == null ? 0 : history.getCount(countType).getTotal();
history.setCount(countType, idsToReset.size() + existingResets);

String loadDescription = dataProviderName + " " + loadTypeString + " bulk load (" + history.getBulkLoadFile().getMd5Sum() + ")";

ProcessDisplayHelper ph = new ProcessDisplayHelper(10000);
ph.startProcess("Deletion/deprecation of: " + dataProviderName + " " + loadTypeString, idsToReset.size());

for (Long id : idsToReset) {
try {
predictedVariantConsequenceService.resetGeneLevelConsequence(id, loadDescription);
history.incrementCompleted(countType);
} catch (Exception e) {
history.incrementFailed(countType);
addException(history, new ObjectUpdateExceptionData("{ \"id\": " + id + "}", e.getMessage(), e.getStackTrace()));
}
if (history.getErrorRate(countType) > 0.25) {
Log.error(countType + " failure rate > 25% aborting load");
failLoadAboveErrorRateCutoff(history);
break;
}
ph.progressProcess();
}
updateHistory(history);
updateExceptions(history);
ph.finishProcess();
}

public APIResponse runLoadApi(String dataProviderName, List<VepTxtDTO> consequenceData) {
List<Long> idsLoaded = new ArrayList<>();
BulkLoadFileHistory history = new BulkLoadFileHistory(consequenceData.size());
history = bulkLoadFileHistoryDAO.persist(history);
BackendBulkDataProvider dataProvider = null;
if (dataProviderName != null) {
dataProvider = BackendBulkDataProvider.valueOf(dataProviderName);
}
runLoad(history, dataProvider, consequenceData, idsLoaded);
history.finishLoad();
return new LoadHistoryResponce(history);
}

}
markquintontulloch marked this conversation as resolved.
Show resolved Hide resolved
Loading
Loading