Skip to content

Commit

Permalink
Merge pull request #1700 from alliance-genome/SCRUM-4442
Browse files Browse the repository at this point in the history
SCRUM-4442 & SCRUM-4554 VEP result loading
  • Loading branch information
markquintontulloch authored Nov 5, 2024
2 parents 884d5b1 + 9f43e5d commit fb3060a
Show file tree
Hide file tree
Showing 66 changed files with 2,072 additions and 48 deletions.
7 changes: 4 additions & 3 deletions src/main/cliapp/src/service/DataLoadService.js
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ export class DataLoadService extends BaseAuthService {
getBackendBulkLoadTypes(loadType) {
const bulkLoadTypes = {
BulkFMSLoad: [
'BIOGRID-ORCS',
'GFF', // This needs to be removed at some point

'GFF_EXON',
'GFF_CDS',
'GFF_TRANSCRIPT',
Expand All @@ -95,8 +95,9 @@ export class DataLoadService extends BaseAuthService {
'PHENOTYPE',
'PARALOGY',
'SEQUENCE_TARGETING_REAGENT',
// 'VARIATION',
'BIOGRID-ORCS',
'VARIATION',
'VEPGENE',
'VEPTRANSCRIPT',
],
BulkURLLoad: [
'ONTOLOGY',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ private Gff3Constants() {
"mRNA", "ncRNA", "piRNA", "lincRNA", "miRNA", "pre_miRNA", "snoRNA", "lncRNA",
"tRNA", "snRNA", "rRNA", "antisense_RNA", "C_gene_segment", "V_gene_segment",
"pseudogene_attribute", "pseudogenic_transcript", "lnc_RNA", "nc_primary_transcript",
"circular_ncRNA"
"circular_ncRNA", "transcript"
);

public static final List<String> STRANDS = List.of("+", "-");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@ private ValidationConstants() {
public static final String DUPLICATE_MESSAGE = "Duplicate entries found";
public static final String DUPLICATE_RELATION_PREFIX = "Entries found with same relation field - ";
public static final String UNRECOGNIZED_MESSAGE = "Unrecognized entry"; // To be used instead of INVALID_MESSAGE when entry to be skipped instead of failed
public static final String AMBIGUOUS_MESSAGE = "Could not be unambiguously resolved";

}
Original file line number Diff line number Diff line change
Expand Up @@ -87,5 +87,11 @@ private VocabularyConstants() {
public static final String HTP_DATASET_CATEGORY_TAGS_VOCABULARY = "data_set_category_tags";
public static final String HTP_DATASET_NOTE_TYPE_VOCABULARY_TERM_SET = "htp_expression_dataset_note_type";
public static final String HTP_DATASET_SAMPLE_NOTE_TYPE_VOCABULARY_TERM_SET = "htp_expression_dataset_sample_note_type";

public static final String VEP_IMPACT_VOCABULARY = "vep_impact";
public static final String VEP_CONSEQUENCE_VOCABULARY = "vep_consequence";
public static final String SIFT_PREDICTION_VOCABULARY = "sift_prediction";
public static final String POLYPHEN_PREDICTION_VOCABULARY = "polyphen_prediction";

public static final String HTP_DATASET_SAMPLE_SEQUENCE_FORMAT_VOCABULARY = "htp_data_sample_sequencing_format";
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package org.alliancegenome.curation_api.controllers.crud;

import java.util.List;

import org.alliancegenome.curation_api.controllers.base.BaseEntityCrudController;
import org.alliancegenome.curation_api.dao.PredictedVariantConsequenceDAO;
import org.alliancegenome.curation_api.interfaces.crud.PredictedVariantConsequenceCrudInterface;
import org.alliancegenome.curation_api.jobs.executors.VepGeneExecutor;
import org.alliancegenome.curation_api.jobs.executors.VepTranscriptExecutor;
import org.alliancegenome.curation_api.model.entities.PredictedVariantConsequence;
import org.alliancegenome.curation_api.model.ingest.dto.fms.VepTxtDTO;
import org.alliancegenome.curation_api.response.APIResponse;
import org.alliancegenome.curation_api.services.PredictedVariantConsequenceService;

import jakarta.annotation.PostConstruct;
import jakarta.enterprise.context.RequestScoped;
import jakarta.inject.Inject;

@RequestScoped
public class PredictedVariantConsequenceCrudController extends BaseEntityCrudController<PredictedVariantConsequenceService, PredictedVariantConsequence, PredictedVariantConsequenceDAO>
implements PredictedVariantConsequenceCrudInterface {

@Inject PredictedVariantConsequenceService predictedVariantConsequenceService;
@Inject VepTranscriptExecutor vepTranscriptExecutor;
@Inject VepGeneExecutor vepGeneExecutor;

@Override
@PostConstruct
protected void init() {
setService(predictedVariantConsequenceService);
}

public APIResponse updateTranscriptLevelConsequences(String dataProvider, List<VepTxtDTO> consequenceData) {
return vepTranscriptExecutor.runLoadApi(predictedVariantConsequenceService, dataProvider, consequenceData);
}

public APIResponse updateGeneLevelConsequences(String dataProvider, List<VepTxtDTO> consequenceData) {
return vepGeneExecutor.runLoadApi(dataProvider, consequenceData);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package org.alliancegenome.curation_api.dao;

import org.alliancegenome.curation_api.dao.base.BaseSQLDAO;
import org.alliancegenome.curation_api.model.entities.PredictedVariantConsequence;

import jakarta.enterprise.context.ApplicationScoped;

@ApplicationScoped
public class PredictedVariantConsequenceDAO extends BaseSQLDAO<PredictedVariantConsequence> {

protected PredictedVariantConsequenceDAO() {
super(PredictedVariantConsequence.class);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ public enum BackendBulkLoadType {
CONSTRUCT_ASSOCIATION("json"),
VARIANT("json"),
VARIATION("json"), // FMS variants as opposed to direct submission for VARIANT
VEPTRANSCRIPT("tsv"),
VEPGENE("tsv"),

// GFF all from the same file but split out
GFF("gff"), // For Database entries
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package org.alliancegenome.curation_api.interfaces.crud;

import java.util.List;

import org.alliancegenome.curation_api.interfaces.base.BaseIdCrudInterface;
import org.alliancegenome.curation_api.model.entities.PredictedVariantConsequence;
import org.alliancegenome.curation_api.model.ingest.dto.fms.VepTxtDTO;
import org.alliancegenome.curation_api.response.APIResponse;
import org.alliancegenome.curation_api.view.View;
import org.eclipse.microprofile.openapi.annotations.tags.Tag;

import com.fasterxml.jackson.annotation.JsonView;

import jakarta.ws.rs.Consumes;
import jakarta.ws.rs.POST;
import jakarta.ws.rs.Path;
import jakarta.ws.rs.PathParam;
import jakarta.ws.rs.Produces;
import jakarta.ws.rs.core.MediaType;

@Path("predictedvariantconsequence")
@Tag(name = "CRUD - Predicted Variant Consequence")
@Produces(MediaType.APPLICATION_JSON)
@Consumes(MediaType.APPLICATION_JSON)
public interface PredictedVariantConsequenceCrudInterface extends BaseIdCrudInterface<PredictedVariantConsequence> {

@POST
@Path("/bulk/{dataProvider}/transcriptConsequenceFile")
@JsonView(View.FieldsAndLists.class)
APIResponse updateTranscriptLevelConsequences(@PathParam("dataProvider") String dataProvider, List<VepTxtDTO> consequenceData);

@POST
@Path("/bulk/{dataProvider}/geneConsequenceFile")
@JsonView(View.FieldsAndLists.class)
APIResponse updateGeneLevelConsequences(@PathParam("dataProvider") String dataProvider, List<VepTxtDTO> consequenceData);

}
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ public class BulkLoadJobExecutor {
@Inject Gff3ExonExecutor gff3ExonExecutor;
@Inject Gff3CDSExecutor gff3CDSExecutor;
@Inject Gff3TranscriptExecutor gff3TranscriptExecutor;
@Inject VepTranscriptExecutor vepTranscriptExecutor;
@Inject VepGeneExecutor vepGeneExecutor;

@Inject ExpressionAtlasExecutor expressionAtlasExecutor;

Expand Down Expand Up @@ -140,6 +142,10 @@ public void process(BulkLoadFileHistory bulkLoadFileHistory, Boolean cleanUp) th
expressionAtlasExecutor.execLoad(bulkLoadFileHistory);
} else if (bulkLoadFileHistory.getBulkLoad().getBackendBulkLoadType() == BackendBulkLoadType.BIOGRID_ORCS) {
biogridOrcExecutor.execLoad(bulkLoadFileHistory);
} else if (bulkLoadFileHistory.getBulkLoad().getBackendBulkLoadType() == BackendBulkLoadType.VEPTRANSCRIPT) {
vepTranscriptExecutor.execLoad(bulkLoadFileHistory);
} else if (bulkLoadFileHistory.getBulkLoad().getBackendBulkLoadType() == BackendBulkLoadType.VEPGENE) {
vepGeneExecutor.execLoad(bulkLoadFileHistory);
} else if (bulkLoadFileHistory.getBulkLoad().getBackendBulkLoadType() == BackendBulkLoadType.HTPDATASAMPLE) {
htpExpressionDatasetSampleAnnotationExecutor.execLoad(bulkLoadFileHistory);
} else {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
package org.alliancegenome.curation_api.jobs.executors;

import com.fasterxml.jackson.dataformat.xml.XmlMapper;
import com.fasterxml.jackson.dataformat.xml.annotation.JacksonXmlElementWrapper;
import jakarta.enterprise.context.ApplicationScoped;
import jakarta.inject.Inject;
import lombok.extern.jbosslog.JBossLog;
import static org.alliancegenome.curation_api.services.DataProviderService.RESOURCE_DESCRIPTOR_PREFIX;

import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;

import org.alliancegenome.curation_api.model.entities.CrossReference;
import org.alliancegenome.curation_api.model.entities.DataProvider;
import org.alliancegenome.curation_api.model.entities.Organization;
Expand All @@ -17,15 +20,12 @@
import org.alliancegenome.curation_api.util.ProcessDisplayHelper;
import org.jetbrains.annotations.NotNull;

import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import com.fasterxml.jackson.dataformat.xml.XmlMapper;
import com.fasterxml.jackson.dataformat.xml.annotation.JacksonXmlElementWrapper;

import static org.alliancegenome.curation_api.services.DataProviderService.RESOURCE_DESCRIPTOR_PREFIX;
import jakarta.enterprise.context.ApplicationScoped;
import jakarta.inject.Inject;

@JBossLog
@ApplicationScoped
public class ExpressionAtlasExecutor extends LoadFileExecutor {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -218,14 +218,13 @@ protected <E extends AuditedObject, T extends BaseDTO> boolean runLoad(BaseUpser
idsAdded.add(dbObject.getId());
}
} catch (ObjectUpdateException e) {
// e.printStackTrace();
history.incrementFailed();
addException(history, e.getData());
} catch (KnownIssueValidationException e) {
Log.debug(e.getMessage());
history.incrementSkipped();
} catch (Exception e) {
// e.printStackTrace();
e.printStackTrace();
history.incrementFailed();
addException(history, new ObjectUpdateExceptionData(dtoObject, e.getMessage(), e.getStackTrace()));
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
package org.alliancegenome.curation_api.jobs.executors;

import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;

import org.alliancegenome.curation_api.dao.PredictedVariantConsequenceDAO;
import org.alliancegenome.curation_api.enums.BackendBulkDataProvider;
import org.alliancegenome.curation_api.exceptions.KnownIssueValidationException;
import org.alliancegenome.curation_api.exceptions.ObjectUpdateException;
import org.alliancegenome.curation_api.exceptions.ObjectUpdateException.ObjectUpdateExceptionData;
import org.alliancegenome.curation_api.jobs.util.CsvSchemaBuilder;
import org.alliancegenome.curation_api.model.entities.bulkloads.BulkFMSLoad;
import org.alliancegenome.curation_api.model.entities.bulkloads.BulkLoadFileHistory;
import org.alliancegenome.curation_api.model.ingest.dto.fms.VepTxtDTO;
import org.alliancegenome.curation_api.response.APIResponse;
import org.alliancegenome.curation_api.response.LoadHistoryResponce;
import org.alliancegenome.curation_api.services.PredictedVariantConsequenceService;
import org.alliancegenome.curation_api.util.ProcessDisplayHelper;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections4.ListUtils;

import com.fasterxml.jackson.databind.MappingIterator;
import com.fasterxml.jackson.dataformat.csv.CsvMapper;
import com.fasterxml.jackson.dataformat.csv.CsvParser;
import com.fasterxml.jackson.dataformat.csv.CsvSchema;

import io.quarkus.logging.Log;
import jakarta.enterprise.context.ApplicationScoped;
import jakarta.inject.Inject;

@ApplicationScoped
public class VepGeneExecutor extends LoadFileExecutor {

@Inject PredictedVariantConsequenceDAO predictedVariantConsequenceDAO;
@Inject PredictedVariantConsequenceService predictedVariantConsequenceService;

public void execLoad(BulkLoadFileHistory bulkLoadFileHistory) {
try {

CsvSchema vepTxtSchema = CsvSchemaBuilder.vepTxtSchema();
CsvMapper csvMapper = new CsvMapper();
MappingIterator<VepTxtDTO> it = csvMapper.enable(CsvParser.Feature.INSERT_NULLS_FOR_MISSING_COLUMNS).readerFor(VepTxtDTO.class).with(vepTxtSchema).readValues(new GZIPInputStream(new FileInputStream(bulkLoadFileHistory.getBulkLoadFile().getLocalFilePath())));
List<VepTxtDTO> vepData = it.readAll();


BulkFMSLoad fmsLoad = (BulkFMSLoad) bulkLoadFileHistory.getBulkLoad();
BackendBulkDataProvider dataProvider = BackendBulkDataProvider.valueOf(fmsLoad.getFmsDataSubType());

List<Long> consequenceIdsLoaded = new ArrayList<>();
List<Long> consequenceIdsBefore = predictedVariantConsequenceService.getGeneLevelIdsByDataProvider(dataProvider);

bulkLoadFileHistory.setCount(vepData.size());
updateHistory(bulkLoadFileHistory);

boolean success = runLoad(bulkLoadFileHistory, dataProvider, vepData, consequenceIdsLoaded);
if (success) {
runCleanup(predictedVariantConsequenceService, bulkLoadFileHistory, dataProvider.name(), consequenceIdsBefore, consequenceIdsLoaded, "gene-level predicted variant consequences");
}
bulkLoadFileHistory.finishLoad();
updateHistory(bulkLoadFileHistory);
updateExceptions(bulkLoadFileHistory);

} catch (Exception e) {
failLoad(bulkLoadFileHistory, e);
e.printStackTrace();
}
}

protected boolean runLoad(BulkLoadFileHistory history, BackendBulkDataProvider dataProvider, List<VepTxtDTO> objectList, List<Long> idsUpdated) {
ProcessDisplayHelper ph = new ProcessDisplayHelper();
ph.addDisplayHandler(loadProcessDisplayService);
if (CollectionUtils.isNotEmpty(objectList)) {
String loadMessage = objectList.get(0).getClass().getSimpleName() + " update";
if (dataProvider != null) {
loadMessage = loadMessage + " for " + dataProvider.name();
}
ph.startProcess(loadMessage, objectList.size());

updateHistory(history);
for (VepTxtDTO dtoObject : objectList) {
try {
Long idUpdated = predictedVariantConsequenceService.updateGeneLevelConsequence(dtoObject);
history.incrementCompleted();
if (idsUpdated != null) {
idsUpdated.add(idUpdated);
}
} catch (ObjectUpdateException e) {
history.incrementFailed();
addException(history, e.getData());
} catch (KnownIssueValidationException e) {
Log.debug(e.getMessage());
history.incrementSkipped();
} catch (Exception e) {
e.printStackTrace();
history.incrementFailed();
addException(history, new ObjectUpdateExceptionData(dtoObject, e.getMessage(), e.getStackTrace()));
}
if (history.getErrorRate() > 0.25) {
Log.error("Failure Rate > 25% aborting load");
updateHistory(history);
updateExceptions(history);
failLoadAboveErrorRateCutoff(history);
return false;
}
ph.progressProcess();
}
updateHistory(history);
updateExceptions(history);
ph.finishProcess();
}
return true;
}

protected void runCleanup(BulkLoadFileHistory history, String dataProviderName, List<Long> annotationIdsBefore, List<Long> annotationIdsAfter, String loadTypeString, Boolean deprecate) {
Log.debug("runLoad: After: " + dataProviderName + " " + annotationIdsAfter.size());

List<Long> distinctAfter = annotationIdsAfter.stream().distinct().collect(Collectors.toList());
Log.debug("runLoad: Distinct: " + dataProviderName + " " + distinctAfter.size());

List<Long> idsToReset = ListUtils.subtract(annotationIdsBefore, distinctAfter);
Log.debug("runLoad: Reset: " + dataProviderName + " " + idsToReset.size());

String countType = loadTypeString + " reset";

long existingResets = history.getCount(countType).getTotal() == null ? 0 : history.getCount(countType).getTotal();
history.setCount(countType, idsToReset.size() + existingResets);

String loadDescription = dataProviderName + " " + loadTypeString + " bulk load (" + history.getBulkLoadFile().getMd5Sum() + ")";

ProcessDisplayHelper ph = new ProcessDisplayHelper(10000);
ph.startProcess("Deletion/deprecation of: " + dataProviderName + " " + loadTypeString, idsToReset.size());

for (Long id : idsToReset) {
try {
predictedVariantConsequenceService.resetGeneLevelConsequence(id, loadDescription);
history.incrementCompleted(countType);
} catch (Exception e) {
history.incrementFailed(countType);
addException(history, new ObjectUpdateExceptionData("{ \"id\": " + id + "}", e.getMessage(), e.getStackTrace()));
}
if (history.getErrorRate(countType) > 0.25) {
Log.error(countType + " failure rate > 25% aborting load");
failLoadAboveErrorRateCutoff(history);
break;
}
ph.progressProcess();
}
updateHistory(history);
updateExceptions(history);
ph.finishProcess();
}

public APIResponse runLoadApi(String dataProviderName, List<VepTxtDTO> consequenceData) {
List<Long> idsLoaded = new ArrayList<>();
BulkLoadFileHistory history = new BulkLoadFileHistory(consequenceData.size());
history = bulkLoadFileHistoryDAO.persist(history);
BackendBulkDataProvider dataProvider = null;
if (dataProviderName != null) {
dataProvider = BackendBulkDataProvider.valueOf(dataProviderName);
}
runLoad(history, dataProvider, consequenceData, idsLoaded);
history.finishLoad();
return new LoadHistoryResponce(history);
}

}
Loading

0 comments on commit fb3060a

Please sign in to comment.