Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Output format #198

Merged
merged 3 commits into from
Aug 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,13 @@
import org.springframework.batch.core.launch.JobLauncher;
import org.springframework.boot.WebApplicationType;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;

/**
*
* @author Mete Ozguz
* @author Zachary Heins
*/
@SpringBootApplication
Expand All @@ -55,6 +61,7 @@ private static Options getOptions(String[] args)
gnuOptions.addOption("h", "help", false, "shows this help document and quits.")
.addOption("f", "filename", true, "Mutation filename")
.addOption("o", "output-filename", true, "Output filename (including path)")
.addOption("t", "output-format", true, "tcga, minimal or a file path which includes output format (FORMAT EXAMPLE: Chromosome,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build)")
.addOption("i", "isoform-override", true, "Isoform Overrides (mskcc or uniprot)")
.addOption("e", "error-report-location", true, "Error report filename (including path)")
.addOption("r", "replace-symbol-entrez", false, "Replace gene symbols and entrez id with what is provided by annotator" )
Expand All @@ -70,7 +77,7 @@ private static void help(Options gnuOptions, int exitStatus)
System.exit(exitStatus);
}

private static void launchJob(String[] args, String filename, String outputFilename, String isoformOverride,
private static void launchJob(String[] args, String filename, String outputFilename, String outputFormat, String isoformOverride,
String errorReportLocation, boolean replace, Integer postIntervalSize) throws Exception
{
SpringApplication app = new SpringApplication(AnnotationPipeline.class);
Expand All @@ -83,6 +90,7 @@ private static void launchJob(String[] args, String filename, String outputFilen
JobParameters jobParameters = new JobParametersBuilder()
.addString("filename", filename)
.addString("outputFilename", outputFilename)
.addString("outputFormat", outputFormat)
.addString("replace", String.valueOf(replace))
.addString("isoformOverride", isoformOverride)
.addString("errorReportLocation", errorReportLocation)
Expand All @@ -104,7 +112,24 @@ public static void main(String[] args) throws Exception
!commandLine.hasOption("output-filename")) {
help(gnuOptions, 0);
}
launchJob(args, commandLine.getOptionValue("filename"), commandLine.getOptionValue("output-filename"),commandLine.getOptionValue("isoform-override"),
String outputFormat = null;
if (commandLine.hasOption("output-format")) {
String outputFormatFile = commandLine.getOptionValue("output-format");
if ("tcga".equals(outputFormatFile)) {
outputFormat = "tcga";
} else if ("minimal".equals(outputFormatFile)) {
outputFormat = "minimal";
} else {
// user supplied a format file instead of pre-defined formats
try (BufferedReader br = new BufferedReader(new FileReader(outputFormatFile))) {
outputFormat = br.readLine();
} catch (IOException e) {
System.err.println("Error while reading output-format file: " + outputFormatFile);
System.exit(0);
}
}
}
launchJob(args, commandLine.getOptionValue("filename"), commandLine.getOptionValue("output-filename"), outputFormat, commandLine.getOptionValue("isoform-override"),
commandLine.hasOption("error-report-location") ? commandLine.getOptionValue("error-report-location") : null,
commandLine.hasOption("replace-symbol-entrez"), commandLine.hasOption("post-interval-size") ? Integer.valueOf(commandLine.getOptionValue("post-interval-size")) : -1);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,35 @@
import org.springframework.batch.item.file.LineCallbackHandler;
import org.springframework.batch.item.file.transform.DelimitedLineTokenizer;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.*;

/**
*
* @author Mete Ozguz
*/
public class DefaultLineCallbackHandler implements LineCallbackHandler {

private static final String[] requiredNames = {"Chromosome", "Start_Position", "End_Position", "Reference_Allele"};
private final Logger LOG = LoggerFactory.getLogger(DefaultLineCallbackHandler.class);
private final DelimitedLineTokenizer tokenizer;
private final List<String> inputFileHeaders;

public DefaultLineCallbackHandler(DelimitedLineTokenizer tokenizer) {
/**
*
* @param tokenizer Reference for the DefaultLineMapper's LineTokenizer. Non null.
* @param inputFileHeaders Reference for the header names which will be used for 'minimal' file format. Non null.
*/
public DefaultLineCallbackHandler(DelimitedLineTokenizer tokenizer, List<String> inputFileHeaders) {
this.tokenizer = tokenizer;
this.inputFileHeaders = inputFileHeaders;
}

/**
* Parser and validator of tab separated header names
* Should be invoked only for the header line!
*
* @param line
*/
@Override
public void handleLine(String line) {
String[] names = line.split("\t");
Expand All @@ -37,6 +52,7 @@ public void handleLine(String line) {
LOG.error(errorMessage);
throw new RuntimeException(errorMessage);
}
Collections.addAll(inputFileHeaders, names);
tokenizer.setNames(names); // do not use sorted names here
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import java.util.*;
import org.cbioportal.annotator.internal.AnnotationSummaryStatistics;
import org.cbioportal.annotator.Annotator;
import org.cbioportal.format.ExtendedMafFormat;
import org.cbioportal.models.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -59,6 +60,8 @@
*/
public class MutationRecordReader implements ItemStreamReader<AnnotatedRecord> {

private List<String> inputFileHeaders = new ArrayList<>();

@Value("#{jobParameters[filename]}")
private String filename;

Expand All @@ -74,6 +77,9 @@ public class MutationRecordReader implements ItemStreamReader<AnnotatedRecord> {
@Value("#{jobParameters[postIntervalSize]}")
private Integer postIntervalSize;

@Value("#{jobParameters[outputFormat]}")
private String outputFormat;

private AnnotationSummaryStatistics summaryStatistics;
private List<AnnotatedRecord> allAnnotatedRecords = new ArrayList<>();
private Set<String> header = new LinkedHashSet<>();
Expand All @@ -85,19 +91,49 @@ public class MutationRecordReader implements ItemStreamReader<AnnotatedRecord> {

@Override
public void open(ExecutionContext ec) throws ItemStreamException {
this.summaryStatistics = new AnnotationSummaryStatistics(annotator);
summaryStatistics = new AnnotationSummaryStatistics(annotator);
String genomeNexusVersion = annotator.getVersion();

processComments(ec, genomeNexusVersion);
List<MutationRecord> mutationRecords = loadMutationRecordsFromMaf();
if (!mutationRecords.isEmpty()) {
if (postIntervalSize > 0) {
this.allAnnotatedRecords = annotator.getAnnotatedRecordsUsingPOST(summaryStatistics, mutationRecords, isoformOverride, replace, postIntervalSize, true);
allAnnotatedRecords = annotator.getAnnotatedRecordsUsingPOST(summaryStatistics, mutationRecords, isoformOverride, replace, postIntervalSize, true);
} else {
this.allAnnotatedRecords = annotator.annotateRecordsUsingGET(summaryStatistics, mutationRecords, isoformOverride, replace, true);
allAnnotatedRecords = annotator.annotateRecordsUsingGET(summaryStatistics, mutationRecords, isoformOverride, replace, true);
}
for (AnnotatedRecord ar : this.allAnnotatedRecords) {
header.addAll(ar.getHeaderWithAdditionalFields());
// if output-format option is supplied, we only need to convert its data into header
if (outputFormat != null) {
if ("tcga".equals(outputFormat)) {
for(String token : ExtendedMafFormat.headers) {
header.add(token);
}
} else if ("minimal".equals(outputFormat)) {
for(String token : inputFileHeaders) {
header.add(token);
}
} else {
String[] tokens = outputFormat.split(",");
for (int i = 0; i < tokens.length; i++) {
header.add(tokens[i].trim());
}
}
// extra headers should go in the back alphabetically for these options
if ("tcga".equals(outputFormat) || "minimal".equals(outputFormat)) {
Set<String> sortedAllHeaders = new TreeSet<>();
for (AnnotatedRecord ar : allAnnotatedRecords) {
sortedAllHeaders.addAll(ar.getHeaderWithAdditionalFields());
}
for(String token : sortedAllHeaders) {
if (!header.contains(token)) {
header.add(token);
}
}
}
} else {
for (AnnotatedRecord ar : allAnnotatedRecords) {
header.addAll(ar.getHeaderWithAdditionalFields());
}
}
// add 'Annotation_Status' to header if not already present
if (!header.contains("Annotation_Status")) {
Expand Down Expand Up @@ -129,7 +165,7 @@ private List<MutationRecord> loadMutationRecordsFromMaf() {
mapper.setFieldSetMapper(new MutationFieldSetMapper());
reader.setLineMapper(mapper);
reader.setLinesToSkip(1);
reader.setSkippedLinesCallback(new DefaultLineCallbackHandler(tokenizer));
reader.setSkippedLinesCallback(new DefaultLineCallbackHandler(tokenizer, inputFileHeaders));
reader.open(new ExecutionContext());
LOG.info("Loading records from: " + filename);
MutationRecord mutationRecord;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,6 @@ public void open(ExecutionContext ec) throws ItemStreamException {
flatFileItemWriter.setHeaderCallback(new FlatFileHeaderCallback() {
@Override
public void writeHeader(Writer writer) throws IOException {
AnnotatedRecord record = new AnnotatedRecord();

// first write out the comment lines, then write the actual header
for (String comment : commentLines) {
writer.write(comment + "\n");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import org.junit.jupiter.api.Test;
import org.springframework.batch.item.file.transform.DelimitedLineTokenizer;

import java.util.ArrayList;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.fail;

Expand All @@ -15,7 +17,7 @@ class DefaultLineCallbackHandlerTest {
void handleLine_success_Allele1() {
String testLine = "Chromosome\tStart_Position\tEnd_Position\tReference_Allele\tTumor_Seq_Allele1";
DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer();
DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer);
DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer, new ArrayList<>());
handler.handleLine(testLine);
assertEquals(true, tokenizer.hasNames());
}
Expand All @@ -24,7 +26,7 @@ void handleLine_success_Allele1() {
void handleLine_success_Allele2() {
String testLine = "Chromosome\tStart_Position\tEnd_Position\tReference_Allele\tTumor_Seq_Allele2";
DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer();
DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer);
DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer, new ArrayList<>());
handler.handleLine(testLine);
assertEquals(true, tokenizer.hasNames());
}
Expand Down Expand Up @@ -61,7 +63,7 @@ void handleLine_missing_Tumor_Seq_Allele1_and_Tumor_Seq_Allele2() {

private void handleLine(String line, String expectedMessage) {
DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer();
DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer);
DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer, new ArrayList<>());
try {
handler.handleLine(line);
} catch (RuntimeException e) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package org.cbioportal.format;

import java.util.LinkedHashSet;
import java.util.Set;

/**
* Created using https://docs.cbioportal.org/file-formats/#extended-maf-format
* <pre>
* The extended MAF format recognized by the portal has:
* 32 columns from the TCGA MAF format.
* 1 column with the amino acid change.
* 4 columns with information on reference and variant allele counts in tumor and normal samples.
* </pre>
*
* @author Mete Ozguz
*/
public class ExtendedMafFormat {
public static final Set<String> headers = new LinkedHashSet<>();

static {
//
// 32 columns from the TCGA MAF format.
//
headers.add("Hugo_Symbol"); // (Required): A HUGO gene symbol.
headers.add("Entrez_Gene_Id"); // (Optional, but recommended): A Entrez Gene identifier.
headers.add("Center"); // (Optional): The sequencing center.
headers.add(
"NCBI_Build"); // (Required): The Genome Reference Consortium Build is used by a variant calling software. It must be "GRCh37" or "GRCh38" for a human, and "GRCm38" for a mouse.
headers.add("Chromosome"); // (Required): A chromosome number, e.g., "7".
headers.add("Start_Position"); // (Required): Start position of event.
headers.add("End_Position"); // (Required): End position of event.
headers.add("Strand"); // (Optional): We assume that the mutation is reported for the + strand.
headers.add("Variant_Classification"); // (Required): Translational effect of variant allele, e.g. Missense_Mutation, Silent, etc.
headers.add("Variant_Type"); // (Optional): Variant Type, e.g. SNP, DNP, etc.
headers.add("Reference_Allele"); // (Required): The plus strand reference allele at this position.
headers.add("Tumor_Seq_Allele1"); // (Optional): Primary data genotype.
headers.add("Tumor_Seq_Allele2"); // (Required): Primary data genotype.
headers.add("dbSNP_RS"); // (Optional): Latest dbSNP rs ID.
headers.add("dbSNP_Val_Status"); // (Optional): dbSNP validation status.
headers.add(
"Tumor_Sample_Barcode"); // (Required): This is the sample ID. Either a TCGA barcode (patient identifier will be extracted), or for non-TCGA data, a literal SAMPLE_ID as listed in the clinical data file.
headers.add("Matched_Norm_Sample_Barcode"); // (Optional): The sample ID for the matched normal sample.
headers.add("Match_Norm_Seq_Allele1"); // (Optional): Primary data.
headers.add("Match_Norm_Seq_Allele2"); // (Optional): Primary data.
headers.add("Tumor_Validation_Allele1"); // (Optional): Secondary data from orthogonal technology.
headers.add("Tumor_Validation_Allele2"); // (Optional): Secondary data from orthogonal technology.
headers.add("Match_Norm_Validation_Allele1"); // (Optional): Secondary data from orthogonal technology.
headers.add("Match_Norm_Validation_Allele2"); // (Optional): Secondary data from orthogonal technology.
headers.add("Verification_Status"); // (Optional): Second pass results from independent attempt using same methods as primary data source. "Verified", "Unknown" or "NA".
headers.add(
"Validation_Status"); // (Optional): Second pass results from orthogonal technology. "Valid", "Invalid", "Untested", "Inconclusive", "Redacted", "Unknown" or "NA".
headers.add(
"Mutation_Status"); // (Optional): "Somatic" or "Germline" are supported by the UI in Mutations tab. "None", "LOH" and "Wildtype" will not be loaded. Other values will be displayed as text.
headers.add("Sequencing_Phase"); // (Optional): Indicates current sequencing phase.
headers.add("Sequence_Source"); // (Optional): Molecular assay type used to produce the analytes used for sequencing.
headers.add("Validation_Method"); // (Optional): The assay platforms used for the validation call.
headers.add("Score"); // (Optional): Not used.
headers.add("BAM_File"); // (Optional): Not used.
headers.add("Sequencer"); // (Optional): Instrument used to produce primary data.
//
// 1 column with the amino acid change.
//
headers.add("HGVSp_Short"); // (Required): Amino Acid Change, e.g. p.V600E.
//
// 4 columns with information on reference and variant allele counts in tumor and normal samples.
//
headers.add("t_alt_count"); // (Optional): Variant allele count (tumor).
headers.add("t_ref_count"); // (Optional): Reference allele count (tumor).
headers.add("n_alt_count"); // (Optional): Variant allele count (normal).
headers.add("n_ref_count"); // (Optional): Reference allele count (normal).
}
}
1 change: 1 addition & 0 deletions test/data/output_format_example.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Annotation_Status