genome-nexus · inodb · Aug 9, 2022 · Jun 25, 2022 · Jul 26, 2022 · Jul 30, 2022
diff --git a/annotationPipeline/src/main/java/org/cbioportal/annotation/AnnotationPipeline.java b/annotationPipeline/src/main/java/org/cbioportal/annotation/AnnotationPipeline.java
@@ -42,7 +42,13 @@
 import org.springframework.batch.core.launch.JobLauncher;
 import org.springframework.boot.WebApplicationType;
 
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+
 /**
+ *
+ * @author Mete Ozguz
  * @author Zachary Heins
  */
 @SpringBootApplication
@@ -55,6 +61,7 @@ private static Options getOptions(String[] args)
         gnuOptions.addOption("h", "help", false, "shows this help document and quits.")
             .addOption("f", "filename", true, "Mutation filename")
             .addOption("o", "output-filename", true, "Output filename (including path)")
+            .addOption("t", "output-format", true, "tcga, minimal or a file path which includes output format (FORMAT EXAMPLE: Chromosome,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build)")
             .addOption("i", "isoform-override", true, "Isoform Overrides (mskcc or uniprot)")
             .addOption("e", "error-report-location", true, "Error report filename (including path)")
             .addOption("r", "replace-symbol-entrez", false, "Replace gene symbols and entrez id with what is provided by annotator" )
@@ -70,7 +77,7 @@ private static void help(Options gnuOptions, int exitStatus)
         System.exit(exitStatus);
     }
 
-    private static void launchJob(String[] args, String filename, String outputFilename, String isoformOverride,
+    private static void launchJob(String[] args, String filename, String outputFilename, String outputFormat, String isoformOverride,
             String errorReportLocation, boolean replace, Integer postIntervalSize) throws Exception
     {
         SpringApplication app = new SpringApplication(AnnotationPipeline.class);
@@ -83,6 +90,7 @@ private static void launchJob(String[] args, String filename, String outputFilen
         JobParameters jobParameters = new JobParametersBuilder()
             .addString("filename", filename)
             .addString("outputFilename", outputFilename)
+            .addString("outputFormat", outputFormat)
             .addString("replace", String.valueOf(replace))
             .addString("isoformOverride", isoformOverride)
             .addString("errorReportLocation", errorReportLocation)
@@ -104,7 +112,24 @@ public static void main(String[] args) throws Exception
             !commandLine.hasOption("output-filename")) {
             help(gnuOptions, 0);
         }
-        launchJob(args, commandLine.getOptionValue("filename"), commandLine.getOptionValue("output-filename"),commandLine.getOptionValue("isoform-override"),
+        String outputFormat = null;
+        if (commandLine.hasOption("output-format")) {
+            String outputFormatFile = commandLine.getOptionValue("output-format");
+            if ("tcga".equals(outputFormatFile)) {
+                outputFormat = "tcga";
+            } else if ("minimal".equals(outputFormatFile)) {
+                outputFormat = "minimal";
+            } else {
+                // user supplied a format file instead of pre-defined formats
+                try (BufferedReader br = new BufferedReader(new FileReader(outputFormatFile))) {
+                    outputFormat = br.readLine();
+                } catch (IOException e) {
+                    System.err.println("Error while reading output-format file: " + outputFormatFile);
+                    System.exit(0);
+                }
+            }
+        }
+        launchJob(args, commandLine.getOptionValue("filename"), commandLine.getOptionValue("output-filename"), outputFormat, commandLine.getOptionValue("isoform-override"),
                 commandLine.hasOption("error-report-location") ? commandLine.getOptionValue("error-report-location") : null,
                 commandLine.hasOption("replace-symbol-entrez"), commandLine.hasOption("post-interval-size") ? Integer.valueOf(commandLine.getOptionValue("post-interval-size")) : -1);
     }

diff --git a/...Pipeline/src/main/java/org/cbioportal/annotation/pipeline/DefaultLineCallbackHandler.java b/...Pipeline/src/main/java/org/cbioportal/annotation/pipeline/DefaultLineCallbackHandler.java
@@ -5,20 +5,35 @@
 import org.springframework.batch.item.file.LineCallbackHandler;
 import org.springframework.batch.item.file.transform.DelimitedLineTokenizer;
 
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Set;
+import java.util.*;
 
+/**
+ *
+ * @author Mete Ozguz
+ */
 public class DefaultLineCallbackHandler implements LineCallbackHandler {
 
     private static final String[] requiredNames = {"Chromosome", "Start_Position", "End_Position", "Reference_Allele"};
     private final Logger LOG = LoggerFactory.getLogger(DefaultLineCallbackHandler.class);
     private final DelimitedLineTokenizer tokenizer;
+    private final List<String> inputFileHeaders;
 
-    public DefaultLineCallbackHandler(DelimitedLineTokenizer tokenizer) {
+    /**
+     *
+     * @param tokenizer Reference for the DefaultLineMapper's LineTokenizer. Non null.
+     * @param inputFileHeaders Reference for the header names which will be used for 'minimal' file format. Non null.
+     */
+    public DefaultLineCallbackHandler(DelimitedLineTokenizer tokenizer, List<String> inputFileHeaders) {
         this.tokenizer = tokenizer;
+        this.inputFileHeaders = inputFileHeaders;
     }
 
+    /**
+     * Parser and validator of tab separated header names
+     * Should be invoked only for the header line!
+     *
+     * @param line
+     */
     @Override
     public void handleLine(String line) {
         String[] names = line.split("\t");
@@ -37,6 +52,7 @@ public void handleLine(String line) {
             LOG.error(errorMessage);
             throw new RuntimeException(errorMessage);
         }
+        Collections.addAll(inputFileHeaders, names);
         tokenizer.setNames(names); // do not use sorted names here
     }
 }
diff --git a/...tationPipeline/src/main/java/org/cbioportal/annotation/pipeline/MutationRecordReader.java b/...tationPipeline/src/main/java/org/cbioportal/annotation/pipeline/MutationRecordReader.java
@@ -36,6 +36,7 @@
 import java.util.*;
 import org.cbioportal.annotator.internal.AnnotationSummaryStatistics;
 import org.cbioportal.annotator.Annotator;
+import org.cbioportal.format.ExtendedMafFormat;
 import org.cbioportal.models.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -59,6 +60,8 @@
  */
 public class MutationRecordReader implements ItemStreamReader<AnnotatedRecord> {
 
+    private List<String> inputFileHeaders = new ArrayList<>();
+
     @Value("#{jobParameters[filename]}")
     private String filename;
 
@@ -74,6 +77,9 @@ public class MutationRecordReader implements ItemStreamReader<AnnotatedRecord> {
     @Value("#{jobParameters[postIntervalSize]}")
     private Integer postIntervalSize;
 
+    @Value("#{jobParameters[outputFormat]}")
+    private String outputFormat;
+
     private AnnotationSummaryStatistics summaryStatistics;
     private List<AnnotatedRecord> allAnnotatedRecords = new ArrayList<>();
     private Set<String> header = new LinkedHashSet<>();
@@ -85,19 +91,49 @@ public class MutationRecordReader implements ItemStreamReader<AnnotatedRecord> {
 
     @Override
     public void open(ExecutionContext ec) throws ItemStreamException {
-        this.summaryStatistics = new AnnotationSummaryStatistics(annotator);
+        summaryStatistics = new AnnotationSummaryStatistics(annotator);
         String genomeNexusVersion = annotator.getVersion();
 
         processComments(ec, genomeNexusVersion);
         List<MutationRecord> mutationRecords = loadMutationRecordsFromMaf();
         if (!mutationRecords.isEmpty()) {
             if (postIntervalSize > 0) {
-                this.allAnnotatedRecords = annotator.getAnnotatedRecordsUsingPOST(summaryStatistics, mutationRecords, isoformOverride, replace, postIntervalSize, true);
+                allAnnotatedRecords = annotator.getAnnotatedRecordsUsingPOST(summaryStatistics, mutationRecords, isoformOverride, replace, postIntervalSize, true);
             } else {
-                this.allAnnotatedRecords = annotator.annotateRecordsUsingGET(summaryStatistics, mutationRecords, isoformOverride, replace, true);
+                allAnnotatedRecords = annotator.annotateRecordsUsingGET(summaryStatistics, mutationRecords, isoformOverride, replace, true);
             }
-            for (AnnotatedRecord ar : this.allAnnotatedRecords) {
-                header.addAll(ar.getHeaderWithAdditionalFields());
+            // if output-format option is supplied, we only need to convert its data into header
+            if (outputFormat != null) {
+                if ("tcga".equals(outputFormat)) {
+                    for(String token : ExtendedMafFormat.headers) {
+                        header.add(token);
+                    }
+                } else if ("minimal".equals(outputFormat)) {
+                    for(String token : inputFileHeaders) {
+                        header.add(token);
+                    }
+                } else {
+                    String[] tokens = outputFormat.split(",");
+                    for (int i = 0; i < tokens.length; i++) {
+                        header.add(tokens[i].trim());
+                    }
+                }
+                // extra headers should go in the back alphabetically for these options
+                if ("tcga".equals(outputFormat) || "minimal".equals(outputFormat)) {
+                    Set<String> sortedAllHeaders = new TreeSet<>();
+                    for (AnnotatedRecord ar : allAnnotatedRecords) {
+                        sortedAllHeaders.addAll(ar.getHeaderWithAdditionalFields());
+                    }
+                    for(String token : sortedAllHeaders) {
+                        if (!header.contains(token)) {
+                            header.add(token);
+                        }
+                    }
+                }
+            } else {
+                for (AnnotatedRecord ar : allAnnotatedRecords) {
+                    header.addAll(ar.getHeaderWithAdditionalFields());
+                }
             }
             // add 'Annotation_Status' to header if not already present
             if (!header.contains("Annotation_Status")) {
@@ -129,7 +165,7 @@ private List<MutationRecord> loadMutationRecordsFromMaf() {
         mapper.setFieldSetMapper(new MutationFieldSetMapper());
         reader.setLineMapper(mapper);
         reader.setLinesToSkip(1);
-        reader.setSkippedLinesCallback(new DefaultLineCallbackHandler(tokenizer));
+        reader.setSkippedLinesCallback(new DefaultLineCallbackHandler(tokenizer, inputFileHeaders));
         reader.open(new ExecutionContext());
         LOG.info("Loading records from: " + filename);
         MutationRecord mutationRecord;

diff --git a/...tationPipeline/src/main/java/org/cbioportal/annotation/pipeline/MutationRecordWriter.java b/...tationPipeline/src/main/java/org/cbioportal/annotation/pipeline/MutationRecordWriter.java
@@ -76,8 +76,6 @@ public void open(ExecutionContext ec) throws ItemStreamException {
             flatFileItemWriter.setHeaderCallback(new FlatFileHeaderCallback() {
                 @Override
                 public void writeHeader(Writer writer) throws IOException {
-                    AnnotatedRecord record = new AnnotatedRecord();
-
                     // first write out the comment lines, then write the actual header
                     for (String comment : commentLines) {
                         writer.write(comment + "\n");

diff --git a/...line/src/test/java/org/cbioportal/annotation/pipeline/DefaultLineCallbackHandlerTest.java b/...line/src/test/java/org/cbioportal/annotation/pipeline/DefaultLineCallbackHandlerTest.java
@@ -3,6 +3,8 @@
 import org.junit.jupiter.api.Test;
 import org.springframework.batch.item.file.transform.DelimitedLineTokenizer;
 
+import java.util.ArrayList;
+
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.fail;
 
@@ -15,7 +17,7 @@ class DefaultLineCallbackHandlerTest {
     void handleLine_success_Allele1() {
         String testLine = "Chromosome\tStart_Position\tEnd_Position\tReference_Allele\tTumor_Seq_Allele1";
         DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer();
-        DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer);
+        DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer, new ArrayList<>());
         handler.handleLine(testLine);
         assertEquals(true, tokenizer.hasNames());
     }
@@ -24,7 +26,7 @@ void handleLine_success_Allele1() {
     void handleLine_success_Allele2() {
         String testLine = "Chromosome\tStart_Position\tEnd_Position\tReference_Allele\tTumor_Seq_Allele2";
         DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer();
-        DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer);
+        DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer, new ArrayList<>());
         handler.handleLine(testLine);
         assertEquals(true, tokenizer.hasNames());
     }
@@ -61,7 +63,7 @@ void handleLine_missing_Tumor_Seq_Allele1_and_Tumor_Seq_Allele2() {
 
     private void handleLine(String line, String expectedMessage) {
         DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer();
-        DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer);
+        DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer, new ArrayList<>());
         try {
             handler.handleLine(line);
         } catch (RuntimeException e) {

diff --git a/annotator/src/main/java/org/cbioportal/format/ExtendedMafFormat.java b/annotator/src/main/java/org/cbioportal/format/ExtendedMafFormat.java
@@ -0,0 +1,72 @@
+package org.cbioportal.format;
+
+import java.util.LinkedHashSet;
+import java.util.Set;
+
+/**
+ * Created using https://docs.cbioportal.org/file-formats/#extended-maf-format
+ * <pre>
+ * The extended MAF format recognized by the portal has:
+ *    32 columns from the TCGA MAF format.
+ *    1 column with the amino acid change.
+ *    4 columns with information on reference and variant allele counts in tumor and normal samples.
+ * </pre>
+ *
+ * @author Mete Ozguz
+ */
+public class ExtendedMafFormat {
+    public static final Set<String> headers = new LinkedHashSet<>();
+
+    static {
+        //
+        // 32 columns from the TCGA MAF format.
+        //
+        headers.add("Hugo_Symbol"); // (Required): A HUGO gene symbol.
+        headers.add("Entrez_Gene_Id"); // (Optional, but recommended): A Entrez Gene identifier.
+        headers.add("Center"); // (Optional): The sequencing center.
+        headers.add(
+                "NCBI_Build"); // (Required): The Genome Reference Consortium Build is used by a variant calling software. It must be "GRCh37" or "GRCh38" for a human, and "GRCm38" for a mouse.
+        headers.add("Chromosome"); // (Required): A chromosome number, e.g., "7".
+        headers.add("Start_Position"); // (Required): Start position of event.
+        headers.add("End_Position"); // (Required): End position of event.
+        headers.add("Strand"); // (Optional): We assume that the mutation is reported for the + strand.
+        headers.add("Variant_Classification"); // (Required): Translational effect of variant allele, e.g. Missense_Mutation, Silent, etc.
+        headers.add("Variant_Type"); // (Optional): Variant Type, e.g. SNP, DNP, etc.
+        headers.add("Reference_Allele"); // (Required): The plus strand reference allele at this position.
+        headers.add("Tumor_Seq_Allele1"); // (Optional): Primary data genotype.
+        headers.add("Tumor_Seq_Allele2"); // (Required): Primary data genotype.
+        headers.add("dbSNP_RS"); // (Optional): Latest dbSNP rs ID.
+        headers.add("dbSNP_Val_Status"); // (Optional): dbSNP validation status.
+        headers.add(
+                "Tumor_Sample_Barcode"); // (Required): This is the sample ID. Either a TCGA barcode (patient identifier will be extracted), or for non-TCGA data, a literal SAMPLE_ID as listed in the clinical data file.
+        headers.add("Matched_Norm_Sample_Barcode"); // (Optional): The sample ID for the matched normal sample.
+        headers.add("Match_Norm_Seq_Allele1"); // (Optional): Primary data.
+        headers.add("Match_Norm_Seq_Allele2"); // (Optional): Primary data.
+        headers.add("Tumor_Validation_Allele1"); // (Optional): Secondary data from orthogonal technology.
+        headers.add("Tumor_Validation_Allele2"); // (Optional): Secondary data from orthogonal technology.
+        headers.add("Match_Norm_Validation_Allele1"); // (Optional): Secondary data from orthogonal technology.
+        headers.add("Match_Norm_Validation_Allele2"); // (Optional): Secondary data from orthogonal technology.
+        headers.add("Verification_Status"); // (Optional): Second pass results from independent attempt using same methods as primary data source. "Verified", "Unknown" or "NA".
+        headers.add(
+                "Validation_Status"); // (Optional): Second pass results from orthogonal technology. "Valid", "Invalid", "Untested", "Inconclusive", "Redacted", "Unknown" or "NA".
+        headers.add(
+                "Mutation_Status"); // (Optional): "Somatic" or "Germline" are supported by the UI in Mutations tab. "None", "LOH" and "Wildtype" will not be loaded. Other values will be displayed as text.
+        headers.add("Sequencing_Phase"); // (Optional): Indicates current sequencing phase.
+        headers.add("Sequence_Source"); // (Optional): Molecular assay type used to produce the analytes used for sequencing.
+        headers.add("Validation_Method"); // (Optional): The assay platforms used for the validation call.
+        headers.add("Score"); // (Optional): Not used.
+        headers.add("BAM_File"); // (Optional): Not used.
+        headers.add("Sequencer"); // (Optional): Instrument used to produce primary data.
+        //
+        // 1 column with the amino acid change.
+        //
+        headers.add("HGVSp_Short"); // (Required): Amino Acid Change, e.g. p.V600E.
+        //
+        // 4 columns with information on reference and variant allele counts in tumor and normal samples.
+        //
+        headers.add("t_alt_count"); // (Optional): Variant allele count (tumor).
+        headers.add("t_ref_count"); // (Optional): Reference allele count (tumor).
+        headers.add("n_alt_count"); // (Optional): Variant allele count (normal).
+        headers.add("n_ref_count"); // (Optional): Reference allele count (normal).
+    }
+}
diff --git a/test/data/output_format_example.txt b/test/data/output_format_example.txt
@@ -0,0 +1 @@
+Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Annotation_Status
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Annotation_Status