Add output format minimal

genome-nexus · Aug 6, 2022 · 30e63bb · 30e63bb
1 parent 179a8be
commit 30e63bb
Show file tree

Hide file tree

Showing 4 changed files with 52 additions and 22 deletions.
diff --git a/annotationPipeline/src/main/java/org/cbioportal/annotation/AnnotationPipeline.java b/annotationPipeline/src/main/java/org/cbioportal/annotation/AnnotationPipeline.java
@@ -47,6 +47,8 @@
 import java.io.IOException;
 
 /**
+ *
+ * @author Mete Ozguz
  * @author Zachary Heins
  */
 @SpringBootApplication
@@ -59,7 +61,7 @@ private static Options getOptions(String[] args)
         gnuOptions.addOption("h", "help", false, "shows this help document and quits.")
             .addOption("f", "filename", true, "Mutation filename")
             .addOption("o", "output-filename", true, "Output filename (including path)")
-            .addOption("t", "output-format", true, "tcga or a file path which includes output format (FORMAT EXAMPLE: Chromosome,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build)")
+            .addOption("t", "output-format", true, "tcga, minimal or a file path which includes output format (FORMAT EXAMPLE: Chromosome,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build)")
             .addOption("i", "isoform-override", true, "Isoform Overrides (mskcc or uniprot)")
             .addOption("e", "error-report-location", true, "Error report filename (including path)")
             .addOption("r", "replace-symbol-entrez", false, "Replace gene symbols and entrez id with what is provided by annotator" )
@@ -115,6 +117,8 @@ public static void main(String[] args) throws Exception
             String outputFormatFile = commandLine.getOptionValue("output-format");
             if ("tcga".equals(outputFormatFile)) {
                 outputFormat = "tcga";
+            } else if ("minimal".equals(outputFormatFile)) {
+                outputFormat = "minimal";
             } else {
                 // user supplied a format file instead of pre-defined formats
                 try (BufferedReader br = new BufferedReader(new FileReader(outputFormatFile))) {

diff --git a/...Pipeline/src/main/java/org/cbioportal/annotation/pipeline/DefaultLineCallbackHandler.java b/...Pipeline/src/main/java/org/cbioportal/annotation/pipeline/DefaultLineCallbackHandler.java
@@ -5,20 +5,35 @@
 import org.springframework.batch.item.file.LineCallbackHandler;
 import org.springframework.batch.item.file.transform.DelimitedLineTokenizer;
 
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Set;
+import java.util.*;
 
+/**
+ *
+ * @author Mete Ozguz
+ */
 public class DefaultLineCallbackHandler implements LineCallbackHandler {
 
     private static final String[] requiredNames = {"Chromosome", "Start_Position", "End_Position", "Reference_Allele"};
     private final Logger LOG = LoggerFactory.getLogger(DefaultLineCallbackHandler.class);
     private final DelimitedLineTokenizer tokenizer;
+    private final List<String> inputFileHeaders;
 
-    public DefaultLineCallbackHandler(DelimitedLineTokenizer tokenizer) {
+    /**
+     *
+     * @param tokenizer Reference for the DefaultLineMapper's LineTokenizer. Non null.
+     * @param inputFileHeaders Reference for the header names which will be used for 'minimal' file format. Non null.
+     */
+    public DefaultLineCallbackHandler(DelimitedLineTokenizer tokenizer, List<String> inputFileHeaders) {
         this.tokenizer = tokenizer;
+        this.inputFileHeaders = inputFileHeaders;
     }
 
+    /**
+     * Parser and validator of tab separated header names
+     * Should be invoked only for the header line!
+     *
+     * @param line
+     */
     @Override
     public void handleLine(String line) {
         String[] names = line.split("\t");
@@ -37,6 +52,7 @@ public void handleLine(String line) {
             LOG.error(errorMessage);
             throw new RuntimeException(errorMessage);
         }
+        Collections.addAll(inputFileHeaders, names);
         tokenizer.setNames(names); // do not use sorted names here
     }
 }
diff --git a/...tationPipeline/src/main/java/org/cbioportal/annotation/pipeline/MutationRecordReader.java b/...tationPipeline/src/main/java/org/cbioportal/annotation/pipeline/MutationRecordReader.java
@@ -60,6 +60,8 @@
  */
 public class MutationRecordReader implements ItemStreamReader<AnnotatedRecord> {
 
+    private List<String> inputFileHeaders = new ArrayList<>();
+
     @Value("#{jobParameters[filename]}")
     private String filename;
 
@@ -89,41 +91,47 @@ public class MutationRecordReader implements ItemStreamReader<AnnotatedRecord> {
 
     @Override
     public void open(ExecutionContext ec) throws ItemStreamException {
-        this.summaryStatistics = new AnnotationSummaryStatistics(annotator);
+        summaryStatistics = new AnnotationSummaryStatistics(annotator);
         String genomeNexusVersion = annotator.getVersion();
 
         processComments(ec, genomeNexusVersion);
         List<MutationRecord> mutationRecords = loadMutationRecordsFromMaf();
         if (!mutationRecords.isEmpty()) {
             if (postIntervalSize > 0) {
-                this.allAnnotatedRecords = annotator.getAnnotatedRecordsUsingPOST(summaryStatistics, mutationRecords, isoformOverride, replace, postIntervalSize, true);
+                allAnnotatedRecords = annotator.getAnnotatedRecordsUsingPOST(summaryStatistics, mutationRecords, isoformOverride, replace, postIntervalSize, true);
             } else {
-                this.allAnnotatedRecords = annotator.annotateRecordsUsingGET(summaryStatistics, mutationRecords, isoformOverride, replace, true);
+                allAnnotatedRecords = annotator.annotateRecordsUsingGET(summaryStatistics, mutationRecords, isoformOverride, replace, true);
             }
             // if output-format option is supplied, we only need to convert its data into header
             if (outputFormat != null) {
                 if ("tcga".equals(outputFormat)) {
-                    Set<String> sortedAllHeaders = new TreeSet<>();
-                    for (AnnotatedRecord ar : this.allAnnotatedRecords) {
-                        sortedAllHeaders.addAll(ar.getHeaderWithAdditionalFields());
-                    }
                     for(String token : ExtendedMafFormat.headers) {
                         header.add(token);
                     }
-                    // extra headers should go in the back alphabetically
-                    for(String token : sortedAllHeaders) {
-                        if (!header.contains(token)) {
-                            header.add(token);
-                        }
+                } else if ("minimal".equals(outputFormat)) {
+                    for(String token : inputFileHeaders) {
+                        header.add(token);
                     }
                 } else {
                     String[] tokens = outputFormat.split(",");
                     for (int i = 0; i < tokens.length; i++) {
                         header.add(tokens[i].trim());
                     }
                 }
+                // extra headers should go in the back alphabetically for these options
+                if ("tcga".equals(outputFormat) || "minimal".equals(outputFormat)) {
+                    Set<String> sortedAllHeaders = new TreeSet<>();
+                    for (AnnotatedRecord ar : allAnnotatedRecords) {
+                        sortedAllHeaders.addAll(ar.getHeaderWithAdditionalFields());
+                    }
+                    for(String token : sortedAllHeaders) {
+                        if (!header.contains(token)) {
+                            header.add(token);
+                        }
+                    }
+                }
             } else {
-                for (AnnotatedRecord ar : this.allAnnotatedRecords) {
+                for (AnnotatedRecord ar : allAnnotatedRecords) {
                     header.addAll(ar.getHeaderWithAdditionalFields());
                 }
             }
@@ -157,7 +165,7 @@ private List<MutationRecord> loadMutationRecordsFromMaf() {
         mapper.setFieldSetMapper(new MutationFieldSetMapper());
         reader.setLineMapper(mapper);
         reader.setLinesToSkip(1);
-        reader.setSkippedLinesCallback(new DefaultLineCallbackHandler(tokenizer));
+        reader.setSkippedLinesCallback(new DefaultLineCallbackHandler(tokenizer, inputFileHeaders));
         reader.open(new ExecutionContext());
         LOG.info("Loading records from: " + filename);
         MutationRecord mutationRecord;

diff --git a/...line/src/test/java/org/cbioportal/annotation/pipeline/DefaultLineCallbackHandlerTest.java b/...line/src/test/java/org/cbioportal/annotation/pipeline/DefaultLineCallbackHandlerTest.java
@@ -3,6 +3,8 @@
 import org.junit.jupiter.api.Test;
 import org.springframework.batch.item.file.transform.DelimitedLineTokenizer;
 
+import java.util.ArrayList;
+
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.fail;
 
@@ -15,7 +17,7 @@ class DefaultLineCallbackHandlerTest {
     void handleLine_success_Allele1() {
         String testLine = "Chromosome\tStart_Position\tEnd_Position\tReference_Allele\tTumor_Seq_Allele1";
         DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer();
-        DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer);
+        DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer, new ArrayList<>());
         handler.handleLine(testLine);
         assertEquals(true, tokenizer.hasNames());
     }
@@ -24,7 +26,7 @@ void handleLine_success_Allele1() {
     void handleLine_success_Allele2() {
         String testLine = "Chromosome\tStart_Position\tEnd_Position\tReference_Allele\tTumor_Seq_Allele2";
         DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer();
-        DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer);
+        DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer, new ArrayList<>());
         handler.handleLine(testLine);
         assertEquals(true, tokenizer.hasNames());
     }
@@ -61,7 +63,7 @@ void handleLine_missing_Tumor_Seq_Allele1_and_Tumor_Seq_Allele2() {
 
     private void handleLine(String line, String expectedMessage) {
         DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer();
-        DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer);
+        DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer, new ArrayList<>());
         try {
             handler.handleLine(line);
         } catch (RuntimeException e) {