Add output format tcga

genome-nexus · Aug 6, 2022 · 179a8be · 179a8be
1 parent faa50cf
commit 179a8be
Show file tree

Hide file tree

Showing 3 changed files with 103 additions and 8 deletions.
diff --git a/annotationPipeline/src/main/java/org/cbioportal/annotation/AnnotationPipeline.java b/annotationPipeline/src/main/java/org/cbioportal/annotation/AnnotationPipeline.java
@@ -59,7 +59,7 @@ private static Options getOptions(String[] args)
         gnuOptions.addOption("h", "help", false, "shows this help document and quits.")
             .addOption("f", "filename", true, "Mutation filename")
             .addOption("o", "output-filename", true, "Output filename (including path)")
-            .addOption("t", "output-format", true, "File path which includes output format (FORMAT EXAMPLE: Chromosome,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build)")
+            .addOption("t", "output-format", true, "tcga or a file path which includes output format (FORMAT EXAMPLE: Chromosome,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build)")
             .addOption("i", "isoform-override", true, "Isoform Overrides (mskcc or uniprot)")
             .addOption("e", "error-report-location", true, "Error report filename (including path)")
             .addOption("r", "replace-symbol-entrez", false, "Replace gene symbols and entrez id with what is provided by annotator" )
@@ -113,10 +113,16 @@ public static void main(String[] args) throws Exception
         String outputFormat = null;
         if (commandLine.hasOption("output-format")) {
             String outputFormatFile = commandLine.getOptionValue("output-format");
-            try (BufferedReader br = new BufferedReader(new FileReader(outputFormatFile))) {
-                outputFormat = br.readLine();
-            } catch (IOException e) {
-                System.err.println("Error while reading output-format file: " + outputFormatFile);
+            if ("tcga".equals(outputFormatFile)) {
+                outputFormat = "tcga";
+            } else {
+                // user supplied a format file instead of pre-defined formats
+                try (BufferedReader br = new BufferedReader(new FileReader(outputFormatFile))) {
+                    outputFormat = br.readLine();
+                } catch (IOException e) {
+                    System.err.println("Error while reading output-format file: " + outputFormatFile);
+                    System.exit(0);
+                }
             }
         }
         launchJob(args, commandLine.getOptionValue("filename"), commandLine.getOptionValue("output-filename"), outputFormat, commandLine.getOptionValue("isoform-override"),

diff --git a/...tationPipeline/src/main/java/org/cbioportal/annotation/pipeline/MutationRecordReader.java b/...tationPipeline/src/main/java/org/cbioportal/annotation/pipeline/MutationRecordReader.java
@@ -36,6 +36,7 @@
 import java.util.*;
 import org.cbioportal.annotator.internal.AnnotationSummaryStatistics;
 import org.cbioportal.annotator.Annotator;
+import org.cbioportal.format.ExtendedMafFormat;
 import org.cbioportal.models.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -101,9 +102,25 @@ public void open(ExecutionContext ec) throws ItemStreamException {
             }
             // if output-format option is supplied, we only need to convert its data into header
             if (outputFormat != null) {
-                String[] tokens = outputFormat.split(",");
-                for (int i = 0; i < tokens.length; i++) {
-                    header.add(tokens[i].trim());
+                if ("tcga".equals(outputFormat)) {
+                    Set<String> sortedAllHeaders = new TreeSet<>();
+                    for (AnnotatedRecord ar : this.allAnnotatedRecords) {
+                        sortedAllHeaders.addAll(ar.getHeaderWithAdditionalFields());
+                    }
+                    for(String token : ExtendedMafFormat.headers) {
+                        header.add(token);
+                    }
+                    // extra headers should go in the back alphabetically
+                    for(String token : sortedAllHeaders) {
+                        if (!header.contains(token)) {
+                            header.add(token);
+                        }
+                    }
+                } else {
+                    String[] tokens = outputFormat.split(",");
+                    for (int i = 0; i < tokens.length; i++) {
+                        header.add(tokens[i].trim());
+                    }
                 }
             } else {
                 for (AnnotatedRecord ar : this.allAnnotatedRecords) {

diff --git a/annotator/src/main/java/org/cbioportal/format/ExtendedMafFormat.java b/annotator/src/main/java/org/cbioportal/format/ExtendedMafFormat.java
@@ -0,0 +1,72 @@
+package org.cbioportal.format;
+
+import java.util.LinkedHashSet;
+import java.util.Set;
+
+/**
+ * Created using https://docs.cbioportal.org/file-formats/#extended-maf-format
+ * <pre>
+ * The extended MAF format recognized by the portal has:
+ *    32 columns from the TCGA MAF format.
+ *    1 column with the amino acid change.
+ *    4 columns with information on reference and variant allele counts in tumor and normal samples.
+ * </pre>
+ *
+ * @author Mete Ozguz
+ */
+public class ExtendedMafFormat {
+    public static final Set<String> headers = new LinkedHashSet<>();
+
+    static {
+        //
+        // 32 columns from the TCGA MAF format.
+        //
+        headers.add("Hugo_Symbol"); // (Required): A HUGO gene symbol.
+        headers.add("Entrez_Gene_Id"); // (Optional, but recommended): A Entrez Gene identifier.
+        headers.add("Center"); // (Optional): The sequencing center.
+        headers.add(
+                "NCBI_Build"); // (Required): The Genome Reference Consortium Build is used by a variant calling software. It must be "GRCh37" or "GRCh38" for a human, and "GRCm38" for a mouse.
+        headers.add("Chromosome"); // (Required): A chromosome number, e.g., "7".
+        headers.add("Start_Position"); // (Required): Start position of event.
+        headers.add("End_Position"); // (Required): End position of event.
+        headers.add("Strand"); // (Optional): We assume that the mutation is reported for the + strand.
+        headers.add("Variant_Classification"); // (Required): Translational effect of variant allele, e.g. Missense_Mutation, Silent, etc.
+        headers.add("Variant_Type"); // (Optional): Variant Type, e.g. SNP, DNP, etc.
+        headers.add("Reference_Allele"); // (Required): The plus strand reference allele at this position.
+        headers.add("Tumor_Seq_Allele1"); // (Optional): Primary data genotype.
+        headers.add("Tumor_Seq_Allele2"); // (Required): Primary data genotype.
+        headers.add("dbSNP_RS"); // (Optional): Latest dbSNP rs ID.
+        headers.add("dbSNP_Val_Status"); // (Optional): dbSNP validation status.
+        headers.add(
+                "Tumor_Sample_Barcode"); // (Required): This is the sample ID. Either a TCGA barcode (patient identifier will be extracted), or for non-TCGA data, a literal SAMPLE_ID as listed in the clinical data file.
+        headers.add("Matched_Norm_Sample_Barcode"); // (Optional): The sample ID for the matched normal sample.
+        headers.add("Match_Norm_Seq_Allele1"); // (Optional): Primary data.
+        headers.add("Match_Norm_Seq_Allele2"); // (Optional): Primary data.
+        headers.add("Tumor_Validation_Allele1"); // (Optional): Secondary data from orthogonal technology.
+        headers.add("Tumor_Validation_Allele2"); // (Optional): Secondary data from orthogonal technology.
+        headers.add("Match_Norm_Validation_Allele1"); // (Optional): Secondary data from orthogonal technology.
+        headers.add("Match_Norm_Validation_Allele2"); // (Optional): Secondary data from orthogonal technology.
+        headers.add("Verification_Status"); // (Optional): Second pass results from independent attempt using same methods as primary data source. "Verified", "Unknown" or "NA".
+        headers.add(
+                "Validation_Status"); // (Optional): Second pass results from orthogonal technology. "Valid", "Invalid", "Untested", "Inconclusive", "Redacted", "Unknown" or "NA".
+        headers.add(
+                "Mutation_Status"); // (Optional): "Somatic" or "Germline" are supported by the UI in Mutations tab. "None", "LOH" and "Wildtype" will not be loaded. Other values will be displayed as text.
+        headers.add("Sequencing_Phase"); // (Optional): Indicates current sequencing phase.
+        headers.add("Sequence_Source"); // (Optional): Molecular assay type used to produce the analytes used for sequencing.
+        headers.add("Validation_Method"); // (Optional): The assay platforms used for the validation call.
+        headers.add("Score"); // (Optional): Not used.
+        headers.add("BAM_File"); // (Optional): Not used.
+        headers.add("Sequencer"); // (Optional): Instrument used to produce primary data.
+        //
+        // 1 column with the amino acid change.
+        //
+        headers.add("HGVSp_Short"); // (Required): Amino Acid Change, e.g. p.V600E.
+        //
+        // 4 columns with information on reference and variant allele counts in tumor and normal samples.
+        //
+        headers.add("t_alt_count"); // (Optional): Variant allele count (tumor).
+        headers.add("t_ref_count"); // (Optional): Reference allele count (tumor).
+        headers.add("n_alt_count"); // (Optional): Variant allele count (normal).
+        headers.add("n_ref_count"); // (Optional): Reference allele count (normal).
+    }
+}