Skip to content

Commit

Permalink
Add output format minimal
Browse files Browse the repository at this point in the history
  • Loading branch information
ozguzMete committed Aug 6, 2022
1 parent 179a8be commit 30e63bb
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@
import java.io.IOException;

/**
*
* @author Mete Ozguz
* @author Zachary Heins
*/
@SpringBootApplication
Expand All @@ -59,7 +61,7 @@ private static Options getOptions(String[] args)
gnuOptions.addOption("h", "help", false, "shows this help document and quits.")
.addOption("f", "filename", true, "Mutation filename")
.addOption("o", "output-filename", true, "Output filename (including path)")
.addOption("t", "output-format", true, "tcga or a file path which includes output format (FORMAT EXAMPLE: Chromosome,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build)")
.addOption("t", "output-format", true, "tcga, minimal or a file path which includes output format (FORMAT EXAMPLE: Chromosome,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build)")
.addOption("i", "isoform-override", true, "Isoform Overrides (mskcc or uniprot)")
.addOption("e", "error-report-location", true, "Error report filename (including path)")
.addOption("r", "replace-symbol-entrez", false, "Replace gene symbols and entrez id with what is provided by annotator" )
Expand Down Expand Up @@ -115,6 +117,8 @@ public static void main(String[] args) throws Exception
String outputFormatFile = commandLine.getOptionValue("output-format");
if ("tcga".equals(outputFormatFile)) {
outputFormat = "tcga";
} else if ("minimal".equals(outputFormatFile)) {
outputFormat = "minimal";
} else {
// user supplied a format file instead of pre-defined formats
try (BufferedReader br = new BufferedReader(new FileReader(outputFormatFile))) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,35 @@
import org.springframework.batch.item.file.LineCallbackHandler;
import org.springframework.batch.item.file.transform.DelimitedLineTokenizer;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.*;

/**
*
* @author Mete Ozguz
*/
public class DefaultLineCallbackHandler implements LineCallbackHandler {

private static final String[] requiredNames = {"Chromosome", "Start_Position", "End_Position", "Reference_Allele"};
private final Logger LOG = LoggerFactory.getLogger(DefaultLineCallbackHandler.class);
private final DelimitedLineTokenizer tokenizer;
private final List<String> inputFileHeaders;

public DefaultLineCallbackHandler(DelimitedLineTokenizer tokenizer) {
/**
*
* @param tokenizer Reference for the DefaultLineMapper's LineTokenizer. Non null.
* @param inputFileHeaders Reference for the header names which will be used for 'minimal' file format. Non null.
*/
public DefaultLineCallbackHandler(DelimitedLineTokenizer tokenizer, List<String> inputFileHeaders) {
this.tokenizer = tokenizer;
this.inputFileHeaders = inputFileHeaders;
}

/**
* Parser and validator of tab separated header names
* Should be invoked only for the header line!
*
* @param line
*/
@Override
public void handleLine(String line) {
String[] names = line.split("\t");
Expand All @@ -37,6 +52,7 @@ public void handleLine(String line) {
LOG.error(errorMessage);
throw new RuntimeException(errorMessage);
}
Collections.addAll(inputFileHeaders, names);
tokenizer.setNames(names); // do not use sorted names here
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@
*/
public class MutationRecordReader implements ItemStreamReader<AnnotatedRecord> {

private List<String> inputFileHeaders = new ArrayList<>();

@Value("#{jobParameters[filename]}")
private String filename;

Expand Down Expand Up @@ -89,41 +91,47 @@ public class MutationRecordReader implements ItemStreamReader<AnnotatedRecord> {

@Override
public void open(ExecutionContext ec) throws ItemStreamException {
this.summaryStatistics = new AnnotationSummaryStatistics(annotator);
summaryStatistics = new AnnotationSummaryStatistics(annotator);
String genomeNexusVersion = annotator.getVersion();

processComments(ec, genomeNexusVersion);
List<MutationRecord> mutationRecords = loadMutationRecordsFromMaf();
if (!mutationRecords.isEmpty()) {
if (postIntervalSize > 0) {
this.allAnnotatedRecords = annotator.getAnnotatedRecordsUsingPOST(summaryStatistics, mutationRecords, isoformOverride, replace, postIntervalSize, true);
allAnnotatedRecords = annotator.getAnnotatedRecordsUsingPOST(summaryStatistics, mutationRecords, isoformOverride, replace, postIntervalSize, true);
} else {
this.allAnnotatedRecords = annotator.annotateRecordsUsingGET(summaryStatistics, mutationRecords, isoformOverride, replace, true);
allAnnotatedRecords = annotator.annotateRecordsUsingGET(summaryStatistics, mutationRecords, isoformOverride, replace, true);
}
// if output-format option is supplied, we only need to convert its data into header
if (outputFormat != null) {
if ("tcga".equals(outputFormat)) {
Set<String> sortedAllHeaders = new TreeSet<>();
for (AnnotatedRecord ar : this.allAnnotatedRecords) {
sortedAllHeaders.addAll(ar.getHeaderWithAdditionalFields());
}
for(String token : ExtendedMafFormat.headers) {
header.add(token);
}
// extra headers should go in the back alphabetically
for(String token : sortedAllHeaders) {
if (!header.contains(token)) {
header.add(token);
}
} else if ("minimal".equals(outputFormat)) {
for(String token : inputFileHeaders) {
header.add(token);
}
} else {
String[] tokens = outputFormat.split(",");
for (int i = 0; i < tokens.length; i++) {
header.add(tokens[i].trim());
}
}
// extra headers should go in the back alphabetically for these options
if ("tcga".equals(outputFormat) || "minimal".equals(outputFormat)) {
Set<String> sortedAllHeaders = new TreeSet<>();
for (AnnotatedRecord ar : allAnnotatedRecords) {
sortedAllHeaders.addAll(ar.getHeaderWithAdditionalFields());
}
for(String token : sortedAllHeaders) {
if (!header.contains(token)) {
header.add(token);
}
}
}
} else {
for (AnnotatedRecord ar : this.allAnnotatedRecords) {
for (AnnotatedRecord ar : allAnnotatedRecords) {
header.addAll(ar.getHeaderWithAdditionalFields());
}
}
Expand Down Expand Up @@ -157,7 +165,7 @@ private List<MutationRecord> loadMutationRecordsFromMaf() {
mapper.setFieldSetMapper(new MutationFieldSetMapper());
reader.setLineMapper(mapper);
reader.setLinesToSkip(1);
reader.setSkippedLinesCallback(new DefaultLineCallbackHandler(tokenizer));
reader.setSkippedLinesCallback(new DefaultLineCallbackHandler(tokenizer, inputFileHeaders));
reader.open(new ExecutionContext());
LOG.info("Loading records from: " + filename);
MutationRecord mutationRecord;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import org.junit.jupiter.api.Test;
import org.springframework.batch.item.file.transform.DelimitedLineTokenizer;

import java.util.ArrayList;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.fail;

Expand All @@ -15,7 +17,7 @@ class DefaultLineCallbackHandlerTest {
void handleLine_success_Allele1() {
String testLine = "Chromosome\tStart_Position\tEnd_Position\tReference_Allele\tTumor_Seq_Allele1";
DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer();
DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer);
DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer, new ArrayList<>());
handler.handleLine(testLine);
assertEquals(true, tokenizer.hasNames());
}
Expand All @@ -24,7 +26,7 @@ void handleLine_success_Allele1() {
void handleLine_success_Allele2() {
String testLine = "Chromosome\tStart_Position\tEnd_Position\tReference_Allele\tTumor_Seq_Allele2";
DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer();
DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer);
DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer, new ArrayList<>());
handler.handleLine(testLine);
assertEquals(true, tokenizer.hasNames());
}
Expand Down Expand Up @@ -61,7 +63,7 @@ void handleLine_missing_Tumor_Seq_Allele1_and_Tumor_Seq_Allele2() {

private void handleLine(String line, String expectedMessage) {
DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer();
DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer);
DefaultLineCallbackHandler handler = new DefaultLineCallbackHandler(tokenizer, new ArrayList<>());
try {
handler.handleLine(line);
} catch (RuntimeException e) {
Expand Down

0 comments on commit 30e63bb

Please sign in to comment.