**************************************************************************************
-** TNRunner2 : Feb 2022 **
+** TNRunner2 : March 2022 **
**************************************************************************************
TNRunner is designed to execute several containerized snakmake workflows on tumor
normal datasets via a slurm cluster. Based on the availability of fastq,
@@ -5165,9 +5188,11 @@ Command Line Menus
delimited, the first cell is the pathway ID and description (e.g.
'hsa05210 Colorectal cancer'), subsequent cells, the associated genes.
-r File to save results, should end with .xls
+-k (Optional) Tab delimited file containing KEGG gene IDs and gene Names, one row
+ per gene, for highlighting impacted genes in each KEGG pathway hyperlink.
Example: java -Xmx10G -jar pathTo/USeq/Apps/VariantPathwayComparator -a earlyCRC.txt
- -b lateCRC.txt -p keggHumanPathways.txt -r earlyVsLateVPC.xls
+ -b lateCRC.txt -p keggHumanPathways.txt -r earlyVsLateVPC.xls -k keggGeneIDName.txt
**************************************************************************************
@@ -5335,7 +5360,7 @@
Command Line Menus
-e Print verbose debugging output.
Example: java -jar pathToUSeq/Apps/VCFCallFrequency -f Vcf/ -s CFVcfs -v
- Hg38/Somatic/Avatar/Vcf -b -m 0.05 -d ~/GQuery/Data
+ Hg38/Somatic/Avatar/Vcf -b -m 0.07 -d ~/GQuery/Data
**************************************************************************************
diff --git a/Misc/JavaxComOrgInfoNetClasses.zip b/Misc/JavaxComOrgInfoNetClasses.zip
index 3405d88..95a121e 100644
Binary files a/Misc/JavaxComOrgInfoNetClasses.zip and b/Misc/JavaxComOrgInfoNetClasses.zip differ
diff --git a/Source/edu/utah/seq/run/TNRunner2.java b/Source/edu/utah/seq/run/TNRunner2.java
index 73f69ec..946847e 100644
--- a/Source/edu/utah/seq/run/TNRunner2.java
+++ b/Source/edu/utah/seq/run/TNRunner2.java
@@ -864,7 +864,7 @@ public void processArgs(String[] args){
public static void printDocs(){
IO.pl("\n" +
"**************************************************************************************\n" +
- "** TNRunner2 : Feb 2022 **\n" +
+ "** TNRunner2 : March 2022 **\n" +
"**************************************************************************************\n" +
"TNRunner is designed to execute several containerized snakmake workflows on tumor\n"+
"normal datasets via a slurm cluster. Based on the availability of fastq, \n"+
diff --git a/Source/edu/utah/seq/run/TNSample2.java b/Source/edu/utah/seq/run/TNSample2.java
index 694d3ab..08306fd 100644
--- a/Source/edu/utah/seq/run/TNSample2.java
+++ b/Source/edu/utah/seq/run/TNSample2.java
@@ -104,19 +104,30 @@ public TNSample2(File rootDir, TNRunner2 tnRunner) throws IOException{
private void parseMergeClinicalVars() throws IOException {
info.add("Checking clinical variant integration...");
- //look for json file
+ //look for json file and xml vcfs
File[] jsonTestResults = IO.extractFiles(new File(rootDir, "ClinicalReport"), ".json");
+ File[] xmls = null;
+ File[] vcfs = null;
+ File[] toLink = null;
+
if (jsonTestResults == null || jsonTestResults.length !=1) {
- info.add("\tJsonReport\tFAILED to find one xxx.json clinical test report file");
- failed = true;
- return;
+
+ //look for caris xml and vcf.gz
+ xmls = IO.extractFiles(new File(rootDir, "ClinicalReport"), ".xml");
+ vcfs = IO.extractFiles(new File(rootDir, "ClinicalReport"), ".vcf.gz");
+ if (xmls.length != 1 || vcfs.length != 1) {
+ info.add("\tJson/Xml/VcfReport\tFAILED to find one xxx.json or one xxx.vcf.gz and xxx.xml clinical test report file(s)");
+ failed = true;
+ return;
+ }
+ jsonTestResults = null;
+ toLink = new File[]{somaticVariants, new File(somaticVariants+".tbi"), xmls[0], vcfs[0]};
}
+ else toLink = new File[]{somaticVariants, new File(somaticVariants+".tbi"), jsonTestResults[0]};
//make dir, ok if it already exists
File jobDir = new File (rootDir, "SomaticVariantCalls/"+id+"_ClinicalVars");
jobDir.mkdirs();
-
- File[] toLink = new File[]{somaticVariants, new File(somaticVariants+".tbi"), jsonTestResults[0]};
//any files?
HashMap nameFile = IO.fetchNamesAndFiles(jobDir);
diff --git a/Source/edu/utah/seq/vcf/VCFCallFrequency.java b/Source/edu/utah/seq/vcf/VCFCallFrequency.java
index 70a0b54..2986416 100644
--- a/Source/edu/utah/seq/vcf/VCFCallFrequency.java
+++ b/Source/edu/utah/seq/vcf/VCFCallFrequency.java
@@ -25,7 +25,6 @@
import edu.utah.hci.query.MasterQuery;
import edu.utah.hci.query.QueryRequest;
import edu.utah.hci.query.UserQuery;
-import edu.utah.seq.vcf.anno.AnnotatedVcfParser;
/**Calculates call frequencies for each vcf record from a db of vcf files and callable region files.
* Most mutated single site in p53 are seen at 0.039 across all cancers in TCGA. 0.57 for BRAF V600E across all cancers.*/
@@ -292,9 +291,10 @@ public void query(UserQuery userQuery) throws IOException {
//make a single use QueryRequest object
QueryRequest qr = new QueryRequest(masterQuery, null, options, null);
//any errors?
- if (qr.getErrTxtForUser() != null || qr.getWarningTxtForUser().size()!=0) {
+ if (qr.getErrTxtForUser() != null ) {
+ //if (qr.getErrTxtForUser() != null || qr.getWarningTxtForUser().size()!=0) {
if (qr.getErrTxtForUser() !=null) Misc.printErrAndExit("\nInvalid request, "+qr.getErrTxtForUser()+ ",aborting.");
- if (qr.getWarningTxtForUser().size() !=0) Misc.printErrAndExit("\n"+qr.getWarningTxtForUser()+", aborting.");
+ // already printed! so skip, if (qr.getWarningTxtForUser().size() !=0) Misc.printErrAndExit("\n"+qr.getWarningTxtForUser()+", might be an issue.");
}
//set results
else userQuery.setResults(qr.getJsonResults());
@@ -639,7 +639,7 @@ public static void printDocs(){
"\nExample: java -jar pathToUSeq/Apps/VCFCallFrequency -f Vcf/ -s CFVcfs -v \n"+
- " Hg38/Somatic/Avatar/Vcf -b -m 0.05 -d ~/GQuery/Data \n\n" +
+ " Hg38/Somatic/Avatar/Vcf -b -m 0.07 -d ~/GQuery/Data \n\n" +
"**************************************************************************************\n");
}
diff --git a/Source/edu/utah/seq/vcf/VariantPathwayComparator.java b/Source/edu/utah/seq/vcf/VariantPathwayComparator.java
index f6c0c9d..3e7a981 100644
--- a/Source/edu/utah/seq/vcf/VariantPathwayComparator.java
+++ b/Source/edu/utah/seq/vcf/VariantPathwayComparator.java
@@ -11,10 +11,13 @@ public class VariantPathwayComparator {
private File groupAGeneHits;
private File groupBGeneHits;
private File pathwayGenes;
+ private File geneKeggIdName;
//internal
private ArrayList pathways = new ArrayList();
private File resultsSpreadsheet = null;
+ private HashMap keggGeneNameID = null;
+ private double minKeggFreq = 0.025;
//constructor
@@ -52,14 +55,14 @@ private void sortSavePathways() {
try {
PrintWriter out = new PrintWriter( new FileWriter(this.resultsSpreadsheet));
- out.println("NameHyperLink\tAdjPval\tAHits\tANoHits\tFracAHits\tAGeneHits\tBHits\tBNoHits\tFracBHits\tBGeneHits\tLog2(fracA/fracB)");
- for (Pathway p: loadedPathways) out.println(p.toString());
+ if (geneKeggIdName == null) out.println("NameHyperLink\tPval\tAdjPval\tAHits\tANoHits\tFracAHits\tAGeneHits\tBHits\tBNoHits\tFracBHits\tBGeneHits\tLog2(fracA/fracB)");
+ else out.println("NameHyperLink\tPval\tAdjPval\tAHits\tANoHits\tFracAHits\tAGeneHits\tAGeneLinks\tBHits\tBNoHits\tFracBHits\tBGeneHits\tBGeneLinks\tLog2(fracA/fracB)");
+ for (Pathway p: loadedPathways) out.println(p.toString(keggGeneNameID));
out.close();
} catch (IOException e) {
e.printStackTrace();
}
-
}
private void comparePathways() {
@@ -173,6 +176,8 @@ private void loadPathways() {
e.printStackTrace();
Misc.printErrAndExit("\nFailed to parse the pathway file "+pathwayGenes);
}
+
+ if (geneKeggIdName!=null) keggGeneNameID = IO.loadFileIntoHash(geneKeggIdName, 1, 0);
}
private class Pathway implements Comparable{
@@ -197,32 +202,34 @@ private class Pathway implements Comparable{
- public String toString() {
- //name adjPval AHits, ANoHits, fracAHits, AGeneHits, BHits, BNoHits, fracBHits, BGeneHits, log2(fracA/fracB)
+ public String toString(HashMapkeggGeneNameId) {
+ //name pval, adjPval, AHits, ANoHits, fracAHits, AGeneHits, (ALinks,) BHits, BNoHits, fracBHits, BGeneHits, (BLinks), log2(fracA/fracB)
StringBuilder sb = new StringBuilder();
//attempt to parse the hsa0000 pathway number
Matcher mat = hsaPath.matcher(name);
+ String partialHyperLink = null;
if (mat.matches()) {
sb.append("=HYPERLINK(\"https://www.kegg.jp/pathway/");
sb.append(mat.group(1));
+ partialHyperLink = sb.toString();
sb.append("\",\"");
sb.append(name);
sb.append("\")");
}
- else sb.append(name);
- sb.append("\t");
+ else sb.append(name); sb.append("\t");
+ sb.append(Num.formatNumber(pval, 5)); sb.append("\t");
sb.append(Num.formatNumber(adjPval, 5)); sb.append("\t");
sb.append(groupAHits.size()); sb.append("\t");
sb.append(groupANoHits.size()); sb.append("\t");
double aFrac = (double)groupAHits.size()/ (double)(groupAHits.size()+groupANoHits.size());
sb.append(Num.formatNumber(aFrac, 5)); sb.append("\t");
- sb.append(convertToSortedFrequencies(aGenes)); sb.append("\t");
+ sb.append(convertToSortedFrequencies(aGenes, partialHyperLink)); sb.append("\t");
sb.append(groupBHits.size()); sb.append("\t");
sb.append(groupBNoHits.size()); sb.append("\t");
double bFrac = (double)groupBHits.size()/ (double)(groupBHits.size()+groupBNoHits.size());
sb.append(Num.formatNumber(bFrac, 5)); sb.append("\t");
- sb.append(convertToSortedFrequencies(bGenes)); sb.append("\t");
+ sb.append(convertToSortedFrequencies(bGenes, partialHyperLink)); sb.append("\t");
double rto = aFrac/bFrac;
sb.append(Num.formatNumber(Num.log2(rto), 5));
@@ -237,7 +244,7 @@ public int compareTo(Pathway o) {
}
- public static String convertToSortedFrequencies(HashMap keyCounts) {
+ public String convertToSortedFrequencies(HashMap keyCounts, String pathwayName) {
double totalCounts = 0;
for (String key: keyCounts.keySet()) totalCounts+= keyCounts.get(key);
GeneCount[] gc = new GeneCount[keyCounts.size()];
@@ -249,17 +256,43 @@ public static String convertToSortedFrequencies(HashMap keyCount
gc[count++] = new GeneCount(key, geneCount);
}
Arrays.sort(gc);
- StringBuilder sb = new StringBuilder(gc[0].name);
- sb.append("=");
- sb.append(Num.formatNumber(gc[0].fraction, 3));
- for (int i=1; i namesToLink = new ArrayList();
+ int last = gc.length-1;
+ for (int i=0; i= minKeggFreq) namesToLink.add(gc[i].name);
+ }
+ String geneNamesToView = namesToView.toString();
+ if (keggGeneNameID != null) return geneNamesToView+ "\t"+ createGeneHyperLink(pathwayName, namesToLink, geneNamesToView);
+ return geneNamesToView;
+ }
+ private String createGeneHyperLink(String pathwayLink, ArrayList namesToLink, String geneNamesToView) {
+ //https://www.kegg.jp/pathway/hsa03440
+ //must watch size, max 250
+ StringBuilder sb = new StringBuilder(pathwayLink);
+ for (String geneName: namesToLink) {
+ String id = keggGeneNameID.get(geneName);
+ int size = sb.length() + id.length()+1;
+ if (size > 250) break;
+ if (id != null) {
+ sb.append("+");
+ sb.append(id);
+
+ }
}
+ sb.append("\",\"");
+ if (geneNamesToView.length()> 250) sb.append(geneNamesToView.subSequence(0, 247)+"...");
+ else sb.append(geneNamesToView);
+ sb.append("\")");
return sb.toString();
}
+
public static class GeneCount implements Comparable{
double fraction = 0;
String name = null;
@@ -301,6 +334,7 @@ public void processArgs(String[] args){
case 'a': groupAGeneHits = new File(args[++i]); break;
case 'b': groupBGeneHits = new File(args[++i]); break;
case 'p': pathwayGenes = new File(args[++i]); break;
+ case 'k': geneKeggIdName = new File(args[++i]); break;
case 'r': resultsSpreadsheet = new File(args[++i]); break;
case 'h': printDocs(); System.exit(0);
default: Misc.printErrAndExit("\nProblem, unknown option! " + mat.group());
@@ -341,9 +375,11 @@ public static void printDocs(){
" delimited, the first cell is the pathway ID and description (e.g.\n"+
" 'hsa05210 Colorectal cancer'), subsequent cells, the associated genes.\n"+
"-r File to save results, should end with .xls\n"+
+ "-k (Optional) Tab delimited file containing KEGG gene IDs and gene Names, one row\n"+
+ " per gene, for highlighting impacted genes in each KEGG pathway hyperlink.\n"+
"\nExample: java -Xmx10G -jar pathTo/USeq/Apps/VariantPathwayComparator -a earlyCRC.txt\n"+
- " -b lateCRC.txt -p keggHumanPathways.txt -r earlyVsLateVPC.xls \n"+
+ " -b lateCRC.txt -p keggHumanPathways.txt -r earlyVsLateVPC.xls -k keggGeneIDName.txt\n"+
"\n**************************************************************************************\n");
diff --git a/Source/edu/utah/seq/vcf/xml/caris/CNVAlteration.java b/Source/edu/utah/seq/vcf/xml/caris/CNVAlteration.java
index 324638d..08ff0f3 100644
--- a/Source/edu/utah/seq/vcf/xml/caris/CNVAlteration.java
+++ b/Source/edu/utah/seq/vcf/xml/caris/CNVAlteration.java
@@ -110,4 +110,9 @@ public SimpleBed toBed(HashMap name2GeneModels) throws I
if (score == null) score = ".";
return new SimpleBed(model.getChrom(), model.getTxStart(), model.getTxEnd(), geneName+":"+result.toLowerCase().replace(" ", "_"), score, model.getStrand());
}
+
+
+ public String getResult() {
+ return result;
+ }
}
diff --git a/Source/edu/utah/seq/vcf/xml/caris/CarisVcfComparator.java b/Source/edu/utah/seq/vcf/xml/caris/CarisVcfComparator.java
new file mode 100644
index 0000000..a254f1c
--- /dev/null
+++ b/Source/edu/utah/seq/vcf/xml/caris/CarisVcfComparator.java
@@ -0,0 +1,326 @@
+package edu.utah.seq.vcf.xml.caris;
+
+import java.io.*;
+import java.util.regex.*;
+import edu.utah.seq.vcf.VCFParser;
+import util.gen.*;
+import java.util.*;
+
+/**
+ * Takes a patient vcf file parsed from a Caris json report and compares it to a vcf generated by reprocessing the raw data.
+ * Writes out a final arbitrated vcf containing all the Caris calls plus non duplicate recall variants with no FILTER flags.
+ *
+ * @author david.nix@hci.utah.edu
+ **/
+public class CarisVcfComparator {
+
+ //user defined fields
+ private File carisVcf = null;
+ private File recallVcf = null;
+ private File mergedVcf = null;
+ private SimpleVcf[] fVcfs;
+ private SimpleVcf[] rVcfs;
+ private int bpPaddingForOverlap = 2;
+
+ //counters
+ private int numberCaris = 0;
+ private int numberRecall = 0;
+ private int numberExactMatches = 0;
+ private int numberCarisWithOnlyOverlap = 0;
+ private int numberModifiedCarisCalls = 0;
+ private int numberCarisWithNoMatch = 0;
+ private int numberPassingRecallWithNoMatch = 0;
+
+ private ArrayList vcfToPrint = new ArrayList();
+ private ArrayList headerLines = new ArrayList();
+
+
+
+
+ //constructors
+ public CarisVcfComparator(String[] args){
+ try {
+ long startTime = System.currentTimeMillis();
+ processArgs(args);
+
+ //load vcf files
+ fVcfs = load(carisVcf, true);
+ rVcfs = load(recallVcf, true);
+ numberRecall = rVcfs.length;
+
+ compareVcfs();
+
+ processCarisVcfs();
+
+ processRecallVcfs();
+
+ printVcfs();
+
+ printStats();
+
+ //finish and calc run time
+ double diffTime = ((double)(System.currentTimeMillis() -startTime))/60000;
+ System.out.println("\nDone! "+Math.round(diffTime)+" Min\n");
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ Misc.printErrAndExit("\nProblem running CarisJson2Vcf app!");
+ }
+ }
+
+ private void printStats() {
+ System.out.println("\nComparator stats:");
+ System.out.println( numberRecall +"\t# Recall variants");
+ System.out.println( numberCaris +"\t# Caris variants");
+ System.out.println( numberExactMatches +"\t# Exact match");
+ System.out.println( numberCarisWithOnlyOverlap +"\t# Overlap recal variants");
+ System.out.println( numberModifiedCarisCalls +"\t# Recommended for modification");
+ System.out.println( numberCarisWithNoMatch +"\t# No match");
+ System.out.println( numberPassingRecallWithNoMatch +"\t# Passing recall variants with no xml vcf match");
+ }
+
+ private void printVcfs() {
+ //sort vcf
+ SimpleVcf[] vcf = new SimpleVcf[vcfToPrint.size()];
+ vcfToPrint.toArray(vcf);
+ Arrays.sort(vcf);
+ Pattern pat = Pattern.compile("N_DP=.+SOMATIC");
+
+ try {
+ Gzipper out = new Gzipper(mergedVcf);
+ //fetch merged header
+ String[] header = mergeHeaders(headerLines);
+
+ for (String h: header) out.println(h);
+ for (SimpleVcf v: vcf) {
+ if (v.getFilter().toLowerCase().contains("fail") == false) {
+ //fix the the SOMATIC info
+ String line = v.getVcfLine();
+ line = pat.matcher(line).replaceFirst("MIXED_SOMATIC_GERMLINE");
+ out.println(line);
+ }
+ }
+
+ out.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ Misc.printErrAndExit("\nERROR: problem writing out the merged vcf file. "+mergedVcf);
+ }
+ }
+
+ private void processRecallVcfs() {
+ //for each Recall vcf, call this after processing the Caris vcfs, skip those with a fail filter field
+ for (SimpleVcf r:rVcfs){
+ //print it?
+ if (r.isPrint() && r.getFilter().toLowerCase().contains("fail") == false) {
+ vcfToPrint.add(r);
+ if (r.getMatch() == null) numberPassingRecallWithNoMatch++;
+ }
+ }
+ }
+
+ /**Merges header lines eliminating duplicates. Does a bad ID name collision checking, silently keeps first one.
+ * Returns null if CHROM lines differ. */
+ public static String[] mergeHeaders(ArrayList header) {
+
+ LinkedHashSet other = new LinkedHashSet();
+ LinkedHashSet contig = new LinkedHashSet();
+ LinkedHashSet info = new LinkedHashSet();
+ LinkedHashSet filter = new LinkedHashSet();
+ LinkedHashSet format = new LinkedHashSet();
+ TreeSet source = new TreeSet();
+ String chromLine = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tTUMOR";
+
+ for (String h: header){
+ h=h.trim();
+ if (h.startsWith("##contig")){
+ if (contig.contains(h) == false) contig.add(h);
+ }
+
+ //modify the caris DP info
+ else if (h.startsWith("##INFO=");
+ }
+
+ //swap out the somatic info
+ else if (h.startsWith("##INFO=");
+ }
+
+ else if (h.startsWith("##INFO")){
+ if (info.contains(h) == false) info.add(h);
+ }
+ else if (h.startsWith("##FILTER")){
+ if (filter.contains(h) == false) filter.add(h);
+ }
+ else if (h.startsWith("##FORMAT")){
+ if (format.contains(h) == false) format.add(h);
+ }
+ else if (h.startsWith("##source=")){
+ source.add(h);
+ }
+ else if (h.startsWith("#CHROM")){
+ //don't do anything, just skip it
+ }
+ else if (other.contains(h) == false) {
+ other.add(h);
+ }
+ }
+
+
+ //add in filter lines
+ filter.add(SimpleVcf.ncFilter);
+
+ //remove ID dups from contig, filter, format, info
+ ArrayList contigAL = VCFParser.mergeHeaderIds(contig);
+ ArrayList filterAL = VCFParser.mergeHeaderIds(filter);
+ ArrayList formatAL = VCFParser.mergeHeaderIds(format);
+ ArrayList infoAL = VCFParser.mergeHeaderIds(info);
+
+ ArrayList lines = new ArrayList();
+ for (String s : other) lines.add(s);
+ for (String s : source) lines.add(s);
+ for (String s : contigAL) lines.add(s);
+ for (String s : filterAL) lines.add(s);
+ for (String s : infoAL) lines.add(s);
+ for (String s : formatAL) lines.add(s);
+ if (chromLine != null) lines.add(chromLine);
+
+ return Misc.stringArrayListToStringArray(lines);
+ }
+
+ private void processCarisVcfs() {
+ //for each Caris record
+ for (SimpleVcf f: fVcfs){
+ numberCaris++;
+ //exact match?
+ if (f.getMatch() != null) {
+ numberExactMatches++;
+ //exact match then add caris info to recall
+ SimpleVcf vcf = f.getMatch();
+ vcf.appendID(f);
+ vcf.appendINFO(f);
+ f.setPrint(false);
+ }
+ else {
+ //So no exact match any overlap?
+ if (f.getOverlap().size()!=0) numberCarisWithOnlyOverlap++;
+
+ //No exact or overlap
+ else {
+ System.err.println("WARNING: No match to this Caris variant:");
+ System.err.println("\t"+f.getVcfLine());
+ numberCarisWithNoMatch++;
+ }
+ //always print it
+ f.appendFilter("NC");
+ vcfToPrint.add(f);
+ }
+ }
+ }
+
+ private void compareVcfs() {
+ //slow comparator, could do many things to speed up...
+ //for each Caris record
+ for (int i=0; i< fVcfs.length; i++){
+ SimpleVcf f = fVcfs[i];
+
+ //for each Recal variant
+ for (int j=0; j< rVcfs.length; j++){
+ SimpleVcf r = rVcfs[j];
+ if (f.compareToExact(r)){
+ if (f.getMatch() !=null || r.getMatch() != null) System.err.println("WARNING: more than one exact match found for \n\t"+f+"\n\t"+r);
+ f.setMatch(r);
+ r.setMatch(f);
+ }
+ else if (f.compareToOverlap(r)){
+ //IO.pl("Overlap xml "+f.getOriginalRecord());
+ //IO.pl("Overlap rec "+r.getOriginalRecord());
+ f.getOverlap().add(r);
+ r.getOverlap().add(f);
+ }
+ }
+ }
+ }
+
+ private SimpleVcf[] load(File vcf, boolean excludeContig) {
+ String[] lines = IO.loadFileIntoStringArray(vcf);
+ ArrayList al = new ArrayList();
+ for (String v: lines){
+ if (v.startsWith("#") == false) {
+ al.add(new SimpleVcf(v, bpPaddingForOverlap));
+ }
+ else {
+ if (excludeContig){
+ if (v.startsWith("##contig") == false) headerLines.add(v);
+ }
+ else headerLines.add(v);
+ }
+ }
+ SimpleVcf[] svs = new SimpleVcf[al.size()];
+ al.toArray(svs);
+ return svs;
+ }
+
+ public static void main(String[] args) {
+ if (args.length ==0){
+ printDocs();
+ System.exit(0);
+ }
+ new CarisVcfComparator(args);
+ }
+
+ /**This method will process each argument and assign new varibles*/
+ public void processArgs(String[] args){
+ Pattern pat = Pattern.compile("-[a-z]");
+ String useqVersion = IO.fetchUSeqVersion();
+ String source = useqVersion+" Args: "+ Misc.stringArrayToString(args, " ");
+ System.out.println("\n"+ source +"\n");
+ for (int i = 0; i xmlVcfFiles = null;
private File saveDirectory;
private boolean includePHI = false;
+ private boolean saveAllVcfRecords = false;
//internal fields
private String source;
@@ -48,6 +49,7 @@ public class CarisXmlVcfParser {
private ArrayList workingCNVAlterations = new ArrayList();
private ArrayList workingTranslocations = new ArrayList();
private ArrayList workingExpressionAlterations = new ArrayList();
+ private ArrayList workingMethylationAlterations = new ArrayList();
//constructors
@@ -76,7 +78,7 @@ public CarisXmlVcfParser(String[] args){
}
private void printSpreadsheet() throws IOException {
- IO.pl("Exporting patient summary spreadsheet... ");
+ IO.pl("\nExporting patient summary spreadsheet... ");
//merge all the keys
LinkedHashSet allKeys = new LinkedHashSet();
for (LinkedHashMap s : allReportAttributes) allKeys.addAll(s.keySet());
@@ -132,6 +134,7 @@ public void parseDatasets() throws Exception{
workingCNVAlterations.clear();
workingTranslocations.clear();
workingExpressionAlterations.clear();
+ workingMethylationAlterations.clear();
workingNumberFailedGenomicMatches = 0;
loadVcf();
@@ -215,10 +218,13 @@ private void loadVcf() throws IOException {
workingVcfHeader.append(SimpleVcf.xrv);
workingVcfHeader.append(SimpleVcf.xrp);
}
- workingVcfHeader.append(line); workingVcfHeader.append("\n");
+ //fix their malformed header line that is causing a vt to error
+ if (line.startsWith("##FORMAT=";
+ workingVcfHeader.append(line);
+ workingVcfHeader.append("\n");
}
else {
- SimpleVcf vcf = new SimpleVcf(line);
+ SimpleVcf vcf = new SimpleVcf(line,0);
String key = vcf.fetchCarisKey();
if (key == null) key = Misc.getRandomString(10);
workingVcfs.put(key, vcf);
@@ -233,14 +239,29 @@ private void saveVcfFile() {
if (workingVcfs.size() == 0) return;
Gzipper out = null;
try {
+ //for each vcf record
+ ArrayList toSave = new ArrayList();
+ int counter = 1;
+ for (SimpleVcf sv : workingVcfs.values()) {
+ if (saveAllVcfRecords) {
+ sv.setId("Caris_"+counter);
+ counter++;
+ toSave.add(sv.toString());
+ }
+ else if (sv.getInfo().startsWith("XRV;XRP=")) {
+ sv.setId("Caris_"+counter);
+ counter++;
+ toSave.add(sv.toString());
+ }
+ }
+ //if (toSave.size() == 0) return; don't do this always want a vcf file output even if empty
+
File vcf = new File (saveDirectory, workingReportName+".vcf.gz");
out = new Gzipper(vcf);
//add header
out.print(workingVcfHeader);
-
- //for each vcf record
- for (SimpleVcf sv : workingVcfs.values()) out.println(sv.toString());
+ for (String s: toSave) out.println(s);
out.close();
} catch (IOException e) {
@@ -258,19 +279,30 @@ private void saveCnvFile() {
Gzipper out = null;
try {
- SimpleBed[] bedRegions = new SimpleBed[workingCNVAlterations.size()];
- for (int i=0; i< bedRegions.length; i++) bedRegions[i] = workingCNVAlterations.get(i).toBed(name2GeneModels);
- Arrays.sort(bedRegions);
+ ArrayList bedToSave = new ArrayList();
+ for (int i=0; i< workingCNVAlterations.size(); i++) {
+ CNVAlteration cnv = workingCNVAlterations.get(i);
+ String res = cnv.getResult().toLowerCase();
+ if (res.contains("not detected") || res.contains("indeterminate")) continue;
+ bedToSave.add(cnv.toBed(name2GeneModels));
+ }
+ if (bedToSave.size() == 0) return;
+
+ SimpleBed[] bedRegions = new SimpleBed[bedToSave.size()];
+ bedToSave.toArray(bedRegions);
+ Arrays.sort(bedRegions);
File bedFile = new File (saveDirectory, workingReportName+".cnv.bed.gz");
out = new Gzipper(bedFile);
//add header
- out.println("#Chr\tGeneStart\tGeneStop\tGeneName:Type\tScore\tGeneStrand\n#Hg38");
+ out.println("#ChrHg38\tGeneStart\tGeneStop\tGeneName:Type\tScore\tGeneStrand");
//for each bed
- for (SimpleBed cnv: bedRegions) out.println(cnv);
+ for (SimpleBed cnv: bedRegions) {
+ if (cnv!= null) out.println(cnv);
+ }
out.close();
} catch (IOException e) {
@@ -291,9 +323,15 @@ private void saveFusionFile() {
ArrayList beds = new ArrayList();
int num = workingTranslocations.size();
for (int i=0; i< num; i++) {
+ Translocation trans = workingTranslocations.get(i);
+ String res = trans.getResult().toLowerCase();
+ if (res.contains("not detected") || res.contains("indeterminate")) continue;
+
SimpleBed[] sbs = workingTranslocations.get(i).toBed(name2GeneModels);
for (SimpleBed sb: sbs) if (sb!=null) beds.add(sb);
}
+ if (beds.size()==0) return;
+
SimpleBed[] bedRegions = new SimpleBed[beds.size()];
beds.toArray(bedRegions);
Arrays.sort(bedRegions);
@@ -303,7 +341,7 @@ private void saveFusionFile() {
out = new Gzipper(bedFile);
//add header
- out.println("#Chr\tGeneStart\tGeneStop\tGeneName1:GeneName2:Type\tScore\tGeneStrand\n#Hg38");
+ out.println("#ChrHg38\tGeneStart\tGeneStop\tGeneName1:GeneName2:Type\tScore\tGeneStrand");
//for each bed
for (SimpleBed fus: bedRegions) out.println(fus);
@@ -365,10 +403,14 @@ private void parseTests(Node node) throws DOMException, ParseException, IOExcept
else if (testName.equals("Exome CNA Panel - Additional Genes")) parseCNVPanel(childNodes);
else if (testName.equals("Transcriptome Detection_v1 Panel")) parseTranslocationPanel(childNodes);
else if (testName.equals("Transcriptome Detection_v1 Variant Panel")) parseTranslocationPanel(childNodes);
+ else if (testName.equals("MGMT-Me")) parseMethyl(childNodes);
else if (ihcTestNames.contains(testName)) parseIHC(childNodes);
else if (testName.equals("Her2 CISH")) parseCNVPanel(childNodes);
else if (testName.equals("EBER ISH")){
- IO.el("Skipping the parsing of 'EBER ISH' test for Epstein-Barr virus genome copy number\n");
+ IO.el("\nSkipping the parsing of 'EBER ISH' test for Epstein-Barr virus genome copy number\n");
+ }
+ else if (testName.equals("ROS1 FISH")){
+ IO.el("\nSkipping the parsing of 'ROS1 FISH' test for ROS1 cytogenic alteration.");
}
else throw new IOException("Found an unknown test! "+testName);
}
@@ -396,6 +438,27 @@ private void parseIHC(NodeList childNodes) throws IOException {
}
}
}
+
+ private void parseMethyl(NodeList childNodes) throws IOException {
+ for (int j = 0; j < childNodes.getLength(); j++) {
+ Node cNode = childNodes.item(j);
+ if (cNode instanceof Element) {
+ String name = cNode.getNodeName();
+ //find testResults
+ if (name.equals("testResults")) {
+ NodeList subNodes = cNode.getChildNodes();
+ for (int i=0; i< subNodes.getLength(); i++) {
+ Node n = subNodes.item(i);
+ if (n instanceof Element) {
+ String subName = n.getNodeName();
+ if (subName.equals("epigeneticAlteration")) workingMethylationAlterations.add( new MethylationAlteration(workingReportAttributes, n.getChildNodes()));
+ else throw new IOException("Found something other than 'epigeneticAlteration' "+subName);
+ }
+ }
+ }
+ }
+ }
+ }
private void parseTranslocationPanel(NodeList childNodes) throws IOException {
for (int j = 0; j < childNodes.getLength(); j++) {
@@ -663,6 +726,9 @@ public void processArgs(String[] args){
System.out.println("\n"+ source +"\n");
File ucscFile = null;
File vcfXmlDir = null;
+ File singleCarisVcf = null;
+ File singleCarisXml = null;
+
for (int i = 0; i report, NodeList nodes) throws IOException {
+ for (int j = 0; j < nodes.getLength(); j++) {
+ Node n = nodes.item(j);
+ if (n instanceof Element) {
+ String name = n.getNodeName();
+ if (name.equals("biomarkerName")) biomarkerName = CarisXmlVcfParser.fetch("biomarkerName", n);
+ else if (name.equals("gene")) gene = CarisXmlVcfParser.fetch("gene", n);
+ else if (name.equals("result")) result = CarisXmlVcfParser.fetch("result", n);
+ }
+ }
+ //IO.pl(toString());
+ addReportInfo(report);
+ }
+
+ private void addReportInfo(LinkedHashMap report) {
+ String bn = biomarkerName.replaceAll(" ", "_");
+ String res = result.replaceAll(" ", "_");
+ report.put(bn+":Result", res);
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ if (biomarkerName!=null) sb.append("\nbiomarkerName\t"+biomarkerName);
+ if (gene!=null) sb.append("\ngene\t"+gene);
+ if (result!=null) sb.append("\nresult\t"+result);
+ return sb.toString();
+ }
+}
diff --git a/Source/edu/utah/seq/vcf/xml/caris/SimpleVcf.java b/Source/edu/utah/seq/vcf/xml/caris/SimpleVcf.java
index 406c4c9..0989be4 100644
--- a/Source/edu/utah/seq/vcf/xml/caris/SimpleVcf.java
+++ b/Source/edu/utah/seq/vcf/xml/caris/SimpleVcf.java
@@ -5,6 +5,8 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import util.gen.Gzipper;
import util.gen.IO;
import util.gen.Misc;
@@ -23,12 +25,22 @@ public class SimpleVcf implements Comparable{
private String info;
private String format;
private String sample;
- private String pathogenicity = null;
+ private int end;
+ private int padPos;
+ private int padEnd;
public static final String xrv = "##INFO=\n";
public static final String xrp = "##INFO=\n";
-
+ private boolean print = true;
+ private SimpleVcf match = null;
+ private ArrayList overlap = new ArrayList();
+ private static final Pattern afPat = Pattern.compile(".+AF=([\\d+\\.]+).*");
+ public static final String ncFilter = "##FILTER=";
+ public static final String nrFilter = "##FILTER=";
+ public static final String mdFilter = "##FILTER=";
+ public static String infoRAF = "##INFO=";
- public SimpleVcf (String vcfLine){
+
+ public SimpleVcf (String vcfLine, int bpPadding){
originalRecord = vcfLine;
//parse fields
String[] t = Misc.TAB.split(vcfLine);
@@ -41,27 +53,75 @@ public SimpleVcf (String vcfLine){
filter = t[6];
info = t[7];
format = t[8];
- sample = t[9];
+ //two samples or one?
+ // Caris just has one sample, the tumor
+ if (t.length==10) sample = t[9];
+ // Strelka has two, the normal and tumor, just take the tumor
+ else if (t.length==11) sample = t[10];
+ else Misc.printErrAndExit("\nERROR: sample mismatch for caris or strelka vcf record:\n"+ vcfLine);
+
+ //define the end as the max effected genomic region
+ int refLen = ref.length();
+ //find max alt length, there many be more than one, not good!
+ String[] splitAlts = Misc.COMMA.split(alt);
+ int altLen = splitAlts[0].length();
+ for (int i=1; i< splitAlts.length; i++){
+ if (splitAlts[i].length() > altLen) altLen = splitAlts[i].length();
+ }
+ if (refLen > altLen) {
+ end = pos+refLen;
+ }
+ else if (refLen < altLen){
+ end = pos+altLen;
+ }
+ else end = pos+altLen;
+
+ //set bp padded positions
+ padPos = pos - bpPadding;
+ padEnd = end + bpPadding;
}
-
+
public void appendInfo(String toAppend) {
info = toAppend+info;
}
-
-
+
+ /**This appends on the AF from the matching SimpleVcf to this vcfLine*/
+ public void appendRAF(SimpleVcf o){
+ Matcher mat = afPat.matcher(o.info);
+ if (mat.matches()) info = info + ";RAF="+mat.group(1);
+ else Misc.printErrAndExit("\nERROR: failed to match AF= from "+o.getVcfLine());
+ }
+
+ /**Appends not confirmed flag to the FILTER field*/
+ public void appendFilter(String flag){
+ //any existing flags?
+ if (filter.equals(".") || filter.toLowerCase().equals("pass")) filter = flag;
+ else filter = filter+";"+flag;
+
+ }
+
+ public void appendID(SimpleVcf o) {
+ if (o.getId().equals(".") == false) id = id+";"+o.getId();
+ else id = o.getId();
+ }
+
+ public void appendINFO(SimpleVcf o) {
+ info = info+";"+o.info;
+ }
+
public String fetchCarisKey() throws IOException {
//chromosome_ref_alt_readDepth_geneName_hgvsCodingChange
StringBuilder sb = new StringBuilder();
sb.append(chr); sb.append("_");
sb.append(ref); sb.append("_");
sb.append(alt); sb.append("_");
-
+
//parse DP from INFO
//DP=1134;TI=NM_003482.3;GI=KMT2D;FC=Nonsense;PC=Q3812*;DC=c.11434C>T;LO=EXON;EXON=39;CI="Pathogenic Variant"
//parse DP, transcript and hgvs
//cant use transcript id, these differ between the vcf file and the xml file
-
+
String[] splitInfo = Misc.SEMI_COLON.split(info);
String dp = null;
String hgvs = null;
@@ -72,6 +132,7 @@ public String fetchCarisKey() throws IOException {
else if (i.startsWith("DP=")) dp = i.substring(3);
}
if (hgvs == null || gi == null || dp == null) {
+ // just a few Caris vcf records with no annotations, just skip
//IO.el("\t\tERROR: failed to parse one or more of the DC=, TI=, DP= INFO values from "+originalRecord);
return null;
}
@@ -83,7 +144,7 @@ public String fetchCarisKey() throws IOException {
return sb.toString();
}
-
+
public String toString(){
//#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE
StringBuilder sb = new StringBuilder();
@@ -94,12 +155,17 @@ public String toString(){
sb.append(alt); sb.append("\t");
sb.append(qual); sb.append("\t");
sb.append(filter); sb.append("\t");
- sb.append(info); sb.append("\t");
- sb.append(format); sb.append("\t");
- sb.append(sample);
+ sb.append(info);
+ if (format!=null && sample!=null) {
+ sb.append("\t");
+ sb.append(format);
+ sb.append("\t");
+ sb.append(sample);
+ }
+
return sb.toString();
}
-
+
public int compareTo(SimpleVcf other) {
//sort by chromosome
int x = chr.compareTo(other.chr);
@@ -109,7 +175,21 @@ public int compareTo(SimpleVcf other) {
if (pos > other.pos) return 1;
return 0;
}
-
+
+ public boolean compareToExact(SimpleVcf o) {
+ if (chr.equals(o.chr) == false) return false;
+ if (pos != o.pos) return false;
+ if (ref.equals(o.ref) == false) return false;
+ if (alt.equals(o.alt) == false) return false;
+ return true;
+ }
+ public boolean compareToOverlap(SimpleVcf o) {
+ if (chr.equals(o.chr) == false) return false;
+ if (padPos > o.padEnd) return false;
+ if (o.padPos > padEnd) return false;
+ return true;
+ }
+
/**Sorts a VCF and writes it out compressed but not indexed. This loads everything into memory so it can blow up. Careful.
* @return number of vcf records*/
public static int sortVcf(File unsortedVcfFile, File sortedOutVcfFile) throws IOException {
@@ -121,7 +201,7 @@ public static int sortVcf(File unsortedVcfFile, File sortedOutVcfFile) throws IO
while ((line = in.readLine())!=null){
if (line.trim().length() == 0) continue;
if (line.startsWith("#")) out.println(line);
- else records.add(new SimpleVcf(line));
+ else records.add(new SimpleVcf(line, 0));
}
//sort em
SimpleVcf[] toSort = new SimpleVcf[records.size()];
@@ -134,7 +214,7 @@ public static int sortVcf(File unsortedVcfFile, File sortedOutVcfFile) throws IO
out.close();
return toSort.length;
}
-
+
//getters and setters
public String getVcfLine() {
return toString();
@@ -166,4 +246,32 @@ public void setAlt(String alt) {
public String getId() {
return id;
}
+
+ public void setId(String id) {
+ this.id = id;
+ }
+
+ public boolean isPrint() {
+ return print;
+ }
+
+ public SimpleVcf getMatch() {
+ return match;
+ }
+
+ public void setPrint(boolean print) {
+ this.print = print;
+ }
+
+ public ArrayList getOverlap() {
+ return overlap;
+ }
+
+ public void setOverlap(ArrayList overlap) {
+ this.overlap = overlap;
+ }
+
+ public void setMatch(SimpleVcf match) {
+ this.match = match;
+ }
}
diff --git a/Source/edu/utah/seq/vcf/xml/caris/Translocation.java b/Source/edu/utah/seq/vcf/xml/caris/Translocation.java
index ff68f11..6e1a8b8 100644
--- a/Source/edu/utah/seq/vcf/xml/caris/Translocation.java
+++ b/Source/edu/utah/seq/vcf/xml/caris/Translocation.java
@@ -117,6 +117,11 @@ private UCSCGeneLine fetchModel(String geneName, String chr, HashMap