diff --git a/Documentation/USeqDocumentation/cmdLnMenus.html b/Documentation/USeqDocumentation/cmdLnMenus.html index bf0a7bd..c742dc9 100644 --- a/Documentation/USeqDocumentation/cmdLnMenus.html +++ b/Documentation/USeqDocumentation/cmdLnMenus.html @@ -79,6 +79,8 @@

Command Line Menus

CalculatePerCycleErrorRate
+ CarisVcfComparator
+ CarisXmlVcfParser
ChIPSeq
@@ -1365,29 +1367,50 @@

Command Line Menus

Example: java -Xmx1500M -jar pathTo/USeq/Apps/CalculatePerCycleErrorRate -b /Data/Bam/ -f /Fastas/chrPhiX_Illumina.fasta.gz +************************************************************************************** +

+ +

**************************************************************************************
+**                           Caris Vcf Comparator: March 2022                       **
+**************************************************************************************
+CVC compares a Caris vcf generated with the CarisXmlVcfParser to a recalled vcf.
+Exact recall vars are so noted and removed. Caris vcf with no exact but one
+overlapping record can be merged with -k. Be sure to vt normalize each before running.
+Recall variants failing FILTER are not saved.
+
+Options:
+-c Path to a the Caris xml parsed vcf file, see the CarisXmlVcfParser app.
+-r Path to a recalled snv/indel vcf file.
+-m Path to named vcf file for saving the results.
+
+Example: java -Xmx2G -jar pathToUSeq/Apps/CarisVcfComparator -f TL-18-03CFD6.vcf
+     -r /F1/TL-18-03CFD6_recall.vcf.gz -g -c -m /F1/TL-18-93CFD6_merged.vcf.gz -k 
+
 **************************************************************************************
 

**************************************************************************************
-**                           Caris Xml Vcf Parser: March 2021                       **
+**                           Caris Xml Vcf Parser: March 2022                       **
 **************************************************************************************
 This tool parses Caris paired xml and vcf report files to generate: new vcfs where xml
 reported genomic alternations are annotated, bed files of copy number changes and gene
 fusions as well as a summary spreadsheet of the non NGS test results and report
-attributes.
+attributes. Use Vt to normalize the vcf output files. 
 
 Options:
 -d Path to a directory containing paired xml and vcf files from Caris.
--s Path to a directory for saving the results.
+-s Path to a directory for saving the results. Only impacting vcf, cnv, fusions are
+     saved, if none present, no file(s) are created.
 -u Path to a Hg38 UCSC RefFlat or RefSeq merged gene file for CNV and gene fusion
    coordinate extraction. See: http://www.genome.ucsc.edu/FAQ/FAQformat.html#format9
    (refSeqGeneName name2(optional) chr strand txStart txEnd cdsStart cdsEnd exonCount
    exonStarts exonEnds). Be sure to merge transcripts with the USeq MergeUCSCGeneTable
    app and remove non standard chromosomes.
 -i Include PHI in spreadsheet output, defaults to excluding.
+-a Include all vcf records in output, defaults to just those with an xml match.
 
 Example: java -Xmx2G -jar pathToUSeq/Apps/CarisXmlVcfParser -d CarisReports/
-     -s ParsedCarisReports/ -i -u ~/GRCh38/hg38RefSeq9Dec2020_MergedStdChr.ucsc.gz 
+     -s ParsedCarisReports/ -u ~/GRCh38/hg38RefSeq9Dec2020_MergedStdChr.ucsc.gz 
 
 **************************************************************************************
 

@@ -4891,7 +4914,7 @@

Command Line Menus


**************************************************************************************
-**                                TNRunner2 : Feb 2022                              **
+**                                TNRunner2 : March 2022                            **
 **************************************************************************************
 TNRunner is designed to execute several containerized snakmake workflows on tumor
 normal datasets via a slurm cluster.  Based on the availability of fastq, 
@@ -5165,9 +5188,11 @@ 

Command Line Menus

delimited, the first cell is the pathway ID and description (e.g. 'hsa05210 Colorectal cancer'), subsequent cells, the associated genes. -r File to save results, should end with .xls +-k (Optional) Tab delimited file containing KEGG gene IDs and gene Names, one row + per gene, for highlighting impacted genes in each KEGG pathway hyperlink. Example: java -Xmx10G -jar pathTo/USeq/Apps/VariantPathwayComparator -a earlyCRC.txt - -b lateCRC.txt -p keggHumanPathways.txt -r earlyVsLateVPC.xls + -b lateCRC.txt -p keggHumanPathways.txt -r earlyVsLateVPC.xls -k keggGeneIDName.txt **************************************************************************************


@@ -5335,7 +5360,7 @@

Command Line Menus

-e Print verbose debugging output. Example: java -jar pathToUSeq/Apps/VCFCallFrequency -f Vcf/ -s CFVcfs -v - Hg38/Somatic/Avatar/Vcf -b -m 0.05 -d ~/GQuery/Data + Hg38/Somatic/Avatar/Vcf -b -m 0.07 -d ~/GQuery/Data **************************************************************************************

diff --git a/Misc/JavaxComOrgInfoNetClasses.zip b/Misc/JavaxComOrgInfoNetClasses.zip index 3405d88..95a121e 100644 Binary files a/Misc/JavaxComOrgInfoNetClasses.zip and b/Misc/JavaxComOrgInfoNetClasses.zip differ diff --git a/Source/edu/utah/seq/run/TNRunner2.java b/Source/edu/utah/seq/run/TNRunner2.java index 73f69ec..946847e 100644 --- a/Source/edu/utah/seq/run/TNRunner2.java +++ b/Source/edu/utah/seq/run/TNRunner2.java @@ -864,7 +864,7 @@ public void processArgs(String[] args){ public static void printDocs(){ IO.pl("\n" + "**************************************************************************************\n" + - "** TNRunner2 : Feb 2022 **\n" + + "** TNRunner2 : March 2022 **\n" + "**************************************************************************************\n" + "TNRunner is designed to execute several containerized snakmake workflows on tumor\n"+ "normal datasets via a slurm cluster. Based on the availability of fastq, \n"+ diff --git a/Source/edu/utah/seq/run/TNSample2.java b/Source/edu/utah/seq/run/TNSample2.java index 694d3ab..08306fd 100644 --- a/Source/edu/utah/seq/run/TNSample2.java +++ b/Source/edu/utah/seq/run/TNSample2.java @@ -104,19 +104,30 @@ public TNSample2(File rootDir, TNRunner2 tnRunner) throws IOException{ private void parseMergeClinicalVars() throws IOException { info.add("Checking clinical variant integration..."); - //look for json file + //look for json file and xml vcfs File[] jsonTestResults = IO.extractFiles(new File(rootDir, "ClinicalReport"), ".json"); + File[] xmls = null; + File[] vcfs = null; + File[] toLink = null; + if (jsonTestResults == null || jsonTestResults.length !=1) { - info.add("\tJsonReport\tFAILED to find one xxx.json clinical test report file"); - failed = true; - return; + + //look for caris xml and vcf.gz + xmls = IO.extractFiles(new File(rootDir, "ClinicalReport"), ".xml"); + vcfs = IO.extractFiles(new File(rootDir, "ClinicalReport"), ".vcf.gz"); + if (xmls.length != 1 || vcfs.length != 1) { + info.add("\tJson/Xml/VcfReport\tFAILED to find one xxx.json or one xxx.vcf.gz and xxx.xml clinical test report file(s)"); + failed = true; + return; + } + jsonTestResults = null; + toLink = new File[]{somaticVariants, new File(somaticVariants+".tbi"), xmls[0], vcfs[0]}; } + else toLink = new File[]{somaticVariants, new File(somaticVariants+".tbi"), jsonTestResults[0]}; //make dir, ok if it already exists File jobDir = new File (rootDir, "SomaticVariantCalls/"+id+"_ClinicalVars"); jobDir.mkdirs(); - - File[] toLink = new File[]{somaticVariants, new File(somaticVariants+".tbi"), jsonTestResults[0]}; //any files? HashMap nameFile = IO.fetchNamesAndFiles(jobDir); diff --git a/Source/edu/utah/seq/vcf/VCFCallFrequency.java b/Source/edu/utah/seq/vcf/VCFCallFrequency.java index 70a0b54..2986416 100644 --- a/Source/edu/utah/seq/vcf/VCFCallFrequency.java +++ b/Source/edu/utah/seq/vcf/VCFCallFrequency.java @@ -25,7 +25,6 @@ import edu.utah.hci.query.MasterQuery; import edu.utah.hci.query.QueryRequest; import edu.utah.hci.query.UserQuery; -import edu.utah.seq.vcf.anno.AnnotatedVcfParser; /**Calculates call frequencies for each vcf record from a db of vcf files and callable region files. * Most mutated single site in p53 are seen at 0.039 across all cancers in TCGA. 0.57 for BRAF V600E across all cancers.*/ @@ -292,9 +291,10 @@ public void query(UserQuery userQuery) throws IOException { //make a single use QueryRequest object QueryRequest qr = new QueryRequest(masterQuery, null, options, null); //any errors? - if (qr.getErrTxtForUser() != null || qr.getWarningTxtForUser().size()!=0) { + if (qr.getErrTxtForUser() != null ) { + //if (qr.getErrTxtForUser() != null || qr.getWarningTxtForUser().size()!=0) { if (qr.getErrTxtForUser() !=null) Misc.printErrAndExit("\nInvalid request, "+qr.getErrTxtForUser()+ ",aborting."); - if (qr.getWarningTxtForUser().size() !=0) Misc.printErrAndExit("\n"+qr.getWarningTxtForUser()+", aborting."); + // already printed! so skip, if (qr.getWarningTxtForUser().size() !=0) Misc.printErrAndExit("\n"+qr.getWarningTxtForUser()+", might be an issue."); } //set results else userQuery.setResults(qr.getJsonResults()); @@ -639,7 +639,7 @@ public static void printDocs(){ "\nExample: java -jar pathToUSeq/Apps/VCFCallFrequency -f Vcf/ -s CFVcfs -v \n"+ - " Hg38/Somatic/Avatar/Vcf -b -m 0.05 -d ~/GQuery/Data \n\n" + + " Hg38/Somatic/Avatar/Vcf -b -m 0.07 -d ~/GQuery/Data \n\n" + "**************************************************************************************\n"); } diff --git a/Source/edu/utah/seq/vcf/VariantPathwayComparator.java b/Source/edu/utah/seq/vcf/VariantPathwayComparator.java index f6c0c9d..3e7a981 100644 --- a/Source/edu/utah/seq/vcf/VariantPathwayComparator.java +++ b/Source/edu/utah/seq/vcf/VariantPathwayComparator.java @@ -11,10 +11,13 @@ public class VariantPathwayComparator { private File groupAGeneHits; private File groupBGeneHits; private File pathwayGenes; + private File geneKeggIdName; //internal private ArrayList pathways = new ArrayList(); private File resultsSpreadsheet = null; + private HashMap keggGeneNameID = null; + private double minKeggFreq = 0.025; //constructor @@ -52,14 +55,14 @@ private void sortSavePathways() { try { PrintWriter out = new PrintWriter( new FileWriter(this.resultsSpreadsheet)); - out.println("NameHyperLink\tAdjPval\tAHits\tANoHits\tFracAHits\tAGeneHits\tBHits\tBNoHits\tFracBHits\tBGeneHits\tLog2(fracA/fracB)"); - for (Pathway p: loadedPathways) out.println(p.toString()); + if (geneKeggIdName == null) out.println("NameHyperLink\tPval\tAdjPval\tAHits\tANoHits\tFracAHits\tAGeneHits\tBHits\tBNoHits\tFracBHits\tBGeneHits\tLog2(fracA/fracB)"); + else out.println("NameHyperLink\tPval\tAdjPval\tAHits\tANoHits\tFracAHits\tAGeneHits\tAGeneLinks\tBHits\tBNoHits\tFracBHits\tBGeneHits\tBGeneLinks\tLog2(fracA/fracB)"); + for (Pathway p: loadedPathways) out.println(p.toString(keggGeneNameID)); out.close(); } catch (IOException e) { e.printStackTrace(); } - } private void comparePathways() { @@ -173,6 +176,8 @@ private void loadPathways() { e.printStackTrace(); Misc.printErrAndExit("\nFailed to parse the pathway file "+pathwayGenes); } + + if (geneKeggIdName!=null) keggGeneNameID = IO.loadFileIntoHash(geneKeggIdName, 1, 0); } private class Pathway implements Comparable{ @@ -197,32 +202,34 @@ private class Pathway implements Comparable{ - public String toString() { - //name adjPval AHits, ANoHits, fracAHits, AGeneHits, BHits, BNoHits, fracBHits, BGeneHits, log2(fracA/fracB) + public String toString(HashMapkeggGeneNameId) { + //name pval, adjPval, AHits, ANoHits, fracAHits, AGeneHits, (ALinks,) BHits, BNoHits, fracBHits, BGeneHits, (BLinks), log2(fracA/fracB) StringBuilder sb = new StringBuilder(); //attempt to parse the hsa0000 pathway number Matcher mat = hsaPath.matcher(name); + String partialHyperLink = null; if (mat.matches()) { sb.append("=HYPERLINK(\"https://www.kegg.jp/pathway/"); sb.append(mat.group(1)); + partialHyperLink = sb.toString(); sb.append("\",\""); sb.append(name); sb.append("\")"); } - else sb.append(name); - sb.append("\t"); + else sb.append(name); sb.append("\t"); + sb.append(Num.formatNumber(pval, 5)); sb.append("\t"); sb.append(Num.formatNumber(adjPval, 5)); sb.append("\t"); sb.append(groupAHits.size()); sb.append("\t"); sb.append(groupANoHits.size()); sb.append("\t"); double aFrac = (double)groupAHits.size()/ (double)(groupAHits.size()+groupANoHits.size()); sb.append(Num.formatNumber(aFrac, 5)); sb.append("\t"); - sb.append(convertToSortedFrequencies(aGenes)); sb.append("\t"); + sb.append(convertToSortedFrequencies(aGenes, partialHyperLink)); sb.append("\t"); sb.append(groupBHits.size()); sb.append("\t"); sb.append(groupBNoHits.size()); sb.append("\t"); double bFrac = (double)groupBHits.size()/ (double)(groupBHits.size()+groupBNoHits.size()); sb.append(Num.formatNumber(bFrac, 5)); sb.append("\t"); - sb.append(convertToSortedFrequencies(bGenes)); sb.append("\t"); + sb.append(convertToSortedFrequencies(bGenes, partialHyperLink)); sb.append("\t"); double rto = aFrac/bFrac; sb.append(Num.formatNumber(Num.log2(rto), 5)); @@ -237,7 +244,7 @@ public int compareTo(Pathway o) { } - public static String convertToSortedFrequencies(HashMap keyCounts) { + public String convertToSortedFrequencies(HashMap keyCounts, String pathwayName) { double totalCounts = 0; for (String key: keyCounts.keySet()) totalCounts+= keyCounts.get(key); GeneCount[] gc = new GeneCount[keyCounts.size()]; @@ -249,17 +256,43 @@ public static String convertToSortedFrequencies(HashMap keyCount gc[count++] = new GeneCount(key, geneCount); } Arrays.sort(gc); - StringBuilder sb = new StringBuilder(gc[0].name); - sb.append("="); - sb.append(Num.formatNumber(gc[0].fraction, 3)); - for (int i=1; i= minKeggFreq) namesToLink.add(gc[i].name); + } + String geneNamesToView = namesToView.toString(); + if (keggGeneNameID != null) return geneNamesToView+ "\t"+ createGeneHyperLink(pathwayName, namesToLink, geneNamesToView); + return geneNamesToView; + } + private String createGeneHyperLink(String pathwayLink, ArrayList namesToLink, String geneNamesToView) { + //https://www.kegg.jp/pathway/hsa03440 + //must watch size, max 250 + StringBuilder sb = new StringBuilder(pathwayLink); + for (String geneName: namesToLink) { + String id = keggGeneNameID.get(geneName); + int size = sb.length() + id.length()+1; + if (size > 250) break; + if (id != null) { + sb.append("+"); + sb.append(id); + + } } + sb.append("\",\""); + if (geneNamesToView.length()> 250) sb.append(geneNamesToView.subSequence(0, 247)+"..."); + else sb.append(geneNamesToView); + sb.append("\")"); return sb.toString(); } + public static class GeneCount implements Comparable{ double fraction = 0; String name = null; @@ -301,6 +334,7 @@ public void processArgs(String[] args){ case 'a': groupAGeneHits = new File(args[++i]); break; case 'b': groupBGeneHits = new File(args[++i]); break; case 'p': pathwayGenes = new File(args[++i]); break; + case 'k': geneKeggIdName = new File(args[++i]); break; case 'r': resultsSpreadsheet = new File(args[++i]); break; case 'h': printDocs(); System.exit(0); default: Misc.printErrAndExit("\nProblem, unknown option! " + mat.group()); @@ -341,9 +375,11 @@ public static void printDocs(){ " delimited, the first cell is the pathway ID and description (e.g.\n"+ " 'hsa05210 Colorectal cancer'), subsequent cells, the associated genes.\n"+ "-r File to save results, should end with .xls\n"+ + "-k (Optional) Tab delimited file containing KEGG gene IDs and gene Names, one row\n"+ + " per gene, for highlighting impacted genes in each KEGG pathway hyperlink.\n"+ "\nExample: java -Xmx10G -jar pathTo/USeq/Apps/VariantPathwayComparator -a earlyCRC.txt\n"+ - " -b lateCRC.txt -p keggHumanPathways.txt -r earlyVsLateVPC.xls \n"+ + " -b lateCRC.txt -p keggHumanPathways.txt -r earlyVsLateVPC.xls -k keggGeneIDName.txt\n"+ "\n**************************************************************************************\n"); diff --git a/Source/edu/utah/seq/vcf/xml/caris/CNVAlteration.java b/Source/edu/utah/seq/vcf/xml/caris/CNVAlteration.java index 324638d..08ff0f3 100644 --- a/Source/edu/utah/seq/vcf/xml/caris/CNVAlteration.java +++ b/Source/edu/utah/seq/vcf/xml/caris/CNVAlteration.java @@ -110,4 +110,9 @@ public SimpleBed toBed(HashMap name2GeneModels) throws I if (score == null) score = "."; return new SimpleBed(model.getChrom(), model.getTxStart(), model.getTxEnd(), geneName+":"+result.toLowerCase().replace(" ", "_"), score, model.getStrand()); } + + + public String getResult() { + return result; + } } diff --git a/Source/edu/utah/seq/vcf/xml/caris/CarisVcfComparator.java b/Source/edu/utah/seq/vcf/xml/caris/CarisVcfComparator.java new file mode 100644 index 0000000..a254f1c --- /dev/null +++ b/Source/edu/utah/seq/vcf/xml/caris/CarisVcfComparator.java @@ -0,0 +1,326 @@ +package edu.utah.seq.vcf.xml.caris; + +import java.io.*; +import java.util.regex.*; +import edu.utah.seq.vcf.VCFParser; +import util.gen.*; +import java.util.*; + +/** + * Takes a patient vcf file parsed from a Caris json report and compares it to a vcf generated by reprocessing the raw data. + * Writes out a final arbitrated vcf containing all the Caris calls plus non duplicate recall variants with no FILTER flags. + * + * @author david.nix@hci.utah.edu + **/ +public class CarisVcfComparator { + + //user defined fields + private File carisVcf = null; + private File recallVcf = null; + private File mergedVcf = null; + private SimpleVcf[] fVcfs; + private SimpleVcf[] rVcfs; + private int bpPaddingForOverlap = 2; + + //counters + private int numberCaris = 0; + private int numberRecall = 0; + private int numberExactMatches = 0; + private int numberCarisWithOnlyOverlap = 0; + private int numberModifiedCarisCalls = 0; + private int numberCarisWithNoMatch = 0; + private int numberPassingRecallWithNoMatch = 0; + + private ArrayList vcfToPrint = new ArrayList(); + private ArrayList headerLines = new ArrayList(); + + + + + //constructors + public CarisVcfComparator(String[] args){ + try { + long startTime = System.currentTimeMillis(); + processArgs(args); + + //load vcf files + fVcfs = load(carisVcf, true); + rVcfs = load(recallVcf, true); + numberRecall = rVcfs.length; + + compareVcfs(); + + processCarisVcfs(); + + processRecallVcfs(); + + printVcfs(); + + printStats(); + + //finish and calc run time + double diffTime = ((double)(System.currentTimeMillis() -startTime))/60000; + System.out.println("\nDone! "+Math.round(diffTime)+" Min\n"); + + } catch (Exception e) { + e.printStackTrace(); + Misc.printErrAndExit("\nProblem running CarisJson2Vcf app!"); + } + } + + private void printStats() { + System.out.println("\nComparator stats:"); + System.out.println( numberRecall +"\t# Recall variants"); + System.out.println( numberCaris +"\t# Caris variants"); + System.out.println( numberExactMatches +"\t# Exact match"); + System.out.println( numberCarisWithOnlyOverlap +"\t# Overlap recal variants"); + System.out.println( numberModifiedCarisCalls +"\t# Recommended for modification"); + System.out.println( numberCarisWithNoMatch +"\t# No match"); + System.out.println( numberPassingRecallWithNoMatch +"\t# Passing recall variants with no xml vcf match"); + } + + private void printVcfs() { + //sort vcf + SimpleVcf[] vcf = new SimpleVcf[vcfToPrint.size()]; + vcfToPrint.toArray(vcf); + Arrays.sort(vcf); + Pattern pat = Pattern.compile("N_DP=.+SOMATIC"); + + try { + Gzipper out = new Gzipper(mergedVcf); + //fetch merged header + String[] header = mergeHeaders(headerLines); + + for (String h: header) out.println(h); + for (SimpleVcf v: vcf) { + if (v.getFilter().toLowerCase().contains("fail") == false) { + //fix the the SOMATIC info + String line = v.getVcfLine(); + line = pat.matcher(line).replaceFirst("MIXED_SOMATIC_GERMLINE"); + out.println(line); + } + } + + out.close(); + } catch (IOException e) { + e.printStackTrace(); + Misc.printErrAndExit("\nERROR: problem writing out the merged vcf file. "+mergedVcf); + } + } + + private void processRecallVcfs() { + //for each Recall vcf, call this after processing the Caris vcfs, skip those with a fail filter field + for (SimpleVcf r:rVcfs){ + //print it? + if (r.isPrint() && r.getFilter().toLowerCase().contains("fail") == false) { + vcfToPrint.add(r); + if (r.getMatch() == null) numberPassingRecallWithNoMatch++; + } + } + } + + /**Merges header lines eliminating duplicates. Does a bad ID name collision checking, silently keeps first one. + * Returns null if CHROM lines differ. */ + public static String[] mergeHeaders(ArrayList header) { + + LinkedHashSet other = new LinkedHashSet(); + LinkedHashSet contig = new LinkedHashSet(); + LinkedHashSet info = new LinkedHashSet(); + LinkedHashSet filter = new LinkedHashSet(); + LinkedHashSet format = new LinkedHashSet(); + TreeSet source = new TreeSet(); + String chromLine = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tTUMOR"; + + for (String h: header){ + h=h.trim(); + if (h.startsWith("##contig")){ + if (contig.contains(h) == false) contig.add(h); + } + + //modify the caris DP info + else if (h.startsWith("##INFO="); + } + + //swap out the somatic info + else if (h.startsWith("##INFO="); + } + + else if (h.startsWith("##INFO")){ + if (info.contains(h) == false) info.add(h); + } + else if (h.startsWith("##FILTER")){ + if (filter.contains(h) == false) filter.add(h); + } + else if (h.startsWith("##FORMAT")){ + if (format.contains(h) == false) format.add(h); + } + else if (h.startsWith("##source=")){ + source.add(h); + } + else if (h.startsWith("#CHROM")){ + //don't do anything, just skip it + } + else if (other.contains(h) == false) { + other.add(h); + } + } + + + //add in filter lines + filter.add(SimpleVcf.ncFilter); + + //remove ID dups from contig, filter, format, info + ArrayList contigAL = VCFParser.mergeHeaderIds(contig); + ArrayList filterAL = VCFParser.mergeHeaderIds(filter); + ArrayList formatAL = VCFParser.mergeHeaderIds(format); + ArrayList infoAL = VCFParser.mergeHeaderIds(info); + + ArrayList lines = new ArrayList(); + for (String s : other) lines.add(s); + for (String s : source) lines.add(s); + for (String s : contigAL) lines.add(s); + for (String s : filterAL) lines.add(s); + for (String s : infoAL) lines.add(s); + for (String s : formatAL) lines.add(s); + if (chromLine != null) lines.add(chromLine); + + return Misc.stringArrayListToStringArray(lines); + } + + private void processCarisVcfs() { + //for each Caris record + for (SimpleVcf f: fVcfs){ + numberCaris++; + //exact match? + if (f.getMatch() != null) { + numberExactMatches++; + //exact match then add caris info to recall + SimpleVcf vcf = f.getMatch(); + vcf.appendID(f); + vcf.appendINFO(f); + f.setPrint(false); + } + else { + //So no exact match any overlap? + if (f.getOverlap().size()!=0) numberCarisWithOnlyOverlap++; + + //No exact or overlap + else { + System.err.println("WARNING: No match to this Caris variant:"); + System.err.println("\t"+f.getVcfLine()); + numberCarisWithNoMatch++; + } + //always print it + f.appendFilter("NC"); + vcfToPrint.add(f); + } + } + } + + private void compareVcfs() { + //slow comparator, could do many things to speed up... + //for each Caris record + for (int i=0; i< fVcfs.length; i++){ + SimpleVcf f = fVcfs[i]; + + //for each Recal variant + for (int j=0; j< rVcfs.length; j++){ + SimpleVcf r = rVcfs[j]; + if (f.compareToExact(r)){ + if (f.getMatch() !=null || r.getMatch() != null) System.err.println("WARNING: more than one exact match found for \n\t"+f+"\n\t"+r); + f.setMatch(r); + r.setMatch(f); + } + else if (f.compareToOverlap(r)){ + //IO.pl("Overlap xml "+f.getOriginalRecord()); + //IO.pl("Overlap rec "+r.getOriginalRecord()); + f.getOverlap().add(r); + r.getOverlap().add(f); + } + } + } + } + + private SimpleVcf[] load(File vcf, boolean excludeContig) { + String[] lines = IO.loadFileIntoStringArray(vcf); + ArrayList al = new ArrayList(); + for (String v: lines){ + if (v.startsWith("#") == false) { + al.add(new SimpleVcf(v, bpPaddingForOverlap)); + } + else { + if (excludeContig){ + if (v.startsWith("##contig") == false) headerLines.add(v); + } + else headerLines.add(v); + } + } + SimpleVcf[] svs = new SimpleVcf[al.size()]; + al.toArray(svs); + return svs; + } + + public static void main(String[] args) { + if (args.length ==0){ + printDocs(); + System.exit(0); + } + new CarisVcfComparator(args); + } + + /**This method will process each argument and assign new varibles*/ + public void processArgs(String[] args){ + Pattern pat = Pattern.compile("-[a-z]"); + String useqVersion = IO.fetchUSeqVersion(); + String source = useqVersion+" Args: "+ Misc.stringArrayToString(args, " "); + System.out.println("\n"+ source +"\n"); + for (int i = 0; i xmlVcfFiles = null; private File saveDirectory; private boolean includePHI = false; + private boolean saveAllVcfRecords = false; //internal fields private String source; @@ -48,6 +49,7 @@ public class CarisXmlVcfParser { private ArrayList workingCNVAlterations = new ArrayList(); private ArrayList workingTranslocations = new ArrayList(); private ArrayList workingExpressionAlterations = new ArrayList(); + private ArrayList workingMethylationAlterations = new ArrayList(); //constructors @@ -76,7 +78,7 @@ public CarisXmlVcfParser(String[] args){ } private void printSpreadsheet() throws IOException { - IO.pl("Exporting patient summary spreadsheet... "); + IO.pl("\nExporting patient summary spreadsheet... "); //merge all the keys LinkedHashSet allKeys = new LinkedHashSet(); for (LinkedHashMap s : allReportAttributes) allKeys.addAll(s.keySet()); @@ -132,6 +134,7 @@ public void parseDatasets() throws Exception{ workingCNVAlterations.clear(); workingTranslocations.clear(); workingExpressionAlterations.clear(); + workingMethylationAlterations.clear(); workingNumberFailedGenomicMatches = 0; loadVcf(); @@ -215,10 +218,13 @@ private void loadVcf() throws IOException { workingVcfHeader.append(SimpleVcf.xrv); workingVcfHeader.append(SimpleVcf.xrp); } - workingVcfHeader.append(line); workingVcfHeader.append("\n"); + //fix their malformed header line that is causing a vt to error + if (line.startsWith("##FORMAT="; + workingVcfHeader.append(line); + workingVcfHeader.append("\n"); } else { - SimpleVcf vcf = new SimpleVcf(line); + SimpleVcf vcf = new SimpleVcf(line,0); String key = vcf.fetchCarisKey(); if (key == null) key = Misc.getRandomString(10); workingVcfs.put(key, vcf); @@ -233,14 +239,29 @@ private void saveVcfFile() { if (workingVcfs.size() == 0) return; Gzipper out = null; try { + //for each vcf record + ArrayList toSave = new ArrayList(); + int counter = 1; + for (SimpleVcf sv : workingVcfs.values()) { + if (saveAllVcfRecords) { + sv.setId("Caris_"+counter); + counter++; + toSave.add(sv.toString()); + } + else if (sv.getInfo().startsWith("XRV;XRP=")) { + sv.setId("Caris_"+counter); + counter++; + toSave.add(sv.toString()); + } + } + //if (toSave.size() == 0) return; don't do this always want a vcf file output even if empty + File vcf = new File (saveDirectory, workingReportName+".vcf.gz"); out = new Gzipper(vcf); //add header out.print(workingVcfHeader); - - //for each vcf record - for (SimpleVcf sv : workingVcfs.values()) out.println(sv.toString()); + for (String s: toSave) out.println(s); out.close(); } catch (IOException e) { @@ -258,19 +279,30 @@ private void saveCnvFile() { Gzipper out = null; try { - SimpleBed[] bedRegions = new SimpleBed[workingCNVAlterations.size()]; - for (int i=0; i< bedRegions.length; i++) bedRegions[i] = workingCNVAlterations.get(i).toBed(name2GeneModels); - Arrays.sort(bedRegions); + ArrayList bedToSave = new ArrayList(); + for (int i=0; i< workingCNVAlterations.size(); i++) { + CNVAlteration cnv = workingCNVAlterations.get(i); + String res = cnv.getResult().toLowerCase(); + if (res.contains("not detected") || res.contains("indeterminate")) continue; + bedToSave.add(cnv.toBed(name2GeneModels)); + } + if (bedToSave.size() == 0) return; + + SimpleBed[] bedRegions = new SimpleBed[bedToSave.size()]; + bedToSave.toArray(bedRegions); + Arrays.sort(bedRegions); File bedFile = new File (saveDirectory, workingReportName+".cnv.bed.gz"); out = new Gzipper(bedFile); //add header - out.println("#Chr\tGeneStart\tGeneStop\tGeneName:Type\tScore\tGeneStrand\n#Hg38"); + out.println("#ChrHg38\tGeneStart\tGeneStop\tGeneName:Type\tScore\tGeneStrand"); //for each bed - for (SimpleBed cnv: bedRegions) out.println(cnv); + for (SimpleBed cnv: bedRegions) { + if (cnv!= null) out.println(cnv); + } out.close(); } catch (IOException e) { @@ -291,9 +323,15 @@ private void saveFusionFile() { ArrayList beds = new ArrayList(); int num = workingTranslocations.size(); for (int i=0; i< num; i++) { + Translocation trans = workingTranslocations.get(i); + String res = trans.getResult().toLowerCase(); + if (res.contains("not detected") || res.contains("indeterminate")) continue; + SimpleBed[] sbs = workingTranslocations.get(i).toBed(name2GeneModels); for (SimpleBed sb: sbs) if (sb!=null) beds.add(sb); } + if (beds.size()==0) return; + SimpleBed[] bedRegions = new SimpleBed[beds.size()]; beds.toArray(bedRegions); Arrays.sort(bedRegions); @@ -303,7 +341,7 @@ private void saveFusionFile() { out = new Gzipper(bedFile); //add header - out.println("#Chr\tGeneStart\tGeneStop\tGeneName1:GeneName2:Type\tScore\tGeneStrand\n#Hg38"); + out.println("#ChrHg38\tGeneStart\tGeneStop\tGeneName1:GeneName2:Type\tScore\tGeneStrand"); //for each bed for (SimpleBed fus: bedRegions) out.println(fus); @@ -365,10 +403,14 @@ private void parseTests(Node node) throws DOMException, ParseException, IOExcept else if (testName.equals("Exome CNA Panel - Additional Genes")) parseCNVPanel(childNodes); else if (testName.equals("Transcriptome Detection_v1 Panel")) parseTranslocationPanel(childNodes); else if (testName.equals("Transcriptome Detection_v1 Variant Panel")) parseTranslocationPanel(childNodes); + else if (testName.equals("MGMT-Me")) parseMethyl(childNodes); else if (ihcTestNames.contains(testName)) parseIHC(childNodes); else if (testName.equals("Her2 CISH")) parseCNVPanel(childNodes); else if (testName.equals("EBER ISH")){ - IO.el("Skipping the parsing of 'EBER ISH' test for Epstein-Barr virus genome copy number\n"); + IO.el("\nSkipping the parsing of 'EBER ISH' test for Epstein-Barr virus genome copy number\n"); + } + else if (testName.equals("ROS1 FISH")){ + IO.el("\nSkipping the parsing of 'ROS1 FISH' test for ROS1 cytogenic alteration."); } else throw new IOException("Found an unknown test! "+testName); } @@ -396,6 +438,27 @@ private void parseIHC(NodeList childNodes) throws IOException { } } } + + private void parseMethyl(NodeList childNodes) throws IOException { + for (int j = 0; j < childNodes.getLength(); j++) { + Node cNode = childNodes.item(j); + if (cNode instanceof Element) { + String name = cNode.getNodeName(); + //find testResults + if (name.equals("testResults")) { + NodeList subNodes = cNode.getChildNodes(); + for (int i=0; i< subNodes.getLength(); i++) { + Node n = subNodes.item(i); + if (n instanceof Element) { + String subName = n.getNodeName(); + if (subName.equals("epigeneticAlteration")) workingMethylationAlterations.add( new MethylationAlteration(workingReportAttributes, n.getChildNodes())); + else throw new IOException("Found something other than 'epigeneticAlteration' "+subName); + } + } + } + } + } + } private void parseTranslocationPanel(NodeList childNodes) throws IOException { for (int j = 0; j < childNodes.getLength(); j++) { @@ -663,6 +726,9 @@ public void processArgs(String[] args){ System.out.println("\n"+ source +"\n"); File ucscFile = null; File vcfXmlDir = null; + File singleCarisVcf = null; + File singleCarisXml = null; + for (int i = 0; i report, NodeList nodes) throws IOException { + for (int j = 0; j < nodes.getLength(); j++) { + Node n = nodes.item(j); + if (n instanceof Element) { + String name = n.getNodeName(); + if (name.equals("biomarkerName")) biomarkerName = CarisXmlVcfParser.fetch("biomarkerName", n); + else if (name.equals("gene")) gene = CarisXmlVcfParser.fetch("gene", n); + else if (name.equals("result")) result = CarisXmlVcfParser.fetch("result", n); + } + } + //IO.pl(toString()); + addReportInfo(report); + } + + private void addReportInfo(LinkedHashMap report) { + String bn = biomarkerName.replaceAll(" ", "_"); + String res = result.replaceAll(" ", "_"); + report.put(bn+":Result", res); + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + if (biomarkerName!=null) sb.append("\nbiomarkerName\t"+biomarkerName); + if (gene!=null) sb.append("\ngene\t"+gene); + if (result!=null) sb.append("\nresult\t"+result); + return sb.toString(); + } +} diff --git a/Source/edu/utah/seq/vcf/xml/caris/SimpleVcf.java b/Source/edu/utah/seq/vcf/xml/caris/SimpleVcf.java index 406c4c9..0989be4 100644 --- a/Source/edu/utah/seq/vcf/xml/caris/SimpleVcf.java +++ b/Source/edu/utah/seq/vcf/xml/caris/SimpleVcf.java @@ -5,6 +5,8 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import util.gen.Gzipper; import util.gen.IO; import util.gen.Misc; @@ -23,12 +25,22 @@ public class SimpleVcf implements Comparable{ private String info; private String format; private String sample; - private String pathogenicity = null; + private int end; + private int padPos; + private int padEnd; public static final String xrv = "##INFO=\n"; public static final String xrp = "##INFO=\n"; - + private boolean print = true; + private SimpleVcf match = null; + private ArrayList overlap = new ArrayList(); + private static final Pattern afPat = Pattern.compile(".+AF=([\\d+\\.]+).*"); + public static final String ncFilter = "##FILTER="; + public static final String nrFilter = "##FILTER="; + public static final String mdFilter = "##FILTER="; + public static String infoRAF = "##INFO="; - public SimpleVcf (String vcfLine){ + + public SimpleVcf (String vcfLine, int bpPadding){ originalRecord = vcfLine; //parse fields String[] t = Misc.TAB.split(vcfLine); @@ -41,27 +53,75 @@ public SimpleVcf (String vcfLine){ filter = t[6]; info = t[7]; format = t[8]; - sample = t[9]; + //two samples or one? + // Caris just has one sample, the tumor + if (t.length==10) sample = t[9]; + // Strelka has two, the normal and tumor, just take the tumor + else if (t.length==11) sample = t[10]; + else Misc.printErrAndExit("\nERROR: sample mismatch for caris or strelka vcf record:\n"+ vcfLine); + + //define the end as the max effected genomic region + int refLen = ref.length(); + //find max alt length, there many be more than one, not good! + String[] splitAlts = Misc.COMMA.split(alt); + int altLen = splitAlts[0].length(); + for (int i=1; i< splitAlts.length; i++){ + if (splitAlts[i].length() > altLen) altLen = splitAlts[i].length(); + } + if (refLen > altLen) { + end = pos+refLen; + } + else if (refLen < altLen){ + end = pos+altLen; + } + else end = pos+altLen; + + //set bp padded positions + padPos = pos - bpPadding; + padEnd = end + bpPadding; } - + public void appendInfo(String toAppend) { info = toAppend+info; } - - + + /**This appends on the AF from the matching SimpleVcf to this vcfLine*/ + public void appendRAF(SimpleVcf o){ + Matcher mat = afPat.matcher(o.info); + if (mat.matches()) info = info + ";RAF="+mat.group(1); + else Misc.printErrAndExit("\nERROR: failed to match AF= from "+o.getVcfLine()); + } + + /**Appends not confirmed flag to the FILTER field*/ + public void appendFilter(String flag){ + //any existing flags? + if (filter.equals(".") || filter.toLowerCase().equals("pass")) filter = flag; + else filter = filter+";"+flag; + + } + + public void appendID(SimpleVcf o) { + if (o.getId().equals(".") == false) id = id+";"+o.getId(); + else id = o.getId(); + } + + public void appendINFO(SimpleVcf o) { + info = info+";"+o.info; + } + public String fetchCarisKey() throws IOException { //chromosome_ref_alt_readDepth_geneName_hgvsCodingChange StringBuilder sb = new StringBuilder(); sb.append(chr); sb.append("_"); sb.append(ref); sb.append("_"); sb.append(alt); sb.append("_"); - + //parse DP from INFO //DP=1134;TI=NM_003482.3;GI=KMT2D;FC=Nonsense;PC=Q3812*;DC=c.11434C>T;LO=EXON;EXON=39;CI="Pathogenic Variant" //parse DP, transcript and hgvs //cant use transcript id, these differ between the vcf file and the xml file - + String[] splitInfo = Misc.SEMI_COLON.split(info); String dp = null; String hgvs = null; @@ -72,6 +132,7 @@ public String fetchCarisKey() throws IOException { else if (i.startsWith("DP=")) dp = i.substring(3); } if (hgvs == null || gi == null || dp == null) { + // just a few Caris vcf records with no annotations, just skip //IO.el("\t\tERROR: failed to parse one or more of the DC=, TI=, DP= INFO values from "+originalRecord); return null; } @@ -83,7 +144,7 @@ public String fetchCarisKey() throws IOException { return sb.toString(); } - + public String toString(){ //#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE StringBuilder sb = new StringBuilder(); @@ -94,12 +155,17 @@ public String toString(){ sb.append(alt); sb.append("\t"); sb.append(qual); sb.append("\t"); sb.append(filter); sb.append("\t"); - sb.append(info); sb.append("\t"); - sb.append(format); sb.append("\t"); - sb.append(sample); + sb.append(info); + if (format!=null && sample!=null) { + sb.append("\t"); + sb.append(format); + sb.append("\t"); + sb.append(sample); + } + return sb.toString(); } - + public int compareTo(SimpleVcf other) { //sort by chromosome int x = chr.compareTo(other.chr); @@ -109,7 +175,21 @@ public int compareTo(SimpleVcf other) { if (pos > other.pos) return 1; return 0; } - + + public boolean compareToExact(SimpleVcf o) { + if (chr.equals(o.chr) == false) return false; + if (pos != o.pos) return false; + if (ref.equals(o.ref) == false) return false; + if (alt.equals(o.alt) == false) return false; + return true; + } + public boolean compareToOverlap(SimpleVcf o) { + if (chr.equals(o.chr) == false) return false; + if (padPos > o.padEnd) return false; + if (o.padPos > padEnd) return false; + return true; + } + /**Sorts a VCF and writes it out compressed but not indexed. This loads everything into memory so it can blow up. Careful. * @return number of vcf records*/ public static int sortVcf(File unsortedVcfFile, File sortedOutVcfFile) throws IOException { @@ -121,7 +201,7 @@ public static int sortVcf(File unsortedVcfFile, File sortedOutVcfFile) throws IO while ((line = in.readLine())!=null){ if (line.trim().length() == 0) continue; if (line.startsWith("#")) out.println(line); - else records.add(new SimpleVcf(line)); + else records.add(new SimpleVcf(line, 0)); } //sort em SimpleVcf[] toSort = new SimpleVcf[records.size()]; @@ -134,7 +214,7 @@ public static int sortVcf(File unsortedVcfFile, File sortedOutVcfFile) throws IO out.close(); return toSort.length; } - + //getters and setters public String getVcfLine() { return toString(); @@ -166,4 +246,32 @@ public void setAlt(String alt) { public String getId() { return id; } + + public void setId(String id) { + this.id = id; + } + + public boolean isPrint() { + return print; + } + + public SimpleVcf getMatch() { + return match; + } + + public void setPrint(boolean print) { + this.print = print; + } + + public ArrayList getOverlap() { + return overlap; + } + + public void setOverlap(ArrayList overlap) { + this.overlap = overlap; + } + + public void setMatch(SimpleVcf match) { + this.match = match; + } } diff --git a/Source/edu/utah/seq/vcf/xml/caris/Translocation.java b/Source/edu/utah/seq/vcf/xml/caris/Translocation.java index ff68f11..6e1a8b8 100644 --- a/Source/edu/utah/seq/vcf/xml/caris/Translocation.java +++ b/Source/edu/utah/seq/vcf/xml/caris/Translocation.java @@ -117,6 +117,11 @@ private UCSCGeneLine fetchModel(String geneName, String chr, HashMap