diff --git a/Source/edu/utah/seq/amazon/Ec2Launcher.java b/Source/edu/utah/seq/amazon/Ec2Launcher.java deleted file mode 100644 index 95dac674..00000000 --- a/Source/edu/utah/seq/amazon/Ec2Launcher.java +++ /dev/null @@ -1,55 +0,0 @@ -package edu.utah.seq.amazon; -import java.io.InputStream; -import com.jcraft.jsch.*; - -public class Ec2Launcher { - - /** - * @param args - */ - public static void main(String[] args) { - String host="ember5.chpc.utah.edu"; - String user="u0028003"; - String password=""; - String command1="ls -ltr && sleep 5s && hostname"; - try{ - - java.util.Properties config = new java.util.Properties(); - config.put("StrictHostKeyChecking", "no"); - JSch jsch = new JSch(); - Session session=jsch.getSession(user, host, 22); - session.setPassword(password); - session.setConfig(config); - session.connect(); - System.out.println("Connected"); - - Channel channel=session.openChannel("exec"); - ((ChannelExec)channel).setCommand(command1); - channel.setInputStream(null); - ((ChannelExec)channel).setErrStream(System.err); - - InputStream in=channel.getInputStream(); - channel.connect(); - byte[] tmp=new byte[1024]; - while(true){ - while(in.available()>0){ - int i=in.read(tmp, 0, 1024); - if(i<0)break; - System.out.print(new String(tmp, 0, i)); - } - if(channel.isClosed()){ - System.out.println("exit-status: "+channel.getExitStatus()); - break; - } - try{Thread.sleep(1000);}catch(Exception ee){} - } - channel.disconnect(); - session.disconnect(); - System.out.println("DONE"); - }catch(Exception e){ - e.printStackTrace(); - } - - } - -} diff --git a/Source/edu/utah/seq/parsers/mpileup/MpileupTabixLoaderGraphite.java b/Source/edu/utah/seq/parsers/mpileup/MpileupTabixLoaderGraphite.java deleted file mode 100644 index 1de519b8..00000000 --- a/Source/edu/utah/seq/parsers/mpileup/MpileupTabixLoaderGraphite.java +++ /dev/null @@ -1,254 +0,0 @@ -package edu.utah.seq.parsers.mpileup; - -import java.io.File; -import java.io.IOException; -import java.text.NumberFormat; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import edu.utah.seq.query.QueryIndexFileLoader; -import edu.utah.seq.vcf.VCFBackgroundChecker; -import edu.utah.seq.vcf.VCFBackgroundCheckerGraphite; -import htsjdk.tribble.readers.TabixReader; -import util.gen.Misc; -import util.gen.Num; - -/**Pulls mpileup lines that overlap each vcf record and calculates AF and z-score stats.*/ -public class MpileupTabixLoaderGraphite implements Runnable{ - - //fields - private boolean failed = false; - private VCFBackgroundCheckerGraphite vbc; - private TabixReader tabixReader = null; - private ArrayList vcfLines = new ArrayList(); - private ArrayList modVcfRecords = new ArrayList(); - private ArrayList tooFewSamples = new ArrayList(); - private int minBaseQuality; - private int minReadCoverage; - private int minNumSamples; - private boolean removeNonZScoredRecords; - private double minimumZScore; - private double maxSampleAF; - private boolean replaceQualScore; - private int numNotScored = 0; - private int numFailingZscore = 0; - private boolean verbose; - - //internal - public static final Pattern AF = Pattern.compile(".*AF=([\\d\\.]+).*"); - public static final double zscoreForInfinity = 1000; - private static final NumberFormat fourDecimalMax = NumberFormat.getNumberInstance(); - - - - public MpileupTabixLoaderGraphite (File mpileupFile, VCFBackgroundCheckerGraphite vbc) throws IOException{ - this.vbc = vbc; - minBaseQuality = vbc.getMinBaseQuality(); - minReadCoverage = vbc.getMinReadCoverage(); - minNumSamples = vbc.getMinNumSamples(); - removeNonZScoredRecords = vbc.isRemoveNonZScoredRecords(); - minimumZScore = vbc.getMinimumZScore(); - maxSampleAF = vbc.getMaxSampleAF(); - replaceQualScore = vbc.isReplaceQualScore(); - verbose = vbc.isVerbose(); - tabixReader = new TabixReader(mpileupFile.getCanonicalPath()); - fourDecimalMax.setMaximumFractionDigits(4); - } - - public void run() { - try { - //get next chunk of work - while (vbc.loadVcfRecords(vcfLines)){ - - //for each - for (String record: vcfLines) score(record); - vbc.saveModifiedVcf(modVcfRecords); - - //cleanup - vcfLines.clear(); - modVcfRecords.clear(); - - } - //update - vbc.update(numNotScored, numFailingZscore, tooFewSamples); - //reset - numNotScored = 0; - numFailingZscore = 0; - tooFewSamples.clear(); - } catch (Exception e) { - failed = true; - System.err.println("\nError: problem fetching mpileup lines\n" ); - e.printStackTrace(); - } - } - - private void score(String record) throws Exception { - //#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR - String[] fields = Misc.TAB.split(record); - - //fetch DP and AF - Matcher mat = AF.matcher(fields[7]); - if (mat.matches() == false) throw new IOException ("Failed to parse AF= number from the INFO field in this variant:\n"+record); - double freq = Double.parseDouble(mat.group(1)); - //mat = DP.matcher(fields[7]); - //if (mat.matches() == false) throw new IOException ("Failed to parse DP= number from the INFO field in this variant:\n"+record); - //double depth = Double.parseDouble(mat.group(1)); - - //interbase coor of effected bps - int[] startStop = QueryIndexFileLoader.fetchEffectedBps(fields, true); - if (startStop == null) { - System.err.println("Failed to parse the effected bps."); - if (removeNonZScoredRecords == false) printScoredVcf(fields, record); - tooFewSamples.add(record); - numNotScored++; - return; - } - - //pull mpileup records, if none then warn and save vcf record - int start = startStop[0]+1; - if (start < 1) start = 1; - String tabixCoor = fields[0]+":"+start+"-"+(startStop[1]); - TabixReader.Iterator it = fetchInteratorOnCoordinates(tabixCoor); - if (it == null) { - if (removeNonZScoredRecords == false) printScoredVcf(fields, record); - tooFewSamples.add(record); - numNotScored++; - return; - } - - StringBuilder sb = null; - if (verbose) { - sb = new StringBuilder ("VCF\t"); - sb.append(record); - sb.append("\n"); - } - - //for each mpileup record, find lowest z-score - String mpileupLine = null; - double minZScore = Double.MAX_VALUE; - MpileupSample[] minZScoreSamples = null; - - while ((mpileupLine = it.next()) != null){ - - //parse mpilup line and pull filtered set of samples - MpileupLine ml = new MpileupLine(mpileupLine, minBaseQuality); - if (ml.getChr() == null) throw new IOException ("Failed to parse the mpileup line:\n"+mpileupLine); - MpileupSample[] toExamine = fetchMpileupSamples(ml); - if (toExamine.length < minNumSamples) continue; - - //calculate zscore - double[] meanStd = calcMeanStdev(toExamine); - double zscore = (freq-meanStd[0])/meanStd[1]; - if (Double.isInfinite(zscore)) zscore = zscoreForInfinity; - if (zscore < minZScore) { - minZScore = zscore; - minZScoreSamples = toExamine; - } - if (verbose) { - addPileupInfo(sb, ml, toExamine, zscore); - System.out.println(sb.toString()); - } - } - - //was a z-score calculated? - if (minZScore == Double.MAX_VALUE) { - if (removeNonZScoredRecords == false) printScoredVcf(fields, record); - tooFewSamples.add(record); - numNotScored++; - } - - //append min zscore and save - else { - //fetch AFs - String bkafs = fetchFormattedAFs(minZScoreSamples); - String bkzString = Num.formatNumberNoComma(minZScore, 2); - fields[7] = "BKZ="+bkzString+";BKAF="+bkafs+";"+fields[7]; - if (replaceQualScore) fields[5] = bkzString; - String modRecord = Misc.stringArrayToString(fields, "\t"); - //threshold it? - if (minimumZScore == 0) modVcfRecords.add(modRecord); - else { - if (minZScore < minimumZScore) numFailingZscore++; - else modVcfRecords.add(modRecord); - } - } - } - - private void printScoredVcf(String[] fields, String record) throws IOException{ - if (replaceQualScore) { - fields[5] = "0"; - modVcfRecords.add(Misc.stringArrayToString(fields, "\t")); - } - else modVcfRecords.add(record); - } - - public static String fetchFormattedAFs(MpileupSample[] toExamine) { - StringBuilder sb = new StringBuilder(Num.formatNumber(toExamine[0].getAlleleFreqNonRefPlusIndels(), 4)); - for (int i=1; i< toExamine.length; i++){ - sb.append(","); - sb.append(Num.formatNumber(toExamine[i].getAlleleFreqNonRefPlusIndels(), 4)); - } - return sb.toString(); - } - - private static double[] calcMeanStdev(MpileupSample[] samples) { - double[] d = new double[samples.length]; - double mean = 0; - for (int i=0; i< samples.length; i++) { - d[i] = samples[i].getAlleleFreqNonRefPlusIndels(); - mean += d[i]; - } - mean = mean/(double)samples.length; - - double stdev = Num.standardDeviation(d, mean); - return new double[]{mean, stdev}; - } - - private TabixReader.Iterator fetchInteratorOnCoordinates(String coordinates) { - TabixReader.Iterator it = null; - //watch out for no retrieved data error from tabix - try { - it = tabixReader.query(coordinates); - } catch (ArrayIndexOutOfBoundsException e){} - return it; - } - - private void addPileupInfo(StringBuilder sb, MpileupLine ml, MpileupSample[] toExamine, double zscore) { - sb.append("\tMpileup\t"+ml.getChr()+"\t"+(ml.getZeroPos()+1) +"\t"+ml.getRef()+"\t"+zscore+"\n"); - //generate sample info - for (MpileupSample ms: toExamine){ - sb.append("\t\tSample\t"); - ms.appendCounts(sb, true); - sb.append("\t"); - sb.append(ms.getAlleleFreqNonRefPlusIndels()); - sb.append("\n"); - } - } - - private MpileupSample[] fetchMpileupSamples (MpileupLine ml) throws Exception{ - - //fetch samples to Examine - ArrayList al = new ArrayList(); - MpileupSample[] allSamples = ml.getSamples(); - for (int i=0; i< allSamples.length; i++){ - //check depth - if (allSamples[i].getReadCoverageAll() < minReadCoverage) continue; - //check af for each base and ins del, if any exceed then skip, likely germline - if (allSamples[i].findMaxSnvAF() > maxSampleAF || allSamples[i].getAlleleFreqINDEL() > maxSampleAF) continue; - al.add(allSamples[i]); - } - - MpileupSample[] toExamine = new MpileupSample[al.size()]; - al.toArray(toExamine); - return toExamine; - } - - public boolean isFailed() { - return failed; - } - - public TabixReader getTabixReader() { - return tabixReader; - } -} diff --git a/Source/edu/utah/seq/parsers/mpileup/MpileupTabixLoaderNz.java b/Source/edu/utah/seq/parsers/mpileup/MpileupTabixLoaderNz.java deleted file mode 100644 index 4349ae5f..00000000 --- a/Source/edu/utah/seq/parsers/mpileup/MpileupTabixLoaderNz.java +++ /dev/null @@ -1,252 +0,0 @@ -package edu.utah.seq.parsers.mpileup; - -import java.io.File; -import java.io.IOException; -import java.text.NumberFormat; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import edu.utah.seq.query.QueryIndexFileLoader; -import edu.utah.seq.vcf.VCFBkzDepreciated; -import edu.utah.seq.vcf.VCFNz; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.samtools.reference.ReferenceSequence; -import htsjdk.tribble.readers.TabixReader; -import util.gen.IO; -import util.gen.Misc; -import util.gen.Num; - -/**Pulls mpileup lines that overlap each vcf record and calculates AF and z-score stats.*/ -public class MpileupTabixLoaderNz implements Runnable{ - - //fields - private boolean failed = false; - private VCFNz vnz; - private TabixReader tabixReader = null; - private ArrayList vcfLines = new ArrayList(); - private ArrayList modVcfRecords = new ArrayList(); - private ArrayList tooFewSamples = new ArrayList(); - private ArrayList baseCounts = new ArrayList(); - private int minBaseQuality; - private boolean removeNonZScoredRecords; - private double maximumZScore; - private HashMap seqMeanStd = null; - private int numNotScored = 0; - private int numFailingZscore = 0; - private boolean verbose; - private int bpPad = 0; - private IndexedFastaSequenceFile fasta = null; - private String workingChromosome = ""; - private String workingSequence = null; - - //internal - private static final NumberFormat fourDecimalMax = NumberFormat.getNumberInstance(); - - public MpileupTabixLoaderNz (File mpileupFile, VCFNz vnz) throws IOException{ - this.vnz = vnz; - minBaseQuality = vnz.getMinBaseQuality(); - removeNonZScoredRecords = vnz.isRemoveNonZScoredRecords(); - maximumZScore = vnz.getMaximumZScore(); - tabixReader = new TabixReader(mpileupFile.getCanonicalPath()); - fourDecimalMax.setMaximumFractionDigits(4); - bpPad = vnz.getBpPad(); - seqMeanStd = vnz.getSeqMeanStd(); - fasta = new IndexedFastaSequenceFile(vnz.getIndexedFasta()); - if (fasta.isIndexed() == false) Misc.printErrAndExit("\nError: cannot find your xxx.fai index or the multi fasta file isn't indexed\n"); - - } - - public void run() { - try { - //get next chunk of work - while (vnz.loadVcfRecords(vcfLines)){ - - //for each - for (String record: vcfLines) score(record); - vnz.save(modVcfRecords, baseCounts); - - //cleanup - vcfLines.clear(); - modVcfRecords.clear(); - baseCounts.clear(); - } - //update - vnz.update(numNotScored, numFailingZscore); - //reset - numNotScored = 0; - numFailingZscore = 0; - fasta.close(); - } catch (Exception e) { - failed = true; - System.err.println("\nError: problem parsing mpileup lines\n" ); - e.printStackTrace(); - } - } - - private void printFailingRecord(String[] fields, String record) throws IOException{ - if (removeNonZScoredRecords == false) printScoredVcf(fields, record); - tooFewSamples.add(record); - numNotScored++; - } - - private void score(String record) throws Exception { - //#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR - String[] fields = Misc.TAB.split(record); - - checkChromosome(fields[0]); - - //interbase coor of effected bps - int[] startStop = QueryIndexFileLoader.fetchEffectedBps(fields, true); - if (startStop == null) { - System.err.println("WARNING: Failed to parse the effected bps for : "+record); - printFailingRecord(fields, record); - return; - } - - - //pull mpileup records, if none then warn and save vcf record - int start = startStop[0] + 1 - bpPad; - if (start < 1) start = 1; - String tabixCoor = fields[0]+":"+start+"-"+(startStop[1]+bpPad); - TabixReader.Iterator it = fetchInteratorOnCoordinates(tabixCoor); - if (it == null) { - printFailingRecord(fields, record); - return; - } - - StringBuilder sb = null; - if (verbose) { - sb = new StringBuilder ("VCF\t"); - sb.append(record); - sb.append("\n"); - } - - //for each mpileup record, find highest z-score - String mpileupLine = null; - double maxZScore = -1; - - while ((mpileupLine = it.next()) != null){ - - //parse mpilup line and pull first sample - MpileupLine ml = new MpileupLine(mpileupLine, minBaseQuality); - if (ml.getChr() == null) throw new IOException ("Failed to parse the mpileup line:\n"+mpileupLine); - - MpileupSample toExamine = ml.getSamples()[0]; - double afN = toExamine.getAlleleFreqNs(); - - //fetch five mer - int pos = ml.getZeroPos(); - String seq = workingSequence.substring(pos-2, pos+3).toUpperCase(); -IO.pl("Seq:"+seq+" Ref:"+ml.getRef()+" afN:"+afN); - - //fetch meanStd and calc zscore - double[] meanStd = seqMeanStd.get(seq); - double zscore = (afN-meanStd[0])/meanStd[1]; - if (Double.isInfinite(zscore)) zscore = VCFNz.zscoreForInfinity; - if (zscore < VCFNz.zscoreLessThanZero) zscore = VCFNz.zscoreLessThanZero; - - if (zscore > maxZScore) maxZScore = zscore; - - } - - //was a z-score calculated? - if (minZScore == Double.MAX_VALUE) printFailingRecord(fields, record); - - //append min zscore and save - else { - //fetch sorted, smallest to largest AFs - double[] bkgAFs = fetchSortedAFs(minZScoreSamples); - - //was a background AF >= tum AF seen? - boolean bkafFound = (freq <= bkgAFs[bkgAFs.length-1]); - - if (bkafFound && excludeBKAFs) { - numWithBKAF++; - } - else { - if (bkafFound) fields[6] = modifyFilterField(fields[6]); - - //fetch AFs - String bkafString = fetchFormattedAFs(bkgAFs); - String bkzString = Num.formatNumberNoComma(minZScore, 2); - fields[7] = "BKZ="+bkzString+";BKAF="+bkafString+";"+fields[7]; - - if (replaceQualScore) fields[5] = bkzString; - String modRecord = Misc.stringArrayToString(fields, "\t"); - - //build counts for R, first is the tumor sample, the latter are the normals - int[][] counts = fetchCountsForR(minZScoreSamples, numNonRef, depth); - - //threshold it? - if (minimumZScore == 0) { - modVcfRecords.add(modRecord); - baseCounts.add(counts); - } - else { - if (minZScore < minimumZScore) numFailingZscore++; - else { - modVcfRecords.add(modRecord); - baseCounts.add(counts); - } - } - } - - - - } - } - - private void checkChromosome(String chr) throws IOException { - if (chr.equals(workingChromosome) == false) { - workingChromosome = chr; - //find and load sequence - ReferenceSequence p = fasta.getSequence(workingChromosome); - if (p == null ) throw new IOException ("\n\nFailed to find or load a fasta sequence for '"+workingChromosome+"', aborting.\n"); - workingSequence = new String(p.getBases()); - } - } - - /**Adds a BKAF fail to the FILTER field.*/ - private static String modifyFilterField(String filterField) { - //nada or pass? - if (filterField.length() == 0 || filterField.equals(".") || filterField.toUpperCase().equals("PASS")) return "BKAF"; - else return "BKAF;" +filterField; - } - - public static boolean highBkgrndAFsFound(double tAF, MpileupSample[] toExamine) { - for (int i=0; i< toExamine.length; i++) if (toExamine[i].getAlleleFreqNonRefPlusIndels() >= tAF) return true; - return false; - } - - private int[][] fetchCountsForR(MpileupSample[] minZScoreSamples, int numNonRef, double depth ){ - int[] nonRef = new int[minZScoreSamples.length+1]; - int[] total = new int[nonRef.length]; - nonRef[0] = numNonRef; - total[0] = ((int)depth) - numNonRef; - int index = 1; - for (MpileupSample ms: minZScoreSamples){ - nonRef[index] = ms.getNonRefBaseCounts() + ms.getInsertions()+ ms.getDeletions(); - total[index++] = ms.getReadCoverageAll(); - } - return new int[][]{nonRef,total}; - } - - private TabixReader.Iterator fetchInteratorOnCoordinates(String coordinates) { - TabixReader.Iterator it = null; - //watch out for no retrieved data error from tabix - try { - it = tabixReader.query(coordinates); - } catch (ArrayIndexOutOfBoundsException e){} - return it; - } - - public boolean isFailed() { - return failed; - } - - public TabixReader getTabixReader() { - return tabixReader; - } -} diff --git a/Source/edu/utah/seq/parsers/mpileup/MpileupTabixLoaderRs.java b/Source/edu/utah/seq/parsers/mpileup/MpileupTabixLoaderRs.java deleted file mode 100644 index eb10a60f..00000000 --- a/Source/edu/utah/seq/parsers/mpileup/MpileupTabixLoaderRs.java +++ /dev/null @@ -1,197 +0,0 @@ -package edu.utah.seq.parsers.mpileup; - -import java.io.File; -import java.io.IOException; -import java.text.NumberFormat; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashSet; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import edu.utah.seq.query.QueryIndexFileLoader; -import edu.utah.seq.vcf.VCFBackgroundChecker; -import edu.utah.seq.vcf.VCFBackgroundCheckerGraphite; -import edu.utah.seq.vcf.VCFBamAnnotator; -import edu.utah.seq.vcf.xml.SimpleVcf; -import htsjdk.tribble.readers.TabixReader; -import util.gen.Misc; -import util.gen.Num; - -/**Pulls mpileup lines that overlap each vcf record and calculates R1 and R2 coverage stats.*/ -public class MpileupTabixLoaderRs implements Runnable{ - - //fields - private boolean failed = false; - private VCFBamAnnotator vbc; - private TabixReader tabixReader = null; - private ArrayList vcfLines = new ArrayList(); - private ArrayList modVcfRecords = new ArrayList(); - private ArrayList notAnnotated = new ArrayList(); - private int minBaseQuality; - private boolean verbose; - - - public MpileupTabixLoaderRs (File mpileupFile, VCFBamAnnotator vbc) throws IOException{ - this.vbc = vbc; - minBaseQuality = vbc.getMinBaseQuality(); - verbose = vbc.isVerbose(); - tabixReader = new TabixReader(mpileupFile.getCanonicalPath()); - } - - public void run() { - try { - //get next chunk of work - while (vbc.loadVcfRecords(vcfLines)){ - - //for each - for (String record: vcfLines) score(record); - - //add back records -vbc.addProcessedVcfs(modVcfRecords, notAnnotated); - - //cleanup - vcfLines.clear(); - modVcfRecords.clear(); - notAnnotated.clear(); - } - } catch (Exception e) { - failed = true; - System.err.println("\nError: problem fetching mpileup lines\n" ); - e.printStackTrace(); - } - } - - private void score(String record) throws Exception { - //#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR - String[] fields = Misc.TAB.split(record); - SimpleVcf vcf = new SimpleVcf(record, 0); - -//System.out.println("\nVCF\t"+record); - - //interbase coor of effected bps - int[] startStop = QueryIndexFileLoader.fetchEffectedBps(fields, true); - if (startStop == null) { - System.err.println("Failed to parse the effected bps for "+record); - notAnnotated.add(record); - return; - } - -//System.out.println("SS\t"+startStop[0]+" "+startStop[1]); - - //pull mpileup records, if none then warn and save vcf record - int start = startStop[0]+1; - if (start < 1) start = 1; - String tabixCoor = fields[0]+":"+start+"-"+(startStop[1]); - TabixReader.Iterator it = fetchInteratorOnCoordinates(tabixCoor); - if (it == null) { - System.err.println("Failed to fetch mpileup data for "+record); - notAnnotated.add(record); - return; - } - - //load mpileup records - ArrayList mLines = new ArrayList(); - String mpileupLine = null; - while ((mpileupLine = it.next()) != null){ - //parse mpilup line and pull filtered set of samples - MpileupLine ml = new MpileupLine(mpileupLine, minBaseQuality); - if (ml.getChr() == null) throw new IOException ("Failed to parse the mpileup line:\n"+record); - mLines.add(ml); - } - - //snp? - if (vcf.isSnv()){ - //check there is only one line - if (mLines.size() !=1) throw new IOException ("Multiple mpileup lines found for a snv?!: "+mLines.size()+"\n"+record); - //check reference - MpileupLine ml = mLines.get(0); - if (vcf.getRef().equals(ml.getRef()) == false) throw new IOException ("Reference of vcf and mpileup differ?!:\n"+mpileupLine +"\n"+record); - //check position - if (vcf.getPos() != ml.getZeroPos()) throw new IOException ("The position of vcf and mpileup differ?!:\n"+mpileupLine +"\n"+record); - //calculate AF and DP for each sample - MpileupSample[] r1r2 = ml.getSamples(); - - StringBuilder sb = new StringBuilder(vcf.toString()); - sb.append("\tGT:AF:DP:R1:R2:DPR1:DPR2"); - for (int i=0; i< r1r2.length; i++){ - - int altBaseIndex = MpileupSample.getIndex(vcf.getAlt().toUpperCase().charAt(0)); - //read one - int R1 = r1r2[i].getForwardGATC()[altBaseIndex] + r1r2[i].getReverseGATC()[altBaseIndex]; - int DPR1 = r1r2[i].getReadCoverageForwardBases() + r1r2[i].getReadCoverageReverseBases(); - i++; - //read two - int R2 = r1r2[i].getForwardGATC()[altBaseIndex] + r1r2[i].getReverseGATC()[altBaseIndex]; - int DPR2 = r1r2[i].getReadCoverageForwardBases() + r1r2[i].getReadCoverageReverseBases(); - //combine - double AF = (double)(R1+R2)/ (DPR1+DPR2); - int DP = DPR1 + DPR2; - - //make string GT:AF:DP:R1:R2:DPR1:DPR2, GT is required by some apps - sb.append("\t./.:"); - sb.append(Num.formatNumberJustMax(AF, 4)); sb.append(":"); - sb.append(DP); sb.append(":"); - sb.append(R1); sb.append(":"); - sb.append(R2); sb.append(":"); - sb.append(DPR1); sb.append(":"); - sb.append(DPR2); - } - System.out.println(sb); - } - } - - /**Adds a BKAF fail to the FILTER field.*/ - private static String modifyFilterField(String filterField) { - //nada or pass? - if (filterField.length() == 0 || filterField.equals(".") || filterField.toUpperCase().equals("PASS")) return "BKAF"; - else return "BKAF;" +filterField; - } - - public static boolean highBkgrndAFsFound(double tAF, MpileupSample[] toExamine) { - for (int i=0; i< toExamine.length; i++) if (toExamine[i].getAlleleFreqNonRefPlusIndels() >= tAF) return true; - return false; - } - - private int[][] fetchCountsForR(MpileupSample[] minZScoreSamples, int numNonRef, double depth ){ - int[] nonRef = new int[minZScoreSamples.length+1]; - int[] total = new int[nonRef.length]; - nonRef[0] = numNonRef; - total[0] = ((int)depth) - numNonRef; - int index = 1; - for (MpileupSample ms: minZScoreSamples){ - nonRef[index] = ms.getNonRefBaseCounts() + ms.getInsertions()+ ms.getDeletions(); - total[index++] = ms.getReadCoverageAll(); - } - return new int[][]{nonRef,total}; - } - - private TabixReader.Iterator fetchInteratorOnCoordinates(String coordinates) { - TabixReader.Iterator it = null; - //watch out for no retrieved data error from tabix - try { - it = tabixReader.query(coordinates); - } catch (ArrayIndexOutOfBoundsException e){} - return it; - } - - private void addPileupInfo(StringBuilder sb, MpileupLine ml, MpileupSample[] toExamine, double zscore) { - sb.append("\tMpileup\t"+ml.getChr()+"\t"+(ml.getZeroPos()+1) +"\t"+ml.getRef()+"\t"+zscore+"\n"); - //generate sample info - for (MpileupSample ms: toExamine){ - sb.append("\t\tSample\t"); - ms.appendCounts(sb, true); - sb.append("\t"); - sb.append(ms.getAlleleFreqNonRefPlusIndels()); - sb.append("\n"); - } - } - - public boolean isFailed() { - return failed; - } - - public TabixReader getTabixReader() { - return tabixReader; - } -} diff --git a/Source/edu/utah/seq/vcf/GraphiteZScore.java b/Source/edu/utah/seq/vcf/GraphiteZScore.java deleted file mode 100644 index 3c922fa9..00000000 --- a/Source/edu/utah/seq/vcf/GraphiteZScore.java +++ /dev/null @@ -1,180 +0,0 @@ -package edu.utah.seq.vcf; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.regex.Matcher; - -import edu.utah.seq.parsers.mpileup.MpileupSample; -import edu.utah.seq.parsers.mpileup.MpileupTabixLoader; -import util.gen.IO; -import util.gen.Misc; -import util.gen.Num; - -public class GraphiteZScore { - - private File vcfIn; - private File graphiteExe; - private File indexedFasta; - private File[] normalBams; - private File graphiteVcf; - private String optionalBashCmds = null; - private double minimumSampleReadCount; - private int minimumNumSamples; - private double maximumSampleAF; - private boolean deleteTemp = false; - private HashMap vcfGraphiteResults = new HashMap(); - - /**The vcfToProcess must be stripped of any FORMAT and sample fields, only CHROM thru INFO. - * @throws IOException */ - public GraphiteZScore(File vcfToProcess, File graphiteExe, File[] normalBams, File indexedFasta, String optionalBashCmds, int minimumSampleReadCount, int minimumNumSamples, double maximumSampleAF) throws Exception { - this.vcfIn = vcfToProcess; - this.graphiteExe = graphiteExe; - this.normalBams = normalBams; - this.indexedFasta = indexedFasta; - this.optionalBashCmds = optionalBashCmds; - this.minimumSampleReadCount = minimumSampleReadCount; - this.minimumNumSamples = minimumNumSamples; - this.maximumSampleAF = maximumSampleAF; - doWork(); - } - - private void doWork() throws Exception { - //graphite -f indexFasta -v vcf -o GResults -t 20 -b bam1 -b bam2 -b bam3 - String name = Misc.removeExtension(vcfIn.getName()); - File resultsDir = new File(vcfIn.getParentFile(), "GraphiteTemp_"+name); - resultsDir.mkdirs(); - graphiteVcf = new File(resultsDir, vcfIn.getName()); - - executeGraphite(resultsDir); - - parseGraphiteResults(); - - if (deleteTemp) { - graphiteVcf.delete(); - IO.deleteDirectory(resultsDir); - } - - } - - private void parseGraphiteResults() throws Exception{ - - BufferedReader in = IO.fetchBufferedReader(graphiteVcf); - String line; - String[] t; - String[] format; - String[] sample; - String[] dp4; - ArrayList afs = new ArrayList(); - while ((line = in.readLine())!= null){ - if (line.startsWith("#")) continue; - t = Misc.TAB.split(line); - - format = Misc.COLON.split(t[8]); - //find DP4_NFP - int index =0; - for (;index 0) { - if (messages != null) throw new IOException("\nProblem executing\n"+bs.toString()+"\n"+Misc.stringArrayToString(messages, "\n")); - throw new IOException("\nProblem executing\n"+bs.toString()+"\n"); - } - } - - public HashMap getVcfGraphiteResults() { - return vcfGraphiteResults; - } - - -} diff --git a/Source/edu/utah/seq/vcf/VCFBackgroundChecker.java b/Source/edu/utah/seq/vcf/VCFBackgroundChecker.java deleted file mode 100644 index 588a8391..00000000 --- a/Source/edu/utah/seq/vcf/VCFBackgroundChecker.java +++ /dev/null @@ -1,422 +0,0 @@ -package edu.utah.seq.vcf; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import edu.utah.seq.parsers.mpileup.MpileupTabixLoader; -import util.gen.Gzipper; -import util.gen.IO; -import util.gen.Misc; -import util.gen.Num; - -/**Given a vcf file compares each record against a set of normal samples from a tabix indexed mpileup file and marks those with evidence of the variant in the normals. - * tabix-0.2.6/bgzip ~/repro3Ct8Norm.mpileup - * tabix-0.2.6/tabix -s 1 -b 2 -e 2 ~/repro3Ct8Norm.mpileup.gz - * @author Nix*/ -public class VCFBackgroundChecker { - - //user defined fields - private File[] vcfFiles; - private File mpileup; - private int[] sampleIndexesToUse = null; - private File saveDir; - private int minBaseQuality = 20; - private int minReadCoverage = 20; - private int minNumSamples = 3; - private boolean verbose = false; - private boolean removeNonZScoredRecords = false; - private double minimumZScore = 0; - private double maxSampleAF = 0.3; - private boolean replaceQualScore = false; - private int numberThreads = 0; - private String afInfoName = "T_AF"; - private String dpInfoName = "T_DP"; - private int bpPad = 0; - - //internal - private static final int numVcfToProc = 100; - private MpileupTabixLoader[] loaders; - public static final double zscoreForInfinity = 1000; - public static final double zscoreLessThanZero = 0.001; - - //working - private File vcfFile; - private BufferedReader vcfIn; - private PrintWriter rOut; - private int numRecords = 0; - private int numNotScored = 0; - private int numFailingZscore = 0; - private int numSaved = 0; - private ArrayList tooFewSamples = new ArrayList(); - private ArrayList vcfHeader = new ArrayList(); - private ArrayList vcfRecords = new ArrayList(); - - //constructor - public VCFBackgroundChecker(String[] args){ - long startTime = System.currentTimeMillis(); - try { - processArgs(args); - - //make Loaders - loaders = new MpileupTabixLoader[numberThreads]; - for (int i=0; i< loaders.length; i++) loaders[i] = new MpileupTabixLoader(mpileup, this); - - //for each vcf file - IO.pl("\nParsing vcf files..."); - for (int i=0; i< vcfFiles.length; i++){ - //set values and clear past data - vcfFile = vcfFiles[i]; - numRecords = 0; - numNotScored = 0; - numFailingZscore = 0; - numSaved = 0; - tooFewSamples.clear(); - vcfRecords.clear(); - - IO.pl(vcfFile.getName()); - String name = Misc.removeExtension(vcfFile.getName()); - createReaderSaveHeader(); - File tempRData = new File(saveDir, name+"_RObs.txt"); - - tempRData.deleteOnExit(); - rOut = new PrintWriter( new FileWriter(tempRData)); - - //load mpileup data - ExecutorService executor = Executors.newFixedThreadPool(numberThreads); - for (MpileupTabixLoader l: loaders) executor.execute(l); - executor.shutdown(); - while (!executor.isTerminated()) {} - - //check loaders - for (MpileupTabixLoader l: loaders) { - if (l.isFailed()) throw new IOException("ERROR: File Loader issue! \n"); - } - vcfIn.close(); - rOut.close(); - - VcfSorter[] finalVcfs = fetchVcfSorterArray(); - Arrays.sort(finalVcfs); - - writeOutModifiedVcf (finalVcfs, new File(saveDir, name+".vcf.gz")); - - IO.pl("\t#Rec="+numRecords+" #Saved="+numSaved+" #NotScored="+numNotScored+" #FailingBKZ="+numFailingZscore+"\n"); - } - - } catch (Exception e) { - e.printStackTrace(); - Misc.printErrAndExit("Problem encountered, aborting!"); - } finally { - //shut down loaders - for (MpileupTabixLoader m : loaders) m.getTabixReader().close(); - - //finish and calc run time - double diffTime = ((double)(System.currentTimeMillis() -startTime))/60000; - IO.pl("\nDone! "+Math.round(diffTime)+" min\n"); - } - } - - private void writeOutModifiedVcf(VcfSorter[] finalVcfs, File file) throws IOException { - Gzipper out = new Gzipper(file); - //add header - for (String s: vcfHeader) out.println(s); - for (VcfSorter v : finalVcfs) out.println(v.fields, "\t"); - out.close(); - } - - private VcfSorter[] fetchVcfSorterArray() throws Exception { - int num= vcfRecords.size(); - - VcfSorter[] v = new VcfSorter[num]; - for (int x = 0; x < num; x++){ - String vcf = vcfRecords.get(x); - v[x] = new VcfSorter(Misc.TAB.split(vcf)); - } - return v; - } - - private class VcfSorter implements Comparable{ - private String chr; - private int pos; - //#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLES - String[] fields; - - public VcfSorter(String[] fields){ - this.fields = fields; - chr = fields[0]; - pos = Integer.parseInt(fields[1]); - } - - public int compareTo(VcfSorter other) { - //sort by chromosome - int x = chr.compareTo(other.chr); - if (x !=0) return x; - //sort by position - if (pos < other.pos) return -1; - if (pos > other.pos) return 1; - return 0; - } - } - - public synchronized void save(ArrayList vcf, ArrayList baseCounts) { - int num = vcf.size(); - numSaved += num; - - //save vcf records - vcfRecords.addAll(vcf); - } - - /**Loads the AL with vcf lines up to the numVcfToProc, if none could be read, returns false, otherwise true.*/ - public synchronized boolean loadVcfRecords(ArrayList chunk) throws IOException{ - String record; - int count = 0; - while ((record = vcfIn.readLine()) != null){ - record = record.trim(); - if (record.length() == 0) continue; - chunk.add(record); - if (count++ > numVcfToProc) break; - } - - if (chunk.size() == 0) return false; - numRecords += chunk.size(); - return true; - } - - public synchronized void update(int numNotScored, int numFailingZscore, ArrayList tooFewSamples) { - this.numNotScored += numNotScored; - this.numFailingZscore+= numFailingZscore; - this.tooFewSamples.addAll(tooFewSamples); - } - - private void createReaderSaveHeader() throws Exception { - vcfIn = IO.fetchBufferedReader(vcfFile); - vcfHeader.clear(); - String record; - boolean addInfo = true; - boolean endFound = false; - while ((record = vcfIn.readLine()) != null){ - record = record.trim(); - if (record.length() == 0) continue; - if (record.startsWith("#")){ - if (addInfo && record.startsWith("##INFO=")) { - vcfHeader.add("##FILTER== 0.9 x variant AF.\">"); - vcfHeader.add("##INFO="); - vcfHeader.add("##INFO="); - addInfo = false; - } - vcfHeader.add(record); - if (record.startsWith("#CHROM")){ - endFound = true; - break; - } - } - else { - endFound = true; - break; - } - } - if (endFound == false) Misc.printErrAndExit("\nERROR: failed to find the #CHROM line, aborting\n"); - } - - public static void main(String[] args) { - if (args.length ==0){ - printDocs(); - System.exit(0); - } - new VCFBackgroundChecker(args); - } - - - - /**This method will process each argument and assign new variables*/ - public void processArgs(String[] args){ - Pattern pat = Pattern.compile("-[a-z]"); - IO.pl("\n"+IO.fetchUSeqVersion()+" Arguments: "+Misc.stringArrayToString(args, " ")+"\n"); - File forExtraction = null; - for (int i = 0; i numAvail) numberThreads = numAvail - 1; - - printSettings(); - - } - - - - public void printSettings(){ - IO.pl("Settings:\nBackground\t"+mpileup); - IO.pl("Save dir\t"+saveDir); - IO.pl(afInfoName+ "\tTumor AF INFO name"); - IO.pl(dpInfoName+ "\tTumor DP INFO name"); - IO.pl(minReadCoverage+"\tMin mpileup sample read coverage"); - IO.pl(minBaseQuality+"\tMin mpileup sample base quality"); - IO.pl(maxSampleAF+"\tMax mpileup sample AF"); - IO.pl(minNumSamples+"\tMin # samples for z-score calc"); - IO.pl(minimumZScore+"\tMin vcf AF z-score to save"); - IO.pl(bpPad+"\tBP padding +/- to vcf record for mpileup scanning"); - if (sampleIndexesToUse != null) IO.pl(Misc.intArrayToString(sampleIndexesToUse, " ")+"\tSample indexes to search"); - IO.pl(numberThreads+"\tCPUs"); - IO.pl(removeNonZScoredRecords+ "\tExclude vcf records that could not be z-scored"); - IO.pl(verbose+"\tVerbose"); - IO.pl(replaceQualScore+"\tReplace QUAL score with z-score and set non scored records to 0"); - } - - - public static void printDocs(){ - IO.pl("\n" + - "**************************************************************************************\n" + - "** VCF Background Checker : Aug 2019 **\n" + - "**************************************************************************************\n" + - "VBC calculates non-reference allele frequencies (AF) from a background multi-sample \n"+ - "mpileup file over each vcf record. It then calculates a z-score for the vcf AF and \n"+ - "appends it to the INFO field. If multiple bps are affected (e.g. INDELs) or bp padding\n"+ - "provided, the lowest bp z-score is appended. Z-scores < ~4 are indicative of non\n"+ - "reference bps in the background samples. A flag is appended the FILTER field if a\n"+ - "background AF was found within 10% of the vcf AF. Note, VBC requires AF and DP tags\n"+ - "in the INFO field of each record to use in the z-score calculation, see -f and -p. \n"+ - - "\nRequired:\n"+ - "-v Path to a xxx.vcf(.gz/.zip OK) file or directory containing such.\n" + - "-m Path to a bgzip compressed and tabix indexed multi-sample mpileup file. e.g.:\n"+ - " 1) Mpileup: 'echo \"#SampleOrder: \"$(ls *bam) > bkg.mpileup; samtools mpileup\n"+ - " -B -R -A -q 13 -d 1000000 -f $fastaIndex -l $bedFile *.bam >> bkg.mpu'\n"+ - " 2) (Optional) MpileupRandomizer: java -jar -Xmx10G ~/USeqApps/MpileupRandomizer\n"+ - " -r 20 -s 3 -m bkg.mpu\n"+ - " 3) Bgzip: 'HTSlib/bgzip --threads 25 bkg.mpu_DP20MS3.txt'\n"+ - " Tabix: 'HTSlib/tabix -s 1 -b 2 -e 2 bkg.mpu_DP20MS3.txt.gz'\n"+ - "-s Path to directory in which to save the modified vcf file(s)\n"+ - - "\nOptional:\n" + - "-f Tumor AF INFO name, defaults to T_AF\n"+ - "-p Tumor DP INFO name, defaults to T_DP\n"+ - "-z Minimum vcf z-score, defaults to 0, no filtering. Unscored vars are kept.\n"+ - "-q Minimum mpileup sample bp quality, defaults to 20\n"+ - "-c Minimum mpileup sample read coverge, defaults to 20\n"+ - "-f Maximum mpileup sample AF, defaults to 0.3\n"+ - "-a Minimum # mpileup samples for z-score calculation, defaults to 3\n" + - "-b Pad the size of each vcf record, defaults to 0 bp\n"+ - "-i Sample indexes to search, comma delimited, no spaces, zero based, defaults to all\n"+ - "-e Exclude vcf records that could not be z-scored\n"+ - "-u Replace QUAL value with z-score. Un scored vars will be assigned 0\n"+ - "-d Print verbose debugging output\n" + - "-t Number of threads to use, defaults to all\n"+ - - - "\n"+ - - "Example: java -Xmx4G -jar pathTo/USeq/Apps/VCFBackgroundChecker -v SomaticVcfs/ -z 3\n"+ - "-m bkg.mpileup.gz -s BkgFiltVcfs/ -q 13 -u -b 2\n\n"+ - - "**************************************************************************************\n"); - } - - //getters and setters - - public int getMinBaseQuality() { - return minBaseQuality; - } - - public int getMinReadCoverage() { - return minReadCoverage; - } - - public int getMinNumSamples() { - return minNumSamples; - } - - public boolean isRemoveNonZScoredRecords() { - return removeNonZScoredRecords; - } - - public double getMinimumZScore() { - return minimumZScore; - } - - public double getMaxSampleAF() { - return maxSampleAF; - } - - public boolean isReplaceQualScore() { - return replaceQualScore; - } - - public boolean isVerbose() { - return verbose; - } - - public String getAFInfoName() { - return afInfoName; - } - - public String getDPInfoName() { - return dpInfoName; - } - - public int getBpPad() { - return bpPad; - } - - public int[] getSampleIndexesToUse() { - return sampleIndexesToUse; - } -} diff --git a/Source/edu/utah/seq/vcf/VCFBackgroundCheckerGraphite.java b/Source/edu/utah/seq/vcf/VCFBackgroundCheckerGraphite.java deleted file mode 100644 index 73cfffcd..00000000 --- a/Source/edu/utah/seq/vcf/VCFBackgroundCheckerGraphite.java +++ /dev/null @@ -1,493 +0,0 @@ -package edu.utah.seq.vcf; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import edu.utah.seq.analysis.OverdispersedRegionScanSeqs; -import edu.utah.seq.parsers.mpileup.MpileupSample; -import edu.utah.seq.parsers.mpileup.MpileupTabixLoaderGraphite; -import edu.utah.seq.vcf.GraphiteZScore.GraphiteResult; -import edu.utah.seq.vcf.xml.SimpleVcf; -import util.gen.IO; -import util.gen.Misc; -import util.gen.Num; - -/**Given a vcf file compares each record against a set of normal samples from a tabix indexed mpileup file and marks those with evidence of the variant in the normals. - * tabix-0.2.6/bgzip ~/repro3Ct8Norm.mpileup - * tabix-0.2.6/tabix -s 1 -b 2 -e 2 ~/repro3Ct8Norm.mpileup.gz - * @author Nix*/ -public class VCFBackgroundCheckerGraphite { - - //user defined fields - private File[] vcfFiles; - private File mpileup; - private File saveDir; - private File indexedFasta; - private String optionalBashCmds = null; - private int minBaseQuality = 20; - private int minReadCoverage = 20; - private int minNumSamples = 3; - private boolean verbose = false; - private boolean removeNonZScoredRecords = false; - private double minimumZScore = 0; - private double maxSampleAF = 0.3; - private boolean replaceQualScore = false; - private int numberThreads = 0; - private File graphite = null; - private File[] graphiteBams = null; - private int bpProximalSnv = 125; - - //internal - private static final int numVcfToProc = 100; - private MpileupTabixLoaderGraphite[] loaders; - private boolean deleteTempFiles = false; - - //working - private File vcfFile; - private File vcfForGraphite; - private File vcfForMPileUp; - private BufferedReader vcfIn; - private PrintWriter vcfOut; - private int numRecords = 0; - private int numNotScored = 0; - private int numFailingZscore = 0; - private int numSaved = 0; - private ArrayList tooFewSamples = new ArrayList(); - private HashMap shortLongVcfGraphite = new HashMap(); - - //constructor - public VCFBackgroundCheckerGraphite(String[] args) { - long startTime = System.currentTimeMillis(); - try { - processArgs(args); - - //make Loaders - loaders = new MpileupTabixLoaderGraphite[numberThreads]; - for (int i=0; i< loaders.length; i++) loaders[i] = new MpileupTabixLoaderGraphite(mpileup, this); - - - //for each vcf file - System.out.println("\nParsing vcf files..."); - for (int i=0; i< vcfFiles.length; i++){ - vcfFile = vcfFiles[i]; - resetWorkingParams(); - String name = Misc.removeExtension(vcfFile.getName()); - System.out.println(name); - - //split into isolated snvs for mpileup and others for graphite - vcfForGraphite = new File(saveDir, name+"ForGraphiteTemp.vcf"); - vcfForMPileUp = new File(saveDir, name+"ForMpileUpTemp.vcf"); - int[] goodBad = parseOutIsolatedSnvVars(vcfForMPileUp, vcfForGraphite); - - File tempVcf = new File(saveDir, name+"_BKZed.vcf"); - if (deleteTempFiles) tempVcf.deleteOnExit(); - vcfOut = new PrintWriter(new FileWriter(tempVcf)); - createReaderSaveHeader(vcfForMPileUp); - - System.out.println("\tProcessing "+goodBad[0]+" SNVs with mpileup..."); - ExecutorService executor = Executors.newFixedThreadPool(numberThreads); - for (MpileupTabixLoaderGraphite l: loaders) executor.execute(l); - executor.shutdown(); - - //spins here until the executer is terminated, e.g. all threads complete - while (!executor.isTerminated()) {} - - //check loaders - for (MpileupTabixLoaderGraphite l: loaders) { - if (l.isFailed()) throw new IOException("ERROR: File Loader issue! \n"); - } - - vcfIn.close(); - - //run graphite - System.out.println("\tProcessing "+goodBad[1]+" INDELS with graphite (slow)..."); - GraphiteZScore g = new GraphiteZScore(vcfForGraphite, graphite, graphiteBams, indexedFasta, optionalBashCmds, minReadCoverage, minNumSamples, maxSampleAF); - addGraphiteResults(g); - - //close out - vcfOut.close(); - - //sort, in memory so hopefully it doesn't blow up - File sorted = new File(tempVcf.getParentFile(), tempVcf.getName()+".gz"); - SimpleVcf.sortVcf(tempVcf, sorted); - - //print summary stats - if (tooFewSamples.size() !=0){ - System.err.println("WARNING, the following did not have any mpileup lines (e.g. outside the compiled regions?) or enough background " - + "samples passing thresholds to calculate a z-score. "); - System.err.println(Misc.stringArrayListToString(tooFewSamples, "\n")); - } - System.out.println("\n#Rec="+numRecords+" #Saved="+numSaved+" #NotScored="+numNotScored+" #FailingBKZ="+numFailingZscore+"\n"); - } - - //delete temp files - if (deleteTempFiles) { - if (vcfForGraphite != null) vcfForGraphite.delete(); - if (vcfForMPileUp != null) vcfForMPileUp.delete(); - } - - } catch (Exception e) { - e.printStackTrace(); - } finally { - //shut down loaders - for (MpileupTabixLoaderGraphite m : loaders) m.getTabixReader().close(); - //finish and calc run time - double diffTime = ((double)(System.currentTimeMillis() -startTime))/60000; - System.out.println("\nDone! "+Math.round(diffTime)+" min\n"); - - } - } - - private void addGraphiteResults(GraphiteZScore g) throws IOException { - HashMap vcfGraphiteResults = g.getVcfGraphiteResults(); - - if (vcfGraphiteResults.size() != shortLongVcfGraphite.size()) throw new IOException ("The number of graphite results isn't equal to the number of input vcf records?!"); - for (String shortVcf: vcfGraphiteResults.keySet()){ - GraphiteResult gr = vcfGraphiteResults.get(shortVcf); - String oriVcf = shortLongVcfGraphite.get(shortVcf); - String[] oriFields = Misc.TAB.split(oriVcf); - if (oriVcf == null) throw new IOException ("Failed to find an original vcf record for this short graphite result?! "+shortVcf); - //too few samples? - if (gr == null) { - tooFewSamples.add(oriVcf); - numNotScored++; - if (removeNonZScoredRecords == false) printScoredVcf(oriFields, oriVcf); - } - - //append min zscore and save - else { - //fetch AFs - String bkafs = fetchFormattedAFs(gr.getBkAFs()); - String bkzString = Num.formatNumberNoComma(gr.getzScore(), 2); - oriFields[7] = "BKZ="+bkzString+";BKAF="+bkafs+";"+oriFields[7]; - if (replaceQualScore) oriFields[5] = bkzString; - String modRecord = Misc.stringArrayToString(oriFields, "\t"); - //threshold it? - if (minimumZScore == 0) { - vcfOut.println(modRecord); - numSaved++; - } - else { - if (gr.getzScore() < minimumZScore) numFailingZscore++; - else { - vcfOut.println(modRecord); - numSaved++; - } - } - } - } - } - - public static String fetchFormattedAFs(double[] afs) { - StringBuilder sb = new StringBuilder(Num.formatNumber(afs[0], 4)); - for (int i=1; i< afs.length; i++){ - sb.append(","); - sb.append(Num.formatNumber(afs[i], 4)); - } - return sb.toString(); - } - - private void printScoredVcf(String[] fields, String record) throws IOException{ - numSaved++; - if (replaceQualScore) { - fields[5] = "0"; - vcfOut.println(Misc.stringArrayToString(fields, "\t")); - } - else vcfOut.println(record); - } - - private int[] parseOutIsolatedSnvVars(File forMpileup, File forGraphite) throws IOException{ - PrintWriter mpileupOut = new PrintWriter (new FileWriter(forMpileup)); - PrintWriter graphiteOut = new PrintWriter (new FileWriter(forGraphite)); - VCFParser p = new VCFParser(vcfFile, true, false, true); - graphiteOut.println("##fileformat=VCFv4.2\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); - mpileupOut.println(Misc.stringArrayListToString(p.getVcfComments().getOriginalComments(), "\n")); - - p.splitVCFRecordsByChromosome(); - HashMap chrVcf = p.getChromosomeVCFRecords(); - int numGoodSnvs = 0; - int numOthers = 0; - //for each chromosome - for (String chr: chrVcf.keySet()){ - VCFLookUp vlu = chrVcf.get(chr); - //for each record - VCFRecord[] records = vlu.getVcfRecord(); - for (VCFRecord r: records){ - if (r.isSNP()){ - int start = r.getPosition()-bpProximalSnv; - if (start < 0) start = 0; - VCFRecord[] ints = vlu.fetchVCFRecords(start, r.getPosition()+bpProximalSnv); - //just itself? - if (ints == null || ints.length <=1) { - mpileupOut.println(r.getOriginalRecord()); - numGoodSnvs++; - } - else { - String shortVcf = r.getTruncatedRecord(); - graphiteOut.println(shortVcf); - shortLongVcfGraphite.put(shortVcf, r.getOriginalRecord()); - numOthers++; - } - } - else { - String shortVcf = r.getTruncatedRecord(); - graphiteOut.println(shortVcf); - shortLongVcfGraphite.put(shortVcf, r.getOriginalRecord()); - numOthers++; - } - } - } - mpileupOut.close(); - graphiteOut.close(); - numRecords = numGoodSnvs + numOthers; - return new int[]{numGoodSnvs, numOthers}; - } - - private void resetWorkingParams() { - numRecords = 0; - numNotScored = 0; - numFailingZscore = 0; - numSaved = 0; - tooFewSamples.clear(); - shortLongVcfGraphite.clear(); - } - - /**Just prints out records.*/ - public synchronized void saveModifiedVcf(ArrayList vcf) throws IOException{ - for (String s: vcf) vcfOut.println(s); - numSaved+= vcf.size(); - } - - /**Loads the AL with vcf lines up to the numVcfToProc, if none could be read, returns false, otherwise true.*/ - public synchronized boolean loadVcfRecords(ArrayList chunk) throws IOException{ - String record; - int count = 0; - while ((record = vcfIn.readLine()) != null){ - record = record.trim(); - if (record.length() == 0) continue; - chunk.add(record); - if (count++ > numVcfToProc) break; - } - - if (chunk.size() == 0) return false; - return true; - } - - public synchronized void update(int numNotScored, int numFailingZscore, ArrayList tooFewSamples) { - this.numNotScored += numNotScored; - this.numFailingZscore+= numFailingZscore; - this.tooFewSamples.addAll(tooFewSamples); - } - - private void createReaderSaveHeader(File forMpileupVcf) throws Exception { - vcfIn = IO.fetchBufferedReader(forMpileupVcf); - String record; - boolean addInfo = true; - boolean endFound = false; - while ((record = vcfIn.readLine()) != null){ - record = record.trim(); - if (record.length() == 0) continue; - if (record.startsWith("#")){ - if (addInfo && record.startsWith("##INFO=")) { - vcfOut.println("##INFO="); - vcfOut.println("##INFO="); - addInfo = false; - } - vcfOut.println(record); - if (record.startsWith("#CHROM")){ - endFound = true; - break; - } - } - else { - endFound = true; - break; - } - } - if (endFound == false) Misc.printErrAndExit("\nERROR: failed to find the #CHROM line, aborting\n"); - } - - public static void main(String[] args) { - if (args.length ==0){ - printDocs(); - System.exit(0); - } - new VCFBackgroundCheckerGraphite(args); - } - - /**This method will process each argument and assign new variables*/ - public void processArgs(String[] args){ - Pattern pat = Pattern.compile("-[a-z]"); - System.out.println("\n"+IO.fetchUSeqVersion()+" Arguments: "+Misc.stringArrayToString(args, " ")+"\n"); - File forExtraction = null; - for (int i = 0; i numAvail) numberThreads = numAvail - 1; - - //graphite - if (graphite == null || graphite.canExecute() == false) Misc.printExit("\nError: cannot find or execute your provided graphite app?! "+graphite); - if (graphiteBams == null || graphiteBams.length == 0) Misc.printExit("\nError: failed to find any background normal bam files for use with graphite?"); - OverdispersedRegionScanSeqs.lookForBaiIndexes(graphiteBams, false); - if (indexedFasta == null || indexedFasta.exists() == false) Misc.printExit("\nError: cannot find your indexed fasta file used to align the normal bams?! "+indexedFasta); - - printSettings(); - } - - public void printSettings(){ - System.out.println("Settings:\nNormal mpileup\t"+mpileup); - System.out.println("Save dir\t"+saveDir); - System.out.println("Normal bam dir\t"+ graphiteBams[0].getParent()); - System.out.println(minReadCoverage+"\tMin mpileup sample read coverage"); - System.out.println(minBaseQuality+"\tMin mpileup sample base quality"); - System.out.println(maxSampleAF+"\tMax mpileup sample AF"); - System.out.println(minNumSamples+"\tMin # samples for z-score calc"); - System.out.println(minimumZScore+"\tMin vcf AF z-score to save"); - System.out.println(bpProximalSnv + "\tBP intersection radius for pushing SNVs to graphite AF estimation "); - System.out.println(numberThreads+"\tCPUs"); - System.out.println(removeNonZScoredRecords+ "\tExclude vcf records that could not be z-scored"); - System.out.println(verbose+"\tVerbose"); - System.out.println(replaceQualScore+"\tReplace QUAL score with z-score and set non scored records to 0"); - } - - - public static void printDocs(){ - System.out.println("\n" + - "**************************************************************************************\n" + - "** VCF Background Checker : April 2016 **\n" + - "**************************************************************************************\n" + - "VBC calculates non-ref allele frequencies (AF) from a normal background multi-sample \n"+ - "mpileup file over each snv or using normal bams and graphite for indel and multi alts.\n"+ - "It then calculates a z-score for the vcf AF and appends it to the INFO field. Z-scores\n"+ - "< ~4 are indicative of non reference bps in the background samples. Note, VBC requires\n"+ - "an AF tag in the INFO field of each record. \n"+ - - "\nRequired:\n"+ - "-v Path to a xxx.vcf(.gz/.zip OK) file or directory containing such.\n" + - "-m Path to a bgzip compressed and tabix indexed multi-sample mpileup file. e.g.:\n"+ - " 1) Mpileup: 'echo \"#SampleOrder: \"$(ls *bam) > bkg.mpileup; samtools mpileup\n"+ - " -B -q 13 -d 1000000 -f $fastaIndex -l $bedFile *.bam >> bkg.mpileup'\n"+ - " 2) Bgzip: 'tabix-0.2.6/bgzip bkg.mpileup'\n"+ - " Tabix: 'tabix-0.2.6/tabix -s 1 -b 2 -e 2 bkg.mpileup.gz'\n"+ - "-g Path to graphite executable, see https://github.com/dillonl/graphite.\n"+ - "-n Directory containing normal background indexed bams for graphite AF estimation.\n"+ - "-i Indexed fasta file used to align the normal bams.\n"+ - "-s Path to a directory to save the modified vcf file(s)\n"+ - - "\nOptional:\n" + - "-z Minimum vcf z-score, defaults to 0, no filtering\n"+ - "-q Minimum mpileup sample bp quality, defaults to 20\n"+ - "-c Minimum sample read coverge, defaults to 20\n"+ - "-f Maximum sample AF, defaults to 0.3\n"+ - "-a Minimum # samples for z-score calculation, defaults to 3\n" + - "-e Exclude vcf records that could not be z-scored\n"+ - "-r Replace QUAL value with z-score. Un scored vars will be assigned 0\n"+ - "-d Print verbose debugging output\n" + - "-t Number of threads to use, defaults to all\n"+ - "-o Additional bash shell commands before executing graphite,e.g. 'module load gcc/4.9'\n"+ - "-b BP intersection radius for pushing SNVs to graphite for AF estimation, default 125\n"+ - - "\n"+ - - "Example: java -Xmx4G -jar pathTo/USeq/Apps/VCFBackgroundChecker -v SomaticVcfs/ -z 3\n"+ - "-m bkg.mpileup.gz -s BkgFiltVcfs/ -b 50 -q 13 -e -r -g ~/Graphite/bin/graphite -n\n"+ - "~/BkgrndNormalBams/ -o 'module load gcc/4.9.2' -i ~/B37/b37.fa \n\n"+ - - "**************************************************************************************\n"); - } - - //getters and setters - public int getMinBaseQuality() { - return minBaseQuality; - } - - public int getMinReadCoverage() { - return minReadCoverage; - } - - public int getMinNumSamples() { - return minNumSamples; - } - - public boolean isRemoveNonZScoredRecords() { - return removeNonZScoredRecords; - } - - public double getMinimumZScore() { - return minimumZScore; - } - - public double getMaxSampleAF() { - return maxSampleAF; - } - - public boolean isReplaceQualScore() { - return replaceQualScore; - } - - public boolean isVerbose() { - return verbose; - } - -} diff --git a/Source/edu/utah/seq/vcf/VCFNz.java b/Source/edu/utah/seq/vcf/VCFNz.java deleted file mode 100644 index 1ac6758d..00000000 --- a/Source/edu/utah/seq/vcf/VCFNz.java +++ /dev/null @@ -1,406 +0,0 @@ -package edu.utah.seq.vcf; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import edu.utah.seq.parsers.mpileup.MpileupTabixLoader; -import edu.utah.seq.parsers.mpileup.MpileupTabixLoaderNz; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import util.gen.Gzipper; -import util.gen.IO; -import util.gen.Misc; -import util.gen.Num; - -/**Given a vcf file compares each record against a set of normal samples from a tabix indexed mpileup file and marks those with evidence of the variant in the normals. - * tabix-0.2.6/bgzip ~/repro3Ct8Norm.mpileup - * tabix-0.2.6/tabix -s 1 -b 2 -e 2 ~/repro3Ct8Norm.mpileup.gz - * @author Nix*/ -public class VCFNz { - - //user defined fields - private File seqStats = null; - private File vcfFile = null; - private File mpileup = null; - private File indexedFasta = null; - private File saveDir; - private int minBaseQuality = 13; - private boolean removeNonZScoredRecords = false; - private double maximumZScore = 0; - private int bpPad = 0; - - //internal - private static final int numVcfToProc = 100; - public static final double zscoreForInfinity = 1000; - public static final double zscoreLessThanZero = 0.001; - private int numberThreads; - private MpileupTabixLoaderNz[] loaders = null; - private HashMap seqMeanStd = null; - private IndexedFastaSequenceFile fasta; - - //working - private BufferedReader vcfIn; - private PrintWriter rOut; - private int numRecords = 0; - private int numNotScored = 0; - private int numFailingZscore = 0; - private int numSaved = 0; - private ArrayList tooFewSamples = new ArrayList(); - private ArrayList vcfHeader = new ArrayList(); - private ArrayList vcfRecords = new ArrayList(); - - //constructor - public VCFNz(String[] args){ - long startTime = System.currentTimeMillis(); - try { - processArgs(args); - - //load seq stats - seqMeanStd = (HashMap)IO.fetchObject(seqStats); - - //make Loaders - loaders = new MpileupTabixLoaderNz[numberThreads]; - for (int i=0; i< loaders.length; i++) loaders[i] = new MpileupTabixLoaderNz(mpileup, this); - - //for each vcf file - IO.pl("\nParsing vcf files..."); - IO.pl("Name\tRecords\tSaved\tNotScored\tFailingNZScore"); - - IO.p(vcfFile.getName()); - String name = Misc.removeExtension(vcfFile.getName()); - createReaderSaveHeader(); - - //load mpileup data - ExecutorService executor = Executors.newFixedThreadPool(numberThreads); - for (MpileupTabixLoaderNz l: loaders) executor.execute(l); - executor.shutdown(); - while (!executor.isTerminated()) {} - - //check loaders - for (MpileupTabixLoaderNz l: loaders) { - if (l.isFailed()) throw new IOException("ERROR: File Loader issue! \n"); - } - vcfIn.close(); - rOut.close(); - - VcfSorter[] finalVcfs = fetchVcfSorterArray(); - Arrays.sort(finalVcfs); - - writeOutModifiedVcf (finalVcfs, new File(saveDir, name+".nz.vcf.gz")); - - IO.pl("\t"+numRecords+"\t"+numSaved+"\t"+numNotScored+"\t"+numFailingZscore); - - - } catch (Exception e) { - e.printStackTrace(); - Misc.printErrAndExit("Problem encountered, aborting!"); - } finally { - //shut down loaders - for (MpileupTabixLoaderNz m : loaders) m.getTabixReader().close(); - - //finish and calc run time - double diffTime = ((double)(System.currentTimeMillis() -startTime))/60000; - IO.pl("\nDone! "+Math.round(diffTime)+" min\n"); - } - } - - private void writeOutModifiedVcf(VcfSorter[] finalVcfs, File file) throws IOException { - Gzipper out = new Gzipper(file); - //add header - for (String s: vcfHeader) out.println(s); - for (VcfSorter v : finalVcfs) out.println(v.fields, "\t"); - out.close(); - } - - private VcfSorter[] fetchVcfSorterArray() throws Exception { - int num= vcfRecords.size(); - - VcfSorter[] v = new VcfSorter[num]; - for (int x = 0; x < num; x++){ - String vcf = vcfRecords.get(x); - v[x] = new VcfSorter(Misc.TAB.split(vcf)); - } - return v; - } - - private class VcfSorter implements Comparable{ - private String chr; - private int pos; - //#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLES - String[] fields; - - public VcfSorter(String[] fields){ - this.fields = fields; - chr = fields[0]; - pos = Integer.parseInt(fields[1]); - } - - public int compareTo(VcfSorter other) { - //sort by chromosome - int x = chr.compareTo(other.chr); - if (x !=0) return x; - //sort by position - if (pos < other.pos) return -1; - if (pos > other.pos) return 1; - return 0; - } - } - - public synchronized void save(ArrayList vcf, ArrayList baseCounts) { - int num = vcf.size(); - numSaved += num; - - //save vcf records - vcfRecords.addAll(vcf); - } - - /**Loads the AL with vcf lines up to the numVcfToProc, if none could be read, returns false, otherwise true.*/ - public synchronized boolean loadVcfRecords(ArrayList chunk) throws IOException{ - String record; - int count = 0; - while ((record = vcfIn.readLine()) != null){ - record = record.trim(); - if (record.length() == 0) continue; - chunk.add(record); - if (count++ > numVcfToProc) break; - } - - if (chunk.size() == 0) return false; - numRecords += chunk.size(); - return true; - } - - public synchronized void update(int numNotScored, int numFailingZscore) { - this.numNotScored += numNotScored; - this.numFailingZscore+= numFailingZscore; - } - - private void createReaderSaveHeader() throws Exception { - vcfIn = IO.fetchBufferedReader(vcfFile); - vcfHeader.clear(); - String record; - boolean addInfo = true; - boolean endFound = false; - while ((record = vcfIn.readLine()) != null){ - record = record.trim(); - if (record.length() == 0) continue; - if (record.startsWith("#")){ - if (addInfo && record.startsWith("##INFO=")) { - vcfHeader.add("##FILTER== variant AF.\">"); - vcfHeader.add("##INFO="); - vcfHeader.add("##INFO="); - addInfo = false; - } - vcfHeader.add(record); - if (record.startsWith("#CHROM")){ - endFound = true; - break; - } - } - else { - endFound = true; - break; - } - } - if (endFound == false) Misc.printErrAndExit("\nERROR: failed to find the #CHROM line, aborting\n"); - } - - public static void main(String[] args) { - if (args.length ==0){ - printDocs(); - System.exit(0); - } - new VCFNz(args); - } - - - - /**This method will process each argument and assign new variables*/ - public void processArgs(String[] args){ - Pattern pat = Pattern.compile("-[a-z]"); - IO.pl("\n"+IO.fetchUSeqVersion()+" Arguments: "+Misc.stringArrayToString(args, " ")+"\n"); - for (int i = 0; i "+ indexedFasta); - - //save dir? - if (saveDir == null) saveDir = vcfFile.getParentFile(); - else saveDir.mkdirs(); - if (saveDir.exists()==false || saveDir.canWrite()==false) Misc.printExit("\nError: failed to find or create a writeable save directory? Aborting.\n"); - - //tabix indexed mpileup? - if (mpileup == null || mpileup.canRead() == false) Misc.printExit("\nError: please provide a path to a bgzipped tabix indexed mpileup file.\n"); - File index = new File (mpileup.toString()+".tbi"); - if (index.exists() == false) Misc.printExit("\nError: cannot find the '"+index.getName()+"' index file corresponding to this indexed mpileup file "+mpileup.getName()); - - //threads to use - int numAvail = Runtime.getRuntime().availableProcessors(); - if (numberThreads < 1 || numberThreads > numAvail) numberThreads = numAvail - 1; - - printSettings(); - - } - - private static void printPoNExample() { - IO.pl("#!/bin/bash"); - IO.pl("bed=/scratch/mammoth/serial/Bed/HSV1_GBM_IDT_Probes_Hg38Pad150bps_91K.bed"); - IO.pl("ref=/uufs/chpc.utah.edu/TNRunner/Indexes/Hg38IndexForBwa-0.7.17/hs38DH.fa"); - IO.pl(""); - IO.pl("echo \"#SampleOrder-index; 15352X1-0 15352X2-1 15352X3-2 15352X4-3 15352X6-4 15352X8-5 15352X9-6 15352X10-7 \\"); - IO.pl("15352X11-8 15352X12-9 15352X14-10 15352X16-11 15352X17-12 15352X18-13 15352X19-14 15352X20-15 15352X22-16 15352X24-17 \\"); - IO.pl("> 15352R.mpileup"); - IO.pl("# Be sure to use samtools 1.8 with these settings, any changes should be evaluated carefully."); - IO.pl("~/BioApps/Samtools/1.8/bin/samtools mpileup \\"); - IO.pl("--count-orphans \\"); - IO.pl("--max-depth 100000 \\"); - IO.pl("--max-idepth 100000 \\"); - IO.pl("--gap-frac 0.001 \\"); - IO.pl("--per-sample-mF \\"); - IO.pl("--ignore-RG \\"); - IO.pl("--min-MQ 13 \\"); - IO.pl("--no-BAQ \\"); - IO.pl("--fasta-ref $ref \\"); - IO.pl("--ff UNMAP,SECONDARY,QCFAIL \\"); - IO.pl("-l $bed \\"); - IO.pl("-o 15352R.mpileup \\"); - IO.pl("15352X1_Hg38_final.bam 15352X2_Hg38_final.bam 15352X3_Hg38_final.bam 15352X4_Hg38_final.bam 15352X6_Hg38_final.bam \\"); - IO.pl("15352X8_Hg38_final.bam 15352X9_Hg38_final.bam 15352X10_Hg38_final.bam 15352X11_Hg38_final.bam 15352X12_Hg38_final.bam \\"); - IO.pl("15352X14_Hg38_final.bam 15352X16_Hg38_final.bam 15352X17_Hg38_final.bam 15352X18_Hg38_final.bam 15352X19_Hg38_final.bam \\"); - IO.pl("15352X20_Hg38_final.bam 15352X22_Hg38_final.bam 15352X24_Hg38_final.bam >> 15352R.mpileup"); - IO.pl(""); - IO.pl("~/BioApps/HTSlib/1.3/bin/bgzip --threads 20 15352R.mpileup"); - IO.pl("~/BioApps/HTSlib/1.3/bin/tabix -s 1 -b 2 -e 2 15352R.mpileup.gz"); - } - - public void printSettings(){ - IO.pl("Settings:"); - IO.pl(" -v Vcf file "+ vcfFile); - IO.pl(" -m Mpileup "+ IO.getCanonicalPath(mpileup)); - IO.pl(" -s Save dir "+ IO.getCanonicalPath(saveDir)); - IO.pl(" -x "+ minBaseQuality+"\tMin mpileup sample base quality"); - IO.pl(" -x "+ maximumZScore+"\tMax N z-score"); - IO.pl(" -x "+ bpPad+"\tBP padding +/- to vcf record for mpileup scanning"); - IO.pl(" -x "+ numberThreads+"\tCPUs"); - IO.pl(" -x "+ removeNonZScoredRecords+ "\tExclude vcf records that could not be z-scored"); - - } - - - //FP flags with tumor sample: - //High N frequency - zscore, 3 vs 5 mer - snvs, what depth? 1000 - //Both Ins Del - for ins #ins/#indels, for del #del/#indels - //Proximal calls - - - - public static void printDocs(){ - IO.pl("\n" + - "**************************************************************************************\n" + - "** VCF Bkz : Sept 2019 **\n" + - "**************************************************************************************\n" + - "VCFBkz calculates non-reference allele frequencies (AF) from a background multi-sample \n"+ - "mpileup file over each vcf record. It then calculates a z-score for the vcf AF and \n"+ - "appends it to the INFO field. If multiple bps are affected (e.g. INDELs) or bp padding\n"+ - "provided, the lowest bp z-score is appended. Z-scores < ~3 are indicative of non\n"+ - "reference bps in the background samples. A flag is appended to the FILTER field if a\n"+ - "background AF was found >= vcf AF. Note, VCFBkz requires tumor AF and\n"+ - "DP tags in the INFO field of each record to use in the z-score calculation, see -f\n"+ - "and -p. \n"+ - - "\nRequired:\n"+ - "-v Path to a xxx.vcf(.gz/.zip OK) file or directory containing such.\n" + - "-m Path to a bgzip compressed and tabix indexed multi-sample mpileup file. See -x\n"+ - - "\nOptional:\n" + - "-s Path to a directory to save the modified vcf file(s), defaults to vcf parent dir\n"+ - "-f Tumor AF INFO name, defaults to T_AF\n"+ - "-p Tumor DP INFO name, defaults to T_DP\n"+ - "-z Minimum vcf z-score, defaults to 0, no filtering. Unscored vars are kept.\n"+ - "-q Minimum mpileup sample bp quality, defaults to 20\n"+ - "-c Minimum mpileup sample read coverge, defaults to 20\n"+ - "-f Maximum mpileup sample AF to include, defaults to 0.1, set to exclude germline.\n"+ - "-a Minimum # mpileup samples for z-score calculation, defaults to 3\n" + - "-b Pad the size of each vcf record, defaults to 0 bp\n"+ - "-i Sample indexes to use, comma delimited, no spaces, zero based, defaults to all\n"+ - "-e Exclude vcf records that could not be z-scored\n"+ - "-l Exclude vcf records with a bkg AF >= the record AF.\n"+ - "-u Replace QUAL value with z-score. Un scored vars will be assigned 0\n"+ - "-d Print verbose debugging output\n" + - "-t Number of threads to use, defaults to all\n"+ - "-x Print an example mpileup PoN bash script\n"+ - "\n"+ - - "Example: java -Xmx4G -jar pathTo/USeq/Apps/VCFBkz -v SomaticVcfs/ -z 3\n"+ - "-m bkg.mpileup.gz -s BkgFiltVcfs/ -q 13 -u -b 2\n\n"+ - - "**************************************************************************************\n"); - } - - //getters and setters - public int getMinBaseQuality() { - return minBaseQuality; - } - - public boolean isRemoveNonZScoredRecords() { - return removeNonZScoredRecords; - } - - public double getMaximumZScore() { - return maximumZScore; - } - - public int getBpPad() { - return bpPad; - } - - public HashMap getSeqMeanStd() { - return seqMeanStd; - } - - public File getIndexedFasta() { - return indexedFasta; - } -}