From 237851de2bb0ccab1b1fa29cf813b41fcec140b0 Mon Sep 17 00:00:00 2001 From: u0028003 Date: Tue, 9 Jul 2024 12:07:25 -0600 Subject: [PATCH] Build out the z-score normalization gene selection method for Copy Analysis --- .../edu/utah/kohli/AccuGenomicsSpikeins.java | 678 ++++++++++++++++-- Source/edu/utah/kohli/CaptureRegion.java | 31 +- Source/edu/utah/kohli/KohliGene.java | 34 + Source/edu/utah/kohli/KohliPatient.java | 13 +- Source/edu/utah/kohli/KohliSample.java | 11 +- build.properties | 2 +- 6 files changed, 694 insertions(+), 75 deletions(-) diff --git a/Source/edu/utah/kohli/AccuGenomicsSpikeins.java b/Source/edu/utah/kohli/AccuGenomicsSpikeins.java index 3ae54b3..4c8bd61 100644 --- a/Source/edu/utah/kohli/AccuGenomicsSpikeins.java +++ b/Source/edu/utah/kohli/AccuGenomicsSpikeins.java @@ -1,11 +1,14 @@ package edu.utah.kohli; import java.io.File; +import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.TreeMap; +import java.util.TreeSet; import edu.utah.kohli.KohliGene.CopyTestResult; import util.gen.IO; @@ -20,76 +23,528 @@ public class AccuGenomicsSpikeins { private ArrayList captureRegions = new ArrayList(); private LinkedHashMap genes = null; private HashMap geneNameNormalizedGene = new HashMap(); - private File boxPlotDirectory = null; + private File resultsDirectory = null; private ArrayList germlineSamplesToRemoveFromPoN = null; private LinkedHashMap copyAnalysisTests = new LinkedHashMap(); - - public static void main(String[] args) { + private String testedGenes = "'ARID1A','PTEN','CCND1','ATM','ZBTB16','CDKN1B','KMT2D','CDK4','MDM2','BRCA2','RB1','FOXA1','AKT1','ZFHX3','TP53','NCOR1','CDK12','BRCA1','SPOP','RNF43','MSH2','ERG','TMPRSS2','CHEK2','CTNNB1','FOXP1','PIK3CB','PIK3CA','PIK3R1','CHD1','APC','CDK6','BRAF','KMT2C','NKX3-1','NCOA2','MYC','COL22A1','CDKN2A','NOTCH1','AR-Enh','AR','OPHN1'"; + private File fullPathToR = new File("/usr/local/bin/R"); + private KohliSample testSample = null; + private File saveDir = null; + private TreeSet genesToNormWith = null; + + //params to pick genes to use in normalizing a test sample + private double maximumZScore = 3.0; + private double minimumFractionPassingGeneCaptureRegions = 0.75; + private double minimumNumberPassingGenes = 3; + + public static void main(String[] args) throws IOException { File anno = new File ("/Users/u0028003/HCI/Labs/Kohli_Manish/ProstateCfDNAAnalysis/FinalAggregateAnalysis/sampleMatchingPersonGNomIDs_13June2024.txt"); File vcfCountData = new File("/Users/u0028003/HCI/Labs/Kohli_Manish/ProstateCfDNAAnalysis/FinalAggregateAnalysis/BcfToolsForAccGen/justVcfSampleCounts.txt"); - File regionCountData = new File("/Users/u0028003/HCI/Labs/Kohli_Manish/ProstateCfDNAAnalysis/FinalAggregateAnalysis/CopyRatio/Split150bp/medianMergedSplit150ProbeCountsQCed.txt"); - File boxPlotDirectory = new File("/Users/u0028003/HCI/Labs/Kohli_Manish/ProstateCfDNAAnalysis/FinalAggregateAnalysis/CopyRatio/Split150bp/BoxPlotDataFiles"); + File regionCountData = new File("/Users/u0028003/HCI/Labs/Kohli_Manish/ProstateCfDNAAnalysis/FinalAggregateAnalysis/CopyRatio/Split150bp/medianCounts150bpDoOverPostQC.txt"); + File resultsDirectory = new File("/Users/u0028003/HCI/Labs/Kohli_Manish/ProstateCfDNAAnalysis/FinalAggregateAnalysis/CopyRatio/Split150bp/CnvZScoreNormJustTest"); + + //bad germline samples for PoN, still want for the paired analysis ArrayList germlineSamplesToRemoveFromPoN = new ArrayList(); germlineSamplesToRemoveFromPoN.add("23802X11"); - new AccuGenomicsSpikeins(anno, vcfCountData, regionCountData, boxPlotDirectory, germlineSamplesToRemoveFromPoN); + germlineSamplesToRemoveFromPoN.add("23802X4"); + new AccuGenomicsSpikeins(anno, vcfCountData, regionCountData, resultsDirectory, germlineSamplesToRemoveFromPoN); } - public AccuGenomicsSpikeins(File anno, File vcfCountData, File regionCountData, File boxPlotDirectory, ArrayList germlineSamplesToRemoveFromPoN) { - this.boxPlotDirectory = boxPlotDirectory; + public AccuGenomicsSpikeins(File anno, File vcfCountData, File regionCountData, File resultsDirectory, ArrayList germlineSamplesToRemoveFromPoN) throws IOException { + this.resultsDirectory = resultsDirectory; this.germlineSamplesToRemoveFromPoN = germlineSamplesToRemoveFromPoN; - + //load patient sample annotations loadPatients(anno); //for (KohliPatient kp: patients.values()) IO.pl(kp); - + //load Accugenomic spike-in data - //loadAccugenomicSpikeinData(vcfCountData); - //for (KohliSample ks: samples.values()) IO.pl(ks.toStringProbes(accuGenProbes)); + /*loadAccugenomicSpikeinData(vcfCountData); + for (KohliSample ks: samples.values()) IO.pl(ks.toStringProbes(accuGenProbes)); + GOAL: flag probes to ignore + for each probe generate a histogram and boxplot of AFs for the ctDNA and germline + IO.pl("\nAFs!"); + printAFs(); + for each probe generate a histogram and boxplot of DPs for the ctDNA and germline + IO.pl("\nDPs!"); + printDPs(); + GOAL: flag samples to ignore + remove flagged probes + histogram and boxplot each sample for AF and DPs*/ + + //load read coverage for each sample, region, and gene + loadReadCoverageData(regionCountData); + + //mean scale the germline samples to 1000, needed for the TN comp + IO.pl("Initial all germline all region normalization to 1000..."); + if (germlineSamplesToRemoveFromPoN.size()!=0) IO.pl("\tExcluding from PoN: "+germlineSamplesToRemoveFromPoN); + meanScaleGermlineReadCoverageDatasetsAllGenes(1000.0, captureRegions, true); + //File boxPlotDataGermline = new File(boxPlotDirectory, "germlineBoxPlotData.txt"); + //saveScaledGermlineReadCoverageDatasetsForBoxPlot(boxPlotDataGermline); + + //define the paired t/n datasets + //matchedTest = samples.get("23802X1"); + //matchedTest = samples.get("22712X1"); + //genesToNormWith = maximizeGenesWithInPoNZScores(); - //GOAL: flag probes to ignore - //for each probe generate a histogram and boxplot of AFs for the ctDNA and germline - //IO.pl("\nAFs!"); - //printAFs(); - //for each probe generate a histogram and boxplot of DPs for the ctDNA and germline - //IO.pl("\nDPs!"); - //printDPs(); + //runCfDNASamples(); - //GOAL: flag samples to ignore - //remove flagged probes - //histogram and boxplot each sample for AF and DPs + runAllGermlinesAsTest(); + + + + IO.pl("\nCOMPLETE!"); + + } + + private void runAllGermlinesAsTest() throws IOException { + + for (KohliSample ks: samples.values()) { + if (ks.isGermlineSample()==false) continue; + testSample = ks; + + //exclude the test from the pon + germlineSamplesToRemoveFromPoN.add(testSample.getSampleId()); + + saveDir = new File(resultsDirectory, testSample.getPatient().getHciPatientId()+"_"+testSample.getSampleId()); + saveDir.mkdirs(); + IO.pl("Testing Sample "+saveDir.getName()+"..."); + + //mean scale the pon minus the "test" sample + meanScaleGermlineReadCoverageDatasetsAllGenes(1000.0, captureRegions, true); + + //ZScore gene selection method + genesToNormWith = maximizeGenesWithInPoNZScores(); + + //Too few genes? + if (genesToNormWith.size()==0) { + IO.pl("ERROR: too few genes found to normalize to, adjust zscore requirements and restart! SKIPPING"); + continue; + } + //All on one chrom + else if (chromosomeCheck(genesToNormWith) == false) { + IO.pl("ERROR: all genes to normalize were found on one chromosome, adjust zscore requirements and restart! SKIPPING"); + continue; + } + + // Collect the Capture Regions to use in normalizing + ArrayList crsToNorm = new ArrayList(); + for (String gn: genesToNormWith) { + KohliGene kg = genes.get(gn); + crsToNorm.addAll(kg.getCaptureRegions()); + } + + //ReNormalize the PoN + IO.pl("Renormalizing the PoN with the select genes..."); + meanScaleGermlineReadCoverageDatasetsAllGenes(1000.0, crsToNorm, false); + + //Normalize the test + IO.pl("Normalizing the test with the select genes..."); + meanScaleTestReadCoverageDatasetsWithDefinedCaptureRegions(1000.0, crsToNorm); + + //Generate some plots for each gene + generateBoxPlots("("+Misc.treeSetToString(genesToNormWith, ",")+")"); + generateZScoreHistograms("("+Misc.treeSetToString(genesToNormWith, ",")+")"); + + //Stat copy changes + generateGeneStatistics(); + + //remove the test from the PoN blocker + germlineSamplesToRemoveFromPoN.remove(germlineSamplesToRemoveFromPoN.size()-1); + } - //load read coverage for each sample, region, and gene - loadReadCoverageData(regionCountData); + } + + private void runCfDNASamples() throws IOException { + for (KohliPatient kp: patients.values()) { + IO.pl("\nTesting Patient "+kp.getHciPatientId()+"..."); + + for (KohliSample ks: kp.getNonGermlineSamples()) { + testSample = ks; + + saveDir = new File(resultsDirectory, kp.getHciPatientId()+"_"+testSample.getSampleId()); + saveDir.mkdirs(); + IO.pl("Testing Sample "+saveDir.getName()+"..."); + + + //ZScore gene selection method + genesToNormWith = maximizeGenesWithInPoNZScores(); + + //Too few genes? + if (genesToNormWith.size()==0) { + IO.pl("ERROR: too few genes found to normalize to, adjust zscore requirements and restart! SKIPPING"); + continue; + } + //All on one chrom + else if (chromosomeCheck(genesToNormWith) == false) { + IO.pl("ERROR: all genes to normalize were found on one chromosome, adjust zscore requirements and restart! SKIPPING"); + continue; + } + + //IO.pl(genesToNormWith.size()+" genes for normalization: "+genesToNormWith); + // Collect the Capture Regions to use in normalizing + ArrayList crsToNorm = new ArrayList(); + for (String gn: genesToNormWith) { + KohliGene kg = genes.get(gn); + crsToNorm.addAll(kg.getCaptureRegions()); + } + + //ReNormalize the PoN + IO.pl("Renormalizing the PoN with the select genes..."); + meanScaleGermlineReadCoverageDatasetsAllGenes(1000.0, crsToNorm, false); - //for (KohliSample ks: samples.values()) IO.pl(ks.meanRegionCounts(captureRegions)); - //meanScaleAndPrintReadCoverageDatasets(1000.0); - //meanScaleReadCoverageDatasetsBoxPlot(1000.0,"germline"); + //Normalize the test + IO.pl("Normalizing the test with the select genes..."); + meanScaleTestReadCoverageDatasetsWithDefinedCaptureRegions(1000.0, crsToNorm); + + //Generate some plots for each gene + generateBoxPlots("("+Misc.treeSetToString(genesToNormWith, ",")+")"); + generateZScoreHistograms("("+Misc.treeSetToString(genesToNormWith, ",")+")"); + + //Stat copy changes + generateGeneStatistics(); + + //reset the initial PoN values in the CaptureRegions + IO.pl("Resetting the PoN to the original values..."); + for (CaptureRegion cr: captureRegions) cr.copyInitialPoNToCurrentPon(); + + } + } + + } + + private boolean chromosomeCheck(TreeSet geneNames) { + HashSet chrs = new HashSet(); + for (String geneName: geneNames) chrs.add(genes.get(geneName).getChromosome()); + if (chrs.size()>1) return true; + return false; + } + + private void generateGeneStatistics() { + IO.pl("Generating statistic spreadsheet..."); + StringBuilder sb = new StringBuilder(); + sb.append("Patient\tTest Sample\tGermline\tUsed To Norm\tGene\t# Capture Regions\t" + + "# CRs Test Outside PoN\tMean CR Z-Score\tMean CR Scaled Test\tMean CR Scaled PoN\tLog2(Test/PoN)\n"); + for (KohliGene kg: genes.values()) { + sb.append(testSample.getPatient().getHciPatientId()); sb.append("\t"); + sb.append(testSample.getSampleId()); sb.append("\t"); + sb.append(testSample.isGermlineSample()); sb.append("\t"); + sb.append(genesToNormWith.contains(kg.getGeneName())); sb.append("\t"); + sb.append(kg.getGeneName()); sb.append("\t"); + sb.append(kg.getCaptureRegions().size()); sb.append("\t"); + sb.append(kg.getNumberTestValuesOutsideCaptureRegions()); sb.append("\t"); + sb.append(kg.getMeanCaptureRegionZScore()); sb.append("\t"); + double meanTest = kg.getMeanCaptureRegionScaledTestScore(); + sb.append(meanTest); sb.append("\t"); + double meanPoN = kg.getMeanCaptureRegionPoNScore(); + sb.append(meanPoN); sb.append("\t"); + sb.append(Num.log2(meanTest/meanPoN)); sb.append("\t"); + sb.append("\n"); + } + + File spreadsheet = new File( saveDir, saveDir.getName()+".geneStats.xls" ); + IO.writeString(sb.toString(), spreadsheet); + } + + private void generateBoxPlots(String normGenes) throws IOException { + IO.pl("Generating boxplots..."); + //write out data files for R + File rDir = new File(saveDir, "R"); + rDir.mkdir(); + File boxPlotDataGermline = new File(rDir, "germlineBoxPlotData.txt"); + saveScaledGermlineReadCoverageDatasetsForBoxPlot(boxPlotDataGermline); + File boxPlotDataTest = new File(rDir, "testBoxPlotData.txt"); + saveScaledTestReadCoverageDatasetsForBoxPlot(boxPlotDataTest); + + //boxPlot dir + File boxPlots = new File(saveDir, "BoxPlots"); + boxPlots.mkdir(); + + ArrayList rScriptAL = new ArrayList(); + rScriptAL.add("library(ggplot2)"); + rScriptAL.add("setwd('"+boxPlots.getCanonicalPath()+"')"); + rScriptAL.add("df = read.table('"+boxPlotDataGermline.getCanonicalPath()+"', header = TRUE)"); + rScriptAL.add("testDf = read.table('"+boxPlotDataTest.getCanonicalPath()+"', header=TRUE)"); + rScriptAL.add("normGenes = '"+normGenes+"'"); + rScriptAL.add("testDataset = '"+testSample.getSampleId()+"'"); + rScriptAL.add("genes = c("+testedGenes+")"); + rScriptAL.add("for (x in genes) {"); + rScriptAL.add(" print(x)"); + rScriptAL.add(" toSub=paste(x,'_',sep='')"); + rScriptAL.add(" dfSub = subset(df, grepl(toSub, Gene))"); + rScriptAL.add(" dfSubTest = subset(testDf, grepl(toSub, Gene))"); + rScriptAL.add(" boxPlotObj = ggplot(dfSub, aes(x=Gene, y=ScaledCount)) + "); + //rScriptAL.add(" geom_boxplot()+ coord_cartesian(ylim = c(250, 1750)) + "); + rScriptAL.add(" geom_boxplot()+ "); + rScriptAL.add(" ggtitle( paste(testDataset, x, normGenes)) + "); + rScriptAL.add(" ylab('1K Mean Scaled Counts') + "); + rScriptAL.add(" theme( "); + rScriptAL.add(" plot.title = element_text(size=12,face='bold'), "); + rScriptAL.add(" axis.title.x = element_blank(), "); + rScriptAL.add(" axis.title.y = element_text(face='bold', size=12), "); + rScriptAL.add(" axis.text.x = element_text(angle = 90, vjust = 1, hjust=1)) + "); + rScriptAL.add(" geom_point(data=dfSubTest, aes(x=Gene, y=ScaledCount), color = 'red', shape=8) "); + rScriptAL.add(" ggsave(file=paste(x,'.box.png', sep = ''), plot=boxPlotObj, width=10, height=8) "); + rScriptAL.add("}"); + rScriptAL.add("print('COMPLETE')"); + String rScript = Misc.stringArrayListToString(rScriptAL, "\n"); + File rScriptFile = new File(rDir, "boxplotScript.R"); + File rLogFile = new File(rDir, "boxplot.Rout"); + IO.writeString(rScript, rScriptFile ); + + //make command + String[] command = new String[] { + fullPathToR.getCanonicalPath(), + "CMD", + "BATCH", + "--no-save", + "--no-restore", + rScriptFile.getCanonicalPath(), + rLogFile.getCanonicalPath()}; + //execute + IO.executeCommandLine(command); + + //read in results + String[] results = IO.loadFile(rLogFile); + boolean complete = false; + for (String s: results) if (s.contains("COMPLETE"))complete = true; + if (complete == false) Misc.printErrAndExit("\nERROR, failed to find 'COMPLETE' in the boxplot R output, see "+rLogFile); + } + + private TreeSet maximizeGenesWithCenteredCoverage(KohliSample testSample, int[][] upDownSameGermline) { + IO.pl("Scanning scalars to identify genes in the test that resemble the matched normal..."); + UpDownResult[] res = new UpDownResult[10000 - 100 +1]; + int counter = 0; + for (int i=100; i<=10000; i++) { + int[][] upDownSameTest = countGenesWithCenteredCoverage(i, testSample); + ArrayList[] numMatchsOneOffsTwoOffs = compare(upDownSameGermline, upDownSameTest, false); + res[counter++] = new UpDownResult(numMatchsOneOffsTwoOffs, i); + } + //sort by matches, oneOffs, twoOffs, threeOffs + Arrays.sort(res); + IO.pl("\tScalar\tMatches\tOneOffs\tTwoOffs\tThreeOffs"); + IO.pl(res[0]); + TreeSet toNorm = new TreeSet(); + for (String geneName: res[0].matchesOneTwoThreeOffs[0]) toNorm.add(geneName); + for (String geneName: res[0].matchesOneTwoThreeOffs[1]) toNorm.add(geneName); + if (toNorm.size() ==0) { + IO.pl("WARNING: no genes found to normalize to, using all!"); + toNorm.addAll(genes.keySet()); + } + else { + for (int i=1; i< res.length; i++) { + if (res[0].compareTo(res[i]) != 0) break; + IO.pl(res[i]); + //add in matches and one offs? how about two offs? + for (String geneName: res[i].matchesOneTwoThreeOffs[0]) toNorm.add(geneName); + for (String geneName: res[i].matchesOneTwoThreeOffs[1]) toNorm.add(geneName); + } + } + return toNorm; + } + + private TreeSet maximizeGenesWithInPoNZScores() { + IO.pl("Scanning scalars to identify genes in the test that resemble the matched normal..."); + NormalizationResult[] res = new NormalizationResult[10000 - 100 +1]; + int counter = 0; + for (int i=100; i<=10000; i++) { + //scale all to i + meanScaleTestReadCoverageDatasetsWithDefinedCaptureRegions(i , captureRegions); + res[counter++] = countNumberGenesWithTestInPon(i); + } + //sort by highest num genes and then sum fraction copy regions + Arrays.sort(res); + IO.pl("\tScalar\t#Passing\tGeneNames"); + TreeSet toNorm = new TreeSet(); + if (res[0].numPassingGenes >= minimumNumberPassingGenes ) { + IO.pl(res[0]); + for (KohliGene kg: res[0].genesPassingThresholds) { + toNorm.add(kg.getGeneName()); + } + } + return toNorm; + } + + private NormalizationResult countNumberGenesWithTestInPon(int normTarget) { + //for each gene, count the number of capture regions where the test is within the PoN zscores + ArrayList testInPoN = new ArrayList(); + ArrayList fracCRs = new ArrayList(); + for (KohliGene kg : genes.values()) { + double inPoN = 0; + for (CaptureRegion cr: kg.getCaptureRegions()) { + if (Math.abs(cr.calculateZScoreFromPoN()) <= maximumZScore) inPoN++; + } + double frac = inPoN/ (double)kg.getCaptureRegions().size(); + if (frac >= minimumFractionPassingGeneCaptureRegions) { + testInPoN.add(kg); + fracCRs.add(frac); + } + } + return new NormalizationResult(normTarget, testInPoN, fracCRs); + } + + private class UpDownResult implements Comparable{ - //mean scale the germline samples - //meanScaleGermlineReadCoverageDatasetsAllGenes(1000.0); + int normTarget; + ArrayList[] matchesOneTwoThreeOffs = null; + int[] numMatchesOneTwoThreeOffs = null; - //print out the genes - //IO.pl("Genes, their capture regions, the mean of all of the germline samples and the individual values"); - //for (KohliGene kg: genes.values()) { - // IO.pl(kg.toString()); - //} + public UpDownResult(ArrayList[] matchesOneTwoThreeOffs, int normTarget) { + this.normTarget = normTarget; + this.matchesOneTwoThreeOffs = matchesOneTwoThreeOffs; + numMatchesOneTwoThreeOffs = new int[] {matchesOneTwoThreeOffs[0].size(), matchesOneTwoThreeOffs[1].size(), matchesOneTwoThreeOffs[2].size(), matchesOneTwoThreeOffs[3].size()}; + } + public String toString() { + StringBuilder sb = new StringBuilder("\t"); + sb.append(normTarget); + sb.append("\t"); + sb.append(matchesOneTwoThreeOffs[0]); + for (int i=1; i< matchesOneTwoThreeOffs.length; i++) { + sb.append("\t"); + sb.append(matchesOneTwoThreeOffs[i]); + } + return sb.toString(); + } + + public int compareTo(UpDownResult o) { + //matches + if (this.numMatchesOneTwoThreeOffs[0] > o.numMatchesOneTwoThreeOffs[0]) return -1; + else if (this.numMatchesOneTwoThreeOffs[0] < o.numMatchesOneTwoThreeOffs[0]) return 1; + + //matches same, thus one offs + if (this.numMatchesOneTwoThreeOffs[1] > o.numMatchesOneTwoThreeOffs[1]) return -1; + else if (this.numMatchesOneTwoThreeOffs[1] < o.numMatchesOneTwoThreeOffs[1]) return 1; + + //one offs same, thus two offs + if (this.numMatchesOneTwoThreeOffs[2] > o.numMatchesOneTwoThreeOffs[2]) return -1; + else if (this.numMatchesOneTwoThreeOffs[2] < o.numMatchesOneTwoThreeOffs[2]) return 1; + + //two offs same, thus three offs + if (this.numMatchesOneTwoThreeOffs[3] > o.numMatchesOneTwoThreeOffs[3]) return -1; + else if (this.numMatchesOneTwoThreeOffs[3] < o.numMatchesOneTwoThreeOffs[3]) return 1; + + return 0; + } + } + + private class NormalizationResult implements Comparable{ - //scan the pon to define the accepted range for each normalizer gene - generatePonBackground(); + int normTarget; + ArrayList genesPassingThresholds = null; + ArrayList fracCRs = new ArrayList(); + int numPassingGenes; + double sumFracCRs; - //test a sample against the PoN - test("22712X1"); - //test("23802X1"); - //for (String sampleId : samples.keySet()) { - //test(sampleId); - //} + public NormalizationResult(int normTarget, ArrayList genesPassingThresholds, ArrayList fracCRs) { + this.genesPassingThresholds = genesPassingThresholds; + this.normTarget = normTarget; + numPassingGenes = genesPassingThresholds.size(); + this.fracCRs = fracCRs; + sumFracCRs = Num.sumArray(Num.arrayListOfDoubleToArray(fracCRs)); + } + public String toString() { + StringBuilder sb = new StringBuilder("\t"); + sb.append(normTarget); + sb.append("\t"); + sb.append(numPassingGenes); + if (numPassingGenes > 0) { + sb.append("\t"); + sb.append(genesPassingThresholds.get(0).getGeneName()); + sb.append("("); + sb.append(Num.formatNumber(fracCRs.get(0), 2)); + sb.append(")"); + for (int i=1; i< numPassingGenes; i++) { + sb.append(","); + sb.append(genesPassingThresholds.get(i).getGeneName()); + sb.append("("); + sb.append(Num.formatNumber(fracCRs.get(i), 2)); + sb.append(")"); + } + } + + return sb.toString(); + } + + public int compareTo(NormalizationResult o) { + if (this.numPassingGenes > o.numPassingGenes) return -1; + else if (this.numPassingGenes < o.numPassingGenes) return 1; + else { + if (this.sumFracCRs > o.sumFracCRs) return -1; + else if (this.sumFracCRs < o.sumFracCRs) return 1; + return 0; + } + } } - private void test(String sampleIdToTest) { + private ArrayList[] compare(int[][] upDownSameGermline, int[][] upDownSameTest, boolean printStats) { + ArrayList matches = new ArrayList(); + ArrayList oneOffs = new ArrayList(); + ArrayList twoOffs = new ArrayList(); + ArrayList threeOffs = new ArrayList(); + + int i=-1; + for (String geneName: genes.keySet()) { + i++; + if (printStats) IO.p(geneName+"\t"+upDownSameGermline[i][0]+":"+upDownSameGermline[i][1]+":"+upDownSameGermline[i][2]+"\t"+upDownSameTest[i][0]+":"+upDownSameTest[i][1]+":"+upDownSameTest[i][2]); + + //No matches for zero values in germline + if (upDownSameGermline[i][0]==0 || upDownSameGermline[i][1]==0) { + if (printStats) IO.pl("\tZeroInGermlineGeneSkipping"); + } + else { + int diffInUp = Math.abs(upDownSameGermline[i][0] - upDownSameTest[i][0]); + if (diffInUp == 0) { + if (printStats) IO.pl("\tMatch"); + matches.add(geneName); + } + else if (diffInUp == 1) { + if (printStats) IO.pl("\tOneOff\t"+(upDownSameGermline[i][0] - upDownSameTest[i][0])); + oneOffs.add(geneName); + } + else if (diffInUp == 2) { + if (printStats) IO.pl("\tTwoOff\t"+(upDownSameGermline[i][0] - upDownSameTest[i][0])); + twoOffs.add(geneName); + } + else if (diffInUp == 3) { + if (printStats) IO.pl("\tThreeOff\t"+(upDownSameGermline[i][0] - upDownSameTest[i][0])); + threeOffs.add(geneName); + } + else if (printStats) IO.pl("\tNoMatch"); + } + } + if (printStats)IO.pl(matches+ "\t"+ oneOffs+"\t"+twoOffs+"\t"+threeOffs); + return new ArrayList[] {matches, oneOffs, twoOffs, threeOffs}; + } + + private int[][] countGenesWithCenteredCoverage(double target, KohliSample ks) { + //scale the counts using all of the capture regions + ks.createScaledRegionCountMap(captureRegions, captureRegions, target); + HashMap sampleScaledCaptureRegionMeans = ks.getScaledReadCoverageCounts(); + + //for each gene count the above and below means from the PoN + int[][] upDown = new int[genes.size()][]; + int counter = 0; + for (KohliGene kg : genes.values()) { + int numAboveMean = 0; + int numBelowMean = 0; + int numSame = 0; //never happens! + for (CaptureRegion cr: kg.getCaptureRegions()) { + String crId = cr.getOriginalInput(); + double testScaledCount = sampleScaledCaptureRegionMeans.get(crId); + double ponScaledCount = cr.getPonScaledMean(); + if (testScaledCount > ponScaledCount) numAboveMean++; + else if (testScaledCount < ponScaledCount) numBelowMean++; + else numSame++; + } + upDown[counter++] = new int[]{numAboveMean, numBelowMean, numSame}; + } + return upDown; + } + + private void testOneGeneMethod(String sampleIdToTest) { -//IO.pl("Testing, "+sampleIdToTest+"..."); CopyAnalysisTest cat = new CopyAnalysisTest(sampleIdToTest); copyAnalysisTests.put(sampleIdToTest, cat); @@ -109,7 +564,7 @@ private void test(String sampleIdToTest) { //scale the test meanScaleTestReadCoverageDatasetsOneGene(1000, kg.getGeneName(), sampleIdToTest); -File boxPlotDataTest = new File(boxPlotDirectory, "testing_"+sampleIdToTest+"_"+kg.getGeneName()+"_BoxPlotData.txt"); +File boxPlotDataTest = new File(resultsDirectory, "testing_"+sampleIdToTest+"_"+kg.getGeneName()+"_BoxPlotData.txt"); saveScaledTestReadCoverageDatasetsForBoxPlot(boxPlotDataTest); //for each gene compare @@ -152,7 +607,7 @@ private void generatePonBackground() { //scale the PoN, not necessary, just useful for boxplots, not used in zscore calcs meanScaleGermlineReadCoverageDatasetsOneGene(1000, normalizerGeneName); - File boxPlotDataGermline = new File(boxPlotDirectory, normalizerGeneName+"_All_Germline_BoxPlotData.txt"); + File boxPlotDataGermline = new File(resultsDirectory, normalizerGeneName+"_All_Germline_BoxPlotData.txt"); saveScaledGermlineReadCoverageDatasetsForBoxPlot(boxPlotDataGermline); //for each germline sample that isn't flagged, use it as a mock test sample @@ -192,6 +647,90 @@ private void generatePonBackground() { if (germlineSamplesToRemoveFromPoN.size()!=0) IO.pl("\tExcluding user flagged "+Misc.stringArrayListToString(germlineSamplesToRemoveFromPoN, ", ")); } + private void generateZScoreHistograms(String normGenes) throws IOException { + IO.pl("Generating histograms..."); + + //write out data files for R + File rDir = new File(saveDir, "R"); + rDir.mkdir(); + File histogramData = new File(rDir, "histogramData.txt"); + saveTestZScoresForHistograms(histogramData); + + //Histo dir + File histogramPlots = new File(saveDir, "HistogramPlots"); + histogramPlots.mkdir(); + + ArrayList rScriptAL = new ArrayList(); + rScriptAL.add("library(ggplot2)"); + rScriptAL.add("setwd('"+histogramPlots.getCanonicalPath()+"')"); + rScriptAL.add("df = read.table('"+histogramData.getCanonicalPath()+"', header = TRUE)"); + rScriptAL.add("normGenes = '"+normGenes+"'"); + rScriptAL.add("testDataset = '"+testSample.getSampleId()+"'"); + rScriptAL.add("genes = c("+testedGenes+")"); + rScriptAL.add("for (x in genes) {"); + rScriptAL.add(" print(x)"); + rScriptAL.add(" toSub=paste(x,'_',sep='')"); + rScriptAL.add(" dfSub = subset(df, grepl(toSub, Gene))"); + rScriptAL.add(" obj = ggplot(dfSub, aes(x=ZScore)) + "); + rScriptAL.add(" geom_histogram(color='black',fill='gray',bins=20) + "); + rScriptAL.add(" geom_vline(aes(xintercept=mean(ZScore)),color='red', linetype='dashed', size=1)+ "); + rScriptAL.add(" ggtitle( paste(testDataset, x, normGenes)) + "); + rScriptAL.add(" scale_x_continuous(name = 'Capture Region Z-Scores') + "); + rScriptAL.add(" scale_y_continuous(name = 'Count')"); + rScriptAL.add(" ggsave(file=paste(x,'.hist.png', sep = ''), plot=obj, width=10, height=8) "); + rScriptAL.add("}"); + rScriptAL.add("print('COMPLETE')"); + String rScript = Misc.stringArrayListToString(rScriptAL, "\n"); + File rScriptFile = new File(rDir, "histogramScript.R"); + File rLogFile = new File(rDir, "histogram.Rout"); + IO.writeString(rScript, rScriptFile ); + + //make command + String[] command = new String[] { + fullPathToR.getCanonicalPath(), + "CMD", + "BATCH", + "--no-save", + "--no-restore", + rScriptFile.getCanonicalPath(), + rLogFile.getCanonicalPath()}; + //execute + IO.executeCommandLine(command); + + //read in results + String[] results = IO.loadFile(rLogFile); + boolean complete = false; + for (String s: results) if (s.contains("COMPLETE"))complete = true; + if (complete == false) Misc.printErrAndExit("\nERROR, failed to find 'COMPLETE' in the boxplot R output, see "+rLogFile); + + } + + private void saveTestZScoresForHistograms(File toSave) { + //print out all of the scaled counts for boxplots + StringBuilder sb = new StringBuilder(); + //header row + sb.append("Gene\tZScore\n"); + String gene = ""; + int counter = 0; + for (int i=0; i< captureRegions.size(); i++) { + CaptureRegion cr = captureRegions.get(i); + if (cr.getGene().equals(gene) == false) { + gene = cr.getGene(); + counter = 0; + } + counter++; + String counterString = Integer.toString(counter); + if (counterString.length()==1) counterString = "00"+counterString; + else if (counterString.length()==2) counterString = "0"+counterString; + String name = gene+"_"+counterString; + sb.append(name); + sb.append("\t"); + sb.append(Double.toString(cr.calculateZScoreFromPoN())); + sb.append("\n"); + } + IO.writeString(sb.toString(), toSave); + } + private void saveScaledTestReadCoverageDatasetsForBoxPlot(File toSave) { //print out all of the scaled counts for boxplots StringBuilder sb = new StringBuilder(); @@ -207,7 +746,8 @@ private void saveScaledTestReadCoverageDatasetsForBoxPlot(File toSave) { } counter++; String counterString = Integer.toString(counter); - if (counterString.length()==1) counterString = "0"+counterString; + if (counterString.length()==1) counterString = "00"+counterString; + else if (counterString.length()==2) counterString = "0"+counterString; String name = gene+"_"+counterString; sb.append(name); sb.append("\t"); @@ -217,30 +757,25 @@ private void saveScaledTestReadCoverageDatasetsForBoxPlot(File toSave) { IO.writeString(sb.toString(), toSave); } - private void meanScaleGermlineReadCoverageDatasetsAllGenes(double target) { - - //scale the counts using all of the capture regions -//IO.pl("Scaling germline sample counts...."); - for (KohliSample ks: samples.values()) { - if (ks.isGermlineSample()) ks.createScaledRegionCountMap(captureRegions, captureRegions, target); + private void meanScaleGermlineReadCoverageDatasetsAllGenes(double target, ArrayList crsToScaleWith, boolean setInitial) { + //scale the counts using the select capture regions for ALL germline samples + for (KohliSample ks: samples.values()) if (ks.isGermlineSample()) { + ks.createScaledRegionCountMap(crsToScaleWith, captureRegions, target); } - - //for each capture region -//IO.pl("Pulling counts over each capture region for each germline sample...."); + + //for all capture region, collect the scaled germline counts and save them to the cr for those good for PoN for (CaptureRegion cr: captureRegions) { String crId = cr.getOriginalInput(); ArrayList germlineScaledCounts = new ArrayList(); //for each germline pon sample for (KohliSample ks: samples.values()) { - if (ks.isGermlineSample()) { + if (ks.isGermlineSample() && ks.isExcludeFromPon()==false) { HashMap sampleScaledCaptureRegionMeans = ks.getScaledReadCoverageCounts(); germlineScaledCounts.add(sampleScaledCaptureRegionMeans.get(crId)); } } //convert to double[] and set in cr -//IO.pl("Setting counts in "+cr.getOriginalInput()); - cr.setPonScaledCountsCalculateStats(Num.arrayListOfDoubleToArray(germlineScaledCounts)); -//IO.pl("\t"+cr.getPonScaledMean()); + cr.setPonScaledCountsCalculateStats(Num.arrayListOfDoubleToArray(germlineScaledCounts), setInitial); } } @@ -266,7 +801,7 @@ private void meanScaleGermlineReadCoverageDatasetsOneGene(double target, String } } //convert to double[] and set in cr - cr.setPonScaledCountsCalculateStats(Num.arrayListOfDoubleToArray(germlineScaledCounts)); + cr.setPonScaledCountsCalculateStats(Num.arrayListOfDoubleToArray(germlineScaledCounts), false); } } @@ -287,6 +822,16 @@ private void meanScaleTestReadCoverageDatasetsOneGene(double target, String gene } } + private void meanScaleTestReadCoverageDatasetsWithDefinedCaptureRegions(double target, ArrayList forScaling) { + testSample.createScaledRegionCountMap(forScaling, captureRegions, target); + //load the capture regions with the scaled counts + for (CaptureRegion cr: captureRegions) { + String crId = cr.getOriginalInput(); + HashMap sampleScaledCaptureRegionMeans = testSample.getScaledReadCoverageCounts(); + cr.setScaledTestCount(sampleScaledCaptureRegionMeans.get(crId)); + } + } + private void saveScaledGermlineReadCoverageDatasetsForBoxPlot(File toSave) { //header row StringBuilder sb = new StringBuilder("Gene\tScaledCount\n"); @@ -301,7 +846,8 @@ private void saveScaledGermlineReadCoverageDatasetsForBoxPlot(File toSave) { } counter++; String counterString = Integer.toString(counter); - if (counterString.length()==1) counterString = "0"+counterString; + if (counterString.length()==1) counterString = "00"+counterString; + else if (counterString.length()==2) counterString = "0"+counterString; String name = gene+"_"+counterString; //for each sample for (KohliSample ks: samples.values()) { @@ -349,7 +895,6 @@ private void meanScaleAndPrintReadCoverageDatasets(double target) { } IO.pl(sb); } - private void meanScaleReadCoverageDatasetsBoxPlot(double target, boolean germline) { //header row StringBuilder sb = new StringBuilder("Gene\tScaledCount\n"); @@ -364,7 +909,8 @@ private void meanScaleReadCoverageDatasetsBoxPlot(double target, boolean germlin } counter++; String counterString = Integer.toString(counter); - if (counterString.length()==1) counterString = "0"+counterString; + if (counterString.length()==1) counterString = "00"+counterString; + else if (counterString.length()==2) counterString = "0"+counterString; String name = gene+"_"+counterString; IO.pl(cr.getOriginalInput()+"\t"+name); //for each sample @@ -526,7 +1072,7 @@ private void loadPatients(File anno) { else { boolean isGermline = false; if (tokens[2].toLowerCase().contains("germline")) isGermline = true; - kp.getSamples().add(new KohliSample(tokens[1], isGermline, tokens[3])); + kp.getSamples().add(new KohliSample(kp, tokens[1], isGermline, tokens[3])); } } //load the sample treemap @@ -545,6 +1091,4 @@ private void loadPatients(File anno) { } } - - } diff --git a/Source/edu/utah/kohli/CaptureRegion.java b/Source/edu/utah/kohli/CaptureRegion.java index 2508fa5..b01fb20 100644 --- a/Source/edu/utah/kohli/CaptureRegion.java +++ b/Source/edu/utah/kohli/CaptureRegion.java @@ -1,6 +1,6 @@ package edu.utah.kohli; -import util.gen.IO; +import java.util.Arrays; import util.gen.Misc; import util.gen.Num; @@ -18,6 +18,11 @@ public class CaptureRegion { private double ponScaledMean = -1; private double ponStdDev = -1; + //Panel of Normals based on initial 1000 scaling + private double[] initialPonScaledCounts = null; + private double initialPonScaledMean = -1; + private double initialPonStdDev = -1; + //Test dataset private double scaledTestCount = -1; @@ -31,10 +36,21 @@ public CaptureRegion (String rep) { gene = f[3]; } + public boolean isTestOutsidePoN() { + double[] minMax = Num.findMinMaxDoubleValues(ponScaledCounts); + if (scaledTestCount < minMax[0]) return true; + if (scaledTestCount > minMax[1]) return true; + return false; + } + public double calculateZScoreFromPoN(double testValue) { return (testValue-ponScaledMean)/ponStdDev; } + public double calculateZScoreFromPoN() { + return (scaledTestCount-ponScaledMean)/ponStdDev; + } + public String toString() { StringBuilder sb = new StringBuilder(originalInput); sb.append("\t"); @@ -46,12 +62,23 @@ public String toString() { } /**for each of the pons, save the scaled counts and calculate the mean*/ - public void setPonScaledCountsCalculateStats (double[] ponScaledCounts) { + public void setPonScaledCountsCalculateStats (double[] ponScaledCounts, boolean setInitial) { this.ponScaledCounts = ponScaledCounts; ponScaledMean = Num.mean(ponScaledCounts); ponStdDev = Num.standardDeviation(ponScaledCounts, ponScaledMean); + if (setInitial) { + initialPonScaledCounts = Arrays.copyOf(ponScaledCounts, ponScaledCounts.length); + initialPonScaledMean = ponScaledMean; + initialPonStdDev = ponStdDev; + } } + /**Replace the working PoN stats with the initially assigned values.*/ + public void copyInitialPoNToCurrentPon() { + ponScaledCounts = Arrays.copyOf(initialPonScaledCounts, initialPonScaledCounts.length); + ponScaledMean = initialPonScaledMean; + ponStdDev = initialPonStdDev; + } public String getOriginalInput() { return originalInput; diff --git a/Source/edu/utah/kohli/KohliGene.java b/Source/edu/utah/kohli/KohliGene.java index aa5fbbe..9ba2bc9 100644 --- a/Source/edu/utah/kohli/KohliGene.java +++ b/Source/edu/utah/kohli/KohliGene.java @@ -94,4 +94,38 @@ public double[][] getScaledGermlineCounts() { } } + public int getNumberTestValuesOutsideCaptureRegions() { + int numOutside = 0; + for (CaptureRegion cr: captureRegions) { + if (cr.isTestOutsidePoN()) numOutside++; + } + return numOutside; + } + + public double getMeanCaptureRegionZScore() { + double meanZScore = 0.0; + for (CaptureRegion cr: captureRegions) { + meanZScore+= cr.calculateZScoreFromPoN(); + } + return meanZScore/(double)captureRegions.size(); + } + + public double getMeanCaptureRegionScaledTestScore() { + double total = 0.0; + for (CaptureRegion cr: captureRegions) { + total+= cr.getScaledTestCount(); + } + return total/(double)captureRegions.size(); + } + public double getMeanCaptureRegionPoNScore() { + double total = 0.0; + for (CaptureRegion cr: captureRegions) { + total+= cr.getPonScaledMean(); + } + return total/(double)captureRegions.size(); + } + public String getChromosome() { + return captureRegions.get(0).getChr(); + } + } diff --git a/Source/edu/utah/kohli/KohliPatient.java b/Source/edu/utah/kohli/KohliPatient.java index 58941f6..2668bfe 100644 --- a/Source/edu/utah/kohli/KohliPatient.java +++ b/Source/edu/utah/kohli/KohliPatient.java @@ -12,7 +12,7 @@ public KohliPatient (String[] tokens) { this.hciPatientId = tokens[0]; boolean isGermline = false; if (tokens[0].toLowerCase().contains("germline")) isGermline = true; - samples.add(new KohliSample(tokens[1], isGermline, tokens[3])); + samples.add(new KohliSample(this, tokens[1], isGermline, tokens[3])); } public String toString() { @@ -33,4 +33,15 @@ public String getHciPatientId() { public ArrayList getSamples() { return samples; } + + public KohliSample getGermlineSample() { + for (KohliSample ks: samples) if (ks.isGermlineSample()) return ks; + return null; + } + + public ArrayList getNonGermlineSamples() { + ArrayList ng = new ArrayList(); + for (KohliSample ks: samples) if (ks.isGermlineSample()==false) ng.add(ks); + return ng; + } } diff --git a/Source/edu/utah/kohli/KohliSample.java b/Source/edu/utah/kohli/KohliSample.java index eaa612a..cf6f7aa 100644 --- a/Source/edu/utah/kohli/KohliSample.java +++ b/Source/edu/utah/kohli/KohliSample.java @@ -2,12 +2,11 @@ import java.util.ArrayList; import java.util.HashMap; - -import util.gen.IO; import util.gen.Misc; public class KohliSample { + private KohliPatient patient = null; private String sampleId = null; private String type = null; private String dateString = null; @@ -22,7 +21,8 @@ public class KohliSample { private double meanRegionCounts = -1.0; private double regionCountScalar = -1.0; - public KohliSample (String sampleId, boolean isGermline, String dateString) { + public KohliSample (KohliPatient patient, String sampleId, boolean isGermline, String dateString) { + this.patient = patient; this.sampleId = sampleId; if (isGermline) germlineSample = true; else germlineSample = false; @@ -94,7 +94,6 @@ public double[] getScaledRegionCountArray(ArrayList regions, doub public void createScaledRegionCountMap(ArrayList regionsForScalarCalc, ArrayList regionsToScale, double targetMean) { regionCountScalar = targetMean / getMeanRegionCounts(regionsForScalarCalc); - //IO.pl("Scaling using just one gene for "+this.sampleId+ " numRegionsInGene "+regionsForScalarCalc.size()+" scalar "+regionCountScalar); scaledReadCoverageCounts = new HashMap(); //probe info for (int i=0; i< regionsToScale.size(); i++) { @@ -149,5 +148,9 @@ public void setExcludeFromPon(boolean excludeFromPon) { public boolean isGermlineSample() { return germlineSample; } + + public KohliPatient getPatient() { + return patient; + } } diff --git a/build.properties b/build.properties index c4865f0..cfbb43b 100644 --- a/build.properties +++ b/build.properties @@ -50,7 +50,7 @@ edu/utah/seq/data/CalculatePerCycleErrorRate,\ edu/utah/seq/run/caris/CarisDataWrangler,\ edu/utah/seq/vcf/xml/caris/CarisVcfComparator,\ edu/utah/seq/vcf/xml/caris/CarisXmlVcfParser,\ -edu/utah/billing/CBiBilling,\ +edu/utah/billing/CBiBilling2,\ edu/utah/seq/data/cbio/CBio2Sgr,\ edu/utah/seq/analysis/ChIPSeq,\ util/apps/CompareIntersectingRegions,\