Made mods to filter and flag variant calls from Strelka GATK joint

genotyping with "MULTIALLELIC". Added LoH analysis to TNRunner.
HuntsmanCancerInstitute · Nov 22, 2022 · a6beb17 · a6beb17
1 parent b1ce18d
commit a6beb17
Show file tree

Hide file tree

Showing 14 changed files with 427 additions and 97 deletions.
diff --git a/.classpath b/.classpath
@@ -72,6 +72,6 @@
 	<classpathentry kind="lib" path="LibraryJars/picard_2.22.4.jar"/>
 	<classpathentry kind="lib" path="LibraryJars/GQueryCLI.jar"/>
 	<classpathentry kind="lib" path="LibraryJars/commons-text-1.9.jar"/>
-	<classpathentry kind="lib" path="LibraryJars/SubjectMatchMaker_0.1.jar"/>
+	<classpathentry kind="lib" path="LibraryJars/SubjectMatchMaker_0.3.jar"/>
 	<classpathentry kind="output" path="Classes"/>
 </classpath>
diff --git a/Source/edu/utah/seq/cnv/cfdna/LiquidBiopsyCA.java b/Source/edu/utah/seq/cnv/cfdna/LiquidBiopsyCA.java
@@ -9,6 +9,8 @@
 import java.util.regex.*;
 import htsjdk.samtools.*;
 import htsjdk.samtools.cram.ref.ReferenceSource;
+import trans.cel.QuantileNormalization;
+import trans.main.WilcoxonSignedRankTest;
 import util.bio.annotation.Bed;
 import util.gen.*;
 import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; //needed by cram
@@ -21,13 +23,14 @@ public class LiquidBiopsyCA {
 	//fields
 	private File bedFile = null;
 	private LiquidBiopsySample[] samples = null;
-	private File results;
+	private File resultsDirectory;
 	private File fastaFile;
 	private int minMappingQuality = 13;
 	private SamReaderFactory samFactory;
 	private LinkedHashMap<String, ArrayList<Bed>> groupNameBed = null;
 	private int numCpu = 0;
 	private ArrayList<LookupJob[]> jobs = new ArrayList<LookupJob[]>();
+	private int[][] targetSampleCounts = null;
 	private int totalNumberTargets = 0;
 
 	//constructor for stand alone use
@@ -60,7 +63,7 @@ public void doWork() throws Exception {
 		makeLookupJobs();
 
 		//make and execute loaders
-		IO.pl("Launching "+numCpu+" loaders\n\t\tJobName\t#IntFragments\t#NonIntFragments");
+		IO.pl("\nLaunching "+numCpu+" loaders...\n\tJobName\t#IntFragments\t#NonIntFragments");
 		ExecutorService executor = Executors.newFixedThreadPool(numCpu);
 		LookupJobRunner[] loaders = new LookupJobRunner[numCpu];
 		for (int i=0; i< numCpu; i++) {
@@ -78,11 +81,100 @@ public void doWork() throws Exception {
 			if (l.isFailed()) throw new IOException("\nERROR: Lookup Loader issue, aborting!");	
 		}
 
-		printCountTable();
+//saveCountTable();
+		saveCountsForR();
+
+		//normalizeCounts();
+
+		//wilcoxonTestPairedData();
+
+	}
+
+
+	private void wilcoxonTestPairedData() {
+		IO.pl("\nTesting each gene for copy differences...");
+		LookupJob[][] germlines = new LookupJob[samples.length][];
+		int normIndex = 0;
+		for (LiquidBiopsySample s: samples) {
+			IO.pl(s.getSampleName());
+			LookupJob[] cf = s.getCfJobs();
+			LookupJob[] norm = s.getGermlineJobs();
+			germlines[normIndex++] = norm;
+			for (int i=0; i<cf.length; i++) {
+				String geneName = cf[i].getRegions().get(0).getName();
+				IO.pl("\t"+geneName);
+				IO.pl("\t\tCF  : "+Num.floatArrayToString(cf[i].getNormalizedCounts()," "));
+				IO.pl("\t\tNorm: "+Num.floatArrayToString(norm[i].getNormalizedCounts()," "));
+				WilcoxonSignedRankTest w = new WilcoxonSignedRankTest(cf[i].getNormalizedCounts(), norm[i].getNormalizedCounts());
+				IO.pl("\t\tPVal: "+w.getTransformedPValue());
+
+			}
+		}
+		IO.pl("\nTesting each gene for mock copy differences between germline samples...");
+		for (int k=0; k< germlines.length; k++) {
+			LookupJob[] normA = germlines[k];
+			for (int j=k+1; j< germlines.length; j++) {
+				LookupJob[] normB = germlines[j];
+				for (int i=0; i<normA.length; i++) {
+					String geneName = normA[i].getRegions().get(0).getName();
+					IO.p("\t"+geneName);
+					//IO.pl("\t\tCF  : "+Num.floatArrayToString(cf[i].getGermlineizedCounts()," "));
+					//IO.pl("\t\tNorm: "+Num.floatArrayToString(norm[i].getGermlineizedCounts()," "));
+					WilcoxonSignedRankTest w = new WilcoxonSignedRankTest(normA[i].getNormalizedCounts(), normB[i].getNormalizedCounts());
+					IO.pl("\tPVal: "+w.getTransformedPValue());
+
+				}
+			}
+		}
+
+	}
 
+	private void normalizeCounts() {
+		IO.pl("\nQuantile germlineizing target counts...");
+		// TODO Auto-generated method stub
+		float[][] intensities = new float[samples.length*2][];
+		int index = 0;
+		for (LiquidBiopsySample s: samples) {
+			intensities[index++] = s.getCfCounts(totalNumberTargets);
+			intensities[index++] = s.getGermlineCounts(totalNumberTargets);
+		}
+		QuantileNormalization qn = new QuantileNormalization (intensities);
+		intensities = qn.getIntensities();
+		addGermlineizedCountsToLookupJobs(intensities);
+		//printIntensityStats(intensities);
+
 	}
 
-	private void printCountTable() {
+	private void addGermlineizedCountsToLookupJobs(float[][] intensities) {
+		int index = 0;
+		//add back to the LookupJob
+		for (LiquidBiopsySample s: samples) {
+			LookupJob[] cf = s.getCfJobs();
+			LookupJob[] norm = s.getGermlineJobs();
+			float[] normCf = intensities[index++];
+			float[] normNorm = intensities[index++];
+			int counter = 0;
+			for (int i=0; i< cf.length; i++) {
+				int numTargetsInJob = cf[i].getRegions().size();
+				int stop = counter+numTargetsInJob;
+				float[] nCf = Arrays.copyOfRange(normCf, counter, stop);
+				float[] nNorm = Arrays.copyOfRange(normNorm, counter, stop);
+				cf[i].setNormalizedCounts(nCf);
+				norm[i].setNormalizedCounts(nNorm);
+				counter = stop;
+			}
+		}
+	}
+
+	/**Prints statistics about the current intensity float[][] arrays*/
+	public void printIntensityStats(float[][] intensities){
+		for (int i=0; i<intensities.length; i++){
+			Num.statFloatArray(intensities[i], false);
+		}
+	}
+
+	private void saveCountTable() {
+		IO.pl("\nSaving count table...");
 		//create rowNames
 		String[] rowNames = new String[totalNumberTargets];
 		int counter = 0;
@@ -98,11 +190,11 @@ private void printCountTable() {
 		columnNames[counter++] = "Target:Coordinates";
 		for (LiquidBiopsySample l: samples) {
 			columnNames[counter++] = l.getSampleName()+":Somatic";
-			columnNames[counter++] = l.getSampleName()+":Normal";
+			columnNames[counter++] = l.getSampleName()+":Germline";
 		}
 
 		//create int[][] table
-		int[][] targetSampleCounts = new int[totalNumberTargets][samples.length*2];
+		targetSampleCounts = new int[totalNumberTargets][samples.length*2];
 
 		//for each targetBlock
 		int numGroup = groupNameBed.keySet().size();	
@@ -114,10 +206,10 @@ private void printCountTable() {
 			for (int s = 0; s< samples.length; s++) {
 				int blockRowCounter = rowCounter;
 				LookupJob tumor = samples[s].getCfJobs()[t];
-				LookupJob normal = samples[s].getNormalJobs()[t];
+				LookupJob germline = samples[s].getGermlineJobs()[t];
 				for (int r=0; r< numRowsInBlock; r++) {
 					targetSampleCounts[blockRowCounter][columnCounter] = tumor.getCounts()[r];
-					targetSampleCounts[blockRowCounter++][columnCounter+1] = normal.getCounts()[r];
+					targetSampleCounts[blockRowCounter++][columnCounter+1] = germline.getCounts()[r];
 				}
 				columnCounter+=2;
 			}
@@ -147,16 +239,27 @@ private void printCountTable() {
 			sb.append("\n");
 		}
 
-		IO.pl(sb);
+		//IO.pl(sb);
 
 	}
+
+	private void saveCountsForR() throws IOException {
+		IO.pl("\nSaving Counts For R...");
+		File forR = new File(resultsDirectory, "countsForR.txt");
+		PrintWriter out = new PrintWriter( new FileWriter(forR));
+		for (LiquidBiopsySample s: samples) {
+			s.addCfCounts(out);
+			s.addGermlineCounts(out);
+		}
+		out.close();
+	}
 
 	private void closeSamReaders() throws IOException {
 		for (LiquidBiopsySample s: samples) {
 			s.getCfReader().close();
-			s.getNormalReader().close();
+			s.getGermlineReader().close();
 			s.getCfBedOut().close();
-			s.getNormalBedOut().close();
+			s.getGermlineBedOut().close();
 		}
 	}
 
@@ -168,7 +271,7 @@ public synchronized LookupJob[] fetchNextJob() {
 	private void makeLookupJobs() throws IOException {
 		for (LiquidBiopsySample s: samples) {
 			jobs.add(s.buildCFJobs(groupNameBed, samFactory, minMappingQuality));
-			jobs.add(s.buildNormalJobs(groupNameBed, samFactory, minMappingQuality));
+			jobs.add(s.buildGermlineJobs(groupNameBed, samFactory, minMappingQuality));
 		}
 
 	}
@@ -217,7 +320,7 @@ public void processArgs(String[] args) throws Exception{
 					case 's': forExtraction = new File(args[++i]).getCanonicalFile(); break;
 					case 'b': bedFile = new File(args[++i]); break;
 					case 'r': fastaFile = new File(args[++i]); break;
-					case 'o': results = new File(args[++i]); break;
+					case 'o': resultsDirectory = new File(args[++i]); break;
 					case 'm': minMappingQuality = Integer.parseInt(args[++i]); break;
 					case 'p': numCpu = Integer.parseInt(args[++i]); break;
 					case 'h': printDocs(); System.exit(0);
@@ -234,12 +337,12 @@ public void processArgs(String[] args) throws Exception{
 		if (forExtraction == null || forExtraction.exists() == false || forExtraction.isDirectory()== false) Misc.printErrAndExit("\nError: please enter a path to directory containing one or more sample specific sub directories.\n");
 		File[] sampleDirectories = IO.extractOnlyDirectories(forExtraction);
 		samples = new LiquidBiopsySample[sampleDirectories.length];
-		for (int i=0; i< samples.length; i++) samples[i] = new LiquidBiopsySample(sampleDirectories[i], results);
+		for (int i=0; i< samples.length; i++) samples[i] = new LiquidBiopsySample(sampleDirectories[i], resultsDirectory);
 
 		//if (fastaFile == null || fastaFile.canRead() == false)  Misc.printErrAndExit("\nError: please provide an reference genome fasta file and it's index.");		
 		if (bedFile == null ||  bedFile.canRead() == false) Misc.printErrAndExit("\nError: please provide a file of regions in bed format.");
-		if (results == null ) Misc.printErrAndExit("\nError: please provide a directory for saving the output results");
-		results.mkdirs();
+		if (resultsDirectory == null ) Misc.printErrAndExit("\nError: please provide a directory for saving the output results");
+		resultsDirectory.mkdirs();
 
 		//number of workers
 		int numProc = Runtime.getRuntime().availableProcessors();
@@ -257,7 +360,7 @@ public static void printDocs(){
 				"\n"+
 
 				"\nRequired Options:\n"+
-				"-s Path to a directory containing sample specific sub directories with two aligment files representing the cfDNA and matched normal cram or bam files with their indexes. The normal file name should contain the 'norm' string.\n"+
+				"-s Path to a directory containing sample specific sub directories with two aligment files representing the cfDNA and matched germline cram or bam files with their indexes. The germline file name should contain the 'norm' string.\n"+
 				"-b Path to a bed file of non overlapping regions to estimate copy ratio differences, xxx.bed.gz/.zip OK. The name column will be used to group each region into a larger area for merged copy ratio differences.\n"+
 				"-r Path to the reference fasta with xxx.fai index used for alignment.\n"+
 				"-o Path to the output results directory.\n"+