**************************************************************************************
-** Scan Seqs: July 2015 **
+** Scan Seqs: Jan 2024 **
**************************************************************************************
Takes unshifted stranded chromosome specific PointData and uses a sliding window to
calculate several smoothed window statistics. These include a binomial p-value, a
diff --git a/Misc/JavaxComOrgInfoNetClasses.zip b/Misc/JavaxComOrgInfoNetClasses.zip
index 07d4e79e..d442c6cd 100644
Binary files a/Misc/JavaxComOrgInfoNetClasses.zip and b/Misc/JavaxComOrgInfoNetClasses.zip differ
diff --git a/Misc/old.jcoinc.zip b/Misc/OldJavaxComOrgInfoNetClasses.zip
similarity index 94%
rename from Misc/old.jcoinc.zip
rename to Misc/OldJavaxComOrgInfoNetClasses.zip
index 9195e765..9b4764dd 100644
Binary files a/Misc/old.jcoinc.zip and b/Misc/OldJavaxComOrgInfoNetClasses.zip differ
diff --git a/Source/edu/utah/billing/AwsAccountExpense.java b/Source/edu/utah/billing/AwsAccountExpense.java
new file mode 100644
index 00000000..8d902c94
--- /dev/null
+++ b/Source/edu/utah/billing/AwsAccountExpense.java
@@ -0,0 +1,28 @@
+package edu.utah.billing;
+
+import java.util.ArrayList;
+
+public class AwsAccountExpense {
+
+ private String awsAccountNumber = null;
+ private float totalExpense = 0f;
+
+ public AwsAccountExpense (String number, float total) {
+ awsAccountNumber = number;
+ totalExpense = total;
+ }
+
+ public String getAwsAccountNumber() {
+ return awsAccountNumber;
+ }
+
+ public float getTotalExpense() {
+ return totalExpense;
+ }
+
+ public static float fetchTotalExpense(ArrayList accounts) {
+ float total = 0f;
+ for (AwsAccountExpense aae: accounts) total+= aae.getTotalExpense();
+ return total;
+ }
+}
diff --git a/Source/edu/utah/billing/AwsXlsxAccountParser.java b/Source/edu/utah/billing/AwsXlsxAccountParser.java
index 8ebc7d47..92e5b23d 100644
--- a/Source/edu/utah/billing/AwsXlsxAccountParser.java
+++ b/Source/edu/utah/billing/AwsXlsxAccountParser.java
@@ -20,23 +20,30 @@
public class AwsXlsxAccountParser {
private TreeMap awsAccountGroupName = new TreeMap();
+ private ArrayList missingAliases = new ArrayList();
- public AwsXlsxAccountParser(File xlsx, boolean debug) {
- parseIt(xlsx);
+ public AwsXlsxAccountParser(File xlsx, boolean debug, HashMap> aliases) {
+ parseIt(xlsx, aliases);
if (debug) {
for (String s: awsAccountGroupName.keySet()) {
IO.pl(s+"\t"+awsAccountGroupName.get(s));
}
}
+
+ if (missingAliases.size()!=0) {
+ for (String a: missingAliases)IO.el("\tMissing entry in masterAccountInfo for "+a+" from "+xlsx.getName());
+ IO.el("\t\tCorrect and restart.\n");
+ System.exit(1);
+ }
}
public static void main(String[] args) {
- AwsXlsxAccountParser p = new AwsXlsxAccountParser(new File ("/Users/u0028003/HCI/CoreAdmin/Billing/AllBillingReports/2023/6_BSR_June_2023/awsAccounts.xlsx"), true);
+ //AwsXlsxAccountParser p = new AwsXlsxAccountParser(new File ("/Users/u0028003/HCI/CoreAdmin/Billing/AllBillingReports/2023/6_BSR_June_2023/awsAccounts.xlsx"), true);
}
- private void parseIt(File inputFile) {
+ private void parseIt(File inputFile, HashMap> aliases) {
try {
//Open up xlsx file
@@ -50,7 +57,7 @@ private void parseIt(File inputFile) {
int numRows = sheet.getPhysicalNumberOfRows();
for (int r = 0; r< numRows; r++) {
Row row = sheet.getRow(r);
- if (row != null) addAccount(row);
+ if (row != null) addAccount(row, aliases);
}
} catch (Exception e) {
System.out.println("Aws Accounts xlsx file is not in the correct format, exiting");
@@ -59,7 +66,7 @@ private void parseIt(File inputFile) {
}
}
- private void addAccount(Row row) {
+ private void addAccount(Row row, HashMap> aliases) {
int numCells = row.getLastCellNum()+1;
if (numCells < 2) return;
@@ -74,6 +81,7 @@ private void addAccount(Row row) {
String accountNumber = cell.toString().trim();
if (accountNumber.length()!=0) {
awsAccountGroupName.put(accountNumber, groupName);
+ if (aliases.containsKey(groupName) == false) missingAliases.add(groupName);
}
}
}
diff --git a/Source/edu/utah/billing/AwsXlsxAccountParser2.java b/Source/edu/utah/billing/AwsXlsxAccountParser2.java
new file mode 100644
index 00000000..253953fc
--- /dev/null
+++ b/Source/edu/utah/billing/AwsXlsxAccountParser2.java
@@ -0,0 +1,79 @@
+package edu.utah.billing;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.TreeMap;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.ss.usermodel.Workbook;
+import org.apache.poi.ss.usermodel.WorkbookFactory;
+import util.gen.IO;
+
+
+public class AwsXlsxAccountParser2 {
+
+ private TreeMap awsAccountGroupName = new TreeMap();
+
+ public AwsXlsxAccountParser2(File xlsx, boolean debug) {
+ parseIt(xlsx);
+
+ if (debug) {
+ for (String s: awsAccountGroupName.keySet()) {
+ IO.pl(s+"\t"+awsAccountGroupName.get(s));
+ }
+ }
+ }
+
+ public static void main(String[] args) {
+ //AwsXlsxAccountParser p = new AwsXlsxAccountParser(new File ("/Users/u0028003/HCI/CoreAdmin/Billing/AllBillingReports/2023/6_BSR_June_2023/awsAccounts.xlsx"), true);
+
+ }
+
+ private void parseIt(File inputFile) {
+ try {
+
+ //Open up xlsx file
+ Workbook wb = WorkbookFactory.create(inputFile);
+
+ //Find appropriate sheet
+ Sheet sheet = wb.getSheetAt(0);
+ if (sheet == null) throw new IOException("Could not find a sheet in "+inputFile+" ?");
+
+ //Iterate through rows
+ int numRows = sheet.getPhysicalNumberOfRows();
+ for (int r = 0; r< numRows; r++) {
+ Row row = sheet.getRow(r);
+ if (row != null) addAccount(row);
+ }
+ } catch (Exception e) {
+ System.out.println("Aws Accounts xlsx file is not in the correct format, exiting");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ private void addAccount(Row row) {
+ int numCells = row.getLastCellNum()+1;
+ if (numCells < 2) return;
+
+ Cell groupNameCell = row.getCell(0);
+ if (groupNameCell == null) return;
+ String groupName = groupNameCell.toString().trim();
+ if (groupName.startsWith("INVESTIGATOR") || groupName.length()==0) return;
+
+ for (int c=1;c < numCells; c++) {
+ Cell cell = row.getCell(c);
+ if (cell != null) {
+ String accountNumber = cell.toString().trim();
+ if (accountNumber.length()!=0) awsAccountGroupName.put(accountNumber, groupName);
+ }
+ }
+ }
+
+ public TreeMap getAwsAccountGroupName() {
+ return awsAccountGroupName;
+ }
+
+
+}
diff --git a/Source/edu/utah/billing/BillingGroup.java b/Source/edu/utah/billing/BillingGroup.java
new file mode 100644
index 00000000..448207dd
--- /dev/null
+++ b/Source/edu/utah/billing/BillingGroup.java
@@ -0,0 +1,240 @@
+package edu.utah.billing;
+
+import java.util.ArrayList;
+import java.util.LinkedHashSet;
+import util.gen.Misc;
+import util.gen.Num;
+
+/**Object to represent a particular lab or group we support, e.g. AC Tan Laboratory. It is loaded with all of the info related to billing and generates a group specific invoice.
+ * Add new billing info here. */
+public class BillingGroup {
+
+ //Master Account Info
+ private boolean cancerCenterMember = true;
+ private double totalHoursBilled = 0d;
+ private LinkedHashSet aliases = null;
+ private String groupName = null;
+
+ //Hourly WAF
+ private ArrayList hourlyWafs = new ArrayList();
+ private boolean missingHourlyWafs = false;
+
+ //Compute WAF
+ private ArrayList computeWafs = new ArrayList();
+ private boolean missingComuteWafs = false;
+
+ //Monthly+ AWS Expenses, TDSynnex; where doing these quarterly since some are < $10; thus might be null;
+ private ArrayList awsAccountExpenses = new ArrayList();
+
+ //Misc Compute Expenses, one off stuff. IPA license, AWS downloads
+ private ArrayList miscExpenses = new ArrayList();
+
+ //Jira Hourly Tracking
+ private ArrayList jiraTickets = new ArrayList();
+
+ //Expenses
+ private float totalHoursToBill = 0;
+ private float totalHourlyExpenses = 0f;
+ private String totalHourlySummary = null;
+ private float additionalHourlyExpenses = 0f; //for HSC billing
+ private String additionalHourlySummary = null;
+ private float totalComputeExpenses = 0f;
+
+ public BillingGroup (boolean isCancerMember, double totalHoursBilled, LinkedHashSet aliases) {
+ cancerCenterMember = isCancerMember;
+ this.totalHoursBilled = totalHoursBilled;
+ this.aliases = aliases;
+ //take first alias as groupName
+ groupName = aliases.iterator().next();
+ }
+
+ public void calculateTotalExpenses() {
+ //compute?
+ if (awsAccountExpenses.size()!=0) totalComputeExpenses+= AwsAccountExpense.fetchTotalExpense(awsAccountExpenses);
+ if (miscExpenses.size()!=0) totalComputeExpenses+= MiscExpense.fetchTotalExpense(miscExpenses);
+
+ //hourly?
+ if (jiraTickets.size()!=0) {
+ for (JiraTicketSummary jts: jiraTickets) totalHoursToBill+= Float.parseFloat(jts.getHoursString());
+
+ //calculate the expense, either $70/hr or $100/hr
+ float pricePerHour = CBiBilling2.firstTierPricePerHour;
+ if (totalHoursBilled > CBiBilling2.maxHoursForFirstTier) pricePerHour = CBiBilling2.secondTierPricePerHour;
+ totalHourlyExpenses = totalHoursToBill* pricePerHour;
+ totalHourlySummary = "Hourly Total:\t$"+Num.formatNumber(totalHourlyExpenses, 2)+ "\t("+ totalHoursToBill+ " x $"+pricePerHour+"/hr)";
+
+ totalHoursBilled += totalHoursToBill;
+
+ //non cancer?
+ if (cancerCenterMember==false) {
+ additionalHourlyExpenses = CBiBilling2.hscCostSharing * totalHoursToBill;
+ additionalHourlySummary = "\t(Hourly HSC Cost Sharing Total:\t$"+Num.formatNumber(additionalHourlyExpenses, 2)+ "\t("+ totalHoursToBill+ " x $"+CBiBilling2.hscCostSharing+"/hr))";
+ }
+ }
+ }
+
+ public float getTotalExpenses() {
+ return totalHourlyExpenses+ additionalHourlyExpenses+ totalComputeExpenses;
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ if (cancerCenterMember) sb.append("Cancer\t");
+ else sb.append("Non-Cancer\t");
+
+ sb.append(totalHoursBilled);
+
+ for (String a: aliases) {
+ sb.append("\t");
+ sb.append(a);
+ }
+
+ return sb.toString();
+ }
+
+ public String generateInvoice(String date, String hourlyWafHeader, String cloudWafHeader, boolean includeAdditionalHourlyExpenses, boolean pretty) {
+ ArrayList txtOut = new ArrayList();
+
+ txtOut.add("HCI Cancer Bioinformatics Invoice - "+date+" - "+groupName);
+ if (pretty)txtOut.add("");
+ txtOut.add("Aliases:\t"+Misc.linkedSetToString(aliases, "; "));
+ txtOut.add("YTD Hourly Usage:\t"+Num.formatNumber(totalHoursBilled, 1)+" hrs");
+ if (pretty)txtOut.add("");
+
+ //Hourly Expenses
+ if (totalHourlyExpenses !=0) {
+ txtOut.add("Hourly Billing:");
+ if (hourlyWafs.size()==0) {
+ txtOut.add("\tHourly WAF:\tNo Hourly WAF - contact PI");
+ missingHourlyWafs = true;
+ }
+ else {
+ txtOut.add("\tHourly WAF:\t"+hourlyWafHeader);
+ for (String[] wl: hourlyWafs) txtOut.add("\tHourly WAF:\t"+Misc.stringArrayToString(wl, "\t"));
+ }
+ if (pretty)txtOut.add("");
+
+ txtOut.add("\tHourly Tickets:\t"+JiraTicketSummary.getToStringHeader());
+ for (JiraTicketSummary jt: jiraTickets) txtOut.add("\tHourly Tickets:\t"+jt.toString());
+ if (pretty)txtOut.add("");
+
+ txtOut.add("\t\t"+totalHourlySummary);
+
+
+ //any HSC billing?
+ if (additionalHourlyExpenses > 0 && includeAdditionalHourlyExpenses) {
+ txtOut.add("\t"+additionalHourlySummary);
+ }
+ if (pretty)txtOut.add("");
+ }
+
+
+ //Compute Usage billing
+ if (totalComputeExpenses > 0) {
+//if (totalHourlyExpenses ==0 && pretty) txtOut.add("");
+
+ txtOut.add("Compute Billing:");
+ if (computeWafs.size()==0) {
+ txtOut.add("\tCloud WAF:\tNo Cloud WAF - contact PI");
+ missingComuteWafs = true;
+ }
+ else {
+ txtOut.add("\tCloud WAF:\t"+cloudWafHeader);
+ for (String[] wl: computeWafs) txtOut.add("\tCloud WAF:\t"+Misc.stringArrayToString(wl, "\t"));
+ }
+ if (pretty)txtOut.add("");
+
+ //Any AWS compute expenses?
+ for (AwsAccountExpense aae: awsAccountExpenses) {
+ String ae = "\tCloud AWS Acc:\t"+aae.getAwsAccountNumber()+"\t$"+Num.formatNumber(aae.getTotalExpense(), 2);
+ txtOut.add(ae);
+ }
+ //Any Misc compute expenses?
+ if (pretty && awsAccountExpenses.size()!=0 && miscExpenses.size()!=0) txtOut.add("");
+ for (MiscExpense aae: miscExpenses) {
+ String me = "\tMisc Compute:\t$"+Num.formatNumber(aae.getCost(), 2)+"\t"+aae.getDescription();
+ txtOut.add(me);
+ }
+ if (pretty)txtOut.add("");
+ txtOut.add("\t\tCompute Total:\t$"+Num.formatNumber(totalComputeExpenses, 2));
+ if (pretty)txtOut.add("");
+ }
+
+ txtOut.add("Total Billing:\t$"+Num.formatNumber(totalComputeExpenses+ totalHourlyExpenses, 2)+"\n");
+ if (pretty) {
+ txtOut.add("Questions?\n\tEmail: "+CBiBilling2.contactEmail);
+ txtOut.add("\tOperating Policies and WAF forms: "+CBiBilling2.cbiPolicyUrl);
+ txtOut.add("");
+ }
+
+ return Misc.stringArrayListToString(txtOut, "\n");
+ }
+
+ public boolean isCancerCenterMember() {
+ return cancerCenterMember;
+ }
+
+ public double getTotalHoursBilled() {
+ return totalHoursBilled;
+ }
+
+ public LinkedHashSet getAliases() {
+ return aliases;
+ }
+
+ public ArrayList getHourlyWafs() {
+ return hourlyWafs;
+ }
+
+ public ArrayList getComputeWafs() {
+ return computeWafs;
+ }
+
+ public double getTotalHourlyExpenses() {
+ return totalHourlyExpenses;
+ }
+
+ public double getTotalComputeExpenses() {
+ return totalComputeExpenses;
+ }
+
+ public ArrayList getAwsAccountExpenses() {
+ return awsAccountExpenses;
+ }
+
+ public ArrayList getMiscExpenses() {
+ return miscExpenses;
+ }
+
+ public ArrayList getJiraTickets() {
+ return jiraTickets;
+ }
+
+ public String getTotalHourlySummary() {
+ return totalHourlySummary;
+ }
+
+ public float getAdditionalHourlyExpenses() {
+ return additionalHourlyExpenses;
+ }
+
+ public String getAdditionalHourlySummary() {
+ return additionalHourlySummary;
+ }
+
+ public boolean isMissingHourlyWafs() {
+ return missingHourlyWafs;
+ }
+
+ public boolean isMissingComuteWafs() {
+ return missingComuteWafs;
+ }
+
+ public float getTotalHoursToBill() {
+ return totalHoursToBill;
+ }
+
+ public String getGroupName() {
+ return groupName;
+ }
+}
diff --git a/Source/edu/utah/billing/CBiBilling.java b/Source/edu/utah/billing/CBiBilling.java
index 83994e4b..bf8fcded 100644
--- a/Source/edu/utah/billing/CBiBilling.java
+++ b/Source/edu/utah/billing/CBiBilling.java
@@ -18,6 +18,7 @@ public class CBiBilling {
private File masterAcountInfo;
private File cloudReportsDirectory;
private File awsAccountsFile;
+ private File expenseFile;
//internal fields
private CarahsoftXlsxParser carahsoftParser;
@@ -25,6 +26,7 @@ public class CBiBilling {
private WafXlsxParser cloud = null;
private MasterAccountInfoParser masterAccountInfo = null;
private AwsXlsxAccountParser awsXlsxAccountParser;
+ private MiscExpenseXlsxParser expenseParser = null;
private boolean debug = false;
private LinkedHashMap headerKeyIndex = null;
private TreeMap> groupNameTickets = new TreeMap>();
@@ -44,16 +46,20 @@ public class CBiBilling {
public CBiBilling (String[] args) throws Exception{
processArgs (args);
+ masterAccountInfo = new MasterAccountInfoParser(masterAcountInfo, debug);
+
parseWafs();
+
+ awsXlsxAccountParser = new AwsXlsxAccountParser(awsAccountsFile, debug, masterAccountInfo.getGroupNameAliases());
+
+ if (expenseFile != null) expenseParser = new MiscExpenseXlsxParser(expenseFile, debug);
+
+ parseCloudAccountInvoice();
parseJiraHours();
parseMasterAccountInfo();
- parseAWSAccounts();
-
- //add missing cloud WAFs!
-
printInvoices();
masterAccountInfo.saveUpdatedInfo();
@@ -65,7 +71,7 @@ public CBiBilling (String[] args) throws Exception{
private void printInvoices() throws IOException {
IO.pl("\nPrinting Invoices...");
- HashMap groupNameHoursBilled = this.masterAccountInfo.getGroupNameHoursBilled();
+ HashMap groupNameHoursBilled = masterAccountInfo.getGroupNameHoursBilled();
//two sources of billing, AWS and Hourly
//Hourly
@@ -75,6 +81,7 @@ private void printInvoices() throws IOException {
txtOut.add("HCI Cancer Bioinformatics Invoice\t\t"+date+"\n\n"+groupName+"\n");
//pull Aliases
HashSet nameAliases = masterAccountInfo.getGroupNameAliases().get(groupName);
+
//pull Hourly WAF
ArrayList wafLines = null;
for (String a: nameAliases) {
@@ -125,13 +132,28 @@ private void printInvoices() throws IOException {
nonCancerBilling.add(groupName+"\t"+Num.formatNumber(totalHoursToBill,3)+"\t$"+ Num.formatNumber(additionalBilling, 2));
totalHscBilling+= additionalBilling;
}
+
+ //any misc expenses?
+ float miscExpenses = 0;
+ TreeMap> nameExpense = expenseParser.getGroupNameExpense();
+ if (expenseParser != null) {
+ ArrayList es = null;
+ HashSet ali = masterAccountInfo.getGroupNameAliases().get(groupName);
+ for (String a: ali) {
+ if (nameExpense.containsKey(a)) {
+ es = nameExpense.get(a);
+ break;
+ }
+ }
+ }
+
//look for Aws billing
float cloudExpenses = printCloudInvoice(nameAliases,txtOut);
IO.pl("TotalExpenses:\t$"+Num.formatNumber(hourlyExpenses+cloudExpenses, 2));
IO.pl();
- totalExpenses += (hourlyExpenses+cloudExpenses);
+ totalExpenses += (hourlyExpenses+cloudExpenses+miscExpenses);
txtOut.add("\n$"+Num.formatNumber(hourlyExpenses+cloudExpenses, 2)+"\tTotal Expenses");
//write out the txt file details
@@ -202,7 +224,7 @@ private void printJustCloudInvoices() {
float totalAwsCost = 0f;
for (String line : groupNameBillingInfo.get(groupName)) {
String[] tokens = Misc.TAB.split(line);
- float cost = Float.parseFloat(tokens[1].substring(1));
+ float cost = Float.parseFloat(Misc.COMMA.matcher(tokens[1].substring(1)).replaceAll(""));
totalAwsCost+= cost;
IO.pl("CloudBilling:\t"+ line);
txtOut.add(line);
@@ -216,14 +238,12 @@ private void printJustCloudInvoices() {
txtOut.add("$"+totalString+"\tTotal Expenses");
//write out the txt file details
- String fileName = Misc.COMMA_WHITESPACE.matcher(groupName).replaceAll("_")+".txt";
+ String fileName = Misc.COMMA_WHITESPACE_FWDSLASH.matcher(groupName).replaceAll("_")+".txt";
IO.writeString(Misc.stringArrayListToString(txtOut, "\n"), new File (outputDirectory, fileName));
-
}
}
}
-
private float printCloudInvoice(HashSet groupAliases, ArrayList txtOut) {
float totalAwsCost = 0f;
@@ -265,6 +285,7 @@ private float printCloudInvoice(HashSet groupAliases, ArrayList
for (String[] cw: cloudWafs.get(gn)) {
IO.pl("CloudWAFLine:\t"+Misc.stringArrayToString(cw, "\t"));
}
+ break;
}
}
if (wafFound ==false) IO.pl("CloudWAFLine:\tNo Cloud WAF");
@@ -275,32 +296,34 @@ private float printCloudInvoice(HashSet groupAliases, ArrayList
}
- private void parseAWSAccounts() throws IOException {
+ private void parseCloudAccountInvoice() throws IOException {
//any cloud reports?
if (cloudReportsDirectory != null) {
- IO.pl("\nParsing Carahsoft AWS account files...");
+
+ IO.pl("\nParsing Carahsoft AWS account invoice...");
carahsoftParser = new CarahsoftXlsxParser(cloudReportsDirectory, debug);
+
//anything parsed
if (carahsoftParser.getAwsAccountNumberTotals().size()!=0) {
- //parse the Aws Accounts
- awsXlsxAccountParser = new AwsXlsxAccountParser(awsAccountsFile, debug);
+
//for each carasoft account charge, look to see if it's in the aws accounts
TreeMap awsAccountGroupName = awsXlsxAccountParser.getAwsAccountGroupName();
TreeMap awsAccountNumberTotals = carahsoftParser.getAwsAccountNumberTotals();
+
ArrayList missingAccountNumber = new ArrayList();
for (String awsAccountNumber: awsAccountNumberTotals.keySet()) {
if (awsAccountGroupName.containsKey(awsAccountNumber)==false) missingAccountNumber.add(awsAccountNumber);
}
-
//any missing account numbers
if (missingAccountNumber.size()!=0) {
- for (String acc: missingAccountNumber) IO.el("\tMissing '"+acc+"' in "+awsAccountsFile+", correct and restart.");
+ for (String acc: missingAccountNumber) IO.el("\tMissing '"+acc+"' in "+awsAccountsFile.getName()+", correct and restart.");
System.exit(1);
}
//check for the WAFs
//for each account number from Carahsoft with charges
TreeMap> cloudWafs = cloud.getGroupNameWafLines();
+
for (String awsAccountNumber: awsAccountNumberTotals.keySet()) {
//fetch the name from the aws accounts
String groupName = awsAccountGroupName.get(awsAccountNumber);
@@ -315,8 +338,6 @@ private void parseAWSAccounts() throws IOException {
private void parseMasterAccountInfo() {
- masterAccountInfo = new MasterAccountInfoParser(masterAcountInfo, debug);
-
ArrayList missingGroups = new ArrayList();
HashMap> aliasesMap = masterAccountInfo.getGroupNameAliases();
TreeMap> userWafs = hourly.getGroupNameWafLines();
@@ -482,8 +503,8 @@ private void parseWafs() {
File[] xlsFiles = IO.extractFiles(wafDirectory, ".xlsx");
for (File f: xlsFiles) {
if (f.getName().contains("WAF") && f.getName().startsWith("~")== false) {
- if (f.getName().contains("Cloud")) cloud = new WafXlsxParser(f, debug);
- else if (f.getName().contains("Cloud") == false) hourly = new WafXlsxParser(f, debug);
+ if (f.getName().contains("Cloud")) cloud = new WafXlsxParser(f, debug, masterAccountInfo.getGroupNameAliases());
+ else if (f.getName().contains("Cloud") == false) hourly = new WafXlsxParser(f, debug, masterAccountInfo.getGroupNameAliases());
}
}
if (cloud == null || hourly == null) Misc.printErrAndExit("\nFailed to parse both an hourly and cloud WAF tracking schedule xlsx file.");
@@ -537,7 +558,7 @@ public void processArgs(String[] args){
public static void printDocs(){
System.out.println("\n" +
"**************************************************************************************\n" +
- "** CBI Billing: Sept 2023 **\n" +
+ "** CBI Billing: Jan 2024 **\n" +
"**************************************************************************************\n" +
"Generates billing reports for the Cancer Bioinformatics Shared Resource.\n\n"+
@@ -546,8 +567,7 @@ public static void printDocs(){
"-m Path to the masterAccountInfo.xlsx spreadsheet updated from the prior month.\n"+
"-a Path to the awsAccounts.xlsx spreadsheet.\n"+
"-w Path to a dir containing the hourly and cloud 'WAF Tracking Schedule' xlsx files.\n" +
- "-c Path to a dir containing the cloud AWS Carahsoft xlsx expense reports. May be empty\n"+
- " if are none available.\n"+
+ "-c If available, path to a dir with the cloud AWS Carahsoft xlsx expense reports.\n"+
"-o Path to write the results.\n"+
"\nExample: java -Xmx256M -jar pathTo/USeq/Apps/CBiBilling -j jiraTime.cvs -m \n"+
diff --git a/Source/edu/utah/billing/CBiBilling2.java b/Source/edu/utah/billing/CBiBilling2.java
new file mode 100644
index 00000000..fd975ba8
--- /dev/null
+++ b/Source/edu/utah/billing/CBiBilling2.java
@@ -0,0 +1,399 @@
+package edu.utah.billing;
+
+import java.io.*;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import util.gen.*;
+
+/**
+ * Generates a billing report for Cancer Bioinformatics from hourly and cloud expenses.
+ */
+public class CBiBilling2 {
+
+ //fields
+ private File jiraReportFile;
+ private File wafDirectory;
+ private File outputDirectory;
+ private File masterAcountInfo;
+ private File cloudReportsDirectory;
+ private File awsAccountsFile;
+ private File expenseFile;
+ private boolean debug = false;
+
+ //internal fields
+ private MasterAccountInfoParser2 masterAccountInfo = null;
+ private HashMap aliasesBillingGroups = null;
+
+ private WafXlsxParser2 hourlyWafParser = null;
+ private WafXlsxParser2 cloudWafParser = null;
+
+ private AwsXlsxAccountParser2 awsXlsxAccountParser = null;
+ private TDSynnexXlsxParser tDSynnexXlsxParser = null;
+ private MiscExpenseXlsxParser miscExpenseParser = null;
+
+ private ArrayList accountNumbersCloudBilled = new ArrayList();
+ private ArrayList nonCancerBilling = new ArrayList();
+
+ static float firstTierPricePerHour = 70f;
+ static float secondTierPricePerHour = 100f;
+ static int maxHoursForFirstTier = 70;
+ static float hscCostSharing = 356f;
+ static float minimalBillingExpense = 5f;
+ static String contactEmail = "hci-srbilling@hci.utah.edu and cbi@hci.utah.edu";
+ static String cbiPolicyUrl = "https://uofuhealth.utah.edu/huntsman/shared-resources/gcb/cbi/cost";
+
+ private int clients = 0;
+ private int ccClientsBilled = 0;
+ private int ncClientsBilled = 0;
+ private float ccTotalHours = 0;
+ private float ncTotalHours = 0;
+ private float totalHourlyExpenses = 0;
+ private float totalHSCExpenses = 0f;
+ private float totalComputeExpenses = 0f;
+ private ArrayList bgsNoHourlyWaf = new ArrayList();
+ private ArrayList bgsNoCloudWaf = new ArrayList();
+ private String date = null;
+
+ //constructor
+ public CBiBilling2 (String[] args) throws Exception{
+ processArgs (args);
+
+ createBillingGroups();
+
+ parseWafs();
+
+ //any AWS cloud compute stuff to parse
+ if (cloudReportsDirectory != null) {
+ parseAwsAccounts();
+ parseAWSCloudAccountInvoices();
+ }
+
+ //any misc expenses, one offs like annual software licenses
+ if (expenseFile != null) parseMiscExpenses();
+
+ parseJiraHours();
+
+ printInvoices();
+
+ calculateSummaryStatistics();
+
+ printSummaryStatsMissingWafs();
+
+ masterAccountInfo.saveUpdatedInfo();
+
+ IO.pl("\nComplete!");
+
+ }
+
+
+ private void printSummaryStatsMissingWafs() {
+
+ IO.pl("Summary Statistics (CC CancerCenter, NC NonCC) ...");
+ IO.pl("\tClients :\t"+ clients);
+ IO.pl("\tCCClientsBilled :\t"+ ccClientsBilled);
+ IO.pl("\tNCClientsBilled :\t"+ ncClientsBilled);
+ IO.pl("\tTotalCCClientHours :\t"+ Num.formatNumber(ccTotalHours, 2));
+ IO.pl("\tTotalNCClientHours :\t"+ Num.formatNumber(ncTotalHours, 2));
+ IO.pl("\tTotalHourlyExpenses :\t$"+ Num.formatNumber(totalHourlyExpenses, 2));
+ IO.pl("\tTotalComputeExpenses:\t$"+ Num.formatNumber(totalComputeExpenses, 2));
+ IO.pl("\tTotalHSCCostSharing :\t$"+ Num.formatNumber(totalHSCExpenses, 2));
+
+ IO.pl("\nTotalBilling :\t$"+ Num.formatNumber((totalHourlyExpenses+ totalHSCExpenses+ totalComputeExpenses), 2));
+
+ IO.pl("\nClients Missing Current WAFs in the Tracking Spreadsheet! Check if these are in the UBox before adding to the contact list!");
+ IO.pl("\nClients Missing Hourly WAFs (GroupName CancerMembership UBoxStatus):");
+ for (BillingGroup bg: bgsNoHourlyWaf) IO.pl("\t"+bg.getGroupName()+"\t"+bg.isCancerCenterMember()+"\t?");
+ IO.pl("\nClients Missing Cloud WAFs (GroupName CancerMember):");
+ for (BillingGroup bg: bgsNoCloudWaf) IO.pl("\t"+bg.getGroupName()+"\t"+bg.isCancerCenterMember()+"\t?");
+ }
+
+
+ private void calculateSummaryStatistics() {
+
+ for (BillingGroup bg: masterAccountInfo.getBillingGroups()) {
+ if (bg.getTotalExpenses() > 0) {
+ clients++;
+
+ if (bg.getTotalExpenses() >= minimalBillingExpense) {
+ if (bg.isCancerCenterMember()) {
+ ccClientsBilled++;
+ ccTotalHours+= bg.getTotalHoursToBill();
+ }
+ else {
+ ncClientsBilled++;
+ ncTotalHours+= bg.getTotalHoursToBill();
+ }
+
+ totalHourlyExpenses+= bg.getTotalHourlyExpenses();
+ totalHSCExpenses+= bg.getAdditionalHourlyExpenses();
+ totalComputeExpenses+= bg.getTotalComputeExpenses();
+ // missing wafs?
+ if (bg.getTotalHourlyExpenses()>0 && bg.getHourlyWafs().size()==0) bgsNoHourlyWaf.add(bg);
+ if (bg.getTotalComputeExpenses()>0 && bg.getComputeWafs().size()==0) bgsNoCloudWaf.add(bg);
+ }
+ }
+
+ }
+ }
+
+
+ private void parseJiraHours() throws IOException {
+ JiraHourlyCsvParser jiraParser = new JiraHourlyCsvParser(jiraReportFile, debug);
+
+ //add to BillingGroups and id those missing
+ TreeMap> groupNameTickets = jiraParser.getGroupNameTickets();
+
+ ArrayList missingGroupNames = new ArrayList();
+ for (String gn: groupNameTickets.keySet()) {
+ BillingGroup bg = this.aliasesBillingGroups.get(gn);
+ if (bg == null) missingGroupNames.add(gn);
+ else bg.getJiraTickets().addAll(groupNameTickets.get(gn));
+ }
+
+ //any missing? if so exit.
+ if (missingGroupNames.size()!=0) {
+ IO.el("The following group names from the Jira Hourly parsing are missing from the aliases in the MasterAccountsInfo sheet, add them and restart:");
+ for (String gn: missingGroupNames) IO.el("\t"+gn);
+ System.exit(1);
+ }
+ }
+
+ private void parseMiscExpenses() {
+ IO.pl("\nParsing the miscellaneous compute expense spreadsheet...");
+ //parse the xlsx spreadsheet
+ miscExpenseParser = new MiscExpenseXlsxParser(expenseFile, debug);
+
+ //add to BillingGroups and id those missing
+ TreeMap> groupNameExpense = miscExpenseParser.getGroupNameExpense();
+ ArrayList missingGroupNames = new ArrayList();
+ for (String gn: groupNameExpense.keySet()) {
+ BillingGroup bg = this.aliasesBillingGroups.get(gn);
+ if (bg == null) missingGroupNames.add(gn);
+ else bg.getMiscExpenses().addAll(groupNameExpense.get(gn));
+ }
+
+ //any missing? if so exit.
+ if (missingGroupNames.size()!=0) {
+ IO.el("The following group names from the MiscExpenseXlsx parsing are missing from the aliases in the MasterAccountsInfo sheet, add them and restart:");
+ for (String gn: missingGroupNames) IO.el("\t"+gn);
+ System.exit(1);
+ }
+
+ }
+
+ private void parseAwsAccounts() {
+
+ IO.pl("\nParsing the AWS Account Info spreadsheet...");
+ awsXlsxAccountParser = new AwsXlsxAccountParser2(awsAccountsFile, debug);
+
+ //check that all of the groupNames are in the billing groups
+ ArrayList missingNames = new ArrayList();
+ for (String groupName: awsXlsxAccountParser.getAwsAccountGroupName().values()) {
+ BillingGroup bg = aliasesBillingGroups.get(groupName);
+ if (bg == null) missingNames.add(groupName);
+ }
+
+ if (missingNames.size()!=0) {
+ IO.el("The following group names from the AwsXlsxAccount parsing are missing from the aliases in the MasterAccountsInfo sheet, add them and restart:");
+ for (String gn: missingNames) IO.el("\t"+gn);
+ System.exit(1);
+ }
+ }
+
+ private void createBillingGroups() throws IOException {
+ IO.pl("\nParsing the Master Account Info spreadsheet...");
+
+ // Parse the master account xlsx sheet
+ masterAccountInfo = new MasterAccountInfoParser2(masterAcountInfo, debug);
+ aliasesBillingGroups = masterAccountInfo.getAliasesBillingGroups();
+ }
+
+ private void printInvoices() throws IOException {
+ IO.pl("\nPrinting Invoices...");
+
+ //find BillingGroups with > minimumExpense, not billing tiny users, < $5
+ ArrayList bgsToBillCancer = new ArrayList();
+ ArrayList bgsToBillNonCancerWithHSCCostSharing = new ArrayList();
+ ArrayList bgsToBillNonCancerNoHSCCostSharing = new ArrayList();
+ for (BillingGroup bg: masterAccountInfo.getBillingGroups()) {
+ bg.calculateTotalExpenses();
+ if (bg.getTotalExpenses() >= minimalBillingExpense) {
+ if (bg.isCancerCenterMember()) bgsToBillCancer.add(bg);
+ else if (bg.getAdditionalHourlyExpenses()> 0) bgsToBillNonCancerWithHSCCostSharing.add(bg);
+ else bgsToBillNonCancerNoHSCCostSharing.add(bg);
+ }
+ }
+
+
+ //for HCI and HSC Billing
+ IO.pl("\n####################### Cancer Center Members #########################\n");
+ for (BillingGroup bg: bgsToBillCancer) {
+ IO.pl("----------------------------------------------------------------\n");
+ String invoice = bg.generateInvoice(date, hourlyWafParser.getHeaderTabbed(), cloudWafParser.getHeaderTabbed(), true, false);
+ IO.pl(invoice);
+ }
+
+ IO.pl("####################### Non Cancer Center Members No HSC Cost Sharing #########################\n");
+ for (BillingGroup bg: bgsToBillNonCancerNoHSCCostSharing) {
+ IO.pl("----------------------------------------------------------------\n");
+ String invoice = bg.generateInvoice(date, hourlyWafParser.getHeaderTabbed(), cloudWafParser.getHeaderTabbed(), true, false);
+ IO.pl(invoice);
+ }
+
+ IO.pl("####################### Non Cancer Center Members With HSC Cost Sharing #########################\n");
+ for (BillingGroup bg: bgsToBillNonCancerWithHSCCostSharing) {
+ IO.pl("----------------------------------------------------------------\n");
+ String invoice = bg.generateInvoice(date, hourlyWafParser.getHeaderTabbed(), cloudWafParser.getHeaderTabbed(), true, false);
+ IO.pl(invoice);
+ }
+
+ //print individual invoices
+ bgsToBillCancer.addAll(bgsToBillNonCancerNoHSCCostSharing);
+ bgsToBillCancer.addAll(bgsToBillNonCancerWithHSCCostSharing);
+ for (BillingGroup bg: bgsToBillCancer) {
+ String fileName = Misc.COMMA_WHITESPACE.matcher(bg.getGroupName()).replaceAll("_")+"_CBIInvoice_"+date+".txt";
+ String invoice = bg.generateInvoice(date, hourlyWafParser.getHeaderTabbed(), cloudWafParser.getHeaderTabbed(), false, true);
+ IO.writeString(invoice, new File (outputDirectory, fileName));
+ }
+ }
+
+ private void parseAWSCloudAccountInvoices() throws IOException {
+ IO.pl("\nParsing TDSynnex AWS account invoices...");
+
+ TreeMap accountNumberGroupName = awsXlsxAccountParser.getAwsAccountGroupName();
+
+
+ //for each billing group charged, add to it their AWS expenses
+ ArrayList missingNames = new ArrayList();
+ tDSynnexXlsxParser = new TDSynnexXlsxParser(cloudReportsDirectory, debug);
+ TreeMap accountNumberTotalExpense= tDSynnexXlsxParser.getAwsAccountNumberTotals();
+ for (String awsAccountNumber: accountNumberTotalExpense.keySet()) {
+ //find the billing group name
+ String billingGroupName = accountNumberGroupName.get(awsAccountNumber);
+ if (billingGroupName == null) missingNames.add(awsAccountNumber);
+
+ else {
+ BillingGroup bg = aliasesBillingGroups.get(billingGroupName);
+ bg.getAwsAccountExpenses().add(new AwsAccountExpense(awsAccountNumber, accountNumberTotalExpense.get(awsAccountNumber)));
+ }
+ }
+ if (missingNames.size()!=0) {
+ IO.el("The following group names from the TDSynnex Xlsx billing are missing from the AWS Account Xlsx info sheet, add them and restart:");
+ for (String gn: missingNames) IO.el("\t"+gn);
+ System.exit(1);
+ }
+ }
+
+ private void parseWafs() {
+ IO.pl("\nParsing the WAF tracking spreadsheets...");
+ File[] xlsFiles = IO.extractFiles(wafDirectory, ".xlsx");
+ for (File f: xlsFiles) {
+ String name = f.getName().toLowerCase();
+ if (name.contains("waf") && f.getName().startsWith("~")== false) {
+ if (name.contains("cloud")) cloudWafParser = new WafXlsxParser2(f, debug);
+ else hourlyWafParser = new WafXlsxParser2(f, debug);
+
+ }
+ }
+
+ if (cloudWafParser == null || hourlyWafParser == null) Misc.printErrAndExit("\nFailed to parse both an hourly and cloud WAF tracking schedule xlsx file.");
+
+ //add cloud compute WAF lines to each Billing Group
+ ArrayList missingAliasCloud = new ArrayList();
+ for (String groupName: cloudWafParser.getGroupNameWafLines().keySet()) {
+ BillingGroup bg = aliasesBillingGroups.get(groupName);
+ if (bg == null) missingAliasCloud.add(groupName);
+ else bg.getComputeWafs().addAll(cloudWafParser.getGroupNameWafLines().get(groupName));
+ };
+ if (missingAliasCloud.size()!=0) {
+ IO.el("The following cloud WAF group names are missing from the MasterAccountsInfo sheet, add them and restart:");
+ for (String gn: missingAliasCloud) IO.el("\t"+gn);
+ System.exit(1);
+ }
+
+ //add hourly WAF lines to each Billing Group
+ ArrayList missingAliasHourly = new ArrayList();
+ for (String groupName: hourlyWafParser.getGroupNameWafLines().keySet()) {
+ BillingGroup bg = aliasesBillingGroups.get(groupName);
+
+ if (bg == null) missingAliasHourly.add(groupName);
+ else bg.getHourlyWafs().addAll(hourlyWafParser.getGroupNameWafLines().get(groupName));
+ }
+ if (missingAliasHourly.size()!=0) {
+ IO.el("The following hourly WAF group names are missing from the MasterAccountsInfo sheet, add them and restart:");
+ for (String gn: missingAliasHourly) IO.el("\t"+gn);
+ System.exit(1);
+ }
+
+ }
+
+ public static void main(String[] args) throws Exception {
+ if (args.length ==0){
+ printDocs();
+ System.exit(0);
+ }
+ new CBiBilling2(args);
+ }
+
+ /**This method will process each argument and assign new varibles*/
+ public void processArgs(String[] args){
+ Pattern pat = Pattern.compile("-[a-z]");
+ String useqVersion = IO.fetchUSeqVersion();
+ System.out.println("\n"+useqVersion+" Arguments: "+ Misc.stringArrayToString(args, " ") +"\n");
+ for (int i = 0; i $5 of hourly+ compute costs are billed.\n\n"+
+
+ "Required Parameters:\n" +
+ "-j Path to the exported cvs Jira 'Logged Time' report.\n"+
+ "-m Path to the masterAccountInfo.xlsx spreadsheet updated from the prior month.\n"+
+ "-a Path to the awsAccounts.xlsx spreadsheet.\n"+
+ "-w Path to a dir containing the hourly and cloud 'WAF Tracking Schedule' xlsx files.\n" +
+ "-c If available, path to a dir with the cloud AWS TDSynnex xlsx expense reports.\n"+
+ "-e If available, path to a miscellaneous compute usage expense xlsx spreadsheet.\n"+
+ "-o Path to write the Invoices.\n"+
+
+ "\nExample: java -Xmx256M -jar pathTo/USeq/Apps/CBiBilling -j jiraTime.cvs -m \n"+
+ " masterAccountInfo.xlsx -w WAFs/ -c TDSynnex/ -o Invoices\n" +
+
+
+ "**************************************************************************************\n");
+ }
+}
diff --git a/Source/edu/utah/billing/JiraHourlyCsvParser.java b/Source/edu/utah/billing/JiraHourlyCsvParser.java
new file mode 100644
index 00000000..32be560d
--- /dev/null
+++ b/Source/edu/utah/billing/JiraHourlyCsvParser.java
@@ -0,0 +1,162 @@
+package edu.utah.billing;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.TreeMap;
+
+import util.gen.IO;
+import util.gen.Misc;
+
+public class JiraHourlyCsvParser {
+
+ private File jiraCsv = null;
+ private boolean debug = false;
+ private LinkedHashMap headerKeyIndex = null;
+ private TreeMap> groupNameTickets = new TreeMap>();
+ private ArrayList errorTickets = new ArrayList();
+
+ public JiraHourlyCsvParser (File jiraCsv, boolean debug) throws IOException {
+ this.jiraCsv = jiraCsv;
+ this.debug = debug;
+ parseIt();
+ }
+
+ private void parseIt() throws IOException {
+ IO.pl("\nParsing the Jira ticket report...");
+ BufferedReader in = IO.fetchBufferedReader(jiraCsv);
+ String line;
+ String [] cells;
+
+ while ((line = in.readLine()) != null) {
+ //empty?
+ line = line.trim();
+ if (line.length()== 0) continue;
+ cells = Misc.splitCsvLine(line);
+ if (cells[0].contains("Issue Key")) parseHeader(cells);
+ else {
+ //data line, add to group
+ if (headerKeyIndex == null) throw new IOException ("Failed to parse a header line from "+jiraCsv);
+ JiraTicketSummary jts = new JiraTicketSummary(cells, line, headerKeyIndex);
+ //any errors?
+ if (jts.getErrors().size()!=0) errorTickets.add(jts);
+ else {
+ String groupName = jts.getGroupToBill();
+ if (groupName == null || groupName.length()==0) groupName = "NA";
+ ArrayList al = groupNameTickets.get(groupName);
+ if (al == null) {
+ al = new ArrayList();
+ groupNameTickets.put(groupName, al);
+ }
+ al.add(jts);
+ }
+ if (jts.getJiraTicket()==null) {
+ Misc.printErrAndExit("Missing jt#: ");
+ }
+ }
+ }
+ in.close();
+
+ //any errors?
+ if (errorTickets.size()!=0) {
+ IO.pl("\nErrors with the following Jira tickets, correct and restart:");
+ for (JiraTicketSummary jts: errorTickets) {
+ for (String e: jts.getErrors()) IO.pl("\t"+e);
+ }
+ System.exit(1);
+ }
+
+ //check groups
+ boolean errorsFound = false;
+ for (String groupName: groupNameTickets.keySet()) {
+ if (groupName.equals("NA")) continue;
+ int numFTEFound = 0;
+ int numHourlyFound = 0;
+ int numInfrastructureFound = 0;
+ for (JiraTicketSummary jts: groupNameTickets.get(groupName)) {
+ if (jts.getWorkType().equals("FTE")) numFTEFound++;
+ else if (jts.getWorkType().equals("Hourly")) numHourlyFound++;
+ else if (jts.getWorkType().equals("Infrastructure")) numHourlyFound++;
+ }
+ //any infrastructure? no need to check this, should have thrown an error above
+ if (numInfrastructureFound !=0) Misc.printErrAndExit("Non zero infrastructure count for "+groupName);
+
+ //both FTE and Hourly?
+ int total = numFTEFound + numHourlyFound;
+ if (numFTEFound != 0 && numFTEFound != total) {
+ errorsFound = true;
+ IO.pl("\tMixed FTE and Hourly tickets were found for "+groupName);
+ for (JiraTicketSummary jts: groupNameTickets.get(groupName)) {
+ IO.pl("\t\t"+jts.getWorkType()+"\t"+jts.toString());
+ }
+ }
+ }
+ if (errorsFound) System.exit(1);
+
+ //any NA?
+ if (groupNameTickets.containsKey("NA")) {
+ IO.pl("\n\tThe following are missing an Account and were labled Infrastructure, checkem!");
+ for (JiraTicketSummary jts: groupNameTickets.get("NA")) {
+ IO.pl("\t"+jts.toString());
+
+ }
+ }
+
+ //print out the FTE
+ IO.pl("\n\tThe following are FTE and will be excluded from hourly billing, checkem!");
+ ArrayList groupsToRemove = new ArrayList();
+ for (String groupName: groupNameTickets.keySet()) {
+ if (groupName.equals("NA")) {
+ groupsToRemove.add("NA");
+ continue;
+ }
+ boolean fteGroup = false;
+
+ for (JiraTicketSummary jts: groupNameTickets.get(groupName)) {
+ if (jts.getWorkType().equals("FTE")) {
+ IO.pl("\t"+jts.toString());
+ fteGroup = true;
+ }
+ }
+ if (fteGroup) groupsToRemove.add(groupName);
+ }
+
+ //sum the hours for FTE and remove them from the jira ticket map
+ for (String gn: groupsToRemove) {
+ if (gn.equals("NA")== false) {
+ float totalHours = 0;
+ for (JiraTicketSummary jts: groupNameTickets.get(gn)) totalHours+= Float.parseFloat(jts.getHoursString());
+ IO.pl("\t\t"+gn+"\t"+totalHours);
+ }
+ groupNameTickets.remove(gn);
+ }
+ }
+
+
+ private void parseHeader(String[] cells) throws IOException {
+ if (debug) IO.pl("\nParsing Jira Report Header:");
+ headerKeyIndex = new LinkedHashMap();
+ for (int i=0; i< cells.length; i++) {
+ if (headerKeyIndex.containsKey(cells[i]) == false) {
+ headerKeyIndex.put(cells[i], i);
+ if (debug) IO.pl("\t"+cells[i]+"\t"+i);
+ }
+ else if (debug) IO.pl("\t"+cells[i]+"\tDuplicate header key skipping"+i);
+ }
+
+ //check it contains the required cells
+ String[] toFind = {"CBI - Work Type", "Account Name", "Issue Key", "Full name", "Billed Hours", "Issue summary", "Work Description"};
+
+ for (String tf: toFind) {
+ if (headerKeyIndex.containsKey(tf) == false) throw new IOException("Failed to find the '"+tf+"' header key in "+headerKeyIndex);
+ }
+ }
+
+ public TreeMap> getGroupNameTickets() {
+ return groupNameTickets;
+ }
+
+
+}
diff --git a/Source/edu/utah/billing/MasterAccountInfoParser2.java b/Source/edu/utah/billing/MasterAccountInfoParser2.java
new file mode 100644
index 00000000..9fab3a90
--- /dev/null
+++ b/Source/edu/utah/billing/MasterAccountInfoParser2.java
@@ -0,0 +1,130 @@
+package edu.utah.billing;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashSet;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.ss.usermodel.Workbook;
+import org.apache.poi.ss.usermodel.WorkbookFactory;
+import util.gen.IO;
+import util.gen.Misc;
+
+
+public class MasterAccountInfoParser2 {
+
+ private File masterAccountFile = null;
+ private ArrayList billingGroups = new ArrayList();
+ private HashMap aliasesBillingGroups = new HashMap();
+
+ public MasterAccountInfoParser2(File xlsx, boolean debug) throws IOException {
+ masterAccountFile = xlsx;
+
+ parseIt();
+
+ //load all of the aliases
+ for (BillingGroup bg: billingGroups) {
+ for (String a: bg.getAliases()) {
+ if (aliasesBillingGroups.containsKey(a)) throw new IOException("ERROR: "+a+" was found on multiple lines! Fix "+masterAccountFile);
+ aliasesBillingGroups.put(a, bg);
+ }
+ }
+
+ if (debug) {
+ for (BillingGroup bg: billingGroups) {
+ IO.pl("Found:\t"+ bg);
+ }
+ }
+ }
+
+ public static void main(String[] args) throws IOException {
+ MasterAccountInfoParser2 p = new MasterAccountInfoParser2(new File ("/Users/u0028003/HCI/CoreAdmin/Billing/AllBillingReports/2024/3-4_CBI_Feb-Mar_2024/masterAccountInfo.xlsx"), true);
+ p.saveUpdatedInfo();
+ }
+
+ /**Prints a txt formatted spreadsheet that can be inspected and then used to replace the old master xlsx file.*/
+ public void saveUpdatedInfo(){
+ File updatedMasterAccountFile = null;
+ try {
+ String originalName = Misc.removeExtension(masterAccountFile.getName());
+ updatedMasterAccountFile = new File(masterAccountFile.getParentFile(), originalName+"_Updated.xls");
+ IO.pl("\nSaving updated master account info, review in Excel and use it for the next billing cycle, "+updatedMasterAccountFile.getName());
+ PrintWriter out = new PrintWriter( new FileWriter(updatedMasterAccountFile));
+ //header
+ out.println("Cancer Status [Cancer|Non-Cancer]\tTotalHoursBilled\tGroupAliasName1\tGroupAliasName2\tEtc");
+ //for each group
+ for (BillingGroup bg: billingGroups) out.println(bg);
+ out.close();
+
+ } catch (Exception e){
+ System.err.println("\nProblem saving updated "+masterAccountFile);
+ if (updatedMasterAccountFile != null) updatedMasterAccountFile.delete();
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ private void parseIt() {
+ try {
+
+ //Open up xlsx file
+ Workbook wb = WorkbookFactory.create(masterAccountFile);
+
+ //Find appropriate sheet
+ Sheet sheet = wb.getSheetAt(0);
+ if (sheet == null) throw new IOException("Could not find a sheet in "+masterAccountFile+" ?");
+
+ //Iterate through rows
+ int numRows = sheet.getPhysicalNumberOfRows()+1;
+ for (int r = 0; r< numRows; r++) {
+ Row row = sheet.getRow(r);
+ if (row != null) parseRow(row);
+ }
+ } catch (Exception e) {
+ System.out.println("MasterAccountInfo xlsx parsing failed, exiting");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ private void parseRow(Row row) {
+ int numCells = row.getLastCellNum()+1;
+ if (numCells == 1) return;
+
+
+ //row 0 is Cancer Status [Cancer|Non-Cancer]
+ String cancerStatus = row.getCell(0).toString();
+ if (cancerStatus.startsWith("Cancer Status")) return; //skip header
+ boolean isCancerMember = true;
+ if (cancerStatus.toLowerCase().contains("non")) isCancerMember = false;
+
+ //row 1 is Total Hours Billed, for <=70 hrs its $70/hr for more its $100
+ Float totalHours = Float.parseFloat(row.getCell(1).toString());
+
+ LinkedHashSet aliases = new LinkedHashSet();
+ for (int c=2;c < numCells; c++) {
+ Cell cell = row.getCell(c);
+ if (cell != null) {
+ String value = cell.toString().trim();
+ if (value.length()!=0) aliases.add(value);
+ }
+ }
+
+ billingGroups.add( new BillingGroup(isCancerMember, totalHours, aliases) );
+ }
+
+ public ArrayList getBillingGroups() {
+ return billingGroups;
+ }
+
+ public HashMap getAliasesBillingGroups() {
+ return aliasesBillingGroups;
+ }
+
+
+}
diff --git a/Source/edu/utah/billing/MiscExpense.java b/Source/edu/utah/billing/MiscExpense.java
new file mode 100644
index 00000000..e1b170cd
--- /dev/null
+++ b/Source/edu/utah/billing/MiscExpense.java
@@ -0,0 +1,28 @@
+package edu.utah.billing;
+
+import java.util.ArrayList;
+
+public class MiscExpense {
+ private double cost = 0;
+ private String description = null;
+
+ public MiscExpense (double cost, String description){
+ this.cost = cost;
+ this.description = description;
+ }
+
+ public static float fetchTotalExpense(ArrayList accounts) {
+ float total = 0f;
+ for (MiscExpense aae: accounts) total+= aae.getCost();
+ return total;
+ }
+
+ public double getCost() {
+ return cost;
+ }
+
+ public String getDescription() {
+ return description;
+ }
+
+}
\ No newline at end of file
diff --git a/Source/edu/utah/billing/MiscExpenseXlsxParser.java b/Source/edu/utah/billing/MiscExpenseXlsxParser.java
new file mode 100644
index 00000000..bfc743b6
--- /dev/null
+++ b/Source/edu/utah/billing/MiscExpenseXlsxParser.java
@@ -0,0 +1,90 @@
+package edu.utah.billing;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.TreeMap;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.ss.usermodel.Workbook;
+import org.apache.poi.ss.usermodel.WorkbookFactory;
+import util.gen.IO;
+
+public class MiscExpenseXlsxParser {
+
+ private TreeMap> groupNameExpense = new TreeMap>();
+ private File inputFile = null;
+
+ public MiscExpenseXlsxParser(File xlsx, boolean debug) {
+ inputFile = xlsx;
+ parseIt();
+
+ if (debug) {
+ for (String s: groupNameExpense.keySet()) {
+ ArrayList al = groupNameExpense.get(s);
+ for (MiscExpense e: al) {
+ IO.pl(s+"\t"+e.getCost()+"\t"+e.getDescription());
+ }
+ }
+ }
+ }
+
+ public static void main(String[] args) {
+ new MiscExpenseXlsxParser(
+ new File ("/Users/u0028003/HCI/CoreAdmin/Billing/AllBillingReports/2024/3-4_CBI_Feb-Mar_2024/OneTimeExpenses/miscComputingExpensesFeb2024.xlsx"),
+ true);
+
+ }
+
+ private void parseIt() {
+ try {
+
+ //Open up xlsx file
+ Workbook wb = WorkbookFactory.create(inputFile);
+
+ //Find appropriate sheet
+ Sheet sheet = wb.getSheetAt(0);
+ if (sheet == null) throw new IOException("Could not find a sheet in "+inputFile+" ?");
+
+ //Iterate through rows
+ int numRows = sheet.getPhysicalNumberOfRows()+1;
+ for (int r = 0; r< numRows; r++) {
+ Row row = sheet.getRow(r);
+ if (row != null) parseExpense(r, row);
+ }
+ } catch (Exception e) {
+ System.out.println("The expense xlsx file is not in the correct format, exiting");
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ private void parseExpense(int rowNumber, Row row) throws IOException {
+ int numCells = row.getPhysicalNumberOfCells();
+
+ //skip blanks
+ if (numCells == 0) return;
+ //error if not three
+ if (numCells != 3) throw new IOException("FAILED to find 3 cells in row "+rowNumber+" in "+inputFile);
+
+ Cell groupNameCell = row.getCell(0);
+ String groupName = groupNameCell.toString().trim();
+ //skip header line
+ if (groupName.startsWith("INVESTIGATOR") || groupName.length()==0) return;
+
+ double expense = row.getCell(1).getNumericCellValue();
+ String description = row.getCell(2).toString().trim();
+ ArrayList al = groupNameExpense.get(groupName);
+ if (al == null) {
+ al = new ArrayList();
+ groupNameExpense.put(groupName, al);
+ }
+ al.add(new MiscExpense(expense, description));
+
+ }
+
+ public TreeMap> getGroupNameExpense() {
+ return groupNameExpense;
+ }
+}
diff --git a/Source/edu/utah/billing/TDSynnexXlsxParser.java b/Source/edu/utah/billing/TDSynnexXlsxParser.java
new file mode 100644
index 00000000..004b2374
--- /dev/null
+++ b/Source/edu/utah/billing/TDSynnexXlsxParser.java
@@ -0,0 +1,144 @@
+package edu.utah.billing;
+
+import java.io.File;
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.util.ArrayList;
+import java.util.TreeMap;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.ss.usermodel.Workbook;
+import org.apache.poi.ss.usermodel.WorkbookFactory;
+import edu.utah.hci.bioinfo.smm.Util;
+import util.gen.IO;
+import util.gen.Misc;
+
+public class TDSynnexXlsxParser {
+
+ private TreeMap> accountExpenses = new TreeMap>();
+ private TreeMap awsAccountNumberTotals = new TreeMap();
+
+ //column names and their found indexes
+ private String accountName = "Account";
+ private int accountIndex = -1;
+ private String expenseName = "`"; //really weird!
+ private int expenseIndex = -1;
+ private int numParsedLines = 0;
+
+ public TDSynnexXlsxParser(File dir, boolean debug) throws IOException {
+
+ //pull all of the xlsx files, there will be several months worth
+ File[] xlsxFiles = IO.extractFiles(dir, ".xlsx");
+ if (xlsxFiles == null || xlsxFiles.length ==0) throw new IOException("ERROR: failed to find xlsx TDSynnex files in "+dir);
+
+ //parse each
+ for (File xlsx: xlsxFiles) parseIt(xlsx);
+
+ //sum all of the expenses for each account
+ float awsTotal = 0.0f;
+ for (String s: accountExpenses.keySet()) {
+ float total = 0;
+ for (Float f: accountExpenses.get(s)) total += f;
+ //trim name
+ if (total >= 0.01f) awsAccountNumberTotals.put(s, total);
+ awsTotal += total;
+ }
+
+ //print results
+ if (debug) {
+ IO.pl("AWS account and total expenses:");
+ for (String s: awsAccountNumberTotals.keySet()) {
+ IO.pl(s+"\t"+awsAccountNumberTotals.get(s));
+ }
+ IO.pl("\nTotal AWS: "+awsTotal);
+ }
+ }
+
+ public static void main(String[] args) throws IOException {
+ TDSynnexXlsxParser p = new TDSynnexXlsxParser(new File ("/Users/u0028003/HCI/CoreAdmin/Billing/AllBillingReports/2024/3-4_CBI_Feb-Mar_2024/TDSynnex/"), true);
+
+ }
+
+ private void parseIt(File inputFile) {
+ //watchout for working and partial saved excel files
+ if (inputFile.getName().startsWith("~$")) return;
+
+ //reset
+ accountIndex = -1;
+ expenseIndex = -1;
+ numParsedLines = 0;
+
+
+ try {
+ //Open up xlsx file
+ Workbook wb = WorkbookFactory.create(inputFile);
+
+ //Find appropriate sheet
+ Sheet sheet = wb.getSheet("Detail");
+ if (sheet == null) throw new IOException("Could not find the 'Detail' sheet in "+inputFile+" ?");
+
+ //Iterate through rows
+ int numRows = sheet.getPhysicalNumberOfRows();
+ for (int r = 0; r< numRows; r++) {
+ Row row = sheet.getRow(r);
+ if (row != null) parseRow(row);
+ }
+
+ } catch (Exception e) {
+ Util.el("\nERROR parsing "+inputFile);
+ e.printStackTrace();
+ System.exit(1);
+ }
+
+ //check that data lines were parsed
+ if (numParsedLines == 0) Misc.printErrAndExit("\nFailed to parse any data lines from "+inputFile);
+
+
+ }
+
+ private void parseRow(Row row) {
+
+ //convert to String[]
+ int numCells = row.getPhysicalNumberOfCells()+1;
+ String[] cellStrings = new String[numCells];
+ for (int c=0;c < numCells; c++) {
+ Cell cell = row.getCell(c);
+ if (cell != null) cellStrings[c] = cell.toString().trim();
+ }
+
+ //header row
+ if (accountIndex == -1) {
+ if (cellStrings[0].contains("Billing")) {
+ for (int i=1; i< numCells; i++) {
+ if (cellStrings[i]!= null) {
+ if (cellStrings[i].equals(accountName)) accountIndex = i;
+ else if (cellStrings[i].equals(expenseName)) expenseIndex = i;
+ }
+ }
+ }
+ //check that both were found
+ if (accountIndex == -1 || expenseIndex == -1) Misc.printErrAndExit("ERROR: failed to find the account or expense indexes in the TDSynnex xlsx sheet.");
+ }
+
+ // data line
+ else {
+ numParsedLines++;
+ //watchout for E formatting in excel sheet for the account #
+ Double lAccName = Double.parseDouble(cellStrings[accountIndex]);
+ String corrAccountName = BigDecimal.valueOf(lAccName).toPlainString();
+ ArrayList expenses = accountExpenses.get(corrAccountName);
+ if (expenses == null) {
+ expenses = new ArrayList();
+ accountExpenses.put(corrAccountName, expenses);
+ }
+ Float cost = Float.parseFloat(cellStrings[expenseIndex]);
+ if (cost > 0.0f) expenses.add(cost);
+ }
+ }
+
+ public TreeMap getAwsAccountNumberTotals() {
+ return awsAccountNumberTotals;
+ }
+
+}
diff --git a/Source/edu/utah/billing/WafXlsxParser.java b/Source/edu/utah/billing/WafXlsxParser.java
index 7b53bc11..fce5f34e 100644
--- a/Source/edu/utah/billing/WafXlsxParser.java
+++ b/Source/edu/utah/billing/WafXlsxParser.java
@@ -25,11 +25,12 @@ public class WafXlsxParser {
private boolean debug = false;
- public WafXlsxParser(File xlsx, boolean debug) {
+ public WafXlsxParser(File xlsx, boolean debug, HashMap> aliases) {
this.debug = debug;
- parseIt(xlsx);
+ parseIt(xlsx, aliases);
headerTabbed = Misc.stringArrayListToString(header, "\t");
+
if (debug) {
IO.pl("Header:\t"+ header);
@@ -47,13 +48,13 @@ public WafXlsxParser(File xlsx, boolean debug) {
public static void main(String[] args) {
//hourly
- new WafXlsxParser(new File ("/Users/u0028003/HCI/CoreAdmin/Billing/AllBillingReports/2023/6_BSR_June_2023/WAF/Bioinformatics WAF Tracking Schedule - FY23.xlsx"), true);
+ //new WafXlsxParser(new File ("/Users/u0028003/HCI/CoreAdmin/Billing/AllBillingReports/2023/6_BSR_June_2023/WAF/Bioinformatics WAF Tracking Schedule - FY23.xlsx"), true);
//cloud
- new WafXlsxParser(new File ("/Users/u0028003/HCI/CoreAdmin/Billing/AllBillingReports/2023/6_BSR_June_2023/WAF/Bioinformatics SB Cloud WAF Tracking Schedule - FY23.xlsx"), true);
+ //new WafXlsxParser(new File ("/Users/u0028003/HCI/CoreAdmin/Billing/AllBillingReports/2023/6_BSR_June_2023/WAF/Bioinformatics SB Cloud WAF Tracking Schedule - FY23.xlsx"), true);
}
- private void parseIt(File inputFile) {
+ private void parseIt(File inputFile, HashMap> aliases) {
try {
//Open up xlsx file
@@ -63,6 +64,8 @@ private void parseIt(File inputFile) {
Sheet sheet = wb.getSheet("Account Table");
if (sheet == null) throw new IOException("Could not find sheet 'Account Table' in "+inputFile+" ?");
+ ArrayList missingAnAlias = new ArrayList();
+
//Iterate through rows
int numRows = sheet.getPhysicalNumberOfRows();
for (int r = 0; r< numRows; r++) {
@@ -110,13 +113,24 @@ private void parseIt(File inputFile) {
ArrayList al = groupNameWafLines.get(cellsToSave[0]);
if (al == null) {
al = new ArrayList();
- groupNameWafLines.put(cellsToSave[0], al);
+
+ //fetch aliases
+ HashSet als = aliases.get(cellsToSave[0]);
+ if (als == null) missingAnAlias.add(cellsToSave[0]);
+ else {
+ for (String a: als) groupNameWafLines.put(a, al);
+ }
}
al.add(cellsToSave);
}
-
}
- }
+ }
+ //any missing aliases?
+ if (missingAnAlias.size()!=0) {
+ for (String a: missingAnAlias)IO.el("\tMissing entry in masterAccountInfo for "+a+" from "+inputFile.getName());
+ IO.el("\t\tCorrect and restart.\n");
+ System.exit(1);
+ }
} catch (Exception e) {
System.out.println("Xlsx file is not in the correct format, exiting -> "+inputFile);
e.printStackTrace();
diff --git a/Source/edu/utah/billing/WafXlsxParser2.java b/Source/edu/utah/billing/WafXlsxParser2.java
new file mode 100644
index 00000000..b68bcee9
--- /dev/null
+++ b/Source/edu/utah/billing/WafXlsxParser2.java
@@ -0,0 +1,140 @@
+package edu.utah.billing;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.TreeMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.ss.usermodel.Workbook;
+import org.apache.poi.ss.usermodel.WorkbookFactory;
+
+import util.gen.IO;
+import util.gen.Misc;
+
+public class WafXlsxParser2 {
+ private HashMap headerKeyIndex = new HashMap();
+ private ArrayList header = new ArrayList();
+ private String headerTabbed = null;
+ private TreeMap> groupNameWafLines = new TreeMap>();
+ private boolean debug = false;
+
+ public WafXlsxParser2(File xlsx, boolean debug) {
+ this.debug = debug;
+
+ parseIt(xlsx);
+ headerTabbed = Misc.stringArrayListToString(header, "\t");
+
+
+ if (debug) {
+ IO.pl("Header:\t"+ header);
+ IO.pl("\nHeaderIndexes:\t"+ headerKeyIndex);
+ IO.pl("\nPIs: "+groupNameWafLines.size());
+ for (String pi: groupNameWafLines.keySet()) {
+ IO.pl(pi);
+ for (String[] v: groupNameWafLines.get(pi)) {
+ IO.pl("\t"+ Misc.stringArrayToString(v, "\t"));
+ }
+ }
+ }
+
+ }
+
+ public static void main(String[] args) {
+ //hourly
+ //new WafXlsxParser(new File ("/Users/u0028003/HCI/CoreAdmin/Billing/AllBillingReports/2023/6_BSR_June_2023/WAF/Bioinformatics WAF Tracking Schedule - FY23.xlsx"), true);
+
+ //cloud
+ //new WafXlsxParser(new File ("/Users/u0028003/HCI/CoreAdmin/Billing/AllBillingReports/2023/6_BSR_June_2023/WAF/Bioinformatics SB Cloud WAF Tracking Schedule - FY23.xlsx"), true);
+ }
+
+ private void parseIt(File inputFile) {
+ try {
+
+ //Open up xlsx file
+ Workbook wb = WorkbookFactory.create(inputFile);
+
+ //Find appropriate sheet
+ Sheet sheet = wb.getSheet("Account Table");
+ if (sheet == null) throw new IOException("Could not find sheet 'Account Table' in "+inputFile+" ?");
+
+ ArrayList missingAnAlias = new ArrayList();
+
+ //Iterate through rows
+ int numRows = sheet.getPhysicalNumberOfRows();
+ for (int r = 0; r< numRows; r++) {
+ Row row = sheet.getRow(r);
+ if (row == null) {
+ //if (debug) IO.pl();
+ }
+ else {
+ int numCells = row.getLastCellNum()+1;
+ boolean inHeader = false;
+ String[] cellsToSave = new String[numCells+1];
+ for (int c=0;c < numCells; c++) {
+ //Get cell
+ Cell cell = sheet.getRow(r).getCell(c);
+ if (cell != null) {
+ //trim it and replace any returns
+ String cellStringValue = cell.toString().trim();
+ cellStringValue = Misc.RETURN.matcher(cellStringValue).replaceAll("");
+
+ //if (debug) IO.p(cellStringValue+"\t");
+ if (cellStringValue.contains("INVESTIGATOR") || inHeader == true) {
+ if (cellStringValue.contains("NOTES")) {
+ inHeader = false;
+ c = numCells;
+ }
+ else inHeader = true;
+ //if (debug) IO.pl("Header found! Adding "+cellStringValue+" to "+c);
+ headerKeyIndex.put(cellStringValue, c);
+ header.add(cellStringValue);
+ }
+ if (cellStringValue.contains("EXPIRED")) {
+ //if (debug) IO.pl("No more active WAFS");
+ return;
+ }
+ cellsToSave[c] = cellStringValue;
+ }
+ else {
+ //if (debug) IO.p("\t");
+ if (inHeader) header.add(" ");
+ }
+ }
+
+ //save?
+ if (cellsToSave[0]!=null && cellsToSave[0].length()!=0 && cellsToSave[0].equals("INVESTIGATOR")==false) {
+ ArrayList al = groupNameWafLines.get(cellsToSave[0]);
+ if (al == null) {
+ al = new ArrayList();
+ groupNameWafLines.put(cellsToSave[0], al);
+ }
+ al.add(cellsToSave);
+ }
+ }
+ }
+
+ } catch (Exception e) {
+ System.out.println("Xlsx file is not in the correct format, exiting -> "+inputFile);
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ public TreeMap> getGroupNameWafLines() {
+ return groupNameWafLines;
+ }
+
+ public ArrayList getHeader() {
+ return header;
+ }
+
+ public String getHeaderTabbed() {
+ return headerTabbed;
+ }
+}
diff --git a/Source/edu/utah/seq/analysis/ScanSeqs.java b/Source/edu/utah/seq/analysis/ScanSeqs.java
index d979c58c..e0135053 100644
--- a/Source/edu/utah/seq/analysis/ScanSeqs.java
+++ b/Source/edu/utah/seq/analysis/ScanSeqs.java
@@ -1344,24 +1344,26 @@ public void processArgs(String[] args){
if (saveDirectory == null) Misc.printExit("\nError: enter a directory text to save results.\n");
if (saveDirectory.exists() == false) saveDirectory.mkdir();
- //check for R and required libraries
- if (fullPathToR == null || fullPathToR.canExecute()== false) {
- Misc.printExit("\nError: Cannot find or execute the R application -> "+fullPathToR+"\n");
- }
- else {
- /*String errors = IO.runRCommandLookForError("library(DESeq)", fullPathToR, saveDirectory);
+ //check for R and required libraries?
+ if (controlPointDirs != null) {
+ if (fullPathToR == null || fullPathToR.canExecute()== false) {
+ Misc.printExit("\nError: Cannot find or execute the R application -> "+fullPathToR+"\n");
+ }
+ else {
+ /*String errors = IO.runRCommandLookForError("library(DESeq)", fullPathToR, saveDirectory);
if (errors == null || errors.length() !=0){
Misc.printExit("\nError: Cannot find the required R library. Did you install DESeq " +
"(http://www-huber.embl.de/users/anders/DESeq/)? See the author's websites for installation instructions. Once installed, " +
"launch an R terminal and type 'library(DESeq)' to see if it is present. R error message:\n\t\t"+errors+"\n\n");
}*/
- String errors = IO.runRCommandLookForError("library(qvalue)", fullPathToR, saveDirectory);
- if (errors == null || errors.length() !=0){
- Misc.printExit("\nError: Cannot find the required R library. Did you install qvalue " +
- "(http://genomics.princeton.edu/storeylab/qvalue/)? See the author's websites for installation instructions. Once installed, " +
- "launch an R terminal and type 'library(qvalue)' to see if it is present. R error message:\n\t\t"+errors+"\n\n");
- }
- }
+ String errors = IO.runRCommandLookForError("library(qvalue)", fullPathToR, saveDirectory);
+ if (errors == null || errors.length() !=0){
+ Misc.printExit("\nError: Cannot find the required R library. Did you install qvalue " +
+ "(http://genomics.princeton.edu/storeylab/qvalue/)? See the author's websites for installation instructions. Once installed, " +
+ "launch an R terminal and type 'library(qvalue)' to see if it is present. R error message:\n\t\t"+errors+"\n\n");
+ }
+ }
+ }
//set score items
setScoreStrings();
@@ -1379,7 +1381,7 @@ public static float pseudoRound(float d){
public static void printDocs(){
System.out.println("\n" +
"**************************************************************************************\n" +
- "** Scan Seqs: July 2015 **\n" +
+ "** Scan Seqs: Jan 2024 **\n" +
"**************************************************************************************\n" +
"Takes unshifted stranded chromosome specific PointData and uses a sliding window to\n" +
"calculate several smoothed window statistics. These include a binomial p-value, a\n" +
diff --git a/Source/edu/utah/seq/qc/AggregateQCStats2.java b/Source/edu/utah/seq/qc/AggregateQCStats2.java
index d91293bb..ae567813 100644
--- a/Source/edu/utah/seq/qc/AggregateQCStats2.java
+++ b/Source/edu/utah/seq/qc/AggregateQCStats2.java
@@ -18,8 +18,9 @@ public class AggregateQCStats2 {
private File saveDirectory;
private File jobDirectory;
private String prependString = "";
+ private boolean removeNAColumns = true;
- private String alignLogMatch = ".+AlignHg38.log";
+ private String alignLogMatch = ".+Align.log";
private String dupLogMatch = ".+Markdup.log";
private String readCovJsonMatch = ".+UniObRC.json.gz";
private String scJsonMatch = ".+SampleConcordance.json.gz";
@@ -274,7 +275,7 @@ public void processArgs(String[] args){
public static void printDocs(){
System.out.println("\n" +
"**************************************************************************************\n" +
- "** Aggregate QC Stats2: April 2022 **\n" +
+ "** Aggregate QC Stats2: March 2024 **\n" +
"**************************************************************************************\n" +
"Parses and aggregates alignment quality statistics from log and json files produced by\n"+
"the TNRunner2 DnaAlignQC and SampleConcordance workflows.\n"+
@@ -284,7 +285,7 @@ public static void printDocs(){
"-s Directory for saving the AggQC results.\n"+
"\nOptions:\n"+
- "-a Alignment log file match, defaults to '.+_AlignHg38.log'\n"+
+ "-a Alignment log file match, defaults to '.+_Align.log'\n"+
"-d Mark duplicates log file match, defaults to '.+_Markdup.log'\n"+
"-r Read coverage json file match, defaults to '.+_UniObRC.json.gz'\n"+
"-c Sample concordance json file match, defaults to '.+_SampleConcordance.json.gz'\n"+
diff --git a/Source/edu/utah/seq/qc/DnaSample.java b/Source/edu/utah/seq/qc/DnaSample.java
index ce9a08c3..b5922f5b 100644
--- a/Source/edu/utah/seq/qc/DnaSample.java
+++ b/Source/edu/utah/seq/qc/DnaSample.java
@@ -33,7 +33,9 @@ public DnaSample(File alignLog, File dupLog, File readCovJson) throws Exception
parseReadCoverageJson(readCovJson);
}
+ //looking for info from the uniObRC json stats file
private void parseReadCoverageJson(File readCovJson) throws Exception {
+
if (readCovJson == null) return;
JSONObject jo = new JSONObject(IO.loadFile(readCovJson, " ", true));
meanOnTargetCoverage = Double.parseDouble(jo.getString("meanCoverage").replaceAll(",", ""));
@@ -46,6 +48,7 @@ private void parseReadCoverageJson(File readCovJson) throws Exception {
readCovParsed = true;
}
+ //looking for info from samtools markdups
private void parseDupLog(File dupLog) throws Exception {
if (dupLog == null) return;
String line;
@@ -72,6 +75,7 @@ else if (line.startsWith("DUPLICATE TOTAL:")) {
}
}
+ //looking for info from cut adapt
private void parseAlignLog(File alignLog) throws Exception {
if (alignLog == null) return;
String line;
diff --git a/Source/edu/utah/seq/run/TNRunner2.java b/Source/edu/utah/seq/run/TNRunner2.java
index ba02937f..a56c3f6b 100644
--- a/Source/edu/utah/seq/run/TNRunner2.java
+++ b/Source/edu/utah/seq/run/TNRunner2.java
@@ -735,6 +735,7 @@ public void processArgs(String[] args){
//root patient dirs? Looks for Fastq dirs then pulls their parent folder.
if (sampleDir == null || sampleDir.exists() == false) Misc.printErrAndExit("Error: failed to find your starting data directory? "+sampleDir);
+
ArrayList fastqDirs = IO.fetchDirectoriesRecursively(sampleDir, "Fastq");
rootDirs = IO.fetchParentDirectories(fastqDirs);
rootDirs = removeThoseWithComplete(rootDirs);
diff --git a/Source/edu/utah/seq/run/avproj/TumorSample.java b/Source/edu/utah/seq/run/avproj/TumorSample.java
index 0df67578..2c42d6b4 100644
--- a/Source/edu/utah/seq/run/avproj/TumorSample.java
+++ b/Source/edu/utah/seq/run/avproj/TumorSample.java
@@ -34,7 +34,9 @@ else if (tumorDnaName != null) {
}
else {
for (String name : nameFastq.keySet()) {
- if (name.startsWith(tumorRnaName)) tumorRnaFastqCram.add(nameFastq.get(name));
+ if (name.startsWith(tumorRnaName)) {
+ if (nameFastq.get(name)!=null) tumorRnaFastqCram.add(nameFastq.get(name));
+ }
}
}
diff --git a/Source/edu/utah/seq/run/avproj/adw/AvatarAnalysisJob.java b/Source/edu/utah/seq/run/avproj/adw/AvatarAnalysisJob.java
index bd87be26..90dc2bf9 100644
--- a/Source/edu/utah/seq/run/avproj/adw/AvatarAnalysisJob.java
+++ b/Source/edu/utah/seq/run/avproj/adw/AvatarAnalysisJob.java
@@ -4,37 +4,39 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.HashSet;
import org.json.JSONArray;
import org.json.JSONObject;
+import edu.utah.hci.misc.Util;
import util.gen.IO;
import util.gen.Misc;
/**Represents one patients tumor and normal datasets for TNRunner2 processing. Some patients will have several of these due to multiple tumor samples and or multiple platforms.
* The tumor exome and tumor RNA are merged into the same Tumor Sample*/
public class AvatarAnalysisJob {
-
+
//fields
private TumorSampleADW tumorSample = null;
private ArrayList normalSamples = new ArrayList();
private boolean matchedPlatform = true;
private String submittingGroup = null;
private PatientADW patient = null;
-
+
public AvatarAnalysisJob(PatientADW patient, TumorSampleADW tumorSample, ArrayList normalSamples, boolean matchedPlatform) {
this.patient = patient;
this.tumorSample = tumorSample;
this.normalSamples = normalSamples;
this.matchedPlatform = matchedPlatform;
}
-
+
/**Returns patientId_normalId1-normalId2_tumorExomeId_tumorRnaId most will have just one normal*/
public String getComparisonKey(String patientId) {
StringBuilder sb = new StringBuilder();
//patientId
sb.append(patientId);
-
+
//normalIds
if (normalSamples.size()==0) sb.append("_NA");
else {
@@ -45,17 +47,17 @@ public String getComparisonKey(String patientId) {
sb.append(normalSamples.get(i).getNormalDnaName());
}
}
-
+
//tumorExomeId
sb.append("_");
if (tumorSample.getTumorDnaName() == null) sb.append("NA");
else sb.append(tumorSample.getTumorDnaName());
-
+
//tumorRnaId
sb.append("_");
if (tumorSample.getTumorRnaName() == null) sb.append("NA");
else sb.append(tumorSample.getTumorRnaName());
-
+
return sb.toString();
}
@@ -71,22 +73,85 @@ public boolean isMatchedPlatform() {
return matchedPlatform;
}
- public void makeAnalysisJob(String nameAJ, File testDir, ArrayList dxCmds, ClinicalMolLinkage linkage) throws IOException {
+ public void makeAnalysisJobAster(File fastqDownloadRoot, String nameAJ, File testDir, ArrayList cmds, ClinicalMolLinkage linkage, HashSet dirPathsToDownload) throws IOException {
+ String downloadRoot = fastqDownloadRoot.getCanonicalPath();
+
//Fastq
File fastq = new File (testDir, "Fastq");
fastq.mkdir();
//TumorDNA
if (tumorSample.getTumorDnaName() != null) {
+ tumorSample.getTumorDnaFastqFiles();
File tumorDNA = new File (fastq, "TumorDNA");
tumorDNA.mkdir();
- String dx = "dx download -f --no-progress HCI_ORIEN_AVATAR_MOLECULAR_DATA:/Whole_Exome/alignment_crams/"+tumorSample.getTumorWesCramFileNameToFetch()+" -o "+tumorDNA.getCanonicalPath()+"/";
+ String tumorFastqDir = tumorDNA.getCanonicalPath()+"/";
+
+ //make link commands, downloads with Aster are a major issue
+ for (String[] s: tumorSample.getTumorWesFastqPathsToFetch()) {
+ String path = Util.stringArrayToString(s, "/");
+ String ln = "ln -s "+ downloadRoot+ path+ " "+ tumorFastqDir;
+ cmds.add(ln);
+ }
+ //update the dirPathsToDownload
+ dirPathsToDownload.add(TumorSampleADW.fetchFastqPathDir(tumorSample.getTumorWesFastqPathsToFetch()));
+ }
+ //TumorRNA
+ if (tumorSample.getTumorRnaName()!= null) {
+ File tumorRNA = new File (fastq, "TumorRNA");
+ tumorRNA.mkdir();
+ String tumorFastqDir = tumorRNA.getCanonicalPath()+"/";
+
+ //make link commands, downloads with Aster are a major issue
+ for (String[] s: tumorSample.getTumorRnaFastqPathsToFetch()) {
+ String path = Util.stringArrayToString(s, "/");
+ String ln = "ln -s "+ downloadRoot+ path+ " "+ tumorFastqDir;
+ cmds.add(ln);
+ }
+
+ //update the dirPathsToDownload
+ dirPathsToDownload.add(TumorSampleADW.fetchFastqPathDir(tumorSample.getTumorRnaFastqPathsToFetch()));
+
+ }
+ //NormalDNAs
+ if (normalSamples.size()!=0) {
+ File normalDNA = new File (fastq, "NormalDNA");
+ normalDNA.mkdir();
+ String normalFastqDir = normalDNA.getCanonicalPath()+"/";
+ for (NormalSampleADW ns: normalSamples) {
+
+ //make link commands, downloads with Aster are a major issue
+ for (String[] s: ns.getNormalWesFastqPathsToFetch()) {
+ String path = Util.stringArrayToString(s, "/");
+ String ln = "ln -s "+ downloadRoot+ path+ " "+ normalFastqDir;
+ cmds.add(ln);
+ }
+
+ //update the dirPathsToDownload
+ dirPathsToDownload.add(TumorSampleADW.fetchFastqPathDir(ns.getNormalWesFastqPathsToFetch()));
+ }
+ }
+ //ClinicalReport
+ File clinicalReport = new File (testDir, "ClinicalReport");
+ clinicalReport.mkdir();
+ writeJson(nameAJ, clinicalReport, linkage);
+ }
+
+ public void makeAnalysisJobDnaNexus(String nameAJ, File testDir, ArrayList dxCmds, ClinicalMolLinkage linkage) throws IOException {
+ //Fastq
+ File fastq = new File (testDir, "Fastq");
+ fastq.mkdir();
+ //TumorDNA
+ if (tumorSample.getTumorDnaName() != null) {
+ File tumorDNA = new File (fastq, "TumorDNA");
+ tumorDNA.mkdir();
+ String dx = "dx download -f --no-progress HCI_ORIEN_AVATAR_MOLECULAR_DATA:/Whole_Exome/alignment_crams/"+tumorSample.getTumorWesFastqPathsToFetch()+" -o "+tumorDNA.getCanonicalPath()+"/";
dxCmds.add(dx);
}
//TumorRNA
if (tumorSample.getTumorRnaName()!= null) {
File tumorRNA = new File (fastq, "TumorRNA");
tumorRNA.mkdir();
- String dx = "dx download -f --no-progress HCI_ORIEN_AVATAR_MOLECULAR_DATA:/RNAseq/alignment_crams/"+tumorSample.getTumorRnaCramFileNameToFetch()+" -o "+tumorRNA.getCanonicalPath()+"/";
+ String dx = "dx download -f --no-progress HCI_ORIEN_AVATAR_MOLECULAR_DATA:/RNAseq/alignment_crams/"+tumorSample.getTumorRnaFastqPathsToFetch()+" -o "+tumorRNA.getCanonicalPath()+"/";
dxCmds.add(dx);
}
//NormalDNAs
@@ -94,10 +159,10 @@ public void makeAnalysisJob(String nameAJ, File testDir, ArrayList dxCmd
File normalDNA = new File (fastq, "NormalDNA");
normalDNA.mkdir();
for (NormalSampleADW ns: normalSamples) {
- String dx = "dx download -f --no-progress HCI_ORIEN_AVATAR_MOLECULAR_DATA:/Whole_Exome/alignment_crams/"+ns.getNormalWesCramFileNameToFetch()+" -o "+ normalDNA.getCanonicalPath()+"/";
+ String dx = "dx download -f --no-progress HCI_ORIEN_AVATAR_MOLECULAR_DATA:/Whole_Exome/alignment_crams/"+ns.getNormalWesFastqPathsToFetch()+" -o "+ normalDNA.getCanonicalPath()+"/";
dxCmds.add(dx);
}
-
+
}
//ClinicalReport
File clinicalReport = new File (testDir, "ClinicalReport");
@@ -115,7 +180,7 @@ else if (normalSamples.size()!=0) {
NormalSampleADW ns = normalSamples.get(0);
platform = ns.getPlatformName();
}
-
+
//group from tumor DNA sample then try from tumor RNA sample
String group = "NA";
if (tumorSample.getTumorDnaName() != null) {
@@ -126,14 +191,14 @@ else if (tumorSample.getTumorRnaName() != null) {
group = linkage.getKeyDiseaseType().get(patient.getPatientId()+"_"+tumorSample.getTumorRnaName());
if (group == null) throw new IOException("\nFailed to find a disease type for the tumor sample "+tumorSample.getTumorDnaName());
}
-
+
//testId_platform_groupId_gender.json
File json = new File (clinicalReport, nameAJ+"_"+platform+"_"+group+"_"+patient.getSubjectMatchMaker().getGender()+".json");
if (platform == null) {
Misc.printErrAndExit("Null plat "+this.getComparisonKey(patient.getPatientId()));
}
-
+
//non matched?
boolean mixed = false;
if (platform.equals("Mixed")) {
@@ -170,9 +235,9 @@ else if (tumorSample.getTumorRnaName() != null) {
ja.put(nj);
}
main.put("NormalDNA", ja);
-
+
}
-
+
IO.writeString(main.toString(3), json);
}
diff --git a/Source/edu/utah/seq/run/avproj/adw/AvatarDataWrangler.java b/Source/edu/utah/seq/run/avproj/adw/AvatarDataWrangler.java
index e4eaa3cc..d8115284 100644
--- a/Source/edu/utah/seq/run/avproj/adw/AvatarDataWrangler.java
+++ b/Source/edu/utah/seq/run/avproj/adw/AvatarDataWrangler.java
@@ -19,30 +19,33 @@ public class AvatarDataWrangler {
//user fields
private File resourceDirectory = null;
private File jobDir = null;
- private File dxCmdLineFile = null;
+ private File cmdLineFile = null;
+ private File asterDownloadDir = null;
//internal
private File patientPhiFile = null;
private File awsRepoListFile = null;
private File linkageFile = null;
private File qcMetricsFile = null;
- private File wesCramsFile = null;
- private File rnaCramsFile = null;
+ private File wesFastqFile = null;
+ private File rnaFastqFile = null;
private ClinicalMolLinkage linkage = null;
private WesQCMetrics wes = null;
private PersonIDConverter patientPhi = null;
- private HashMap slIdWesCramName = new HashMap();
- private HashMap slIdRnaCramName = new HashMap();
+ private HashMap> slIdWesFastqPaths = new HashMap>();
+ private HashMap> slIdRnaFastqPaths = new HashMap>();
private HashMap patients = new HashMap();
private int numPatientsWithMissingData;
private ArrayList patientsWithJobsToRun = new ArrayList();
+ //Aster downloads, a moving target at present
+ private String asterExecDir = "/uufs/chpc.utah.edu/common/HIPAA/u0028003/BioApps/Aster";
+ private String asterProjectId = "project-F66v00Q0q4045Q4Y6PY2Xv7F";
+
//subject match maker
private File smmDirectory = null;
private File smmRegistryDirectory = null;
-
-
public AvatarDataWrangler (String[] args){
long startTime = System.currentTimeMillis();
try {
@@ -52,7 +55,7 @@ public AvatarDataWrangler (String[] args){
loadM2GenFiles();
- loadAvailableCramFiles();
+ loadAvailableFastqFiles();
buildPatients();
@@ -61,17 +64,11 @@ public AvatarDataWrangler (String[] args){
compareJobsToAlreadyProcessed();
if (patientsWithJobsToRun.size()!=0) {
-
//fetch the PHI
if (loadOrRequestPhi()) {
-
pullMolDataPatientIds();
-
buildAnalysisJobDirs();
}
-
-
-
}
else IO.pl("\nNo new jobs to run, exiting.");
@@ -90,7 +87,8 @@ public AvatarDataWrangler (String[] args){
private void buildAnalysisJobDirs() throws IOException {
IO.pl("\nBuilding Job Directories...");
- ArrayList dxCmds = new ArrayList();
+ ArrayList cliCmds = new ArrayList();
+ HashSet dirPathsToDownload = new HashSet();
//for each analysis job
for (PatientADW p : patientsWithJobsToRun) {
@@ -102,16 +100,34 @@ private void buildAnalysisJobDirs() throws IOException {
File test = new File (jd, ajName);
test.mkdir();
//File testDir, ArrayList dxCmds, HashMap keyDiseaseType
- aj.makeAnalysisJob(ajName, test, dxCmds, linkage);
+ aj.makeAnalysisJobAster(asterDownloadDir, ajName, test, cliCmds, linkage, dirPathsToDownload);
}
- //write out the dxCmds
- if (IO.writeArrayList(dxCmds, dxCmdLineFile) == false) {
- dxCmdLineFile.delete();
- throw new IOException("\nFailed to write out the DNAnexus cmd line file. Deleting it and aborting.");
+
+ //insert Aster download cmds, these freeze after ~8hrs so need to be manually run, hopefully they will fix them
+ ArrayList dwnldCmds = new ArrayList();
+ dwnldCmds.add("set -e; start=$(date +'%s')");
+ dwnldCmds.add("cd "+ asterDownloadDir);
+ dwnldCmds.add("d="+asterExecDir);
+ int counter = 0;
+ for (String d: dirPathsToDownload) {
+ dwnldCmds.add("echo -e \"\\n---------- "+counter+" Downloading Fastq Dir "+d+" -------- $((($(date +'%s') - $start)/60)) min\"");
+ dwnldCmds.add("python $d/support_scripts/download_project.py"+
+ " --project-id "+ asterProjectId+
+ " --exec $d/rwb.linux.x64"+
+ " --include "+d+" --no-dry-run && touch "+counter+".DONE || touch "+counter+".FAIL &");
+ counter++;
}
- else dxCmdLineFile.setExecutable(true);
+ dwnldCmds.add("echo -e \"\\n---------- Downloading Fastq Dir COMPLETE -------- $((($(date +'%s') - $start)/60)) min\"");
+ dwnldCmds.addAll(cliCmds);
+ dwnldCmds.add("echo -e \"\\n---------- Complete! -------- $((($(date +'%s') - $start)/60)) min total\"");
+
+ //write out the link cmds
+ if (IO.writeArrayList(dwnldCmds, cmdLineFile) == false) {
+ cmdLineFile.delete();
+ throw new IOException("\nFailed to write out the download script file. Deleting it and aborting.");
+ }
+ else cmdLineFile.setExecutable(true);
}
-
}
@@ -230,8 +246,8 @@ private void loadResourceFiles() {
linkageFile = fetchFromResource("ClinicalMolLinkage_V4.csv",issues);
qcMetricsFile = fetchFromResource("WES_QC_Metrics.csv",issues);
- wesCramsFile = fetchFromResource("WesCramList.txt",issues);
- rnaCramsFile = fetchFromResource("RNACramList.txt",issues);
+ wesFastqFile = fetchFromResource("exomeFastqFiles.txt",issues);
+ rnaFastqFile = fetchFromResource("rnaFastqFiles.txt",issues);
awsRepoListFile = fetchFromResource("AWSRepoList.txt",issues);
if (issues.length() !=0) Misc.printErrAndExit("One or more resource files are missing:\n"+issues);
@@ -247,32 +263,45 @@ private File fetchFromResource(String extension, StringBuilder issues) {
issues.append("\tError: failed to find one xxx_"+extension+" file in "+resourceDirectory);
return null;
}
-
- private void loadAvailableCramFiles() throws IOException {
- IO.pl("\nLoading available cram files...");
- // FT-SA134847_st_g_markdup.cram
- // SL261633_st_g_markdup.cram
- // SL261681_st_t_markdup.cram
- // A59553_st_t_markdup.cram
- // A59554_st_g_markdup.cram
- String[] wesCrams = IO.loadFileIntoStringArray(wesCramsFile);
- for (String wc: wesCrams) {
- if (wc.startsWith("#")) continue;
- String[] f = Misc.UNDERSCORE.split(wc);
- if (f.length != 4) throw new IOException ("Failed to find 4 parts in this wes cram file entry "+wc);
- if (slIdWesCramName.containsKey(f[0])) throw new IOException ("Found duplicate SLID "+f[0]);
- slIdWesCramName.put(f[0], wc);
+
+ private void loadAvailableFastqFiles() throws IOException {
+ IO.pl("\nLoading available fastq files...");
+ /* /Avatar_MolecularData_hg38/2023_06_30/Whole_Exome/FASTq/FT-SA212052_R1.fastq.gz
+ /Avatar_MolecularData_hg38/2023_06_30/Whole_Exome/FASTq/FT-SA212052_R2.fastq.gz
+ /Avatar_MolecularData_hg38/2023_06_30/Whole_Exome/FASTq/SL600460_1.fastq.gz
+ /Avatar_MolecularData_hg38/2023_06_30/Whole_Exome/FASTq/SL600460_2.fastq.gz*/
+ String[] wesFastq = IO.loadFileIntoStringArray(wesFastqFile);
+ for (String wc: wesFastq) {
+ // /Avatar_MolecularData_hg38/2023_06_30/Whole_Exome/FASTq/FT-SA212052_R1.fastq.gz
+ // / 1 2 3 4 5
+ String[] l = Misc.FORWARD_SLASH.split(wc);
+ String[] f = Misc.UNDERSCORE.split(l[5]);
+ if (f.length != 2) throw new IOException ("Failed to find 2 parts in this wes fastq file entry "+wc+ " from "+wesFastqFile);
+ ArrayList al = slIdWesFastqPaths.get(f[0]);
+ if (al == null) {
+ al = new ArrayList();
+ slIdWesFastqPaths.put(f[0], al);
+ }
+ al.add(l);
}
- // FT-SA130920R.genome.cram
- // SL316725.genome.cram
- String[] rnaCrams = IO.loadFileIntoStringArray(rnaCramsFile);
- for (String wc: rnaCrams) {
- if (wc.startsWith("#")) continue;
- String[] f = Misc.PERIOD.split(wc);
- if (f.length != 3) throw new IOException ("Failed to find 3 parts in this RNA cram file entry "+wc);
- if (slIdRnaCramName.containsKey(f[0])) throw new IOException ("Found duplicate SLID "+f[0]);
- slIdRnaCramName.put(f[0], wc);
+ /* /Avatar_MolecularData_hg38/2023_03_03/RNAseq/FASTq/FT-SA168005R_R1.fastq.gz
+ /Avatar_MolecularData_hg38/2023_03_03/RNAseq/FASTq/FT-SA168005R_R2.fastq.gz
+ /Avatar_MolecularData_hg38/2023_03_03/RNAseq/FASTq/SL526167_1.fastq.gz
+ /Avatar_MolecularData_hg38/2023_03_03/RNAseq/FASTq/SL526167_2.fastq.gz */
+ String[] rnaFastq = IO.loadFileIntoStringArray(rnaFastqFile);
+ for (String wc: rnaFastq) {
+ // /Avatar_MolecularData_hg38/2023_03_03/RNAseq/FASTq/SL526167_2.fastq.gz
+ // / 1 2 3 4 5
+ String[] l = Misc.FORWARD_SLASH.split(wc);
+ String[] f = Misc.UNDERSCORE.split(l[5]);
+ if (f.length != 2) throw new IOException ("Failed to find 2 parts in this RNASeq fastq file entry "+wc);
+ ArrayList al = slIdRnaFastqPaths.get(f[0]);
+ if (al == null) {
+ al = new ArrayList();
+ slIdRnaFastqPaths.put(f[0], al);
+ }
+ al.add(l);
}
}
@@ -296,19 +325,6 @@ private void buildAnalysisJobs(PatientADW p) throws IOException {
//split tumor samples by trimmed generic specimineId
HashMap> specimineIdTumorSamples = splitTumorSamplesBySpecimine(p.getTumorSamples());
-
-//Delete this test
-/* if (p.getPatientId().equals("A018344")) {
- for (String specimineId: specimineIdTumorSamples.keySet()) {
- IO.pl(specimineId);
- ArrayList tumorSamples = specimineIdTumorSamples.get(specimineId);
- for (TumorSampleADW t: tumorSamples) {
- IO.pl("Tum "+t.getPlatformName()+" "+t.getTumorDnaName()+" "+t.getTumorRnaName());
- }
- IO.pl();
- }
- }
-*/
//for each tumor specimen
for (String specimineId: specimineIdTumorSamples.keySet()) {
@@ -336,7 +352,7 @@ private void buildAnalysisJobs(PatientADW p) throws IOException {
if (numNorm > 1) {
IO.pl("\tWARNING: multiple normal files found in the same platform, these will all be added to the Fastq dir for merging: ");
for (NormalSampleADW ns: normalSamples) {
- IO.pl("\t\t"+ns.getNormalWesCramFileNameToFetch());
+ IO.pl("\t\t"+ns.getNormalWesFastqPathsToFetch());
normalSamplesToAdd.add(ns);
}
}
@@ -354,7 +370,7 @@ private void buildAnalysisJobs(PatientADW p) throws IOException {
IO.pl("\tWARNING: no normal found in the same platform, will add those from all other platforms for merging:");
for (NormalSampleADW ns: p.getNormalSamples()) {
normalSamplesToAdd.add(ns);
- IO.pl("\t\t"+ns.getPlatformName()+"\t"+ ns.getNormalWesCramFileNameToFetch());
+ IO.pl("\t\t"+ns.getPlatformName()+"\t"+ ns.getNormalWesFastqPathsToFetch());
matchedPlatform = false;
}
}
@@ -394,7 +410,7 @@ private ArrayList mergeSplitTumorExomeRNADatasets(ArrayList nameTumorWes = slIdWesFastqPaths.get(wesId);
+ if (nameTumorWes == null || nameTumorWes.size()!=2) {
OK = false;
- IO.pl("\tFailed to find an available tumor WES cram file for "+ wesId+" in the M2Gen project, skipping patient "+patientId);
+ IO.pl("\tFailed to find two tumor WES fastq files for "+ wesId+" in the M2Gen project, skipping patient "+patientId);
}
- else ts.setTumorWesCramFileNameToFetch(nameTumorWes);
+ else ts.setTumorWesFastqPathsToFetch(nameTumorWes);
}
//any tumor rna
if (rnaId != null) {
- String nameTumorRna = slIdRnaCramName.get(rnaId);
- if (nameTumorRna == null) {
+ ArrayList nameTumorRna = slIdRnaFastqPaths.get(rnaId);
+ if (nameTumorRna == null || nameTumorRna.size()!=2) {
OK = false;
- IO.pl("\tFailed to find an available tumor RNA cram file for "+ rnaId+" in the M2Gen project, skipping patient "+patientId);
+ IO.pl("\tFailed to find two tumor RNA fastq files for "+ rnaId+" in the M2Gen project, skipping patient "+patientId);
+ }
+ else {
+ ts.setTumorRnaPathsToFetch(nameTumorRna);
}
- else ts.setTumorRnaCramFileNameToFetch(nameTumorRna);
}
p.getTumorSamples().add(ts);
}
//is it a germline normal dataline
else if (tumorGermline.equals("Germline")) {
- String nameNormalWes = slIdWesCramName.get(wesId);
- if (nameNormalWes == null) {
+ ArrayList nameNormalWes = slIdWesFastqPaths.get(wesId);
+ if (nameNormalWes == null || nameNormalWes.size()!=2) {
OK = false;
- IO.pl("\tFailed to find an available normal WES cram file for "+ wesId+" in the M2Gen project, skipping patient "+patientId);
+ IO.pl("\tFailed to find two normal WES fastq files for "+ wesId+" in the M2Gen project, skipping patient "+patientId);
}
else {
NormalSampleADW ns = new NormalSampleADW(wesId, platform, fields);
- ns.setNormalWesCramFileNameToFetch(nameNormalWes);
+ ns.setNormalWesFastqPathsToFetch(nameNormalWes);
p.getNormalSamples().add(ns);
}
@@ -584,9 +601,10 @@ public void processArgs(String[] args) throws IOException{
switch (test){
case 'r': resourceDirectory = new File(args[++i]); break;
case 'j': jobDir = new File(args[++i]); break;
+ case 'a': asterDownloadDir = new File(args[++i]); break;
case 'p': printScript(); break;
case 't': tmpDir = new File(args[++i]); break;
- case 'd': dxCmdLineFile = new File(args[++i]); break;
+ case 'd': cmdLineFile = new File(args[++i]); break;
case 's': smmRegistryDirectory = new File(args[++i]); break;
case 'h': printDocs(); System.exit(0);
default: Misc.printErrAndExit("\nProblem, unknown option! " + mat.group());
@@ -599,7 +617,7 @@ public void processArgs(String[] args) throws IOException{
}
}
//dx cmd file
- if (dxCmdLineFile == null) Misc.printErrAndExit("\nERROR: please provide a file path to save the DNAnexus download cmds. Aborting. "+dxCmdLineFile);
+ if (cmdLineFile == null) Misc.printErrAndExit("\nERROR: please provide a file path to save the DNAnexus download cmds. Aborting. "+cmdLineFile);
//jobDir
@@ -607,6 +625,11 @@ public void processArgs(String[] args) throws IOException{
jobDir.mkdirs();
if (jobDir.exists() == false || jobDir.canWrite() == false) Misc.printErrAndExit("\nERROR: cannot write into your JobDir? Aborting. "+jobDir);
+ //Aster download dir
+ if (asterDownloadDir == null) Misc.printErrAndExit("\nERROR: cannot find your AsterDownloadDir? Aborting. "+asterDownloadDir);
+ asterDownloadDir.mkdirs();
+ if (asterDownloadDir.exists() == false || jobDir.canWrite() == false) Misc.printErrAndExit("\nERROR: cannot write into your AsterDownloadDir? Aborting. "+asterDownloadDir);
+
//resourceDir
if (resourceDirectory == null || resourceDirectory.exists()== false) Misc.printErrAndExit("\nERROR: cannot find your Resource directory? Aborting. "+resourceDirectory);
@@ -654,7 +677,7 @@ public static void printDocs(){
IO.pl("\n" +
"**************************************************************************************\n" +
- "** Avatar Data Wrangler : March 2023 **\n" +
+ "** Avatar Data Wrangler : Feb 2024 **\n" +
"**************************************************************************************\n" +
"Tool for assembling directories for TNRunner based on files provided by M2Gen via\n"+
"download from DNAnexus. Handles patient datasets from different exome capture\n"+
@@ -677,9 +700,10 @@ public static void printDocs(){
"-t Directory to place temp files with PHI for Subject ID matching\n"+
"-s Directory containing the SubjectMatchMaker 'currentRegistry_' file\n"+
"-d Path to save a bash script for downloading the sequence read data.\n"+
+ "-a Path to where the Aster datasets will be downloaded.\n"+
"\nExample: java -jar ~/USeqApps/AvatarDataWrangler -r Resources/ -j AJobs -t SMM_PHI\n"+
- " -s ~/PHI/SmmRegistry/ -d dxDownloadCmds.sh \n\n"+
+ " -s ~/PHI/SmmRegistry/ -d dxDownloadCmds.sh -a AsterDownloads\n\n"+
"**************************************************************************************\n");
diff --git a/Source/edu/utah/seq/run/avproj/adw/NormalSampleADW.java b/Source/edu/utah/seq/run/avproj/adw/NormalSampleADW.java
index 1d5a5fee..e9a7a99d 100644
--- a/Source/edu/utah/seq/run/avproj/adw/NormalSampleADW.java
+++ b/Source/edu/utah/seq/run/avproj/adw/NormalSampleADW.java
@@ -8,14 +8,13 @@
public class NormalSampleADW {
private String platformName = null;
- private String normalDnaName = null;
- private ArrayList normalDnaFastqCram = new ArrayList();
- private String normalWesCramFileNameToFetch = null;
+ private String normalDnaSampleName = null;
+ private ArrayList normalDnaFastqFiles = new ArrayList();
+ private ArrayList normalWesFastqPathsToFetch = null;
private String[] linkageDataLine = null;
-
public NormalSampleADW (String normalDnaName, String platformName, String[] linkageDataLine) {
- this.normalDnaName = normalDnaName;
+ this.normalDnaSampleName = normalDnaName;
this.platformName = platformName;
this.linkageDataLine = linkageDataLine;
}
@@ -23,11 +22,11 @@ public NormalSampleADW (String normalDnaName, String platformName, String[] link
public JSONObject fetchJson(ClinicalMolLinkage linkage) throws IOException {
JSONObject jo = new JSONObject();
jo.put("capturePlatform", platformName);
- jo.put("normalDNASeqFile", normalWesCramFileNameToFetch);
- jo.put("normalDNASampleLibraryId", normalDnaName);
+ jo.put("normalDNASeqPaths", TumorSampleADW.mergePairedFastqPaths(normalWesFastqPathsToFetch));
+ jo.put("normalDNASampleLibraryId", normalDnaSampleName);
ArrayList al = new ArrayList();
al.add(linkageDataLine);
- TumorSampleADW.addLinkageInfo(normalDnaName, jo, linkage, al, true);
+ TumorSampleADW.addLinkageInfo(normalDnaSampleName, jo, linkage, al, true);
return jo;
}
@@ -36,19 +35,19 @@ public String getPlatformName() {
}
public String getNormalDnaName() {
- return normalDnaName;
+ return normalDnaSampleName;
}
- public ArrayList getNormalDnaFastqCram() {
- return normalDnaFastqCram;
+ public ArrayList getNormalDnaFastqFiles() {
+ return normalDnaFastqFiles;
}
- public String getNormalWesCramFileNameToFetch() {
- return normalWesCramFileNameToFetch;
+ public ArrayList getNormalWesFastqPathsToFetch() {
+ return normalWesFastqPathsToFetch;
}
- public void setNormalWesCramFileNameToFetch(String normalWesCramFileNameToFetch) {
- this.normalWesCramFileNameToFetch = normalWesCramFileNameToFetch;
+ public void setNormalWesFastqPathsToFetch(ArrayList normalWesFastqPathsToFetch) {
+ this.normalWesFastqPathsToFetch = normalWesFastqPathsToFetch;
}
public String[] getLinkageDataLine() {
diff --git a/Source/edu/utah/seq/run/avproj/adw/TumorSampleADW.java b/Source/edu/utah/seq/run/avproj/adw/TumorSampleADW.java
index f5499a15..5dd11ad1 100644
--- a/Source/edu/utah/seq/run/avproj/adw/TumorSampleADW.java
+++ b/Source/edu/utah/seq/run/avproj/adw/TumorSampleADW.java
@@ -4,8 +4,12 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
+
+import org.json.JSONArray;
import org.json.JSONObject;
+import util.gen.Misc;
+
public class TumorSampleADW {
@@ -15,19 +19,19 @@ public class TumorSampleADW {
//Tumor Exome
private String platformName = null;
- private String tumorDnaName = null;
- private ArrayList tumorDnaFastqCram = new ArrayList();
- private String tumorWesCramFileNameToFetch = null;
+ private String tumorDnaSampleName = null;
+ private ArrayList tumorDnaFastqFiles = new ArrayList();
+ private ArrayList tumorWesFastqPathsToFetch = null;
//Tumor RNA
- private String tumorRnaName = null;
- private ArrayList tumorRnaFastqCram = new ArrayList();
- private String tumorRnaCramFileNameToFetch = null;
+ private String tumorRnaSampleName = null;
+ private ArrayList tumorRnaFastqFiles = new ArrayList();
+ private ArrayList tumorRnaFastqPathsToFetch = null;
public TumorSampleADW (String tumorDnaName, String tumorRnaName, String platformName, String trimmedSpecimineId, String[] tumorLinkageDataLine) {
- this.tumorDnaName = tumorDnaName;
- this.tumorRnaName = tumorRnaName;
+ this.tumorDnaSampleName = tumorDnaName;
+ this.tumorRnaSampleName = tumorRnaName;
this.platformName = platformName;
this.genericSpecimineId = trimmedSpecimineId;
tumorLinkageDataLines.add(tumorLinkageDataLine);
@@ -37,21 +41,45 @@ public JSONObject fetchTumorDnaJson(ClinicalMolLinkage linkage) throws IOExcepti
JSONObject jo = new JSONObject();
jo.put("capturePlatform", platformName);
jo.put("trimmedSpecimineId", genericSpecimineId);
- jo.put("tumorDNASeqFile", tumorWesCramFileNameToFetch);
- jo.put("tumorDNASampleLibraryId", tumorDnaName);
- addLinkageInfo(tumorDnaName, jo, linkage, tumorLinkageDataLines, true);
+ JSONArray ja = new JSONArray();
+ String[] paths = mergePairedFastqPaths(tumorWesFastqPathsToFetch);
+ ja.put(paths[0]);
+ ja.put(paths[1]);
+ jo.put("tumorDNASeqPaths", ja);
+ jo.put("tumorDNASampleLibraryId", tumorDnaSampleName);
+ addLinkageInfo(tumorDnaSampleName, jo, linkage, tumorLinkageDataLines, true);
return jo;
}
public JSONObject fetchTumorRnaJson(ClinicalMolLinkage linkage) throws IOException {
JSONObject jo = new JSONObject();
jo.put("trimmedSpecimineId", genericSpecimineId);
- jo.put("tumorRNASeqFile", tumorRnaCramFileNameToFetch);
- jo.put("tumorRNASampleLibraryId", tumorRnaName);
- addLinkageInfo(tumorRnaName, jo, linkage, tumorLinkageDataLines, false);
+ JSONArray ja = new JSONArray();
+ String[] paths = mergePairedFastqPaths(tumorRnaFastqPathsToFetch);
+ ja.put(paths[0]);
+ ja.put(paths[1]);
+ jo.put("tumorRNASeqPaths", ja);
+ jo.put("tumorRNASampleLibraryId", tumorRnaSampleName);
+ addLinkageInfo(tumorRnaSampleName, jo, linkage, tumorLinkageDataLines, false);
return jo;
}
+ public static String[] mergePairedFastqPaths(ArrayList al) throws IOException {
+ if (al.size()!=2 ) throw new IOException("\nDidn't find two fastq file paths.");
+ // removing leading /
+ String merged1 = Misc.stringArrayToString(al.get(0), "/").substring(1);
+ String merged2 = Misc.stringArrayToString(al.get(1), "/").substring(1);
+ return new String[] {merged1, merged2};
+ }
+
+ public static String fetchFastqPathDir(ArrayList al) {
+ // /Avatar_MolecularData_hg38/2023_06_30/Whole_Exome/FASTq/FT-SA212052_R1.fastq.gz
+ // / 1 2 3 4 5
+ // look at just first, they will be the same; skip the actual fastq.gz
+ String merged = Misc.stringArrayToString(al.get(0), "/");
+ return merged.substring(0,merged.lastIndexOf('/'));
+ }
+
public static void addLinkageInfo (String sampleName, JSONObject jo, ClinicalMolLinkage linkage, ArrayList linkageDataLines, boolean isWes) throws IOException {
HashMap headerKeyIndex = linkage.getHeaderKeyIndex();
String[] headerKeys = linkage.getHeaderKeys();
@@ -79,35 +107,35 @@ public String getPlatformName() {
}
public String getTumorDnaName() {
- return tumorDnaName;
+ return tumorDnaSampleName;
}
- public ArrayList getTumorDnaFastqCram() {
- return tumorDnaFastqCram;
+ public ArrayList getTumorDnaFastqFiles() {
+ return tumorDnaFastqFiles;
}
public String getTumorRnaName() {
- return tumorRnaName;
+ return tumorRnaSampleName;
}
- public ArrayList getTumorRnaFastqCram() {
- return tumorRnaFastqCram;
+ public ArrayList getTumorRnaFastqFiles() {
+ return tumorRnaFastqFiles;
}
- public String getTumorWesCramFileNameToFetch() {
- return tumorWesCramFileNameToFetch;
+ public ArrayList getTumorWesFastqPathsToFetch() {
+ return tumorWesFastqPathsToFetch;
}
- public void setTumorWesCramFileNameToFetch(String tumorWesCramFileNameToFetch) {
- this.tumorWesCramFileNameToFetch = tumorWesCramFileNameToFetch;
+ public void setTumorWesFastqPathsToFetch(ArrayList tumorWesFastqPathsToFetch) {
+ this.tumorWesFastqPathsToFetch = tumorWesFastqPathsToFetch;
}
- public String getTumorRnaCramFileNameToFetch() {
- return tumorRnaCramFileNameToFetch;
+ public ArrayList getTumorRnaFastqPathsToFetch() {
+ return tumorRnaFastqPathsToFetch;
}
- public void setTumorRnaCramFileNameToFetch(String tumorRnaCramFileNameToFetch) {
- this.tumorRnaCramFileNameToFetch = tumorRnaCramFileNameToFetch;
+ public void setTumorRnaPathsToFetch(ArrayList tumorRnaFastqPathsToFetch) {
+ this.tumorRnaFastqPathsToFetch = tumorRnaFastqPathsToFetch;
}
public String getGenericSpecimineId() {
@@ -115,7 +143,7 @@ public String getGenericSpecimineId() {
}
public void setTumorRnaName(String tumorRnaName) {
- this.tumorRnaName = tumorRnaName;
+ this.tumorRnaSampleName = tumorRnaName;
}
public ArrayList getTumorLinkageDataLines() {
diff --git a/Source/edu/utah/seq/run/caris/CarisDataWrangler.java b/Source/edu/utah/seq/run/caris/CarisDataWrangler.java
index 6100c9d2..76b1ff59 100644
--- a/Source/edu/utah/seq/run/caris/CarisDataWrangler.java
+++ b/Source/edu/utah/seq/run/caris/CarisDataWrangler.java
@@ -310,7 +310,7 @@ public void processArgs(String[] args) throws IOException{
public static void printDocs(){
IO.pl("\n" +
"**************************************************************************************\n" +
- "** Caris Data Wrangler : Aug 2022 **\n" +
+ "** Caris Data Wrangler : Mar 2024 **\n" +
"**************************************************************************************\n" +
"The Caris Data Wrangler downloads complete patient datasets from an AWS bucket, parses\n"+
"the xml test file for patient info, fetches/makes coreIds using the SubjectMatchMaker\n"+
diff --git a/Source/edu/utah/seq/run/caris/CarisPatient.java b/Source/edu/utah/seq/run/caris/CarisPatient.java
index 74ecdf65..84de3a25 100644
--- a/Source/edu/utah/seq/run/caris/CarisPatient.java
+++ b/Source/edu/utah/seq/run/caris/CarisPatient.java
@@ -165,9 +165,11 @@ public void makeJobDirsMoveXml(String coreId) throws Exception {
clinReportDir.mkdirs();
//move the report into the ClinicalReport folder
File deIdXml = carisXml.getDeidentifiedXmlFile();
- if (deIdXml.renameTo(new File(clinReportDir, deIdXml.getName())) == false) {
- throw new IOException("FAILED to move "+deIdXml+" to "+clinReportDir);
- }
+ File dest = new File(clinReportDir, deIdXml.getName());
+ //debugging, no longer working with move, trying copy and delete
+ //if (deIdXml.renameTo(dest) == false) throw new IOException("FAILED to move "+deIdXml+" to "+clinReportDir);
+ if (IO.copyViaFileChannel(deIdXml, dest) == false) throw new IOException("FAILED to copy "+deIdXml+" to "+clinReportDir);
+ else deIdXml.deleteOnExit();
}
public boolean isTooYoung() {
diff --git a/Source/util/apps/JobCleaner.java b/Source/util/apps/JobCleaner.java
index f43f1742..009b302b 100644
--- a/Source/util/apps/JobCleaner.java
+++ b/Source/util/apps/JobCleaner.java
@@ -2,6 +2,8 @@
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import util.gen.IO;
@@ -15,6 +17,7 @@ public class JobCleaner {
private String[] fileExtensionsToDelete = null;
private String[] directoryNamesToZip = null;
private boolean dryRun = false;
+ private boolean mergeInDirectory = false;
//internal
private boolean lookForFiles;
@@ -53,17 +56,57 @@ private void zipDirectories() throws IOException {
for (File f: toZip) IO.pl("\t"+f.toString());
}
else {
-
+
IO.pl("\nZip archiving and then deleting the following files and directories...");
- for (File d: toZip) {
- IO.pl("\t"+d.toString());
- if (d.exists()== false) IO.pl("\t\tMissing, already zipped?! "+d.toString());
- else {
- boolean zipped = IO.zipDirectory(d);
- if (zipped == false) throw new IOException("Failed to zip: "+d);
-
- boolean deleted = IO.deleteDirectorySimple(d);
- if (deleted == false) throw new IOException("Failed to delete directory after zipping: "+d);
+ if (mergeInDirectory) {
+ //split by parent directory
+ HashMap> parentDirs = new HashMap>();
+ for (File d: toZip) {
+ IO.pl("\t"+d.toString());
+ if (d.exists()== false) IO.pl("\t\tMissing, already zipped?! Skipping "+d.toString());
+ else {
+ String parent = d.getParentFile().getCanonicalPath();
+ ArrayList al = parentDirs.get(parent);
+ if (al == null) {
+ al = new ArrayList();
+ parentDirs.put(parent, al);
+ }
+ al.add(d);
+ }
+ }
+ //for each parent dir, zip the children dirs into a combine archive
+ for (String par: parentDirs.keySet()) {
+ ArrayList children = parentDirs.get(par);
+ File[] toCombine = new File[children.size()];
+ children.toArray(toCombine);
+ Arrays.sort(toCombine);
+ StringBuilder comboName = new StringBuilder(par);
+ comboName.append("/");
+ for (File f: toCombine) comboName.append(f.getName());
+ comboName.append(".zip");
+ File zipFile = new File (comboName.toString());
+ boolean zipped = IO.zipDirectoriesInSameParentDirectory(toCombine, zipFile);
+ if (zipped == false) throw new IOException("Failed to zip: "+comboName);
+
+ //delete the individual dirs
+ for (File f: toCombine) {
+ boolean deleted = IO.deleteDirectorySimple(f);
+ if (deleted == false) throw new IOException("Failed to delete directory after zipping: "+f);
+ }
+
+ }
+ }
+ else {
+ for (File d: toZip) {
+ IO.pl("\t"+d.toString());
+ if (d.exists()== false) IO.pl("\t\tMissing, already zipped?! "+d.toString());
+ else {
+ boolean zipped = IO.zipDirectory(d);
+ if (zipped == false) throw new IOException("Failed to zip: "+d);
+
+ boolean deleted = IO.deleteDirectorySimple(d);
+ if (deleted == false) throw new IOException("Failed to delete directory after zipping: "+d);
+ }
}
}
}
@@ -146,6 +189,7 @@ public void processArgs(String[] args) throws IOException{
case 'e': fileExtensionsToDelete = Misc.COMMA.split(args[++i]); break;
case 'n': directoryNamesToZip = Misc.COMMA.split(args[++i]); break;
case 'd': dryRun = true; break;
+ case 'm': mergeInDirectory = true; break;
default: Misc.printErrAndExit("\nProblem, unknown option! " + mat.group());
}
}
@@ -165,7 +209,7 @@ public void processArgs(String[] args) throws IOException{
public static void printDocs(){
IO.pl("\n" +
"**************************************************************************************\n" +
- "** Job Cleaner : May 2022 **\n" +
+ "** Job Cleaner : Jan 2024 **\n" +
"**************************************************************************************\n" +
"Zip archives particular folders, deletes particular files. Use to clean up analysis \n"+
"result directories prior to cloud upload.\n"+
@@ -174,10 +218,13 @@ public static void printDocs(){
"-r Root directory to recursively look for files and folders.\n"+
"-e File extensions and file names to delete, comma delimited, no spaces.\n" +
"-n Directory names to zip archive and then delete, comma delimited, no spaces.\n"+
+ "-m Create a merged zip archive for directories defined in -n that exist in the same\n"+
+ " parent directory, e.g. LogsRunScripts.zip instead of Logs.zip and RunScripts.zip\n"+
"-d Dry run, just list the files and directories.\n"+
+
"\nExample: java -jar pathToUSeq/Apps/JobCleaner -d -n 'Logs,RunScripts' -r CJobs/ -e \n"+
- " '.tbi,.crai,.bai,COMPLETE' \n"+
+ " '.tbi,.crai,.bai,COMPLETE' -m \n"+
"\n**************************************************************************************\n");
}
diff --git a/Source/util/gen/BucketObjLoadTest.java b/Source/util/gen/BucketObjLoadTest.java
new file mode 100644
index 00000000..114f1ed2
--- /dev/null
+++ b/Source/util/gen/BucketObjLoadTest.java
@@ -0,0 +1,106 @@
+package util.gen;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.math3.distribution.ChiSquaredDistribution;
+
+public class BucketObjLoadTest {
+
+ public static void main(String[] args) throws IOException {
+ BufferedReader in = IO.fetchBufferedReader("/Users/u0028003/Downloads/BucketObjTest/hcibioinfo-patient-molecular-repo.list.16Dec2023.txt.gz");
+ String line;
+ String[] tokens;
+ HashMap> dirFiles = new HashMap>();
+ String delimiter = "/";
+ long totalSize = 0;
+ long numberObjects = 0;
+ long startTime = System.currentTimeMillis();
+
+ while ((line = in.readLine())!=null) {
+ line = line.trim();
+ if (line.length()!=0) {
+ //this will cut any directory or file name with a space, which is permitted!
+ //its always a file, never a directory
+ /*
+2023-03-01 09:04:43 2938 Patients/AA2mF6Vy/Avatar/A032049_SL419345_SL419548_SL420681/ClinicalReport/A032049_SL419345_SL419548_SL420681_IDTv1_SAR_F.json
+2023-01-30 21:32:56 7208120597 Patients/AA2mF6Vy/Avatar/A032049_SL419345_SL419548_SL420681/Fastq/NormalDNA/SL419345_1.fastq.gz
+2023-01-30 21:32:56 7423738484 Patients/AA2mF6Vy/Avatar/A032049_SL419345_SL419548_SL420681/Fastq/NormalDNA/SL419345_2.fastq.gz
+ 0 1 2 3
+ */
+ tokens = Misc.WHITESPACE.split(line);
+ long size = Long.parseLong(tokens[2]);
+ totalSize+= size;
+ numberObjects++;
+
+ //IO.pl(tokens.length+"\t"+ line);
+
+ //create fullPath
+ String fullPath = null;
+ //no white space
+ if (tokens.length == 4) fullPath = tokens[3];
+ //with white space in name or directory
+ else {
+ StringBuilder sb = new StringBuilder(tokens[3]);
+ for (int i=4; i< tokens.length; i++) {
+ sb.append(" ");
+ sb.append(tokens[i]);
+ }
+ fullPath = sb.toString();
+ }
+
+ //split full path into dir and name
+ String dir = null;
+ String fileName = null;
+ //is this just in the root dir?
+ if (fullPath.contains(delimiter) == false) {
+ dir = "ROOT";
+ fileName = fullPath;
+ }
+ else {
+ int lastIndex = fullPath.lastIndexOf(delimiter);
+ dir = fullPath.substring(0, lastIndex);
+ fileName = fullPath.substring(lastIndex+1);
+ }
+
+ //add to hash
+ ArrayList files = dirFiles.get(dir);
+ if (files == null) {
+ files = new ArrayList();
+ dirFiles.put(dir, files);
+ if (dir.contains("TL-23-WGGHKD5Z/GermlineVariantCalling")) IO.pl(dir);
+ }
+ files.add(fileName);
+ }
+ }
+ IO.pl(dirFiles.size());
+ IO.pl(numberObjects);
+ IO.pl(totalSize);
+
+ //Issue is with directories that only contain directories, walk the /x/y/z ?
+
+ //finish and calc run time
+ double diffTime = ((double)(System.currentTimeMillis() -startTime))/1000;
+ System.out.println("Done! "+Math.round(diffTime)+" sec\n");
+
+ IO.pl(dirFiles.get("Patients/ADk3Na5eXy/Tempus/TL-23-WGGHKD5Z/GermlineVariantCalling"));
+
+ in.close();
+ }
+
+
+
+
+
+
+
+}
+
+
\ No newline at end of file
diff --git a/Source/util/gen/IO.java b/Source/util/gen/IO.java
index d78d9a88..406713eb 100755
--- a/Source/util/gen/IO.java
+++ b/Source/util/gen/IO.java
@@ -1610,6 +1610,49 @@ public static boolean zipDirectory(File directory) throws IOException{
return true;
}
+ /**Zips the contents of the provided directories using relative paths. Make sure your zipFile ends in .zip */
+ public static boolean zipDirectoriesInSameParentDirectory(File[] dirs, File zipFile) throws IOException{
+ boolean success = false;
+ ZipOutputStream out = null;
+ try {
+ out = new ZipOutputStream(new FileOutputStream(zipFile));
+ byte[] buf = new byte[2048];
+
+ for (File d: dirs) {
+ String toTrim = d.getParentFile().getCanonicalPath()+"/";
+ ArrayList filesToZip = fetchAllFilesAndDirsRecursively(d);
+
+ // Compress the files with a relative path
+ for (File f: filesToZip) {
+ String relPath = f.getCanonicalPath().replace(toTrim, "");
+ if (f.isFile()) {
+ out.putNextEntry(new ZipEntry(relPath));
+ FileInputStream in = new FileInputStream(f);
+ int len;
+ while ((len = in.read(buf)) != -1) out.write(buf, 0, len);
+ out.closeEntry();
+ in.close();
+ }
+ //for directories add the /
+ else {
+ out.putNextEntry(new ZipEntry(relPath+"/"));
+ out.closeEntry();
+ }
+ }
+ }
+ success = true;
+ } catch (IOException e) {
+ IO.el("ERROR zip archiving "+zipFile);
+ zipFile.delete();
+ e.printStackTrace();
+
+ } finally {
+ out.close();
+ }
+ return success;
+ }
+
+
/**Fetches all of the files and dirs in the provided directory.*/
public static ArrayList fetchAllFilesAndDirsRecursively (File directory) throws IOException{
ArrayList files = new ArrayList();
@@ -1761,7 +1804,6 @@ public static BufferedReader fetchBufferedReader(String s){
e.printStackTrace();
}
return null;
-
}
/**Fetches a BufferedReader from a url, zip/gz OK.*/
diff --git a/Source/util/gen/Misc.java b/Source/util/gen/Misc.java
index 41c9382c..eefc50e2 100755
--- a/Source/util/gen/Misc.java
+++ b/Source/util/gen/Misc.java
@@ -24,6 +24,7 @@ public class Misc {
public static final Pattern RETURN = Pattern.compile("\n");
public static final Pattern WHITESPACE = Pattern.compile("\\s+");
public static final Pattern COMMA_WHITESPACE = Pattern.compile("[,\\s]+");
+ public static final Pattern COMMA_WHITESPACE_FWDSLASH = Pattern.compile("[,\\s/]+");
public static final Pattern UNDERSCORE = Pattern.compile("_");
public static final Pattern DASH = Pattern.compile("-");
public static final Pattern COLON = Pattern.compile(":");
@@ -305,6 +306,21 @@ public static String treeSetToString(TreeSet hash, String delimiter){
return s.toString();
}
+ /**Converts a hash to a String using the delimiter to join.*/
+ public static String linkedSetToString(LinkedHashSet hash, String delimiter){
+ if (hash.size() == 0) return "";
+ Iterator it = hash.iterator();
+ StringBuilder s = new StringBuilder();
+ Object obj = (Object)it.next();
+ s.append(obj.toString());
+ while (it.hasNext()){
+ obj = it.next();
+ s.append(delimiter);
+ s.append(obj.toString());
+ }
+ return s.toString();
+ }
+
/**Converts a hash to a String[].*/
public static String[] setToStringArray(Set hash){