Update geneAnalysis.py

mkayasth · May 1, 2024 · b0dda17 · b0dda17
1 parent cc80cd4
commit b0dda17
Showing 1 changed file with 19 additions and 5 deletions.
diff --git a/geneAnalysis/geneAnalysis.py b/geneAnalysis/geneAnalysis.py
@@ -8,7 +8,7 @@
 
 # check if a reference genome is provided as a command-line argument.
 if len(sys.argv) < 2:
-    print("Usage: python3 geneAnalysis.py <reference_genome>")
+    print("Usage: python3.11 geneAnalysis.py <reference_genome>")
     sys.exit(1)
 
 # get the reference genome from the command-line argument.
@@ -27,7 +27,7 @@
 
 # function to run a Python script with additional arguments, if needed.
 def run_script(script_path, *args):
-    subprocess.run(['python3', script_path, *args], check=True)
+    subprocess.run(['python3.11', script_path, *args], check=True)
 
 # Calculate the average mutation rate (for a protein ~~ by comparing all with reference strain).
 def calculate_average_mutation_rate(csv_file): # avg of all strains compared to the reference genome from mutation_rates.csv.
@@ -50,6 +50,18 @@ def append_average_mutation_rate_to_file(file_path, protein_name, mutation_rate)
     # then, append to the file.
     with open(file_path, mode='a', encoding='utf-8') as file:
         file.write(f"Average mutation rate for {protein_name}: {mutation_rate}\n")
+
+# function that cleans up any hypothetical_protein that Prokka could not annotate.
+def clean_up_mutation_rates_txt(file_path):
+    with open(file_path, 'r', encoding='utf-8') as file:
+        lines = file.readlines()
+
+    # Filter out lines containing 'hypothetical_protein'
+    cleaned_lines = [line for line in lines if 'hypothetical_protein' not in line.lower()]
+
+    # Writing the cleaned lines back to the same file.
+    with open(file_path, 'w', encoding='utf-8') as file:
+        file.writelines(cleaned_lines)
 
 # Run the scripts in order, including the reference genome as an argument for the first two scripts.
 run_script(genetic_distances_script, reference_genome)
@@ -64,16 +76,18 @@ def append_average_mutation_rate_to_file(file_path, protein_name, mutation_rate)
     reader = csv.reader(file)
     next(reader)  # skip the header row.
     first_row = next(reader, None)  # read the first data row.
-    protein_name = 'UnknownProtein'  # default value, if protein not found.
+    protein_name = 'UnknownProtein'  # default value
     if first_row:
         strain_name = first_row[0]
         protein_name = strain_name.split('|')[1].split(',')[0] if '|' in strain_name else strain_name
 
 # Append the average mutation rate to the text file
 append_average_mutation_rate_to_file(average_mutation_rate_file, protein_name, average_mutation_rate)
 
-# After processing and calculations are done, deleting the intermediate files.
+# Clean the average mutation rates text file
+clean_up_mutation_rates_txt(average_mutation_rate_file)
+
+# After processing and calculations are done, delete the intermediate files.
 os.remove('geneAnalysis-output/genetic_distances.csv')
 os.remove('geneAnalysis-output/branch_length.csv')
 os.remove('geneAnalysis-output/mutation_rates.csv')
-