Skip to content

Commit

Permalink
Update geneAnalysis.py
Browse files Browse the repository at this point in the history
  • Loading branch information
mkayasth authored May 1, 2024
1 parent cc80cd4 commit b0dda17
Showing 1 changed file with 19 additions and 5 deletions.
24 changes: 19 additions & 5 deletions geneAnalysis/geneAnalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

# check if a reference genome is provided as a command-line argument.
if len(sys.argv) < 2:
print("Usage: python3 geneAnalysis.py <reference_genome>")
print("Usage: python3.11 geneAnalysis.py <reference_genome>")
sys.exit(1)

# get the reference genome from the command-line argument.
Expand All @@ -27,7 +27,7 @@

# function to run a Python script with additional arguments, if needed.
def run_script(script_path, *args):
subprocess.run(['python3', script_path, *args], check=True)
subprocess.run(['python3.11', script_path, *args], check=True)

# Calculate the average mutation rate (for a protein ~~ by comparing all with reference strain).
def calculate_average_mutation_rate(csv_file): # avg of all strains compared to the reference genome from mutation_rates.csv.
Expand All @@ -50,6 +50,18 @@ def append_average_mutation_rate_to_file(file_path, protein_name, mutation_rate)
# then, append to the file.
with open(file_path, mode='a', encoding='utf-8') as file:
file.write(f"Average mutation rate for {protein_name}: {mutation_rate}\n")

# function that cleans up any hypothetical_protein that Prokka could not annotate.
def clean_up_mutation_rates_txt(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()

# Filter out lines containing 'hypothetical_protein'
cleaned_lines = [line for line in lines if 'hypothetical_protein' not in line.lower()]

# Writing the cleaned lines back to the same file.
with open(file_path, 'w', encoding='utf-8') as file:
file.writelines(cleaned_lines)

# Run the scripts in order, including the reference genome as an argument for the first two scripts.
run_script(genetic_distances_script, reference_genome)
Expand All @@ -64,16 +76,18 @@ def append_average_mutation_rate_to_file(file_path, protein_name, mutation_rate)
reader = csv.reader(file)
next(reader) # skip the header row.
first_row = next(reader, None) # read the first data row.
protein_name = 'UnknownProtein' # default value, if protein not found.
protein_name = 'UnknownProtein' # default value
if first_row:
strain_name = first_row[0]
protein_name = strain_name.split('|')[1].split(',')[0] if '|' in strain_name else strain_name

# Append the average mutation rate to the text file
append_average_mutation_rate_to_file(average_mutation_rate_file, protein_name, average_mutation_rate)

# After processing and calculations are done, deleting the intermediate files.
# Clean the average mutation rates text file
clean_up_mutation_rates_txt(average_mutation_rate_file)

# After processing and calculations are done, delete the intermediate files.
os.remove('geneAnalysis-output/genetic_distances.csv')
os.remove('geneAnalysis-output/branch_length.csv')
os.remove('geneAnalysis-output/mutation_rates.csv')

0 comments on commit b0dda17

Please sign in to comment.