Skip to content

Commit

Permalink
Update ERGA EAR tool to v24.10.15 (#1523)
Browse files Browse the repository at this point in the history
* Update ERGA EAR tool

* Flake8 fixes
  • Loading branch information
SaimMomin12 authored Oct 15, 2024
1 parent 71b3dac commit 720787c
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 103 deletions.
2 changes: 1 addition & 1 deletion tools/ear/macros.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<macros>
<token name="@TOOL_VERSION@">24.08.26</token>
<token name="@TOOL_VERSION@">24.10.15</token>
<token name="@VERSION_SUFFIX@">0</token>
<token name="@PROFILE@">23.2</token>
<xml name="creator">
Expand Down
226 changes: 124 additions & 102 deletions tools/ear/make_EAR.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
# CAUTION: This is for the Galaxy version!
# by Diego De Panis
# ERGA Sequencing and Assembly Committee
EAR_version = "v24.08.26"
EAR_version = "v24.10.15"


def make_report(yaml_file):
Expand All @@ -36,18 +36,29 @@ def format_number(value):
try:
value_float = float(value)
if value_float.is_integer():
# format as an integer if no decimal part
# format as integer if no decimal
return f'{int(value_float):,}'
else:
# format as a float
return f'{value_float:,}'
except ValueError:
# return the original value if it can't be converted to a float
# return original value if can't be converted to float
return value

# extract gfastats values
def extract_gfastats_values(content, keys):
return [re.findall(f"{key}: (.+)", content)[0] for key in keys]
values = []
for key in keys:
# colon-separated as default format first
match = re.search(rf"{re.escape(key)}:\s*(.+)", content)
if not match:
# If not try galaxy's tab-separated
match = re.search(rf"{re.escape(key)}\t(.+)", content)
if match:
values.append(match.group(1).strip())
else:
values.append("N/A")
return values

keys = [
"Total scaffold length",
Expand Down Expand Up @@ -79,9 +90,17 @@ def extract_gfastats_values(content, keys):
def extract_total_bp_from_gfastats(gfastats_path):
with open(gfastats_path, "r") as f:
content = f.read()
total_bp = re.search(r"Total scaffold length: (.+)", content).group(1)
total_bp = int(total_bp.replace(',', ''))
return "{:,}".format(total_bp)
# Try colon-separated format first
match = re.search(r"Total scaffold length:\s*(.+)", content)
if not match:
# If not found, try tab-separated format
match = re.search(r"Total scaffold length\t(.+)", content)
if match:
total_bp = match.group(1).replace(',', '')
return "{:,}".format(int(total_bp))
else:
logging.error(f"Could not find Total scaffold length in {gfastats_path}")
return "N/A"

# compute EBP quality metric
def compute_ebp_metric(haplotype, gfastats_path, qv_value):
Expand All @@ -93,7 +112,6 @@ def compute_ebp_metric(haplotype, gfastats_path, qv_value):
values = extract_gfastats_values(content, keys_needed)
contig_n50_log = math.floor(math.log10(int(values[0].replace(',', ''))))
scaffold_n50_log = math.floor(math.log10(int(values[1].replace(',', ''))))

return f"Obtained EBP quality metric for {haplotype}: {contig_n50_log}.{scaffold_n50_log}.Q{math.floor(float(qv_value))}"

# extract qv values
Expand Down Expand Up @@ -151,25 +169,29 @@ def extract_busco_values(file_path):
def extract_busco_info(file_path):
busco_version = None
lineage_info = None
busco_mode = None
busco_pred = None

try:
with open(file_path, 'r') as file:
content = file.read()
version_match = re.search(r"# BUSCO version is: ([\d.]+)", content)
if version_match:
busco_version = version_match.group(1)
lineage_match = re.search(r"The lineage dataset is: (.*?) \(Creation date:.*?, number of genomes: (\d+), number of BUSCOs: (\d+)\)", content)
lineage_match = re.search(r"The lineage dataset is: (.*?) \(Creation date:.*?, number of (genomes|species): (\d+), number of BUSCOs: (\d+)\)", content)
if lineage_match:
lineage_info = lineage_match.groups()
if not lineage_info:
lineage_match = re.search(r"The lineage dataset is: (.*?) \(Creation date:.*?, number of species: (\d+), number of BUSCOs: (\d+)\)", content)
if lineage_match:
lineage_info = lineage_match.groups()
lineage_info = (lineage_match.group(1), lineage_match.group(3), lineage_match.group(4))
mode_match = re.search(r"# BUSCO was run in mode: (\w+)", content)
if mode_match:
busco_mode = mode_match.group(1)
pred_match = re.search(r"# Gene predictor used: (\w+)", content)
if pred_match:
busco_pred = pred_match.group(1)

except Exception as e:
logging.warning(f"Error reading {file_path}: {str(e)}")

return busco_version, lineage_info
return busco_version, lineage_info, busco_mode, busco_pred

# Function to check and generate warning messages
def generate_warning_paragraphs(expected, observed, trait):
Expand All @@ -195,6 +217,7 @@ def generate_warning_paragraphs(expected, observed, trait):
paragraphs.append(Paragraph(message, styles["midiStyle"]))
except Exception as e:
logging.warning(f"Error in generating warning for {trait}: {str(e)}")

return paragraphs

# Generate warnings for curated haplotypes (qv, kcomp, busco)
Expand Down Expand Up @@ -465,19 +488,29 @@ def generate_pipeline_tree(pipeline_data):
if isinstance(haplotype_properties, dict):
if 'gfastats--nstar-report_txt' in haplotype_properties:
file_path = haplotype_properties['gfastats--nstar-report_txt']
with open(file_path, 'r') as file:
content = file.read()
gfastats_data[(asm_stage, haplotypes)] = extract_gfastats_values(content, keys)
try:
with open(file_path, 'r') as file:
content = file.read()
gfastats_data[(asm_stage, haplotypes)] = extract_gfastats_values(content, keys)
except FileNotFoundError:
logging.error(f"Gfastats file not found: {file_path}")
except Exception as e:
logging.error(f"Error processing gfastats file {file_path}: {str(e)}")

gaps_per_gbp_data = {}
for (asm_stage, haplotypes), values in gfastats_data.items():
try:
gaps = float(values[gaps_index])
total_length = float(values[total_length_index])
gaps_per_gbp = round((gaps / total_length * 1_000_000_000), 2)
gaps_per_gbp_data[(asm_stage, haplotypes)] = gaps_per_gbp
except (ValueError, ZeroDivisionError):
gaps_per_gbp_data[(asm_stage, haplotypes)] = ''
gaps = float(values[gaps_index].replace(',', ''))
total_length = float(values[total_length_index].replace(',', ''))
if total_length > 0:
gaps_per_gbp = round((gaps / total_length * 1_000_000_000), 2)
gaps_per_gbp_data[(asm_stage, haplotypes)] = gaps_per_gbp
else:
logging.warning(f"Total length is zero for {asm_stage} {haplotypes}")
gaps_per_gbp_data[(asm_stage, haplotypes)] = 'N/A'
except (ValueError, IndexError):
logging.warning(f"Could not calculate gaps per Gbp for {asm_stage} {haplotypes}")
gaps_per_gbp_data[(asm_stage, haplotypes)] = 'N/A'

# Define the contigging table (column names)
asm_table_data = [["Metrics"] + [f'{asm_stage} \n {haplotypes}' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]]
Expand All @@ -486,11 +519,10 @@ def generate_pipeline_tree(pipeline_data):
for i in range(len(display_names)):
metric = display_names[i]
if metric not in exclusion_list:
asm_table_data.append([metric] + [format_number(gfastats_data.get((asm_stage, haplotypes), [''])[i]) if (asm_stage, haplotypes) in gfastats_data else '' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])
asm_table_data.append([metric] + [format_number(gfastats_data.get((asm_stage, haplotypes), ['N/A'] * len(keys))[i]) if (asm_stage, haplotypes) in gfastats_data else 'N/A' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])

# Add the gaps/gbp in between
asm_table_data.insert(gaps_index + 1, ['Gaps/Gbp'] + [format_number(gaps_per_gbp_data.get((asm_stage, haplotypes), '')) for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])

asm_table_data.insert(gaps_index + 1, ['Gaps/Gbp'] + [format_number(gaps_per_gbp_data.get((asm_stage, haplotypes), 'N/A')) for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])
# get QV, Kmer completeness and BUSCO data
qv_data = {}
completeness_data = {}
Expand Down Expand Up @@ -747,23 +779,31 @@ def generate_pipeline_tree(pipeline_data):
# Spacer
elements.append(Spacer(1, 5))

# Store BUSCO version and lineage information from each file in list
# Store BUSCO information from each file in a list
busco_info_list = []
for asm_stages, stage_properties in asm_data.items():
for i, haplotype_properties in stage_properties.items():
if isinstance(haplotype_properties, dict):
if 'busco_short_summary_txt' in haplotype_properties:
busco_version, lineage_info = extract_busco_info(haplotype_properties['busco_short_summary_txt'])
if busco_version and lineage_info:
busco_info_list.append((busco_version, lineage_info))
busco_info = extract_busco_info(haplotype_properties['busco_short_summary_txt'])
if all(busco_info):
busco_info_list.append(busco_info)

# Function to format BUSCO information
def format_busco_info(info):
version, (lineage, genomes, buscos), mode, predictor = info
return f"BUSCO: {version} ({mode}, {predictor}) / Lineage: {lineage} (genomes:{genomes}, BUSCOs:{buscos})"

# Checking if all elements in the list are identical
if all(info == busco_info_list[0] for info in busco_info_list):
busco_version, (lineage_name, num_genomes, num_buscos) = busco_info_list[0]
elements.append(Paragraph(f"BUSCO {busco_version} Lineage: {lineage_name} (genomes:{num_genomes}, BUSCOs:{num_buscos})", styles['miniStyle']))
if busco_info_list and all(info == busco_info_list[0] for info in busco_info_list):
busco_text = format_busco_info(busco_info_list[0])
elements.append(Paragraph(busco_text, styles['miniStyle']))
else:
elements.append(Paragraph("Warning: BUSCO versions or lineage datasets are not the same across results", styles['miniStyle']))
logging.warning("WARNING!!! BUSCO versions or lineage datasets are not the same across results")
elements.append(Paragraph("Warning! BUSCO versions or lineage datasets are not the same across results:", styles['miniStyle']))
logging.warning("WARNING: BUSCO versions or lineage datasets are not the same across results")
for info in busco_info_list:
busco_text = format_busco_info(info)
elements.append(Paragraph(busco_text, styles['miniStyle']))

# Page break
elements.append(PageBreak())
Expand Down Expand Up @@ -806,7 +846,7 @@ def generate_pipeline_tree(pipeline_data):

# Add paragraph for the link
if link:
link_html = f'<b>{haplotype}</b> <link href="{link}" color="blue">[LINK]</link>'
link_html = f'<b>{haplotype}</b> <link href="{link}" color="blue">[non-permanent LINK]</link>'
else:
link_html = f'<b>{haplotype}</b> File link is missing!'

Expand Down Expand Up @@ -852,93 +892,75 @@ def generate_pipeline_tree(pipeline_data):
curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {})

# Get paths for spectra files
spectra_files = {
'hap1': {
'spectra_cn_png': curated_assemblies.get('hap1', {}).get('merqury_hap_spectra_cn_png', None),
},
'hap2': {
'spectra_cn_png': curated_assemblies.get('hap2', {}).get('merqury_hap_spectra_cn_png', None),
},
'common': {
'spectra_cn_png': curated_assemblies.get('hap1', {}).get('merqury_spectra_cn_png', None),
'spectra_asm_png': curated_assemblies.get('hap1', {}).get('merqury_spectra_asm_png', None),
}
}

# Filter out None values and empty strings
spectra_files = {k: {sk: v for sk, v in sv.items() if v} for k, sv in spectra_files.items()}
spectra_files = {'common': {}}
for assembly_type, assembly_data in curated_assemblies.items():
if 'merqury_hap_spectra_cn_png' in assembly_data:
spectra_files[assembly_type] = {'spectra_cn_png': assembly_data['merqury_hap_spectra_cn_png']}
if 'merqury_spectra_cn_png' in assembly_data:
spectra_files['common']['spectra_cn_png'] = assembly_data['merqury_spectra_cn_png']
if 'merqury_spectra_asm_png' in assembly_data:
spectra_files['common']['spectra_asm_png'] = assembly_data['merqury_spectra_asm_png']

# Determine the number of spectra-cn files and assign unique names if needed
spectra_cn_files = [
spectra_files['common'].get('spectra_cn_png', None),
spectra_files['hap1'].get('spectra_cn_png', None),
spectra_files['hap2'].get('spectra_cn_png', None)
file_dict.get('spectra_cn_png', None)
for file_dict in spectra_files.values()
if file_dict.get('spectra_cn_png')
]
spectra_cn_files = [f for f in spectra_cn_files if f] # Filter out None values
spectra_cn_files = list(set(spectra_cn_files)) # Remove duplicates

if len(spectra_cn_files) == 3:
# For 3 spectra-cn files
shortest_spectra_cn_file = min(spectra_cn_files, key=lambda f: len(os.path.basename(f)), default=None)
similar_files = [f for f in spectra_cn_files if f != shortest_spectra_cn_file]
if similar_files:
unique_name1, unique_name2 = find_unique_parts(os.path.basename(similar_files[0]), os.path.basename(similar_files[1]))
else:
shortest_spectra_cn_file = spectra_cn_files[0] if spectra_cn_files else None
unique_name1 = unique_name2 = None
# unique_name1 = unique_name2 = None

# Create image objects and add filename below each image
images = []

for label, file_dict in spectra_files.items():
for key, png_file in file_dict.items():
if png_file:
image = Image(png_file, width=8.4 * cm, height=7 * cm)
filename = os.path.basename(png_file)

if filename.endswith("spectra-asm.ln.png"):
text = "Distribution of k-mer counts coloured by their presence in reads/assemblies"
elif filename.endswith("spectra-cn.ln.png"):
if len(spectra_cn_files) == 3:
# For 3 spectra-cn files use particular text
if png_file == shortest_spectra_cn_file:
text = "Distribution of k-mer counts per copy numbers found in asm (dipl.)"
else:
if png_file == spectra_files['hap1'].get('spectra_cn_png', None):
text = f"Distribution of k-mer counts per copy numbers found in <b>{unique_name1}</b> (hapl.)"
elif png_file == spectra_files['hap2'].get('spectra_cn_png', None):
text = f"Distribution of k-mer counts per copy numbers found in <b>{unique_name2}</b> (hapl.)"
if png_file and os.path.exists(png_file):
try:
image = Image(png_file, width=8.4 * cm, height=7 * cm)
filename = os.path.basename(png_file)

if filename.endswith("spectra-asm.ln.png"):
text = "Distribution of k-mer counts coloured by their presence in reads/assemblies"
elif filename.endswith("spectra-cn.ln.png"):
if len(spectra_cn_files) == 3:
if png_file == shortest_spectra_cn_file:
text = "Distribution of k-mer counts per copy numbers found in asm (dipl.)"
else:
text = "Distribution of k-mer counts per copy numbers found in asm"
text = f"Distribution of k-mer counts per copy numbers found in {label} (hapl.)"
else:
text = "Distribution of k-mer counts per copy numbers found in asm"
else:
# For 2 spectra-cn files use same text
text = "Distribution of k-mer counts per copy numbers found in asm"
else:
text = filename

images.append([image, Paragraph(text, styles["midiStyle"])])

# Filter None values
images = [img for img in images if img[0] is not None]
text = filename

# Get number of rows and columns for the table
num_rows = (len(images) + 1) // 2 # +1 to handle odd numbers of images
num_columns = 2
images.append([image, Paragraph(text, styles["midiStyle"])])
except Exception as e:
logging.error(f"Error processing image {png_file}: {str(e)}")

# Create the table with dynamic size
image_table_data = [[images[i * num_columns + j] if i * num_columns + j < len(images) else [] for j in range(num_columns)] for i in range(num_rows)]
image_table = Table(image_table_data)

# Style the "table"
table_style = TableStyle([
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
('BOTTOMPADDING', (0, 0), (-1, -1), 20), # 20 here is a spacer between rows
])

# Set the style
image_table.setStyle(table_style)

# Add image table to elements
elements.append(image_table)
if images:
num_rows = (len(images) + 1) // 2
num_columns = 2
image_table_data = [[images[i * num_columns + j] if i * num_columns + j < len(images) else [] for j in range(num_columns)] for i in range(num_rows)]
image_table = Table(image_table_data)

# Style the table
table_style = TableStyle([
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
('BOTTOMPADDING', (0, 0), (-1, -1), 20), # 20 here is a spacer between rows
])

image_table.setStyle(table_style)
elements.append(image_table)
else:
elements.append(Paragraph("No K-mer spectra images available.", styles["midiStyle"]))

# Increase counter by the number of PNGs added
counter += len(images)
Expand Down

0 comments on commit 720787c

Please sign in to comment.