Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

changes to maintenance script / ensembl push #1646

Merged
merged 13 commits into from
Jan 17, 2025
151 changes: 140 additions & 11 deletions maintenance/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,12 @@ def test_name(self):
def test_common_name(self):
assert self.species.common_name == "$common_name"

def test_assembly_source(self):
assert self.genome.assembly_source == "$assembly_source"

def test_assembly_build_version(self):
assert self.genome.assembly_build_version == "$assembly_build_version"

# QC Tests. These tests are performed by another contributor
# independently referring to the citations provided in the
# species definition, filling in the appropriate values
Expand Down Expand Up @@ -177,6 +183,10 @@ def black_format(code):


def ensembl_stdpopsim_id(ensembl_id):
if ensembl_id == "canis_lupus_familiaris":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

perhaps insert a comment explaining what this is doing?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean, explaining why this is necessary

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And, actually, I don't understand why it's necessary. I see below that now species.ensembl_id == "canis_lupus_familiaris", so where does ensembl_id equal "canis_familiaris"? Just missing something here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hah this is because of the transition that happened when running the maintenance script! now i bet i can take it out

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay - could you either take this out, or say in the comment exactly what needs to happen to take it out? (It says "once we have moved to the new names" - but: moved what names? where? I am confused. =)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added a larger comment now explaining the context that says:

    # below is do deal with name changes in Ensembl
    # TODO: remove this once we have moved to the new names
    # this was added as a temporary fix to allow the release to be
    # updated to 113, where the names of these species were changed
    # in the future we might keep this code block, but comment it out
    # to show others how maintenance updates were performed in the case
    # where the species name changed in Ensembl

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't what I understood from talking to you - I thought that (a) we could remove this code right now if we wanted, since this if clause will never get triggered; but (b) we were going to leave the code in as an example of how to do this. Is that right?

ensembl_id = "canis_familiaris"
elif ensembl_id == "escherichia_coli_str_k_12_substr_mg1655_gca_000005845":
ensembl_id = "escherichia_coli"
tmp = ensembl_id.split("_")[:2]
sps_id = "".join([x[0:3].capitalize() for x in tmp])
if len(sps_id) != 6:
Expand Down Expand Up @@ -296,15 +306,80 @@ def write_genome_data(self, ensembl_id):
raise ValueError(
f"Directory {id} corresponding to {ensembl_id} does" + "not exist"
)
logger.info(f"Writing genome data for {sps_id} {ensembl_id}")
path = path / "genome_data.py"

genome_data_path = path / "genome_data.py"
# Check if file exists and has non-Ensembl assembly source
if genome_data_path.exists():
try:
# Get existing data
namespace = {}
with open(genome_data_path) as f:
exec(f.read(), namespace)
existing_assembly_source = namespace["data"].get(
"assembly_source", "ensembl"
)
if existing_assembly_source != "ensembl":
logger.info(
f"Skipping {sps_id} ({ensembl_id}): non-Ensembl assembly source"
andrewkern marked this conversation as resolved.
Show resolved Hide resolved
)
return ("manual", None)
except Exception as e:
logger.warning(
f"Error checking assembly source for {sps_id}: {e}. "
"Proceeding with update."
)

# Get new data from Ensembl
data = self.ensembl_client.get_genome_data(ensembl_id)

# Preserve existing assembly source or default to "ensembl"
if genome_data_path.exists():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wait, this duplicates the code above?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(and, it seems like it's doing the same thing, almost, as the code above?)

try:
namespace = {}
with open(genome_data_path) as f:
exec(f.read(), namespace)
data["assembly_source"] = namespace["data"].get(
"assembly_source", "ensembl"
)
except Exception:
data["assembly_source"] = "ensembl"
else:
data["assembly_source"] = "ensembl"

# Add Ensembl version number if assembly source is Ensembl
if data["assembly_source"] == "ensembl":
data["assembly_build_version"] = str(self.ensembl_client.get_release())
else:
data["assembly_build_version"] = None

# Check if existing genome data exists and compare chromosome names
if genome_data_path.exists():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is doing something different to the code above but duplicates the "open genome_data_path if it exists" code; just do that once?

try:
# Get existing chromosome names
namespace = {}
with open(genome_data_path) as f:
exec(f.read(), namespace)
existing_chroms = set(namespace["data"]["chromosomes"].keys())
new_chroms = set(data["chromosomes"].keys())

if existing_chroms != new_chroms:
logger.warning(
f"Skipping {sps_id} ({ensembl_id}): chromosome names mismatch."
petrelharp marked this conversation as resolved.
Show resolved Hide resolved
)
return ("chrom_mismatch", (existing_chroms, new_chroms))
except Exception as e:
logger.warning(
f"Error comparing chromosome names for {sps_id}: {e}. \
Proceeding with update."
)

logger.info(f"Writing genome data for {sps_id} {ensembl_id}")
code = f"data = {data}"

# Format the code with Black so we don't get noisy diffs
with self.write(path) as f:
with self.write(genome_data_path) as f:
f.write(black_format(code))
return data
return ("updated", None)
petrelharp marked this conversation as resolved.
Show resolved Hide resolved

def write_genome_data_ncbi(self, ncbi_id, sps_id):
path = catalog_path(sps_id)
Expand Down Expand Up @@ -371,15 +446,69 @@ def update_genome_data(species):
will update the genome data for humans. Multiple species can
be specified. By default all species are updated.
"""
# TODO make this work for NCBI as well
# Track warnings and errors
skipped_species = []

# Original species processing logic
if len(species) == 0:
embl_ids = [s.ensembl_id for s in stdpopsim.all_species()]
species_list = list(stdpopsim.all_species())
logger.info(f"Found {len(species_list)} species in catalog")
embl_ids = []
for s in species_list:
logger.info(f"Processing {s.id}: ensembl_id={s.ensembl_id}")
embl_ids.append((s.id, s.ensembl_id))
else:
embl_ids = [s.lower() for s in species]
embl_ids = [(s, s.lower()) for s in species]

# Process each species, maintaining existing logging
writer = DataWriter()
for eid in embl_ids:
writer.write_genome_data(eid)
writer.write_ensembl_release()
for species_id, eid in embl_ids:
try:
result = writer.write_genome_data(eid)
if result is not None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see how that function could return None?

status, details = result
if status == "manual":
skipped_species.append(
(species_id, eid, "Manually created genome data file")
)
elif status == "chrom_mismatch":
existing_chroms, new_chroms = details
skipped_species.append(
(
species_id,
eid,
(
f"Chromosome names mismatch.\n"
f"Existing: {sorted(existing_chroms)}\n"
f"New: {sorted(new_chroms)}"
),
)
)
except ValueError as e:
logger.error(f"Error processing {eid}: {e}")
skipped_species.append((species_id, eid, str(e)))
except Exception as e:
logger.error(f"Unexpected error processing {eid}: {e}")
skipped_species.append((species_id, eid, f"Unexpected error: {str(e)}"))

# writer.write_ensembl_release()

# Add summary report at the end
if skipped_species:
logger.warning("\n=== Species Update Summary ===")
logger.warning("The following species were not updated:")
for species_id, eid, reason in skipped_species:
if "Chromosome names mismatch" in reason:
# Split the chromosome mismatch message into multiple lines
logger.warning(f" - {species_id} (Ensembl ID: {eid}):")
logger.warning(" Chromosome names mismatch.")
# Parse the chromosome lists from the new format
existing = reason.split("Existing: ")[1].split("\n")[0]
new = reason.split("New: ")[1]
logger.warning(f" Existing chromosomes: {existing}")
logger.warning(f" New chromosomes: {new}")
else:
logger.warning(f" - {species_id} (Ensembl ID: {eid}): {reason}")


@cli.command()
Expand All @@ -391,7 +520,7 @@ def add_species(ensembl_id, force):
"""
writer = DataWriter()
writer.add_species(ensembl_id.lower(), force=force)
writer.write_ensembl_release()
# writer.write_ensembl_release()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove?



# TODO refactor this so that it's an option to add-species. By default
Expand Down
2 changes: 2 additions & 0 deletions stdpopsim/catalog/AedAeg/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@
"3": {"length": 409777670, "synonyms": []},
"MT": {"length": 16790, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/AnaPla/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,6 @@
"39": {"length": 2018729, "synonyms": []},
"40": {"length": 1354177, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/AnoCar/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,6 @@
"LGh": {"length": 248369, "synonyms": []},
"MT": {"length": 17223, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "103",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/AnoGam/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,6 @@
"X": {"length": 24393108, "synonyms": []},
"Mt": {"length": 15363, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
36 changes: 19 additions & 17 deletions stdpopsim/catalog/ApiMel/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,24 @@
"assembly_accession": "GCA_003254395.2",
"assembly_name": "Amel_HAv3.1",
"chromosomes": {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, this is why Codecov is complaining (no more synonyms, so this bit of code in ApiMel/species.py is not needed)
Screenshot from 2025-01-13 21-54-18

"CM009931.2": {"length": 27754200, "synonyms": ["NC_037638.1"]},
"CM009932.2": {"length": 16089512, "synonyms": ["NC_037639.1"]},
"CM009933.2": {"length": 13619445, "synonyms": ["NC_037640.1"]},
"CM009934.2": {"length": 13404451, "synonyms": ["NC_037641.1"]},
"CM009935.2": {"length": 13896941, "synonyms": ["NC_037642.1"]},
"CM009936.2": {"length": 17789102, "synonyms": ["NC_037643.1"]},
"CM009937.2": {"length": 14198698, "synonyms": ["NC_037644.1"]},
"CM009938.2": {"length": 12717210, "synonyms": ["NC_037645.1"]},
"CM009939.2": {"length": 12354651, "synonyms": ["NC_037646.1"]},
"CM009940.2": {"length": 12360052, "synonyms": ["NC_037647.1"]},
"CM009941.2": {"length": 16352600, "synonyms": ["NC_037648.1"]},
"CM009942.2": {"length": 11514234, "synonyms": ["NC_037649.1"]},
"CM009943.2": {"length": 11279722, "synonyms": ["NC_037650.1"]},
"CM009944.2": {"length": 10670842, "synonyms": ["NC_037651.1"]},
"CM009945.2": {"length": 9534514, "synonyms": ["NC_037652.1"]},
"CM009946.2": {"length": 7238532, "synonyms": ["NC_037653.1"]},
"CM009947.2": {"length": 16343, "synonyms": ["NC_001566.1", "MT"]},
petrelharp marked this conversation as resolved.
Show resolved Hide resolved
"CM009931.2": {"length": 27754200, "synonyms": []},
"CM009932.2": {"length": 16089512, "synonyms": []},
"CM009933.2": {"length": 13619445, "synonyms": []},
"CM009934.2": {"length": 13404451, "synonyms": []},
"CM009935.2": {"length": 13896941, "synonyms": []},
"CM009936.2": {"length": 17789102, "synonyms": []},
"CM009937.2": {"length": 14198698, "synonyms": []},
"CM009938.2": {"length": 12717210, "synonyms": []},
"CM009939.2": {"length": 12354651, "synonyms": []},
"CM009940.2": {"length": 12360052, "synonyms": []},
"CM009941.2": {"length": 16352600, "synonyms": []},
"CM009942.2": {"length": 11514234, "synonyms": []},
"CM009943.2": {"length": 11279722, "synonyms": []},
"CM009944.2": {"length": 10670842, "synonyms": []},
"CM009945.2": {"length": 9534514, "synonyms": []},
"CM009946.2": {"length": 7238532, "synonyms": []},
"CM009947.2": {"length": 16343, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/AraTha/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,6 @@
"Mt": {"length": 366924, "synonyms": []},
"Pt": {"length": 154478, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
66 changes: 34 additions & 32 deletions stdpopsim/catalog/BosTau/genome_data.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,40 @@
# File autogenerated from Ensembl REST API. Do not edit.
data = {
"assembly_accession": "GCA_002263795.2",
"assembly_name": "ARS-UCD1.2",
"assembly_accession": "GCA_002263795.3",
"assembly_name": "ARS-UCD1.3",
"chromosomes": {
"1": {"length": 158534110, "synonyms": []},
"2": {"length": 136231102, "synonyms": []},
"3": {"length": 121005158, "synonyms": []},
"4": {"length": 120000601, "synonyms": []},
"5": {"length": 120089316, "synonyms": []},
"6": {"length": 117806340, "synonyms": []},
"7": {"length": 110682743, "synonyms": []},
"8": {"length": 113319770, "synonyms": []},
"9": {"length": 105454467, "synonyms": []},
"10": {"length": 103308737, "synonyms": []},
"11": {"length": 106982474, "synonyms": []},
"12": {"length": 87216183, "synonyms": []},
"13": {"length": 83472345, "synonyms": []},
"14": {"length": 82403003, "synonyms": []},
"15": {"length": 85007780, "synonyms": []},
"16": {"length": 81013979, "synonyms": []},
"17": {"length": 73167244, "synonyms": []},
"18": {"length": 65820629, "synonyms": []},
"19": {"length": 63449741, "synonyms": []},
"20": {"length": 71974595, "synonyms": []},
"21": {"length": 69862954, "synonyms": []},
"22": {"length": 60773035, "synonyms": []},
"23": {"length": 52498615, "synonyms": []},
"24": {"length": 62317253, "synonyms": []},
"25": {"length": 42350435, "synonyms": []},
"26": {"length": 51992305, "synonyms": []},
"27": {"length": 45612108, "synonyms": []},
"28": {"length": 45940150, "synonyms": []},
"29": {"length": 51098607, "synonyms": []},
"X": {"length": 139009144, "synonyms": []},
"1": {"length": 158534110, "synonyms": ["chr1"]},
"2": {"length": 136231102, "synonyms": ["chr2"]},
"3": {"length": 121005158, "synonyms": ["chr3"]},
"4": {"length": 120000601, "synonyms": ["chr4"]},
"5": {"length": 120089316, "synonyms": ["chr5"]},
"6": {"length": 117806340, "synonyms": ["chr6"]},
"7": {"length": 110682743, "synonyms": ["chr7"]},
"8": {"length": 113319770, "synonyms": ["chr8"]},
"9": {"length": 105454467, "synonyms": ["chr9"]},
"10": {"length": 103308737, "synonyms": ["chr10"]},
"11": {"length": 106982474, "synonyms": ["chr11"]},
"12": {"length": 87216183, "synonyms": ["chr12"]},
"13": {"length": 83472345, "synonyms": ["chr13"]},
"14": {"length": 82403003, "synonyms": ["chr14"]},
"15": {"length": 85007780, "synonyms": ["chr15"]},
"16": {"length": 81013979, "synonyms": ["chr16"]},
"17": {"length": 73167244, "synonyms": ["chr17"]},
"18": {"length": 65820629, "synonyms": ["chr18"]},
"19": {"length": 63449741, "synonyms": ["chr19"]},
"20": {"length": 71974595, "synonyms": ["chr20"]},
"21": {"length": 69862954, "synonyms": ["chr21"]},
"22": {"length": 60773035, "synonyms": ["chr22"]},
"23": {"length": 52498615, "synonyms": ["chr23"]},
"24": {"length": 62317253, "synonyms": ["chr24"]},
"25": {"length": 42350435, "synonyms": ["chr25"]},
"26": {"length": 51992305, "synonyms": ["chr26"]},
"27": {"length": 45612108, "synonyms": ["chr27"]},
"28": {"length": 45940150, "synonyms": ["chr28"]},
"29": {"length": 51098607, "synonyms": ["chr29"]},
"X": {"length": 139009144, "synonyms": ["chrX"]},
"MT": {"length": 16338, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
Loading
Loading