-
Notifications
You must be signed in to change notification settings - Fork 92
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
changes to maintenance script / ensembl push #1646
Changes from 4 commits
4e03aed
2fb9499
71751a0
1a9b881
d120909
18165af
35b8c71
c2f14c8
44bcae9
05d32eb
36e9395
69c37bd
5753a8a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -132,6 +132,12 @@ def test_name(self): | |
def test_common_name(self): | ||
assert self.species.common_name == "$common_name" | ||
|
||
def test_assembly_source(self): | ||
assert self.genome.assembly_source == "$assembly_source" | ||
|
||
def test_assembly_build_version(self): | ||
assert self.genome.assembly_build_version == "$assembly_build_version" | ||
|
||
# QC Tests. These tests are performed by another contributor | ||
# independently referring to the citations provided in the | ||
# species definition, filling in the appropriate values | ||
|
@@ -177,6 +183,10 @@ def black_format(code): | |
|
||
|
||
def ensembl_stdpopsim_id(ensembl_id): | ||
if ensembl_id == "canis_lupus_familiaris": | ||
ensembl_id = "canis_familiaris" | ||
elif ensembl_id == "escherichia_coli_str_k_12_substr_mg1655_gca_000005845": | ||
ensembl_id = "escherichia_coli" | ||
tmp = ensembl_id.split("_")[:2] | ||
sps_id = "".join([x[0:3].capitalize() for x in tmp]) | ||
if len(sps_id) != 6: | ||
|
@@ -296,15 +306,80 @@ def write_genome_data(self, ensembl_id): | |
raise ValueError( | ||
f"Directory {id} corresponding to {ensembl_id} does" + "not exist" | ||
) | ||
logger.info(f"Writing genome data for {sps_id} {ensembl_id}") | ||
path = path / "genome_data.py" | ||
|
||
genome_data_path = path / "genome_data.py" | ||
# Check if file exists and has non-Ensembl assembly source | ||
if genome_data_path.exists(): | ||
try: | ||
# Get existing data | ||
namespace = {} | ||
with open(genome_data_path) as f: | ||
exec(f.read(), namespace) | ||
existing_assembly_source = namespace["data"].get( | ||
"assembly_source", "ensembl" | ||
) | ||
if existing_assembly_source != "ensembl": | ||
logger.info( | ||
f"Skipping {sps_id} ({ensembl_id}): non-Ensembl assembly source" | ||
andrewkern marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
return ("manual", None) | ||
except Exception as e: | ||
logger.warning( | ||
f"Error checking assembly source for {sps_id}: {e}. " | ||
"Proceeding with update." | ||
) | ||
|
||
# Get new data from Ensembl | ||
data = self.ensembl_client.get_genome_data(ensembl_id) | ||
|
||
# Preserve existing assembly source or default to "ensembl" | ||
if genome_data_path.exists(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wait, this duplicates the code above? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (and, it seems like it's doing the same thing, almost, as the code above?) |
||
try: | ||
namespace = {} | ||
with open(genome_data_path) as f: | ||
exec(f.read(), namespace) | ||
data["assembly_source"] = namespace["data"].get( | ||
"assembly_source", "ensembl" | ||
) | ||
except Exception: | ||
data["assembly_source"] = "ensembl" | ||
else: | ||
data["assembly_source"] = "ensembl" | ||
|
||
# Add Ensembl version number if assembly source is Ensembl | ||
if data["assembly_source"] == "ensembl": | ||
data["assembly_build_version"] = str(self.ensembl_client.get_release()) | ||
else: | ||
data["assembly_build_version"] = None | ||
|
||
# Check if existing genome data exists and compare chromosome names | ||
if genome_data_path.exists(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is doing something different to the code above but duplicates the "open genome_data_path if it exists" code; just do that once? |
||
try: | ||
# Get existing chromosome names | ||
namespace = {} | ||
with open(genome_data_path) as f: | ||
exec(f.read(), namespace) | ||
existing_chroms = set(namespace["data"]["chromosomes"].keys()) | ||
new_chroms = set(data["chromosomes"].keys()) | ||
|
||
if existing_chroms != new_chroms: | ||
logger.warning( | ||
f"Skipping {sps_id} ({ensembl_id}): chromosome names mismatch." | ||
petrelharp marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
return ("chrom_mismatch", (existing_chroms, new_chroms)) | ||
except Exception as e: | ||
logger.warning( | ||
f"Error comparing chromosome names for {sps_id}: {e}. \ | ||
Proceeding with update." | ||
) | ||
|
||
logger.info(f"Writing genome data for {sps_id} {ensembl_id}") | ||
code = f"data = {data}" | ||
|
||
# Format the code with Black so we don't get noisy diffs | ||
with self.write(path) as f: | ||
with self.write(genome_data_path) as f: | ||
f.write(black_format(code)) | ||
return data | ||
return ("updated", None) | ||
petrelharp marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def write_genome_data_ncbi(self, ncbi_id, sps_id): | ||
path = catalog_path(sps_id) | ||
|
@@ -371,15 +446,69 @@ def update_genome_data(species): | |
will update the genome data for humans. Multiple species can | ||
be specified. By default all species are updated. | ||
""" | ||
# TODO make this work for NCBI as well | ||
# Track warnings and errors | ||
skipped_species = [] | ||
|
||
# Original species processing logic | ||
if len(species) == 0: | ||
embl_ids = [s.ensembl_id for s in stdpopsim.all_species()] | ||
species_list = list(stdpopsim.all_species()) | ||
logger.info(f"Found {len(species_list)} species in catalog") | ||
embl_ids = [] | ||
for s in species_list: | ||
logger.info(f"Processing {s.id}: ensembl_id={s.ensembl_id}") | ||
embl_ids.append((s.id, s.ensembl_id)) | ||
else: | ||
embl_ids = [s.lower() for s in species] | ||
embl_ids = [(s, s.lower()) for s in species] | ||
|
||
# Process each species, maintaining existing logging | ||
writer = DataWriter() | ||
for eid in embl_ids: | ||
writer.write_genome_data(eid) | ||
writer.write_ensembl_release() | ||
for species_id, eid in embl_ids: | ||
try: | ||
result = writer.write_genome_data(eid) | ||
if result is not None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see how that function could return |
||
status, details = result | ||
if status == "manual": | ||
skipped_species.append( | ||
(species_id, eid, "Manually created genome data file") | ||
) | ||
elif status == "chrom_mismatch": | ||
existing_chroms, new_chroms = details | ||
skipped_species.append( | ||
( | ||
species_id, | ||
eid, | ||
( | ||
f"Chromosome names mismatch.\n" | ||
f"Existing: {sorted(existing_chroms)}\n" | ||
f"New: {sorted(new_chroms)}" | ||
), | ||
) | ||
) | ||
except ValueError as e: | ||
logger.error(f"Error processing {eid}: {e}") | ||
skipped_species.append((species_id, eid, str(e))) | ||
except Exception as e: | ||
logger.error(f"Unexpected error processing {eid}: {e}") | ||
skipped_species.append((species_id, eid, f"Unexpected error: {str(e)}")) | ||
|
||
# writer.write_ensembl_release() | ||
|
||
# Add summary report at the end | ||
if skipped_species: | ||
logger.warning("\n=== Species Update Summary ===") | ||
logger.warning("The following species were not updated:") | ||
for species_id, eid, reason in skipped_species: | ||
if "Chromosome names mismatch" in reason: | ||
# Split the chromosome mismatch message into multiple lines | ||
logger.warning(f" - {species_id} (Ensembl ID: {eid}):") | ||
logger.warning(" Chromosome names mismatch.") | ||
# Parse the chromosome lists from the new format | ||
existing = reason.split("Existing: ")[1].split("\n")[0] | ||
new = reason.split("New: ")[1] | ||
logger.warning(f" Existing chromosomes: {existing}") | ||
logger.warning(f" New chromosomes: {new}") | ||
else: | ||
logger.warning(f" - {species_id} (Ensembl ID: {eid}): {reason}") | ||
|
||
|
||
@cli.command() | ||
|
@@ -391,7 +520,7 @@ def add_species(ensembl_id, force): | |
""" | ||
writer = DataWriter() | ||
writer.add_species(ensembl_id.lower(), force=force) | ||
writer.write_ensembl_release() | ||
# writer.write_ensembl_release() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove? |
||
|
||
|
||
# TODO refactor this so that it's an option to add-species. By default | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,22 +3,24 @@ | |
"assembly_accession": "GCA_003254395.2", | ||
"assembly_name": "Amel_HAv3.1", | ||
"chromosomes": { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
"CM009931.2": {"length": 27754200, "synonyms": ["NC_037638.1"]}, | ||
"CM009932.2": {"length": 16089512, "synonyms": ["NC_037639.1"]}, | ||
"CM009933.2": {"length": 13619445, "synonyms": ["NC_037640.1"]}, | ||
"CM009934.2": {"length": 13404451, "synonyms": ["NC_037641.1"]}, | ||
"CM009935.2": {"length": 13896941, "synonyms": ["NC_037642.1"]}, | ||
"CM009936.2": {"length": 17789102, "synonyms": ["NC_037643.1"]}, | ||
"CM009937.2": {"length": 14198698, "synonyms": ["NC_037644.1"]}, | ||
"CM009938.2": {"length": 12717210, "synonyms": ["NC_037645.1"]}, | ||
"CM009939.2": {"length": 12354651, "synonyms": ["NC_037646.1"]}, | ||
"CM009940.2": {"length": 12360052, "synonyms": ["NC_037647.1"]}, | ||
"CM009941.2": {"length": 16352600, "synonyms": ["NC_037648.1"]}, | ||
"CM009942.2": {"length": 11514234, "synonyms": ["NC_037649.1"]}, | ||
"CM009943.2": {"length": 11279722, "synonyms": ["NC_037650.1"]}, | ||
"CM009944.2": {"length": 10670842, "synonyms": ["NC_037651.1"]}, | ||
"CM009945.2": {"length": 9534514, "synonyms": ["NC_037652.1"]}, | ||
"CM009946.2": {"length": 7238532, "synonyms": ["NC_037653.1"]}, | ||
"CM009947.2": {"length": 16343, "synonyms": ["NC_001566.1", "MT"]}, | ||
petrelharp marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"CM009931.2": {"length": 27754200, "synonyms": []}, | ||
"CM009932.2": {"length": 16089512, "synonyms": []}, | ||
"CM009933.2": {"length": 13619445, "synonyms": []}, | ||
"CM009934.2": {"length": 13404451, "synonyms": []}, | ||
"CM009935.2": {"length": 13896941, "synonyms": []}, | ||
"CM009936.2": {"length": 17789102, "synonyms": []}, | ||
"CM009937.2": {"length": 14198698, "synonyms": []}, | ||
"CM009938.2": {"length": 12717210, "synonyms": []}, | ||
"CM009939.2": {"length": 12354651, "synonyms": []}, | ||
"CM009940.2": {"length": 12360052, "synonyms": []}, | ||
"CM009941.2": {"length": 16352600, "synonyms": []}, | ||
"CM009942.2": {"length": 11514234, "synonyms": []}, | ||
"CM009943.2": {"length": 11279722, "synonyms": []}, | ||
"CM009944.2": {"length": 10670842, "synonyms": []}, | ||
"CM009945.2": {"length": 9534514, "synonyms": []}, | ||
"CM009946.2": {"length": 7238532, "synonyms": []}, | ||
"CM009947.2": {"length": 16343, "synonyms": []}, | ||
}, | ||
"assembly_source": "ensembl", | ||
"assembly_build_version": "113", | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,38 +1,40 @@ | ||
# File autogenerated from Ensembl REST API. Do not edit. | ||
data = { | ||
"assembly_accession": "GCA_002263795.2", | ||
"assembly_name": "ARS-UCD1.2", | ||
"assembly_accession": "GCA_002263795.3", | ||
"assembly_name": "ARS-UCD1.3", | ||
"chromosomes": { | ||
"1": {"length": 158534110, "synonyms": []}, | ||
"2": {"length": 136231102, "synonyms": []}, | ||
"3": {"length": 121005158, "synonyms": []}, | ||
"4": {"length": 120000601, "synonyms": []}, | ||
"5": {"length": 120089316, "synonyms": []}, | ||
"6": {"length": 117806340, "synonyms": []}, | ||
"7": {"length": 110682743, "synonyms": []}, | ||
"8": {"length": 113319770, "synonyms": []}, | ||
"9": {"length": 105454467, "synonyms": []}, | ||
"10": {"length": 103308737, "synonyms": []}, | ||
"11": {"length": 106982474, "synonyms": []}, | ||
"12": {"length": 87216183, "synonyms": []}, | ||
"13": {"length": 83472345, "synonyms": []}, | ||
"14": {"length": 82403003, "synonyms": []}, | ||
"15": {"length": 85007780, "synonyms": []}, | ||
"16": {"length": 81013979, "synonyms": []}, | ||
"17": {"length": 73167244, "synonyms": []}, | ||
"18": {"length": 65820629, "synonyms": []}, | ||
"19": {"length": 63449741, "synonyms": []}, | ||
"20": {"length": 71974595, "synonyms": []}, | ||
"21": {"length": 69862954, "synonyms": []}, | ||
"22": {"length": 60773035, "synonyms": []}, | ||
"23": {"length": 52498615, "synonyms": []}, | ||
"24": {"length": 62317253, "synonyms": []}, | ||
"25": {"length": 42350435, "synonyms": []}, | ||
"26": {"length": 51992305, "synonyms": []}, | ||
"27": {"length": 45612108, "synonyms": []}, | ||
"28": {"length": 45940150, "synonyms": []}, | ||
"29": {"length": 51098607, "synonyms": []}, | ||
"X": {"length": 139009144, "synonyms": []}, | ||
"1": {"length": 158534110, "synonyms": ["chr1"]}, | ||
"2": {"length": 136231102, "synonyms": ["chr2"]}, | ||
"3": {"length": 121005158, "synonyms": ["chr3"]}, | ||
"4": {"length": 120000601, "synonyms": ["chr4"]}, | ||
"5": {"length": 120089316, "synonyms": ["chr5"]}, | ||
"6": {"length": 117806340, "synonyms": ["chr6"]}, | ||
"7": {"length": 110682743, "synonyms": ["chr7"]}, | ||
"8": {"length": 113319770, "synonyms": ["chr8"]}, | ||
"9": {"length": 105454467, "synonyms": ["chr9"]}, | ||
"10": {"length": 103308737, "synonyms": ["chr10"]}, | ||
"11": {"length": 106982474, "synonyms": ["chr11"]}, | ||
"12": {"length": 87216183, "synonyms": ["chr12"]}, | ||
"13": {"length": 83472345, "synonyms": ["chr13"]}, | ||
"14": {"length": 82403003, "synonyms": ["chr14"]}, | ||
"15": {"length": 85007780, "synonyms": ["chr15"]}, | ||
"16": {"length": 81013979, "synonyms": ["chr16"]}, | ||
"17": {"length": 73167244, "synonyms": ["chr17"]}, | ||
"18": {"length": 65820629, "synonyms": ["chr18"]}, | ||
"19": {"length": 63449741, "synonyms": ["chr19"]}, | ||
"20": {"length": 71974595, "synonyms": ["chr20"]}, | ||
"21": {"length": 69862954, "synonyms": ["chr21"]}, | ||
"22": {"length": 60773035, "synonyms": ["chr22"]}, | ||
"23": {"length": 52498615, "synonyms": ["chr23"]}, | ||
"24": {"length": 62317253, "synonyms": ["chr24"]}, | ||
"25": {"length": 42350435, "synonyms": ["chr25"]}, | ||
"26": {"length": 51992305, "synonyms": ["chr26"]}, | ||
"27": {"length": 45612108, "synonyms": ["chr27"]}, | ||
"28": {"length": 45940150, "synonyms": ["chr28"]}, | ||
"29": {"length": 51098607, "synonyms": ["chr29"]}, | ||
"X": {"length": 139009144, "synonyms": ["chrX"]}, | ||
"MT": {"length": 16338, "synonyms": []}, | ||
}, | ||
"assembly_source": "ensembl", | ||
"assembly_build_version": "113", | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
perhaps insert a comment explaining what this is doing?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mean, explaining why this is necessary
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And, actually, I don't understand why it's necessary. I see below that now
species.ensembl_id == "canis_lupus_familiaris"
, so where doesensembl_id
equal"canis_familiaris"
? Just missing something here.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hah this is because of the transition that happened when running the maintenance script! now i bet i can take it out
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay - could you either take this out, or say in the comment exactly what needs to happen to take it out? (It says "once we have moved to the new names" - but: moved what names? where? I am confused. =)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
added a larger comment now explaining the context that says:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This isn't what I understood from talking to you - I thought that (a) we could remove this code right now if we wanted, since this
if
clause will never get triggered; but (b) we were going to leave the code in as an example of how to do this. Is that right?