popsim-consortium · andrewkern · Jan 17, 2025 · Jan 7, 2025 · Jan 8, 2025 · Jan 10, 2025
diff --git a/maintenance/main.py b/maintenance/main.py
@@ -132,6 +132,12 @@ def test_name(self):
     def test_common_name(self):
         assert self.species.common_name == "$common_name"
 
+    def test_assembly_source(self):
+        assert self.genome.assembly_source == "$assembly_source"
+
+    def test_assembly_build_version(self):
+        assert self.genome.assembly_build_version == "$assembly_build_version"
+
     # QC Tests. These tests are performed by another contributor
     # independently referring to the citations provided in the
     # species definition, filling in the appropriate values
@@ -177,6 +183,10 @@ def black_format(code):
 
 
 def ensembl_stdpopsim_id(ensembl_id):
+    if ensembl_id == "canis_lupus_familiaris":
+        ensembl_id = "canis_familiaris"
+    elif ensembl_id == "escherichia_coli_str_k_12_substr_mg1655_gca_000005845":
+        ensembl_id = "escherichia_coli"
     tmp = ensembl_id.split("_")[:2]
     sps_id = "".join([x[0:3].capitalize() for x in tmp])
     if len(sps_id) != 6:
@@ -296,15 +306,80 @@ def write_genome_data(self, ensembl_id):
             raise ValueError(
                 f"Directory {id} corresponding to {ensembl_id} does" + "not exist"
             )
-        logger.info(f"Writing genome data for {sps_id} {ensembl_id}")
-        path = path / "genome_data.py"
+
+        genome_data_path = path / "genome_data.py"
+        # Check if file exists and has non-Ensembl assembly source
+        if genome_data_path.exists():
+            try:
+                # Get existing data
+                namespace = {}
+                with open(genome_data_path) as f:
+                    exec(f.read(), namespace)
+                existing_assembly_source = namespace["data"].get(
+                    "assembly_source", "ensembl"
+                )
+                if existing_assembly_source != "ensembl":
+                    logger.info(
+                        f"Skipping {sps_id} ({ensembl_id}): non-Ensembl assembly source"
+                    )
+                    return ("manual", None)
+            except Exception as e:
+                logger.warning(
+                    f"Error checking assembly source for {sps_id}: {e}. "
+                    "Proceeding with update."
+                )
+
+        # Get new data from Ensembl
         data = self.ensembl_client.get_genome_data(ensembl_id)
+
+        # Preserve existing assembly source or default to "ensembl"
+        if genome_data_path.exists():
+            try:
+                namespace = {}
+                with open(genome_data_path) as f:
+                    exec(f.read(), namespace)
+                data["assembly_source"] = namespace["data"].get(
+                    "assembly_source", "ensembl"
+                )
+            except Exception:
+                data["assembly_source"] = "ensembl"
+        else:
+            data["assembly_source"] = "ensembl"
+
+        # Add Ensembl version number if assembly source is Ensembl
+        if data["assembly_source"] == "ensembl":
+            data["assembly_build_version"] = str(self.ensembl_client.get_release())
+        else:
+            data["assembly_build_version"] = None
+
+        # Check if existing genome data exists and compare chromosome names
+        if genome_data_path.exists():
+            try:
+                # Get existing chromosome names
+                namespace = {}
+                with open(genome_data_path) as f:
+                    exec(f.read(), namespace)
+                existing_chroms = set(namespace["data"]["chromosomes"].keys())
+                new_chroms = set(data["chromosomes"].keys())
+
+                if existing_chroms != new_chroms:
+                    logger.warning(
+                        f"Skipping {sps_id} ({ensembl_id}): chromosome names mismatch."
+                    )
+                    return ("chrom_mismatch", (existing_chroms, new_chroms))
+            except Exception as e:
+                logger.warning(
+                    f"Error comparing chromosome names for {sps_id}: {e}. \
+                        Proceeding with update."
+                )
+
+        logger.info(f"Writing genome data for {sps_id} {ensembl_id}")
         code = f"data = {data}"
 
         # Format the code with Black so we don't get noisy diffs
-        with self.write(path) as f:
+        with self.write(genome_data_path) as f:
             f.write(black_format(code))
-        return data
+        return ("updated", None)
 
     def write_genome_data_ncbi(self, ncbi_id, sps_id):
         path = catalog_path(sps_id)
@@ -371,15 +446,69 @@ def update_genome_data(species):
     will update the genome data for humans. Multiple species can
     be specified. By default all species are updated.
     """
-    # TODO make this work for NCBI as well
+    # Track warnings and errors
+    skipped_species = []
+
+    # Original species processing logic
     if len(species) == 0:
-        embl_ids = [s.ensembl_id for s in stdpopsim.all_species()]
+        species_list = list(stdpopsim.all_species())
+        logger.info(f"Found {len(species_list)} species in catalog")
+        embl_ids = []
+        for s in species_list:
+            logger.info(f"Processing {s.id}: ensembl_id={s.ensembl_id}")
+            embl_ids.append((s.id, s.ensembl_id))
     else:
-        embl_ids = [s.lower() for s in species]
+        embl_ids = [(s, s.lower()) for s in species]
+
+    # Process each species, maintaining existing logging
     writer = DataWriter()
-    for eid in embl_ids:
-        writer.write_genome_data(eid)
-    writer.write_ensembl_release()
+    for species_id, eid in embl_ids:
+        try:
+            result = writer.write_genome_data(eid)
+            if result is not None:
+                status, details = result
+                if status == "manual":
+                    skipped_species.append(
+                        (species_id, eid, "Manually created genome data file")
+                    )
+                elif status == "chrom_mismatch":
+                    existing_chroms, new_chroms = details
+                    skipped_species.append(
+                        (
+                            species_id,
+                            eid,
+                            (
+                                f"Chromosome names mismatch.\n"
+                                f"Existing: {sorted(existing_chroms)}\n"
+                                f"New: {sorted(new_chroms)}"
+                            ),
+                        )
+                    )
+        except ValueError as e:
+            logger.error(f"Error processing {eid}: {e}")
+            skipped_species.append((species_id, eid, str(e)))
+        except Exception as e:
+            logger.error(f"Unexpected error processing {eid}: {e}")
+            skipped_species.append((species_id, eid, f"Unexpected error: {str(e)}"))
+
+    # writer.write_ensembl_release()
+
+    # Add summary report at the end
+    if skipped_species:
+        logger.warning("\n=== Species Update Summary ===")
+        logger.warning("The following species were not updated:")
+        for species_id, eid, reason in skipped_species:
+            if "Chromosome names mismatch" in reason:
+                # Split the chromosome mismatch message into multiple lines
+                logger.warning(f"  - {species_id} (Ensembl ID: {eid}):")
+                logger.warning("    Chromosome names mismatch.")
+                # Parse the chromosome lists from the new format
+                existing = reason.split("Existing: ")[1].split("\n")[0]
+                new = reason.split("New: ")[1]
+                logger.warning(f"    Existing chromosomes: {existing}")
+                logger.warning(f"    New chromosomes: {new}")
+            else:
+                logger.warning(f"  - {species_id} (Ensembl ID: {eid}): {reason}")
 
 
 @cli.command()
@@ -391,7 +520,7 @@ def add_species(ensembl_id, force):
     """
     writer = DataWriter()
     writer.add_species(ensembl_id.lower(), force=force)
-    writer.write_ensembl_release()
+    # writer.write_ensembl_release()
 
 
 # TODO refactor this so that it's an option to add-species. By default

diff --git a/stdpopsim/catalog/AedAeg/genome_data.py b/stdpopsim/catalog/AedAeg/genome_data.py
@@ -8,4 +8,6 @@
         "3": {"length": 409777670, "synonyms": []},
         "MT": {"length": 16790, "synonyms": []},
     },
+    "assembly_source": "ensembl",
+    "assembly_build_version": "113",
 }
diff --git a/stdpopsim/catalog/AnaPla/genome_data.py b/stdpopsim/catalog/AnaPla/genome_data.py
@@ -45,4 +45,6 @@
         "39": {"length": 2018729, "synonyms": []},
         "40": {"length": 1354177, "synonyms": []},
     },
+    "assembly_source": "ensembl",
+    "assembly_build_version": "113",
 }
diff --git a/stdpopsim/catalog/AnoCar/genome_data.py b/stdpopsim/catalog/AnoCar/genome_data.py
@@ -18,4 +18,6 @@
         "LGh": {"length": 248369, "synonyms": []},
         "MT": {"length": 17223, "synonyms": []},
     },
+    "assembly_source": "ensembl",
+    "assembly_build_version": "103",
 }
diff --git a/stdpopsim/catalog/AnoGam/genome_data.py b/stdpopsim/catalog/AnoGam/genome_data.py
@@ -10,4 +10,6 @@
         "X": {"length": 24393108, "synonyms": []},
         "Mt": {"length": 15363, "synonyms": []},
     },
+    "assembly_source": "ensembl",
+    "assembly_build_version": "113",
 }
diff --git a/stdpopsim/catalog/ApiMel/genome_data.py b/stdpopsim/catalog/ApiMel/genome_data.py
@@ -3,22 +3,24 @@
     "assembly_accession": "GCA_003254395.2",
     "assembly_name": "Amel_HAv3.1",
     "chromosomes": {
-        "CM009931.2": {"length": 27754200, "synonyms": ["NC_037638.1"]},
-        "CM009932.2": {"length": 16089512, "synonyms": ["NC_037639.1"]},
-        "CM009933.2": {"length": 13619445, "synonyms": ["NC_037640.1"]},
-        "CM009934.2": {"length": 13404451, "synonyms": ["NC_037641.1"]},
-        "CM009935.2": {"length": 13896941, "synonyms": ["NC_037642.1"]},
-        "CM009936.2": {"length": 17789102, "synonyms": ["NC_037643.1"]},
-        "CM009937.2": {"length": 14198698, "synonyms": ["NC_037644.1"]},
-        "CM009938.2": {"length": 12717210, "synonyms": ["NC_037645.1"]},
-        "CM009939.2": {"length": 12354651, "synonyms": ["NC_037646.1"]},
-        "CM009940.2": {"length": 12360052, "synonyms": ["NC_037647.1"]},
-        "CM009941.2": {"length": 16352600, "synonyms": ["NC_037648.1"]},
-        "CM009942.2": {"length": 11514234, "synonyms": ["NC_037649.1"]},
-        "CM009943.2": {"length": 11279722, "synonyms": ["NC_037650.1"]},
-        "CM009944.2": {"length": 10670842, "synonyms": ["NC_037651.1"]},
-        "CM009945.2": {"length": 9534514, "synonyms": ["NC_037652.1"]},
-        "CM009946.2": {"length": 7238532, "synonyms": ["NC_037653.1"]},
-        "CM009947.2": {"length": 16343, "synonyms": ["NC_001566.1", "MT"]},
+        "CM009931.2": {"length": 27754200, "synonyms": []},
+        "CM009932.2": {"length": 16089512, "synonyms": []},
+        "CM009933.2": {"length": 13619445, "synonyms": []},
+        "CM009934.2": {"length": 13404451, "synonyms": []},
+        "CM009935.2": {"length": 13896941, "synonyms": []},
+        "CM009936.2": {"length": 17789102, "synonyms": []},
+        "CM009937.2": {"length": 14198698, "synonyms": []},
+        "CM009938.2": {"length": 12717210, "synonyms": []},
+        "CM009939.2": {"length": 12354651, "synonyms": []},
+        "CM009940.2": {"length": 12360052, "synonyms": []},
+        "CM009941.2": {"length": 16352600, "synonyms": []},
+        "CM009942.2": {"length": 11514234, "synonyms": []},
+        "CM009943.2": {"length": 11279722, "synonyms": []},
+        "CM009944.2": {"length": 10670842, "synonyms": []},
+        "CM009945.2": {"length": 9534514, "synonyms": []},
+        "CM009946.2": {"length": 7238532, "synonyms": []},
+        "CM009947.2": {"length": 16343, "synonyms": []},
     },
+    "assembly_source": "ensembl",
+    "assembly_build_version": "113",
 }
diff --git a/stdpopsim/catalog/AraTha/genome_data.py b/stdpopsim/catalog/AraTha/genome_data.py
@@ -11,4 +11,6 @@
         "Mt": {"length": 366924, "synonyms": []},
         "Pt": {"length": 154478, "synonyms": []},
     },
+    "assembly_source": "ensembl",
+    "assembly_build_version": "113",
 }
diff --git a/stdpopsim/catalog/BosTau/genome_data.py b/stdpopsim/catalog/BosTau/genome_data.py
@@ -1,38 +1,40 @@
 # File autogenerated from Ensembl REST API. Do not edit.
 data = {
-    "assembly_accession": "GCA_002263795.2",
-    "assembly_name": "ARS-UCD1.2",
+    "assembly_accession": "GCA_002263795.3",
+    "assembly_name": "ARS-UCD1.3",
     "chromosomes": {
-        "1": {"length": 158534110, "synonyms": []},
-        "2": {"length": 136231102, "synonyms": []},
-        "3": {"length": 121005158, "synonyms": []},
-        "4": {"length": 120000601, "synonyms": []},
-        "5": {"length": 120089316, "synonyms": []},
-        "6": {"length": 117806340, "synonyms": []},
-        "7": {"length": 110682743, "synonyms": []},
-        "8": {"length": 113319770, "synonyms": []},
-        "9": {"length": 105454467, "synonyms": []},
-        "10": {"length": 103308737, "synonyms": []},
-        "11": {"length": 106982474, "synonyms": []},
-        "12": {"length": 87216183, "synonyms": []},
-        "13": {"length": 83472345, "synonyms": []},
-        "14": {"length": 82403003, "synonyms": []},
-        "15": {"length": 85007780, "synonyms": []},
-        "16": {"length": 81013979, "synonyms": []},
-        "17": {"length": 73167244, "synonyms": []},
-        "18": {"length": 65820629, "synonyms": []},
-        "19": {"length": 63449741, "synonyms": []},
-        "20": {"length": 71974595, "synonyms": []},
-        "21": {"length": 69862954, "synonyms": []},
-        "22": {"length": 60773035, "synonyms": []},
-        "23": {"length": 52498615, "synonyms": []},
-        "24": {"length": 62317253, "synonyms": []},
-        "25": {"length": 42350435, "synonyms": []},
-        "26": {"length": 51992305, "synonyms": []},
-        "27": {"length": 45612108, "synonyms": []},
-        "28": {"length": 45940150, "synonyms": []},
-        "29": {"length": 51098607, "synonyms": []},
-        "X": {"length": 139009144, "synonyms": []},
+        "1": {"length": 158534110, "synonyms": ["chr1"]},
+        "2": {"length": 136231102, "synonyms": ["chr2"]},
+        "3": {"length": 121005158, "synonyms": ["chr3"]},
+        "4": {"length": 120000601, "synonyms": ["chr4"]},
+        "5": {"length": 120089316, "synonyms": ["chr5"]},
+        "6": {"length": 117806340, "synonyms": ["chr6"]},
+        "7": {"length": 110682743, "synonyms": ["chr7"]},
+        "8": {"length": 113319770, "synonyms": ["chr8"]},
+        "9": {"length": 105454467, "synonyms": ["chr9"]},
+        "10": {"length": 103308737, "synonyms": ["chr10"]},
+        "11": {"length": 106982474, "synonyms": ["chr11"]},
+        "12": {"length": 87216183, "synonyms": ["chr12"]},
+        "13": {"length": 83472345, "synonyms": ["chr13"]},
+        "14": {"length": 82403003, "synonyms": ["chr14"]},
+        "15": {"length": 85007780, "synonyms": ["chr15"]},
+        "16": {"length": 81013979, "synonyms": ["chr16"]},
+        "17": {"length": 73167244, "synonyms": ["chr17"]},
+        "18": {"length": 65820629, "synonyms": ["chr18"]},
+        "19": {"length": 63449741, "synonyms": ["chr19"]},
+        "20": {"length": 71974595, "synonyms": ["chr20"]},
+        "21": {"length": 69862954, "synonyms": ["chr21"]},
+        "22": {"length": 60773035, "synonyms": ["chr22"]},
+        "23": {"length": 52498615, "synonyms": ["chr23"]},
+        "24": {"length": 62317253, "synonyms": ["chr24"]},
+        "25": {"length": 42350435, "synonyms": ["chr25"]},
+        "26": {"length": 51992305, "synonyms": ["chr26"]},
+        "27": {"length": 45612108, "synonyms": ["chr27"]},
+        "28": {"length": 45940150, "synonyms": ["chr28"]},
+        "29": {"length": 51098607, "synonyms": ["chr29"]},
+        "X": {"length": 139009144, "synonyms": ["chrX"]},
         "MT": {"length": 16338, "synonyms": []},
     },
+    "assembly_source": "ensembl",
+    "assembly_build_version": "113",
 }