From 88f9dca5fef95d13bcfc2edcfe39b2874515626a Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Tue, 24 Jan 2023 11:10:30 -0500
Subject: [PATCH 01/52] update attribute names

---
 src/AGR_data_retrieval_curation_disease.py | 113 ++++++++++-----------
 1 file changed, 56 insertions(+), 57 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_disease.py b/src/AGR_data_retrieval_curation_disease.py
index c953b62..c140da6 100644
--- a/src/AGR_data_retrieval_curation_disease.py
+++ b/src/AGR_data_retrieval_curation_disease.py
@@ -89,45 +89,45 @@ def __init__(self, feature_cvterm, provenance_prop):
         """
         # FlyBase data
         self.unique_key = '{}_{}'.format(feature_cvterm.feature_cvterm_id, provenance_prop.rank)
-        self.feature_cvterm = feature_cvterm             # The FeatureCvterm object.
-        self.provenance = provenance_prop                # The "provenance" FeatureCvtermprop.
-        self.evidence_code = None                        # Will be the "evidence_code" FeatureCvtermprop.
-        self.qualifier = None                            # Will be the "qualifier" FeatureCvtermprop.
-        self.timestamps = []                             # Will be a list of audit_chado timestamp lists.
-        # Derived attribures.
-        self.modifier_problem = False                    # Change to true if there's a problem finding the modifier allele.
-        # Attributes for the Alliance AuditedObject.
-        self.obsolete = False                            # Never True. All FB annotations are deleted if no longer current.
-        self.internal = False                            # Will be internal if annotation should not be exported to Alliance for some reason.
-        self.created_by = 'FB:FB_curator'                # Use placeholder value since no Person object at FlyBase.
-        self.updated_by = 'FB:FB_curator'               # Use placeholder value since no Person object at FlyBase.
-        self.date_created = None                         # Not straightforward as half of relevant annotations are derived in the reporting build.
-        self.date_updated = None                         # Not straightforward as half of relevant annotations are derived in the reporting build.
-        # Attributes for the Alliance Association
-        self.subject = None                              # Provide allele curie (slot usage from AlleleDiseaseAnnotation)
-        self.predicate = 'is_implicated_in'              # "Allele disease relations" CV (slot usage from AlleleDiseaseAnnotation)
-        self.object = None                               # Provide DOID (slot usage from DiseaseAnnotation).
-        # Attributes for the Alliance DiseaseAnnotation
-        self.data_provider = 'FB'
-        self.negated = False                             # Change to True for "NOT" annotations.
-        self.evidence_codes = []                         # Set as appropriate.
-        self.single_reference = None                     # Provide FBrf ID.
-        self.annotation_type = 'manually_curated'        # "Annotation types" CV.
-        self.disease_genetic_modifier = None             # Gene, Allele or AGM curie.
-        self.disease_genetic_modifier_relation = None    # "Disease genetic modifier relations" CV.
-        self.unique_id = self.unique_key                 # Use the unique_key (internal ID is ok).
-        self.mod_entity_id = None                        # N/A to FlyBase data.
-        self.inferred_gene = None                        # Gene asserted by curator to be associated with the disease annotation.
-        # self.with = None                               # N/A to FlyBase data.
-        self.disease_qualifiers = []                     # N/A to FlyBase data. "Disease Qualifiers" CV.
-        self.condition_relations = []                    # N/A to FlyBase data.
-        self.genetic_sex = None                          # N/A to FlyBase data. "Genetic sexes" CV.
-        self.related_notes = []                          # N/A to FlyBase data.
-        self.secondary_data_provider = None              # N/A to FlyBase data.
+        self.feature_cvterm = feature_cvterm                  # The FeatureCvterm object.
+        self.provenance = provenance_prop                     # The "provenance" FeatureCvtermprop.
+        self.evidence_code = None                             # Will be the "evidence_code" FeatureCvtermprop.
+        self.qualifier = None                                 # Will be the "qualifier" FeatureCvtermprop.
+        self.timestamps = []                                  # Will be a list of audit_chado timestamp lists.
+        # Derived attributes.
+        self.modifier_problem = False                         # Change to true if there's a problem finding the modifier allele.
+        # Attributes for the Alliance AuditedObjectDTO.
+        self.obsolete = False                                 # Never True. All FB annotations are deleted if no longer current.
+        self.internal = False                                 # Will be internal if annotation should not be exported to Alliance for some reason.
+        self.created_by_curie = 'FB:FB_curator'               # Use placeholder value since no Person object at FlyBase.
+        self.updated_by_curie = 'FB:FB_curator'               # Use placeholder value since no Person object at FlyBase.
+        self.date_created = None                              # Not straightforward as half of relevant annotations are derived in the reporting build.
+        self.date_updated = None                              # Not straightforward as half of relevant annotations are derived in the reporting build.
+        # Attributes for the Alliance DiseaseAnnotationDTO.
+        self.disease_relation_name = 'is_implicated_in'       # "Allele disease relations" CV (slot usage from AlleleDiseaseAnnotation)
+        self.do_term_curie = None                             # Provide DOID (slot usage from DiseaseAnnotation).
+        self.mod_entity_id = None                             # N/A to FlyBase data.
+        self.negated = False                                  # Change to True for "NOT" annotations.
+        self.evidence_curies = []                             # Not sure what these are?
+        self.evidence_code_curies = []                        # Set as appropriate.
+        self.reference_curie = None                           # Provide FBrf ID.
+        self.annotation_type_name = 'manually_curated'        # "Annotation types" CV.
+        self.with_gene_curies = []                            # N/A to FlyBase data.
+        self.disease_qualifier_names = []                     # N/A to FlyBase data. "Disease Qualifiers" CV.
+        self.condition_relation_dtos = []                     # N/A to FlyBase data.
+        self.genetic_sex_name = None                          # N/A to FlyBase data. "Genetic sexes" CV.
+        self.note_dtos = []                                   # N/A to FlyBase data.
+        self.data_provider_name = 'FB'
+        self.secondary_data_provider_name = None              # N/A to FlyBase data.
+        self.disease_genetic_modifier_curie = None            # Gene, Allele or AGM curie.
+        self.disease_genetic_modifier_relation_name = None    # "Disease genetic modifier relations" CV.
+        # Attributes for the Alliance AlleleDiseaseAnnotationDTO.
+        self.allele_curie = None                              # Provide allele curie.
+        self.inferred_gene_curie = None                       # Gene inferred to be associated with the disease annotation based on curated allele.
         # Notes associated with the object.
-        self.for_alliance_export = True         # Change to False if object should be excluded from export.
-        self.internal_reasons = []              # Reasons for marking an object as internal. Will be exported but not necessarily displayed at Alliance.
-        self.export_warnings = []               # Reasons for suppressing an object from the export file.
+        self.for_alliance_export = True                       # Change to False if object should be excluded from export.
+        self.internal_reasons = []                            # Reasons for marking an object as internal (exported but not displayed at Alliance).
+        self.export_warnings = []                             # Reasons for suppressing an object from the export file.
 
     def __str__(self):
         """Succinct text string describing the disease annotation."""
@@ -171,34 +171,33 @@ def __init__(self):
     }
 
     required_fields = [
-        'data_provider',
-        'evidence_codes',
+        'allele_curie',
+        'data_provider_name',
+        'disease_relation_name',
+        'do_term_curie',
+        'evidence_code_curies',
         'internal',
-        'object',
-        'predicate',
-        'single_reference'
-        'subject'
+        'reference_curie',
     ]
 
     output_fields = [
-        'annotation_type',
-        'created_by',
-        'data_provider',
+        'allele_curie',
+        'annotation_type_name',
+        'created_by_curie',
+        'data_provider_name',
         'date_created',
         'date_updated',
-        'disease_genetic_modifier',
-        'disease_genetic_modifier_relation',
-        'evidence_codes',
-        'inferred_gene',
+        'disease_genetic_modifier_curie',
+        'disease_genetic_modifier_relation_name',
+        'disease_relation_name',
+        'do_term_curie',
+        'evidence_code_curies',
+        'inferred_gene_curie',
         'internal',
-        'updated_by',
         'negated',
-        'object',
         'obsolete',
-        'predicate',
-        'single_reference',
-        'subject',
-        'unique_id'    # For derived annotations, feature_cvterm_id+rank changes each release. So, suppress.
+        'reference_curie',
+        'updated_by_curie',
     ]
 
     def get_disease_annotations(self, session):

From b40a82d0c49b676aef071c25cd280efe1fb738e1 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Tue, 24 Jan 2023 11:39:54 -0500
Subject: [PATCH 02/52] update attribute names in synthesis method

---
 src/AGR_data_retrieval_curation_disease.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_disease.py b/src/AGR_data_retrieval_curation_disease.py
index c140da6..ec4cc23 100644
--- a/src/AGR_data_retrieval_curation_disease.py
+++ b/src/AGR_data_retrieval_curation_disease.py
@@ -429,11 +429,11 @@ def synthesize_info(self, session):
         log.info('Synthesizing disease annotation info.')
         for dis_anno in self.dis_anno_dict.values():
             log.debug('Evaluating annotation: {}'.format(dis_anno))
-            # Get subject, object and pub.
-            dis_anno.subject = 'FB:{}'.format(dis_anno.feature_cvterm.feature.uniquename)
-            dis_anno.object = 'DOID:{}'.format(dis_anno.feature_cvterm.cvterm.dbxref.accession)
-            dis_anno.single_reference = self.get_pub_xref(session, dis_anno.feature_cvterm.pub.uniquename)
-            dis_anno.inferred_gene = self.get_inferred_gene(session, dis_anno.feature_cvterm.feature.feature_id)
+            # Get allele, DO term and pub.
+            dis_anno.allele_curie = 'FB:{}'.format(dis_anno.feature_cvterm.feature.uniquename)
+            dis_anno.do_term_curie = 'DOID:{}'.format(dis_anno.feature_cvterm.cvterm.dbxref.accession)
+            dis_anno.reference_curie = self.get_pub_xref(session, dis_anno.feature_cvterm.pub.uniquename)
+            dis_anno.inferred_gene_curie = self.get_inferred_gene(session, dis_anno.feature_cvterm.feature.feature_id)
             # Mark negative annotations.
             if dis_anno.qualifier.value == 'DOES NOT model':
                 dis_anno.negated = True
@@ -445,23 +445,23 @@ def synthesize_info(self, session):
             #         timestamp_to_rfc3339_localoffset(datetime.datetime.timestamp(max(dis_anno.timestamps)))
             # Determine evidence_code
             if dis_anno.evidence_code.value.startswith('CEC'):
-                dis_anno.evidence_codes.append(self.evidence_code_xrefs['CEC'])
+                dis_anno.evidence_code_curies.append(self.evidence_code_xrefs['CEC'])
             else:
-                dis_anno.evidence_codes.append(self.evidence_code_xrefs['CEA'])
+                dis_anno.evidence_code_curies.append(self.evidence_code_xrefs['CEA'])
             # Find modifiers and their relations.
             allele_regex = r'FBal[0-9]{7}'
             for fb_term in self.disease_genetic_modifier_terms.keys():
                 if fb_term in dis_anno.evidence_code.value:
-                    dis_anno.disease_genetic_modifier_relation = self.disease_genetic_modifier_terms[fb_term]
+                    dis_anno.disease_genetic_modifier_relation_name = self.disease_genetic_modifier_terms[fb_term]
                     if re.search(allele_regex, dis_anno.evidence_code.value):
                         allele_id = re.search(allele_regex, dis_anno.evidence_code.value).group(0)
                         if self.confirm_current_allele_by_uniquename(session, allele_id):
-                            dis_anno.disease_genetic_modifier = 'FB:{}'.format(allele_id)
+                            dis_anno.disease_genetic_modifier_curie = 'FB:{}'.format(allele_id)
                         else:
                             # Look up current allele by 2o ID. Use that.
                             curr_allele_id = self.get_current_id_for_allele(session, allele_id)
                             if curr_allele_id:
-                                dis_anno.disease_genetic_modifier = 'FB:{}'.format(curr_allele_id)
+                                dis_anno.disease_genetic_modifier_curie = 'FB:{}'.format(curr_allele_id)
                             else:
                                 dis_anno.modifier_problem = True
             # Now check for conditions that prevent export.

From 9e2b82eaf7315764a6ce14084d0f921b9889756f Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Tue, 24 Jan 2023 15:02:23 -0500
Subject: [PATCH 03/52] update attribute names and synonym handling

---
 src/AGR_data_retrieval_curation_gene.py | 383 +++++++++++++++---------
 1 file changed, 235 insertions(+), 148 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index df7f5ee..3bf47c2 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -31,8 +31,8 @@
 # from sqlalchemy.orm.exc import NoResultFound
 from harvdev_utils.char_conversions import sub_sup_sgml_to_html
 from harvdev_utils.production import (
-    Cvterm, Db, Dbxref, Feature, FeatureDbxref, FeatureSynonym, Featureloc,
-    Featureprop, OrganismDbxref, Synonym
+    Cvterm, Db, Dbxref, Feature, FeatureDbxref, FeatureSynonym,
+    Featureloc, Featureprop, Organism, OrganismDbxref, Pub, PubDbxref, Synonym
 )
 from harvdev_utils.psycopg_functions import set_up_db_reading
 
@@ -99,45 +99,45 @@ def __init__(self, feature):
         # 1. Gene.name is requested (not required), but not all genes have a fullname.
         # 2. Gene.taxon is required, but even after updating NCBITaxon info at FlyBase, not all genes will have NCBI taxon ID.
         # 3. GenomicLocation lacks strand info.
-        self.feature = feature                    # The Feature object corresponding to the FlyBase gene.
-        self.organism_abbr = None                 # Will be the organism.abbreviation for the gene's species of origin.
-        self.taxon_dbxref = None                  # Will be the NCBITaxon (Db, Dbxref) tuple for the organism.
-        self.featureloc = None                    # Will be Featureloc object for the gene.
-        self.gene_type_name = None                # Will be the cvterm.name for "promoted_gene_type" featureprop.
-        self.gene_snapshot = None                 # Will be the "gene_summary_text" Featureprop object.
-        self.curr_fb_symbol = None                # Will be the current symbol Synonym object.
-        self.curr_fb_fullname = None              # Will be the current fullname Synonym object.
-        self.internal_synonyms = []               # Will be list of internal synonym names (and synonym_sgml if different).
-        self.public_synonyms = []                 # Will be list of public synonym names (and synonym_sgml if different).
-        self.dbxrefs = []                         # Will be list of dbxrefs as sql result groupings: Db, Dbxref, FeatureDbxref.
-        self.alt_fb_ids = []                      # Will be list of Dbxrefs for 2o FlyBase IDs.
-        self.annotation_ids = []                  # Will be list of Dbxrefs for annotation IDs.
-        self.timestamps = []                      # Add all timestamps here.
-        # Attributes for the Alliance AuditedObject.
-        self.obsolete = feature.is_obsolete       # Will be the FlyBase value here.
-        self.internal = False                     # Change to true if gene not intended for display at Alliance website.
-        self.created_by = 'FB:FB_curator'         # Use placeholder value since no Person object at FlyBase.
-        self.updated_by = 'FB:FB_curator'        # Use placeholder value since no Person object at FlyBase.
-        self.date_created = None                  # Earliest timestamp.
-        self.date_updated = None                  # Latest timestamp.
-        # self.data_provider = 'FB'                 # The MOD abbreviation.
-        # Attributes for the Alliance BiologicalEntity. BiologicalEntity is_a AuditedObject.
+        self.feature = feature                                # The Feature object corresponding to the FlyBase gene.
+        self.organism_abbr = None                             # Will be the organism.abbreviation for the gene's species of origin.
+        self.taxon_dbxref = None                              # Will be the NCBITaxon (Db, Dbxref) tuple for the organism.
+        self.featureloc = None                                # Will be Featureloc object for the gene.
+        self.gene_type_name = None                            # Will be the cvterm.name for "promoted_gene_type" featureprop.
+        self.gene_snapshot = None                             # Will be the "gene_summary_text" Featureprop object.
+        self.curr_anno_id = None                              # Will be current annotation ID for the gene (str).
+        self.curr_fb_symbol = []                              # Will be all FeatureSynonym objects in support of the current symbol Synonym object.
+        self.curr_fb_fullname = []                            # Will be all FeatureSynonym objects in support of the current fullname Synonym object.
+        self.systematic_name = []                             # Will be all FeatureSynonym objects using the systematic name of the gene.
+        self.other_synonyms = []                              # Will be all FeatureSynonym objects in support of non-current synonyms.
+        self.dbxrefs = []                                     # Will be list of dbxrefs as sql result groupings: Db, Dbxref, FeatureDbxref.
+        self.alt_fb_ids = []                                  # Will be list of Dbxrefs for 2o FlyBase IDs.
+        self.annotation_ids = []                              # Will be list of Dbxrefs for annotation IDs.
+        self.timestamps = []                                  # Add all timestamps here.
+        # Attributes for the Alliance AuditedObjectDTO.
+        self.obsolete = False                                 # Never True. All FB annotations are deleted if no longer current.
+        self.internal = False                                 # Will be internal if annotation should not be exported to Alliance for some reason.
+        self.created_by_curie = 'FB:FB_curator'               # Use placeholder value since no Person object at FlyBase.
+        self.updated_by_curie = 'FB:FB_curator'               # Use placeholder value since no Person object at FlyBase.
+        self.date_created = None                              # Not straightforward as half of relevant annotations are derived in the reporting build.
+        self.date_updated = None                              # Not straightforward as half of relevant annotations are derived in the reporting build.
+        # Attributes for the Alliance BiologicalEntityDTO. BiologicalEntityDTO is_a AuditedObjectDTO.
         self.curie = 'FB:{}'.format(feature.uniquename)
-        self.taxon = None                         # A string representing the NCBI taxon ID. We have no NCBI taxonID for 561 genes (72 species).
-        # Attributes for the Alliance GenomicEntity. GenomicEntity is_a BiologicalEntity.
-        self.name = None                          # Will be current fullname synonym - report ascii or utf8 (sgml) version?
-        self.synonyms = []                        # All current and non-current ASCII and SGML synonyms.
-        self.cross_references = []                # Report only select dbs, using AGR-accepted db_prefix.
-        self.secondary_identifiers = []           # Annotation IDs and 2o FlyBase IDs.
-        # Attributes for the Alliance Gene. Gene is_a GenomicEntity.
-        self.genomic_locations = []               # Will need to be list of GenomicLocation objects.
-        self.symbol = None                        # Will be a string (ascii or utf8)?
-        self.gene_synopsis = None                 # Will be the gene's "gene_summary_text" featureprop value - remove "@" symbols.
-        self.gene_type = None                     # Will be the SO term ID corresponding to the gene's promoted_gene_type.
+        self.taxon_curie = None                               # A string representing the NCBI taxon ID. We have no NCBI taxonID for 561 genes (72 species).
+        # Attributes for the Alliance GenomicEntityDTO. GenomicEntityDTO is_a BiologicalEntityDTO.
+        self.cross_reference_dtos = []                        # Report only select dbs, using AGR-accepted db_prefix.
+        self.secondary_identifiers = []                       # Annotation IDs and 2o FlyBase IDs.
+        self.genomic_location_dtos = []                       # Will need to be list of GenomicLocation objects.
+        # Attributes for the Alliance GeneDTO. GeneDTO is_a GenomicEntityDTO.
+        self.gene_symbol_dto = None                           # Will be a single SymbolSlotAnnotationDTO.
+        self.gene_full_name_dto = None                        # Will be a single GeneFullNameSlotAnnotation.
+        self.gene_systematic_name_dto = None                  # Will be a single GeneSystematicNameSlotAnnotation.
+        self.gene_synonym_dtos = []                           # Will be list of NameSlotAnnotationDTO objects.
+        self.gene_type_curie = None                           # Will be the SO term ID corresponding to the gene's promoted_gene_type.
         # Notes associated with the object.
-        self.for_alliance_export = True           # Change to False if object should be excluded from export.
-        self.internal_reasons = []                # Reasons for marking an object as internal in the export file.
-        self.export_warnings = []                 # Reasons for suppressing an object from the export file.
+        self.for_alliance_export = True                       # Change to False if object should be excluded from export.
+        self.internal_reasons = []                            # Reasons for marking an object as internal in the export file.
+        self.export_warnings = []                             # Reasons for suppressing an object from the export file.
 
     def __str__(self):
         """Succinct text string describing the AllianceGene object."""
@@ -150,6 +150,8 @@ class GeneHandler(object):
     def __init__(self):
         """Create the GeneHandler object."""
         self.gene_dict = {}           # An FBgnID-keyed dict of AllianceGene objects.
+        self.all_pubs_dict = {}       # A pub_id-keyed dict of pub curies (PMID or FBrf).
+        self.all_synonyms_dict = {}   # A synonym_id-keyed dict of Synonym objects.
         self.pthr_dict = {}           # Will be an 1:1 FBgnID-PTHR xref dict.
         self.chr_dict = {}            # Will be a feature_id-keyed dict of chr scaffold uniquenames.
         self.total_feat_cnt = 0       # Count of all genes found in starting query.
@@ -159,26 +161,27 @@ def __init__(self):
     test_genes = ['wg', 'mt:ori', 'lncRNA:roX1', 'CG12656']
     required_fields = [
         'curie',
-        'taxon',
-        'symbol',
-        'internal'
+        'gene_symbol_dto',
+        'gene_full_name_dto',
+        'internal',
+        'taxon_curie',
     ]
     output_fields = [
-        'created_by',
-        'cross_references',
+        'created_by_curie',
+        'cross_reference_dtos',
         'curie',
         'date_created',
         'date_updated',
-        'gene_type',
-        'genomic_locations',
+        'gene_full_name_dto',
+        'gene_symbol_dto',
+        'gene_synonym_dtos',
+        'gene_type_curie',
+        'genomic_location_dtos',
         'internal',
-        'updated_by',
-        'name',
         'obsolete',
         'secondary_identifiers',
-        'symbol',
-        'synonyms',
-        'taxon'
+        'taxon_curie',
+        'updated_by_curie',
     ]
     internal_gene_types = [
         'engineered_fusion_gene',
@@ -224,6 +227,42 @@ def open_panther_file(self):
                     self.pthr_dict[re.search(fb_regex, row[FB]).group(0)] = re.search(pthr_regex, row[PTHR]).group(0)
         return
 
+    def get_references(self, session):
+        """Get all references."""
+        log.info('Get all references.')
+        # First get all current pubs having an FBrf uniquename.
+        fbrf_regex = r'^(FBrf[0-9]{7}|unattributed)$'
+        filters = (
+            Pub.uniquename.op('~')(fbrf_regex),
+            Pub.is_obsolete.is_(False)
+        )
+        results = session.query(Pub).\
+            filter(*filters).\
+            distinct()
+        pub_counter = 0
+        for pub in results:
+            self.all_pubs_dict[pub.pub_id] = f'FB:{pub.uniquename}'
+            counter += 1
+        # Next find PMIDs if available and replace the curie in the all_pubs_dict.
+        filters = (
+            Pub.uniquename.op('~')(fbrf_regex),
+            Pub.is_obsolete.is_(False),
+            Db.name == 'pubmed',
+            PubDbxref.is_current.is_(True)
+        )
+        pmid_xrefs = session.query(Pub, Dbxref).\
+            join(PubDbxref, (PubDbxref.pub_id == Pub.pub_id)).\
+            join(Dbxref, (Dbxref.dbxref_id == PubDbxref.dbxref_id)).\
+            join(Db, (Db.db_id == Dbxref.db_id)).\
+            filter(*filters).\
+            distinct()
+        pmid_counter = 0
+        for xref in pmid_xrefs:
+            self.all_pubs_dict[xref.Pub.pub_id] = f'PMID:{xref.Dbxref.accession}'
+            pmid_counter += 1
+        log.info(f'Found {pmid_counter} PMID IDs for {pub_counter} current FB publications.')
+        return
+
     def get_genes(self, session):
         """Get all genes."""
         log.info('Querying chado for genes.')
@@ -262,11 +301,48 @@ def get_gene_taxons(self, session):
             organism_taxon_dict[result.OrganismDbxref.organism_id] = result.Dbxref.accession
         for gene in self.gene_dict.values():
             try:
-                gene.taxon = 'NCBITaxon:{}'.format(organism_taxon_dict[gene.feature.organism_id])
+                gene.taxon_curie = 'NCBITaxon:{}'.format(organism_taxon_dict[gene.feature.organism_id])
             except KeyError:
                 log.debug('No NCBI taxon ID available for: {}'.format(gene))
         return
 
+    def get_gene_dbxrefs(self, session):
+        """Get all dbxrefs for genes. This will take 10-15 minutes."""
+        log.info('Getting gene dbxrefs.')
+        gene_regex = r'^FBgn[0-9]{7}$'
+        filters = (
+            Feature.uniquename.op('~')(gene_regex),
+            Feature.is_analysis.is_(False),
+            Cvterm.name == 'gene',
+            Db.name.in_((self.fb_agr_db_dict.keys()))
+        )
+        gene_dbxref_results = session.query(Feature, FeatureDbxref, Dbxref, Db).\
+            join(Cvterm, (Cvterm.cvterm_id == Feature.type_id)).\
+            join(FeatureDbxref, (FeatureDbxref.feature_id == Feature.feature_id)).\
+            join(Dbxref, (Dbxref.dbxref_id == FeatureDbxref.dbxref_id)).\
+            join(Db, (Db.db_id == Dbxref.db_id)).\
+            filter(*filters).\
+            distinct()
+        counter = 0
+        for result in gene_dbxref_results:
+            counter += 1
+            if counter % 100000 == 0:
+                log.debug('Processing xref #{}'.format(counter))
+            # Skip current FlyBase accessions.
+            # If present, these are same as feature.uniquename.
+            # However, not present for all genes (e.g., FBgn0085177), so cannot be relied upon.
+            if result.FeatureDbxref.is_current is True and result.Db.name == 'FlyBase':
+                pass
+            elif result.FeatureDbxref.is_current is False and result.Db.name == 'FlyBase':
+                self.gene_dict[result.Feature.uniquename].alt_fb_ids.append(result.Dbxref)
+            elif result.Db.name == 'FlyBase Annotation IDs':
+                self.gene_dict[result.Feature.uniquename].annotation_ids.append(result.Dbxref)
+                if result.FeatureDbxref.is_current is True:
+                    self.gene_dict[result.Feature.uniquename].curr_anno_id = result.Dbxref.accession
+            else:
+                self.gene_dict[result.Feature.uniquename].dbxrefs.append(result)
+        return
+
     def get_synonyms(self, session):
         """Get current and non-current symbols and full names for genes."""
         log.info('Getting gene synonyms.')
@@ -286,18 +362,19 @@ def get_synonyms(self, session):
             filter(*filters).\
             distinct()
         for result in gene_curr_symbol_results:
+            # First, build the all_synonyms_dict.
+            self.all_synonyms_dict[result.Synonym.synonym_id] = Synonym
+            # Second, collect FeatureSynonym objects by type.
             if result.FeatureSynonym.is_current is True:
                 if result.synonym_type.name == 'symbol':
-                    self.gene_dict[result.Feature.uniquename].curr_fb_symbol = result.Synonym
+                    self.gene_dict[result.Feature.uniquename].curr_fb_symbol.append(result.FeatureSynonym)
                 elif result.synonym_type.name == 'fullname':
-                    self.gene_dict[result.Feature.uniquename].curr_fb_fullname = result.Synonym
-            elif result.FeatureSynonym.is_internal is True:
-                self.gene_dict[result.Feature.uniquename].internal_synonyms.append(result.Synonym.name)
-                self.gene_dict[result.Feature.uniquename].internal_synonyms.append(sub_sup_sgml_to_html(result.Synonym.synonym_sgml))
+                    self.gene_dict[result.Feature.uniquename].curr_fb_fullname.append(result.FeatureSynonym)
             else:
-                self.gene_dict[result.Feature.uniquename].public_synonyms.append(result.Synonym.name)
-                self.gene_dict[result.Feature.uniquename].public_synonyms.append(sub_sup_sgml_to_html(result.Synonym.synonym_sgml))
-
+                self.gene_dict[result.Feature.uniquename].other_synonyms.append(result.FeatureSynonym)
+            # Third, catch synonyms that match the annotation ID (aka, systematic_name).
+            if result.Synonym.name == self.gene_dict[result.Feature.uniquename].curr_anno_id:
+                self.gene_dict[result.Feature.uniquename].systematic_name.append(result.FeatureSynonym)
         return
 
     def get_gene_snapshots(self, session):
@@ -340,7 +417,7 @@ def get_gene_types(self, session):
             filter(*filters).\
             distinct()
         for result in gene_type_results:
-            self.gene_dict[result.feature.uniquename].gene_type = result.value[1:10].replace('SO', 'SO:')
+            self.gene_dict[result.feature.uniquename].gene_type_curie = result.value[1:10].replace('SO', 'SO:')
             self.gene_dict[result.feature.uniquename].gene_type_name = result.value[11:-1]
         return
 
@@ -375,41 +452,6 @@ def get_gene_timestamps(self, session):
         ########################################################################
         return
 
-    def get_gene_dbxrefs(self, session):
-        """Get all dbxrefs for genes. This will take 10-15 minutes."""
-        log.info('Getting gene dbxrefs.')
-        gene_regex = r'^FBgn[0-9]{7}$'
-        filters = (
-            Feature.uniquename.op('~')(gene_regex),
-            Feature.is_analysis.is_(False),
-            Cvterm.name == 'gene',
-            Db.name.in_((self.fb_agr_db_dict.keys()))
-        )
-        gene_dbxref_results = session.query(Feature, FeatureDbxref, Dbxref, Db).\
-            join(Cvterm, (Cvterm.cvterm_id == Feature.type_id)).\
-            join(FeatureDbxref, (FeatureDbxref.feature_id == Feature.feature_id)).\
-            join(Dbxref, (Dbxref.dbxref_id == FeatureDbxref.dbxref_id)).\
-            join(Db, (Db.db_id == Dbxref.db_id)).\
-            filter(*filters).\
-            distinct()
-        counter = 0
-        for result in gene_dbxref_results:
-            counter += 1
-            if counter % 100000 == 0:
-                log.debug('Processing xref #{}'.format(counter))
-            # Skip current FlyBase accessions.
-            # If present, these are same as feature.uniquename.
-            # However, not present for all genes (e.g., FBgn0085177), so cannot be relied upon.
-            if result.FeatureDbxref.is_current is True and result.Db.name == 'FlyBase':
-                pass
-            elif result.FeatureDbxref.is_current is False and result.Db.name == 'FlyBase':
-                self.gene_dict[result.Feature.uniquename].alt_fb_ids.append(result.Dbxref)
-            elif result.Db.name == 'FlyBase Annotation IDs':
-                self.gene_dict[result.Feature.uniquename].annotation_ids.append(result.Dbxref)
-            else:
-                self.gene_dict[result.Feature.uniquename].dbxrefs.append(result)
-        return
-
     def get_gene_featureloc(self, session):
         """Getting gene featureloc."""
         log.info('Getting gene genomic locations.')
@@ -445,22 +487,103 @@ def get_gene_featureloc(self, session):
     def query_chado(self, session):
         """A wrapper method that runs initial db queries."""
         self.open_panther_file()
+        self.get_references(session)
         self.get_genes(session)
         self.get_gene_taxons(session)
+        self.get_gene_dbxrefs(session)
         self.get_synonyms(session)
         self.get_gene_snapshots(session)
         self.get_gene_types(session)
         self.get_gene_timestamps(session)
-        self.get_gene_dbxrefs(session)
         self.get_gene_featureloc(session)
         return
 
+    # BOB: new method for synonyms.
+    def process_feature_synonyms(self, input, name_type, return_single_value):
+        """Convert a string or list of FeatureSynonym objects into single or many DTO objects for export.
+
+        Args:
+            arg1 (input): (str or list) A string, or, a list of FeatureSynonym objects.
+            arg2 (name_type): (str) The type of name to return. If "unspecified" is given, go by Synonym type.
+            arg3 (return_single_value): (bool) True if output should be a single DTO, False if a list is to be returned.
+
+        Returns:
+            A single or list of name DTO objects.
+
+        Raises:
+            Raises error if in put is not a string/list.
+            Raises error if return_single_value set to True, but many synonyms found in the input list.
+
+        """
+        if type(input) is not str or type(input) is not list:
+            log.error('Input must be a string or list of FeatureSynonym objects.')
+            raise
+        # First handle the simplest case where a string is given as the input.
+        if type(input) is str:
+            output_synonym_dto = {
+                'name_type_name': name_type,
+                'format_text': input,
+                'display_text': input,
+                'synonym_scope': 'exact',
+                'evidence_curies': [],
+                'internal': False,
+                'obsolete': False
+            }
+            if return_single_value is False:
+                output_synonym_dto = [output_synonym_dto]
+            return output_synonym_dto
+        # Next handle a list of FeatureSynonym objects.
+        # Collect pub_ids for each synonym (keyed by synonym_id).
+        feature_synonym_dict = {}
+        output_synonym_dto_list = []
+        for f_s in input:
+            try:
+                feature_synonym_dict[f_s.synonym_id].append(f_s.pub_id)
+            except KeyError:
+                feature_synonym_dict[f_s.synonym_id] = [f_s.pub_id]
+        for synonym_id, pub_list in feature_synonym_dict.items():
+            synonym = self.all_synonyms_dict[synonym_id]
+            if name_type == 'unspecified':
+                name_type_to_use = synonym.type.name
+            else:
+                name_type_to_use = name_type
+            output_synonym_dto = {
+                'name_type_name': name_type_to_use,
+                'format_text': synonym.name,
+                'display_text': synonym.synonym_sgml,
+                'synonym_scope': 'exact',
+                'evidence_curies': [f'{self.all_pubs_dict[i]}' for i in pub_list if self.all_pubs_dict[i] != 'unattributed'],
+                'internal': False,
+                'obsolete': False
+            }
+            output_synonym_dto_list.append(output_synonym_dto)
+        if return_single_value is True and len(output_synonym_dto_list) != 1:
+            log.error('Found many synonyms but was expecting only one.')
+            raise
+        elif return_single_value is True and len(output_synonym_dto_list) == 1:
+            return output_synonym_dto_list[0]
+        else:
+            return output_synonym_dto_list
+
     # Synthesis of initial db info.
     def synthesize_info(self):
         """Convert FlyBase gene data into an AllianceGene representation."""
         log.info('Synthesizing gene info.')
         for gene in self.gene_dict.values():
             log.debug('Evaluating annotation: {}'.format(gene))
+            # BOB: Handle synonyms.
+            if gene.curr_fb_symbol:
+                gene.gene_symbol_dto = self.process_feature_synonyms(gene.curr_fb_symbol, 'nomenclature_symbol', True)
+            else:
+                gene.gene_symbol_dto = self.process_feature_synonyms(gene.feature.name, 'nomenclature_symbol', True)
+            if gene.curr_fb_fullname:
+                gene.gene_full_name_dto = self.process_feature_synonyms(gene.curr_fb_fullname, 'full_name', True)
+            else:
+                gene.gene_full_name_dto = self.process_feature_synonyms(gene.feature.name, 'full_name', True)
+            if gene.systematic_name:
+                gene.gene_systematic_name_dto = self.process_feature_synonyms(gene.systematic_name, 'systematic_name', True)
+            if gene.other_synonyms:
+                gene.gene_synonym_dtos = self.process_feature_synonyms(gene.other_synonyms, 'unspecified', False)
             # Get timestamps.
             if gene.timestamps:
                 gene.date_created = strict_rfc3339.\
@@ -472,12 +595,12 @@ def synthesize_info(self):
                 genomic_location_dict = {
                     'internal': False,
                     'obsolete': False,
-                    'created_by': 'FB:FB_curator',
-                    'updated_by': 'FB:FB_curator',
-                    'subject': gene.curie,
+                    'created_by_curie': 'FB:FB_curator',
+                    'updated_by_curie': 'FB:FB_curator',
+                    'genomic_entity_curie': gene.curie,
                     'predicate': 'localizes_to',
-                    'object': 'FB:{}'.format(self.chr_dict[gene.featureloc.srcfeature_id]),
-                    'has_assembly': reference_assembly
+                    'chromosome_curie': 'FB:{}'.format(self.chr_dict[gene.featureloc.srcfeature_id]),
+                    'assembly_curie': reference_assembly
                 }
                 if gene.featureloc.strand == -1:
                     genomic_location_dict['start'] = str(gene.featureloc.fmax)
@@ -485,51 +608,15 @@ def synthesize_info(self):
                 else:
                     genomic_location_dict['start'] = str(gene.featureloc.fmin + 1)
                     genomic_location_dict['end'] = str(gene.featureloc.fmax)
-                gene.genomic_locations.append(genomic_location_dict)
-            # Get the symbol.
-            if gene.curr_fb_symbol:
-                gene.symbol = sub_sup_sgml_to_html(gene.curr_fb_symbol.synonym_sgml)
-            else:
-                gene.symbol = gene.feature.name
-            # Get the fullname.
-            if gene.curr_fb_fullname:
-                gene.name = sub_sup_sgml_to_html(gene.curr_fb_fullname.synonym_sgml)
-            else:
-                gene.name = gene.feature.name
-            # Get synonyms.
-            internal_synonym_set = set(gene.internal_synonyms)
-            for internal_synonym in internal_synonym_set:
-                internal_synonym_dict = {
-                    'name': internal_synonym,
-                    'created_by': 'FB:FB_curator',
-                    'obsolete': False,
-                    'internal': True
-                }
-                gene.synonyms.append(internal_synonym_dict)
-            public_synonym_set = set(gene.public_synonyms)
-            for public_synonym in public_synonym_set:
-                public_synonym_dict = {
-                    'name': public_synonym,
-                    'created_by': 'FB:FB_curator',
-                    'obsolete': False,
-                    'internal': False
-                }
-                gene.synonyms.append(public_synonym_dict)
+                gene.genomic_location_dtos.append(genomic_location_dict)
             # Add gene synopsis.
             if gene.gene_snapshot:
                 gene.gene_synopsis = gene.gene_snapshot.value
-            # Get secondary IDs.
+            # Get secondary IDs (FBgn and annotation IDs).
             for fb_id in gene.alt_fb_ids:
                 gene.secondary_identifiers.append('FB:{}'.format(fb_id.accession))
             for anno_id in gene.annotation_ids:
-                # gene.secondary_identifiers.append('FB:{}'.format(anno_id.accession))
-                public_synonym_dict = {
-                    'synonym': anno_id.accession,
-                    'created_by': 'FB:FB_curator',
-                    'obsolete': False,
-                    'internal': False
-                }
-                gene.synonyms.append(public_synonym_dict)
+                gene.secondary_identifiers.append('FB:{}'.format(anno_id.accession))
             # Get crossreferences.
             # Start by adding gene uniquename as an xref.
             xref_dict = {
@@ -537,11 +624,11 @@ def synthesize_info(self):
                 'display_name': 'FB:{}'.format(gene.feature.uniquename),
                 'prefix': 'FB',
                 'page_areas': ['gene'],
-                'created_by': 'FB:FB_curator',
+                'created_by_curie': 'FB:FB_curator',
                 'obsolete': False,
                 'internal': False
             }
-            gene.cross_references.append(xref_dict)
+            gene.cross_reference_dtos.append(xref_dict)
             # Then add PANTHER xref (from file).
             if gene.feature.uniquename in self.pthr_dict.keys():
                 pthr_xref_dict = {
@@ -552,7 +639,7 @@ def synthesize_info(self):
                     'obsolete': False,
                     'internal': False
                 }
-                gene.cross_references.append(pthr_xref_dict)
+                gene.cross_reference_dtos.append(pthr_xref_dict)
             # Get other xrefs.
             for result in gene.dbxrefs:
                 if result.Db.name in self.fb_agr_db_dict.keys():
@@ -561,13 +648,13 @@ def synthesize_info(self):
                         'display_name': '{}:{}'.format(self.fb_agr_db_dict[result.Db.name], result.Dbxref.accession),
                         'prefix': self.fb_agr_db_dict[result.Db.name],
                         'page_areas': ['gene'],
-                        'created_by': 'FB:FB_curator',
+                        'created_by_curie': 'FB:FB_curator',
                         'obsolete': False,
                         'internal': False
                     }
                     if result.FeatureDbxref.is_current is False:
                         xref_dict['internal'] = True
-                    gene.cross_references.append(xref_dict)
+                    gene.cross_reference_dtos.append(xref_dict)
             # Flag internal features.
             if gene.organism_abbr != 'Dmel':
                 gene.internal = True
@@ -575,12 +662,12 @@ def synthesize_info(self):
             if gene.obsolete is True:
                 gene.internal = True
                 gene.internal_reasons.append('Obsolete')
-            if gene.gene_type is None:
+            if gene.gene_type_curie is None:
                 gene.internal = True
                 gene.internal_reasons.append('Lacks gene type')
             if gene.gene_type_name in self.internal_gene_types:
                 gene.internal = True
-                gene.internal_reasons.append('Internal gene type {} ({})'.format(gene.gene_type_name, gene.gene_type))
+                gene.internal_reasons.append('Internal gene type {} ({})'.format(gene.gene_type_name, gene.gene_type_curie))
             for attr in self.required_fields:
                 if attr not in gene.__dict__.keys():
                     gene.for_alliance_export = False

From 0469a5af92f8e72f84cdccb4d2b146024dc12eae Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Tue, 24 Jan 2023 15:05:14 -0500
Subject: [PATCH 04/52] flake8

---
 src/AGR_data_retrieval_curation_gene.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index 3bf47c2..cb18e10 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -32,7 +32,7 @@
 from harvdev_utils.char_conversions import sub_sup_sgml_to_html
 from harvdev_utils.production import (
     Cvterm, Db, Dbxref, Feature, FeatureDbxref, FeatureSynonym,
-    Featureloc, Featureprop, Organism, OrganismDbxref, Pub, PubDbxref, Synonym
+    Featureloc, Featureprop, OrganismDbxref, Pub, PubDbxref, Synonym
 )
 from harvdev_utils.psycopg_functions import set_up_db_reading
 
@@ -242,7 +242,7 @@ def get_references(self, session):
         pub_counter = 0
         for pub in results:
             self.all_pubs_dict[pub.pub_id] = f'FB:{pub.uniquename}'
-            counter += 1
+            pub_counter += 1
         # Next find PMIDs if available and replace the curie in the all_pubs_dict.
         filters = (
             Pub.uniquename.op('~')(fbrf_regex),
@@ -550,7 +550,7 @@ def process_feature_synonyms(self, input, name_type, return_single_value):
             output_synonym_dto = {
                 'name_type_name': name_type_to_use,
                 'format_text': synonym.name,
-                'display_text': synonym.synonym_sgml,
+                'display_text': sub_sup_sgml_to_html(synonym.synonym_sgml),
                 'synonym_scope': 'exact',
                 'evidence_curies': [f'{self.all_pubs_dict[i]}' for i in pub_list if self.all_pubs_dict[i] != 'unattributed'],
                 'internal': False,

From 98d6fb0b7eba86fdb926620a09a175f4a8791e5e Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Tue, 24 Jan 2023 15:36:45 -0500
Subject: [PATCH 05/52] fix type check for synonym method

---
 src/AGR_data_retrieval_curation_gene.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index cb18e10..8312752 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -515,7 +515,7 @@ def process_feature_synonyms(self, input, name_type, return_single_value):
             Raises error if return_single_value set to True, but many synonyms found in the input list.
 
         """
-        if type(input) is not str or type(input) is not list:
+        if type(input) is not str and type(input) is not list:
             log.error('Input must be a string or list of FeatureSynonym objects.')
             raise
         # First handle the simplest case where a string is given as the input.
@@ -570,8 +570,9 @@ def synthesize_info(self):
         """Convert FlyBase gene data into an AllianceGene representation."""
         log.info('Synthesizing gene info.')
         for gene in self.gene_dict.values():
-            log.debug('Evaluating annotation: {}'.format(gene))
+            log.debug(f'Evaluating annotation: {gene}')
             # BOB: Handle synonyms.
+            log.debug(f'Handle synonyms for {gene}')
             if gene.curr_fb_symbol:
                 gene.gene_symbol_dto = self.process_feature_synonyms(gene.curr_fb_symbol, 'nomenclature_symbol', True)
             else:

From bddb072ee9a941cfa8d10296295033c04ca6d552 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Tue, 24 Jan 2023 15:47:30 -0500
Subject: [PATCH 06/52] temp suppress xrefs for faster dev

---
 src/AGR_data_retrieval_curation_gene.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index 8312752..b1a9420 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -490,7 +490,7 @@ def query_chado(self, session):
         self.get_references(session)
         self.get_genes(session)
         self.get_gene_taxons(session)
-        self.get_gene_dbxrefs(session)
+        # self.get_gene_dbxrefs(session)    # BOB - suppress for faster dev.
         self.get_synonyms(session)
         self.get_gene_snapshots(session)
         self.get_gene_types(session)

From b9ff60f6f5ea0c74f501fe73c53cb24cd642c21b Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Tue, 24 Jan 2023 15:53:08 -0500
Subject: [PATCH 07/52] fix synonym dict construction

---
 src/AGR_data_retrieval_curation_gene.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index b1a9420..4c93020 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -363,7 +363,7 @@ def get_synonyms(self, session):
             distinct()
         for result in gene_curr_symbol_results:
             # First, build the all_synonyms_dict.
-            self.all_synonyms_dict[result.Synonym.synonym_id] = Synonym
+            self.all_synonyms_dict[result.Synonym.synonym_id] = result.Synonym
             # Second, collect FeatureSynonym objects by type.
             if result.FeatureSynonym.is_current is True:
                 if result.synonym_type.name == 'symbol':

From 14adf409ab1ef2bd0a44ec492f21bd42af65ca3e Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Tue, 24 Jan 2023 15:58:49 -0500
Subject: [PATCH 08/52] debug syno

---
 src/AGR_data_retrieval_curation_gene.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index 4c93020..1880e66 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -572,17 +572,20 @@ def synthesize_info(self):
         for gene in self.gene_dict.values():
             log.debug(f'Evaluating annotation: {gene}')
             # BOB: Handle synonyms.
-            log.debug(f'Handle synonyms for {gene}')
+            log.debug(f'BOB: Handle symbol for {gene}')
             if gene.curr_fb_symbol:
                 gene.gene_symbol_dto = self.process_feature_synonyms(gene.curr_fb_symbol, 'nomenclature_symbol', True)
             else:
                 gene.gene_symbol_dto = self.process_feature_synonyms(gene.feature.name, 'nomenclature_symbol', True)
+            log.debug(f'BOB: Handle full_name for {gene}')
             if gene.curr_fb_fullname:
                 gene.gene_full_name_dto = self.process_feature_synonyms(gene.curr_fb_fullname, 'full_name', True)
             else:
                 gene.gene_full_name_dto = self.process_feature_synonyms(gene.feature.name, 'full_name', True)
+            log.debug(f'BOB: Handle systematic_name for {gene}')
             if gene.systematic_name:
                 gene.gene_systematic_name_dto = self.process_feature_synonyms(gene.systematic_name, 'systematic_name', True)
+            log.debug(f'BOB: Handle other synonyms for {gene}')
             if gene.other_synonyms:
                 gene.gene_synonym_dtos = self.process_feature_synonyms(gene.other_synonyms, 'unspecified', False)
             # Get timestamps.

From b50590e28c5853743813a99de8148a913424f684 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Wed, 25 Jan 2023 12:45:33 -0500
Subject: [PATCH 09/52] group distinct chado synonyms by shared
 name/synonym_sgml

---
 src/AGR_data_retrieval_curation_gene.py | 73 +++++++++++++++++++------
 1 file changed, 57 insertions(+), 16 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index 1880e66..0aa10d9 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -498,7 +498,6 @@ def query_chado(self, session):
         self.get_gene_featureloc(session)
         return
 
-    # BOB: new method for synonyms.
     def process_feature_synonyms(self, input, name_type, return_single_value):
         """Convert a string or list of FeatureSynonym objects into single or many DTO objects for export.
 
@@ -515,10 +514,20 @@ def process_feature_synonyms(self, input, name_type, return_single_value):
             Raises error if return_single_value set to True, but many synonyms found in the input list.
 
         """
+        # Dict for converting FB to AGR synonym types.
+        synonym_type_conversion = {
+            'symbol': 'nomenclature_symbol',
+            'fullname': 'full_name',
+            'nickname': 'nomenclature_symbol',
+            'synonym': 'nomenclature_symbol'
+        }
+        # Regex for FB systematic names (Dmel or other Dros species).
+        systematic_name_regex = r'^(D[a-z]{3}\\|)(CG|CR|G[A-Z])[0-9]{4,5}$'
+        # Check for correct input.
         if type(input) is not str and type(input) is not list:
             log.error('Input must be a string or list of FeatureSynonym objects.')
             raise
-        # First handle the simplest case where a string is given as the input.
+        # Handle a simple string.
         if type(input) is str:
             output_synonym_dto = {
                 'name_type_name': name_type,
@@ -532,28 +541,60 @@ def process_feature_synonyms(self, input, name_type, return_single_value):
             if return_single_value is False:
                 output_synonym_dto = [output_synonym_dto]
             return output_synonym_dto
-        # Next handle a list of FeatureSynonym objects.
-        # Collect pub_ids for each synonym (keyed by synonym_id).
+        # Handle a list of FeatureSynonym objects.
+        # Group by each distinct name/synonym_sgml combination: for each, group pub_id by synonym type.
+        # Have a dict, keyed by tuple of (format_text, display_text)
+        # Value for each key is a dict where synonym type is key for a list of pubs, or, 'internal' key for feature_synonym.is_internal values.
         feature_synonym_dict = {}
-        output_synonym_dto_list = []
         for f_s in input:
-            try:
-                feature_synonym_dict[f_s.synonym_id].append(f_s.pub_id)
-            except KeyError:
-                feature_synonym_dict[f_s.synonym_id] = [f_s.pub_id]
-        for synonym_id, pub_list in feature_synonym_dict.items():
-            synonym = self.all_synonyms_dict[synonym_id]
-            if name_type == 'unspecified':
-                name_type_to_use = synonym.type.name
+            synonym = self.all_synonyms_dict[f_s.synonym_id]
+            distinct_synonym_name = (synonym.name, synonym.synonym_sgml)
+            if distinct_synonym_name in feature_synonym_dict.keys():
+                feature_synonym_dict[distinct_synonym_name]['internal'].append(f_s.is_internal)
+                if synonym.type.name in feature_synonym_dict[distinct_synonym_name].keys():
+                    feature_synonym_dict[distinct_synonym_name][synonym.type.name].append(f_s.pub_id)
+                else:
+                    feature_synonym_dict[distinct_synonym_name][synonym.type.name] = [f_s.pub_id]
             else:
+                feature_synonym_dict[distinct_synonym_name] = {synonym.type.name: [f_s.pub_id], 'internal': [f_s.is_internal]}
+        # Now convert to AGR DTO object.
+        output_synonym_dto_list = []
+        FORMAT_TEXT = 0
+        DISPLAY_TEXT = 1
+        for syno_name, syno_types_pubs in feature_synonym_dict:
+            # Determine internal status.
+            if True in syno_types_pubs['internal']:
+                syno_internal = True
+            else:
+                syno_internal = False
+            # Collect all pubs.
+            pub_list = []
+            for syno_type, syno_type_pub_list in syno_types_pubs.items():
+                if syno_type == 'internal':
+                    continue
+                pub_list.extend(syno_type_pub_list)
+            pub_list = list(set(pub_list))
+            # Pick correct name type to apply.
+            if re.match(systematic_name_regex, syno_name[FORMAT_TEXT]) and name_type != 'full_name':
+                name_type_to_use = 'systematic_name'
+            elif name_type != 'unspecified':
                 name_type_to_use = name_type
+            # If name_type is "unspecified", we need to figure this out. Same name can be used in diff ways. Pick most frequent use.
+            # e.g., "wingless" is stored as both symbol and fullname in chado, but more frequently curated as a fullname.
+            else:
+                type_tally = {}
+                for syno_type, syno_type_pub_list in syno_types_pubs.items():
+                    if syno_type == 'internal':
+                        continue
+                    type_tally[len(set(pub_list))] = syno_type
+                name_type_to_use = synonym_type_conversion[type_tally[max(type_tally.keys())]]
             output_synonym_dto = {
                 'name_type_name': name_type_to_use,
-                'format_text': synonym.name,
-                'display_text': sub_sup_sgml_to_html(synonym.synonym_sgml),
+                'format_text': syno_name[FORMAT_TEXT],
+                'display_text': sub_sup_sgml_to_html(syno_name[DISPLAY_TEXT]),
                 'synonym_scope': 'exact',
                 'evidence_curies': [f'{self.all_pubs_dict[i]}' for i in pub_list if self.all_pubs_dict[i] != 'unattributed'],
-                'internal': False,
+                'internal': syno_internal,
                 'obsolete': False
             }
             output_synonym_dto_list.append(output_synonym_dto)

From f86aeef16454a847ef1662a7f690c9658f155f63 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Wed, 25 Jan 2023 14:28:01 -0500
Subject: [PATCH 10/52] revise export of synonyms and anno IDs

---
 src/AGR_data_retrieval_curation_gene.py | 338 ++++++++++++------------
 1 file changed, 166 insertions(+), 172 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index 0aa10d9..7085b1c 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -105,11 +105,10 @@ def __init__(self, feature):
         self.featureloc = None                                # Will be Featureloc object for the gene.
         self.gene_type_name = None                            # Will be the cvterm.name for "promoted_gene_type" featureprop.
         self.gene_snapshot = None                             # Will be the "gene_summary_text" Featureprop object.
+        self.curr_symbol_name = None                          # Will be the current symbol synonym.synonym_sgml.
+        self.curr_fullname = None                             # Will be the current fullname synonym.synonym_sgml.
         self.curr_anno_id = None                              # Will be current annotation ID for the gene (str).
-        self.curr_fb_symbol = []                              # Will be all FeatureSynonym objects in support of the current symbol Synonym object.
-        self.curr_fb_fullname = []                            # Will be all FeatureSynonym objects in support of the current fullname Synonym object.
-        self.systematic_name = []                             # Will be all FeatureSynonym objects using the systematic name of the gene.
-        self.other_synonyms = []                              # Will be all FeatureSynonym objects in support of non-current synonyms.
+        self.feature_synonyms = []                            # Will be list of all FeatureSynonym objects.
         self.dbxrefs = []                                     # Will be list of dbxrefs as sql result groupings: Db, Dbxref, FeatureDbxref.
         self.alt_fb_ids = []                                  # Will be list of Dbxrefs for 2o FlyBase IDs.
         self.annotation_ids = []                              # Will be list of Dbxrefs for annotation IDs.
@@ -158,7 +157,38 @@ def __init__(self):
         self.export_feat_cnt = 0      # Count of all genes exported to file.
         self.internal_feat_cnt = 0    # Count of all genes marked as internal=True in export file.
 
+    # Regexes.
+    gene_regex = r'^FBgn[0-9]{7}$'
+    pthr_regex = r'PTHR[0-9]{5}'
+    pub_regex = r'^(FBrf[0-9]{7}|unattributed)$'
+    systematic_name_regex = r'^(D[a-z]{3}\\|)(CG|CR|G[A-Z])[0-9]{4,5}$'
+    # Reference dicts.
+    internal_gene_types = [
+        'engineered_fusion_gene',
+        'engineered_region',
+        'gene_group',
+        'gene_with_polycistronic_transcript',
+        'insulator',
+        'mitochondrial_sequence',
+        'origin_of_replication',
+        'region',
+        'regulatory_region',
+        'repeat_region',
+        'satellite_DNA',
+        'transposable_element_gene'
+    ]
+    fb_agr_db_dict = {
+        'EntrezGene': 'NCBI_Gene',
+        'FlyBase': 'FB',
+        'FlyBase Annotation IDs': 'FB',
+        'RNAcentral': 'RNAcentral',
+        # 'UniProt/GCRP': 'UniProt/GCRP',
+        'UniProt/Swiss-Prot': 'UniProtKB',
+        'UniProt/TrEMBL': 'UniProtKB'
+    }
+    # Sample set.
     test_genes = ['wg', 'mt:ori', 'lncRNA:roX1', 'CG12656']
+    # Export fields.
     required_fields = [
         'curie',
         'gene_symbol_dto',
@@ -183,29 +213,6 @@ def __init__(self):
         'taxon_curie',
         'updated_by_curie',
     ]
-    internal_gene_types = [
-        'engineered_fusion_gene',
-        'engineered_region',
-        'gene_group',
-        'gene_with_polycistronic_transcript',
-        'insulator',
-        'mitochondrial_sequence',
-        'origin_of_replication',
-        'region',
-        'regulatory_region',
-        'repeat_region',
-        'satellite_DNA',
-        'transposable_element_gene'
-    ]
-    fb_agr_db_dict = {
-        'EntrezGene': 'NCBI_Gene',
-        'FlyBase': 'FB',
-        'FlyBase Annotation IDs': 'FB',
-        'RNAcentral': 'RNAcentral',
-        # 'UniProt/GCRP': 'UniProt/GCRP',
-        'UniProt/Swiss-Prot': 'UniProtKB',
-        'UniProt/TrEMBL': 'UniProtKB'
-    }
 
     def open_panther_file(self):
         """Extract panther information from file."""
@@ -216,24 +223,21 @@ def open_panther_file(self):
             filepath = '/data/ortholog/panther/PTHR17.0_fruit_fly'
         tsv_file = open(filepath, "r")
         tsvin = csv.reader(tsv_file, delimiter='\t')
-        fb_regex = r'FBgn[0-9]{7}'
-        pthr_regex = r'PTHR[0-9]{5}'
         FB = 0
         PTHR = 3
         for row in tsvin:
             fields = len(row)
             if fields:  # Ignore blank lines
-                if re.search(fb_regex, row[FB]) and re.search(pthr_regex, row[PTHR]):
-                    self.pthr_dict[re.search(fb_regex, row[FB]).group(0)] = re.search(pthr_regex, row[PTHR]).group(0)
+                if re.search(self.gene_regex, row[FB]) and re.search(self.pthr_regex, row[PTHR]):
+                    self.pthr_dict[re.search(self.gene_regex, row[FB]).group(0)] = re.search(self.pthr_regex, row[PTHR]).group(0)
         return
 
     def get_references(self, session):
         """Get all references."""
         log.info('Get all references.')
         # First get all current pubs having an FBrf uniquename.
-        fbrf_regex = r'^(FBrf[0-9]{7}|unattributed)$'
         filters = (
-            Pub.uniquename.op('~')(fbrf_regex),
+            Pub.uniquename.op('~')(self.pub_regex),
             Pub.is_obsolete.is_(False)
         )
         results = session.query(Pub).\
@@ -245,7 +249,7 @@ def get_references(self, session):
             pub_counter += 1
         # Next find PMIDs if available and replace the curie in the all_pubs_dict.
         filters = (
-            Pub.uniquename.op('~')(fbrf_regex),
+            Pub.uniquename.op('~')(self.pub_regex),
             Pub.is_obsolete.is_(False),
             Db.name == 'pubmed',
             PubDbxref.is_current.is_(True)
@@ -267,9 +271,8 @@ def get_genes(self, session):
         """Get all genes."""
         log.info('Querying chado for genes.')
         # First get all gene features from chado.
-        gene_regex = r'^FBgn[0-9]{7}$'
         filters = (
-            Feature.uniquename.op('~')(gene_regex),
+            Feature.uniquename.op('~')(self.gene_regex),
             Feature.is_analysis.is_(False),
             Cvterm.name == 'gene'
         )
@@ -306,17 +309,68 @@ def get_gene_taxons(self, session):
                 log.debug('No NCBI taxon ID available for: {}'.format(gene))
         return
 
+    def get_synonyms(self, session):
+        """Get current and non-current symbols and full names for genes."""
+        log.info('Get current and non-current symbols and full names for genes.')
+        filters = (
+            Feature.uniquename.op('~')(self.gene_regex),
+            Feature.is_analysis.is_(False),
+            Cvterm.name == 'gene'
+        )
+        results = session.query(Feature, FeatureSynonym, Synonym).\
+            join(FeatureSynonym, (FeatureSynonym.synonym_id == Synonym.synonym_id)).\
+            join(Feature, (Feature.feature_id == FeatureSynonym.feature_id)).\
+            join(Cvterm, (Cvterm.cvterm_id == Feature.type_id)).\
+            filter(*filters).\
+            distinct()
+        counter = 0
+        for result in results:
+            # First, build the all_synonyms_dict.
+            self.all_synonyms_dict[result.Synonym.synonym_id] = result.Synonym
+            # Second, collect FeatureSynonyms for each gene.
+            self.gene_dict[result.Feature.uniquename].feature_synonyms.append(result.FeatureSynonym)
+            # Catch current symbol and fullname strings.
+            if result.FeatureSynonym.is_current is True and result.Synonym.type.name == 'symbol':
+                self.gene_dict[result.Feature.uniquename].curr_symbol_name = sub_sup_sgml_to_html(result.Synonym.synonym_sgml)
+            elif result.FeatureSynonym.is_current is True and result.Synonym.type.name == 'fullname':
+                self.gene_dict[result.Feature.uniquename].curr_fullname = sub_sup_sgml_to_html(result.Synonym.synonym_sgml)
+            counter += 1
+        return
+
+    def get_annotation_ids(self, session):
+        """Get current annotation IDs."""
+        log.info('Get current annotation IDs.')
+        filters = (
+            Feature.uniquename.op('~')(self.gene_regex),
+            Feature.is_analysis.is_(False),
+            Cvterm.name == 'gene',
+            FeatureDbxref.is_current.is_(True),
+            Db.name == 'FlyBase Annotation IDs'
+        )
+        results = session.query(Feature, Dbxref).\
+            join(Cvterm, (Cvterm.cvterm_id == Feature.type_id)).\
+            join(FeatureDbxref, (FeatureDbxref.feature_id == Feature.feature_id)).\
+            join(Dbxref, (Dbxref.dbxref_id == FeatureDbxref.dbxref_id)).\
+            join(Db, (Db.db_id == Dbxref.db_id)).\
+            filter(*filters).\
+            distinct()
+        counter = 0
+        for result in results:
+            self.gene_dict[result.Feature.uniquename].curr_anno_id = result.Dbxref.accession
+            counter += 1
+        log.info(f'Found {counter} current annotation IDs for FlyBase genes.')
+        return
+
     def get_gene_dbxrefs(self, session):
         """Get all dbxrefs for genes. This will take 10-15 minutes."""
         log.info('Getting gene dbxrefs.')
-        gene_regex = r'^FBgn[0-9]{7}$'
         filters = (
-            Feature.uniquename.op('~')(gene_regex),
+            Feature.uniquename.op('~')(self.gene_regex),
             Feature.is_analysis.is_(False),
             Cvterm.name == 'gene',
-            Db.name.in_((self.fb_agr_db_dict.keys()))
+            Db.name.in_((self.fb_agr_db_dict.keys())),
         )
-        gene_dbxref_results = session.query(Feature, FeatureDbxref, Dbxref, Db).\
+        results = session.query(Feature, FeatureDbxref, Dbxref, Db).\
             join(Cvterm, (Cvterm.cvterm_id == Feature.type_id)).\
             join(FeatureDbxref, (FeatureDbxref.feature_id == Feature.feature_id)).\
             join(Dbxref, (Dbxref.dbxref_id == FeatureDbxref.dbxref_id)).\
@@ -324,57 +378,22 @@ def get_gene_dbxrefs(self, session):
             filter(*filters).\
             distinct()
         counter = 0
-        for result in gene_dbxref_results:
+        for result in results:
             counter += 1
             if counter % 100000 == 0:
                 log.debug('Processing xref #{}'.format(counter))
-            # Skip current FlyBase accessions.
-            # If present, these are same as feature.uniquename.
-            # However, not present for all genes (e.g., FBgn0085177), so cannot be relied upon.
+            # Skip current FlyBase accessions because these are not comprehensive.
+            # When they exist, they're always equal to the feature.uniquename.
+            # But they're not always present, so these dbxrefs can't be relied upon (e.g., FBgn0085177)
             if result.FeatureDbxref.is_current is True and result.Db.name == 'FlyBase':
                 pass
             elif result.FeatureDbxref.is_current is False and result.Db.name == 'FlyBase':
                 self.gene_dict[result.Feature.uniquename].alt_fb_ids.append(result.Dbxref)
             elif result.Db.name == 'FlyBase Annotation IDs':
                 self.gene_dict[result.Feature.uniquename].annotation_ids.append(result.Dbxref)
-                if result.FeatureDbxref.is_current is True:
-                    self.gene_dict[result.Feature.uniquename].curr_anno_id = result.Dbxref.accession
             else:
                 self.gene_dict[result.Feature.uniquename].dbxrefs.append(result)
-        return
-
-    def get_synonyms(self, session):
-        """Get current and non-current symbols and full names for genes."""
-        log.info('Getting gene synonyms.')
-        feature_type = aliased(Cvterm, name='feature_type')
-        synonym_type = aliased(Cvterm, name='synonym_type')
-        gene_regex = r'^FBgn[0-9]{7}$'
-        filters = (
-            Feature.uniquename.op('~')(gene_regex),
-            Feature.is_analysis.is_(False),
-            feature_type.name == 'gene'
-        )
-        gene_curr_symbol_results = session.query(synonym_type, Feature, FeatureSynonym, Synonym).\
-            join(FeatureSynonym, (FeatureSynonym.synonym_id == Synonym.synonym_id)).\
-            join(Feature, (Feature.feature_id == FeatureSynonym.feature_id)).\
-            join(feature_type, (feature_type.cvterm_id == Feature.type_id)).\
-            join(synonym_type, (synonym_type.cvterm_id == Synonym.type_id)).\
-            filter(*filters).\
-            distinct()
-        for result in gene_curr_symbol_results:
-            # First, build the all_synonyms_dict.
-            self.all_synonyms_dict[result.Synonym.synonym_id] = result.Synonym
-            # Second, collect FeatureSynonym objects by type.
-            if result.FeatureSynonym.is_current is True:
-                if result.synonym_type.name == 'symbol':
-                    self.gene_dict[result.Feature.uniquename].curr_fb_symbol.append(result.FeatureSynonym)
-                elif result.synonym_type.name == 'fullname':
-                    self.gene_dict[result.Feature.uniquename].curr_fb_fullname.append(result.FeatureSynonym)
-            else:
-                self.gene_dict[result.Feature.uniquename].other_synonyms.append(result.FeatureSynonym)
-            # Third, catch synonyms that match the annotation ID (aka, systematic_name).
-            if result.Synonym.name == self.gene_dict[result.Feature.uniquename].curr_anno_id:
-                self.gene_dict[result.Feature.uniquename].systematic_name.append(result.FeatureSynonym)
+        log.info(f'Found {counter} gene dbxrefs.')
         return
 
     def get_gene_snapshots(self, session):
@@ -382,9 +401,8 @@ def get_gene_snapshots(self, session):
         log.info('Getting gene snapshots.')
         feature_type = aliased(Cvterm, name='feature_type')
         prop_type = aliased(Cvterm, name='gene_summary_text')
-        gene_regex = r'^FBgn[0-9]{7}$'
         filters = (
-            Feature.uniquename.op('~')(gene_regex),
+            Feature.uniquename.op('~')(self.gene_regex),
             Feature.is_analysis.is_(False),
             feature_type.name == 'gene',
             prop_type.name == 'gene_summary_text'
@@ -404,9 +422,8 @@ def get_gene_types(self, session):
         log.info('Getting gene types.')
         feature_type = aliased(Cvterm, name='feature_type')
         prop_type = aliased(Cvterm, name='promoted_gene_type')
-        gene_regex = r'^FBgn[0-9]{7}$'
         filters = (
-            Feature.uniquename.op('~')(gene_regex),
+            Feature.uniquename.op('~')(self.gene_regex),
             Feature.is_analysis.is_(False),
             prop_type.name == 'promoted_gene_type'
         )
@@ -469,9 +486,8 @@ def get_gene_featureloc(self, session):
         for result in chr_results:
             self.chr_dict[result.feature_id] = result.uniquename
         # Now get gene featureloc.
-        gene_regex = r'^FBgn[0-9]{7}$'
         filters = (
-            Feature.uniquename.op('~')(gene_regex),
+            Feature.uniquename.op('~')(self.gene_regex),
             Feature.is_analysis.is_(False),
             Cvterm.name == 'gene'
         )
@@ -498,22 +514,8 @@ def query_chado(self, session):
         self.get_gene_featureloc(session)
         return
 
-    def process_feature_synonyms(self, input, name_type, return_single_value):
-        """Convert a string or list of FeatureSynonym objects into single or many DTO objects for export.
-
-        Args:
-            arg1 (input): (str or list) A string, or, a list of FeatureSynonym objects.
-            arg2 (name_type): (str) The type of name to return. If "unspecified" is given, go by Synonym type.
-            arg3 (return_single_value): (bool) True if output should be a single DTO, False if a list is to be returned.
-
-        Returns:
-            A single or list of name DTO objects.
-
-        Raises:
-            Raises error if in put is not a string/list.
-            Raises error if return_single_value set to True, but many synonyms found in the input list.
-
-        """
+    def process_feature_synonyms(self, feature):
+        """Generate name/synonym DTOs for a feature that has a list of FeatureSynonym objects."""
         # Dict for converting FB to AGR synonym types.
         synonym_type_conversion = {
             'symbol': 'nomenclature_symbol',
@@ -521,32 +523,21 @@ def process_feature_synonyms(self, input, name_type, return_single_value):
             'nickname': 'nomenclature_symbol',
             'synonym': 'nomenclature_symbol'
         }
-        # Regex for FB systematic names (Dmel or other Dros species).
-        systematic_name_regex = r'^(D[a-z]{3}\\|)(CG|CR|G[A-Z])[0-9]{4,5}$'
-        # Check for correct input.
-        if type(input) is not str and type(input) is not list:
-            log.error('Input must be a string or list of FeatureSynonym objects.')
-            raise
-        # Handle a simple string.
-        if type(input) is str:
-            output_synonym_dto = {
-                'name_type_name': name_type,
-                'format_text': input,
-                'display_text': input,
-                'synonym_scope': 'exact',
-                'evidence_curies': [],
-                'internal': False,
-                'obsolete': False
-            }
-            if return_single_value is False:
-                output_synonym_dto = [output_synonym_dto]
-            return output_synonym_dto
-        # Handle a list of FeatureSynonym objects.
-        # Group by each distinct name/synonym_sgml combination: for each, group pub_id by synonym type.
-        # Have a dict, keyed by tuple of (format_text, display_text)
-        # Value for each key is a dict where synonym type is key for a list of pubs, or, 'internal' key for feature_synonym.is_internal values.
+        default_name_dto = {
+            'name_type_name': 'unspecified',
+            'format_text': 'unspecified',
+            'display_text': 'unspecified',
+            'synonym_scope': 'exact',
+            'evidence_curies': [],
+            'internal': False,
+            'obsolete': False
+        }
+        # Create a dict of all distinct name/synonym_sgml combinations: for each, capture synonym type(s) an pub_ids.
+        # Keys are (synonym.name, synonym.synonym_sgml) tuples.
+        # Values are dicts too where keys are chado synonym types and values are lists of pub_ids.
+        # Value dict also has an "internal" key that stores list of FeatureSynonym.is_internal values.
         feature_synonym_dict = {}
-        for f_s in input:
+        for f_s in feature.feature_synonyms:
             synonym = self.all_synonyms_dict[f_s.synonym_id]
             distinct_synonym_name = (synonym.name, synonym.synonym_sgml)
             if distinct_synonym_name in feature_synonym_dict.keys():
@@ -557,54 +548,73 @@ def process_feature_synonyms(self, input, name_type, return_single_value):
                     feature_synonym_dict[distinct_synonym_name][synonym.type.name] = [f_s.pub_id]
             else:
                 feature_synonym_dict[distinct_synonym_name] = {synonym.type.name: [f_s.pub_id], 'internal': [f_s.is_internal]}
-        # Now convert to AGR DTO object.
-        output_synonym_dto_list = []
+        # Convert to AGR name DTO objects.
+        name_dto_list = []
         FORMAT_TEXT = 0
         DISPLAY_TEXT = 1
-        for syno_name, syno_types_pubs in feature_synonym_dict:
-            # Determine internal status.
-            if True in syno_types_pubs['internal']:
-                syno_internal = True
-            else:
+        for syno_name, syno_attributes in feature_synonym_dict.items():
+            # Determine internal status. False trumps True.
+            if False in set(syno_attributes['internal']):
                 syno_internal = False
+            else:
+                syno_internal = True
             # Collect all pubs.
-            pub_list = []
-            for syno_type, syno_type_pub_list in syno_types_pubs.items():
+            pub_id_list = []
+            for syno_type, syno_type_pub_list in syno_attributes.items():
                 if syno_type == 'internal':
                     continue
-                pub_list.extend(syno_type_pub_list)
-            pub_list = list(set(pub_list))
+                pub_id_list.extend(syno_type_pub_list)
+            pub_id_list = list(set(pub_id_list))
             # Pick correct name type to apply.
-            if re.match(systematic_name_regex, syno_name[FORMAT_TEXT]) and name_type != 'full_name':
+            if re.match(self.systematic_name_regex, syno_name[DISPLAY_TEXT]):
                 name_type_to_use = 'systematic_name'
-            elif name_type != 'unspecified':
-                name_type_to_use = name_type
-            # If name_type is "unspecified", we need to figure this out. Same name can be used in diff ways. Pick most frequent use.
-            # e.g., "wingless" is stored as both symbol and fullname in chado, but more frequently curated as a fullname.
             else:
                 type_tally = {}
-                for syno_type, syno_type_pub_list in syno_types_pubs.items():
+                for syno_type, syno_type_pub_list in syno_attributes.items():
                     if syno_type == 'internal':
                         continue
-                    type_tally[len(set(pub_list))] = syno_type
+                    type_tally[len(set(syno_type_pub_list))] = syno_type
                 name_type_to_use = synonym_type_conversion[type_tally[max(type_tally.keys())]]
             output_synonym_dto = {
                 'name_type_name': name_type_to_use,
-                'format_text': syno_name[FORMAT_TEXT],
+                'format_text': sub_sup_sgml_to_html(syno_name[FORMAT_TEXT]),
                 'display_text': sub_sup_sgml_to_html(syno_name[DISPLAY_TEXT]),
                 'synonym_scope': 'exact',
-                'evidence_curies': [f'{self.all_pubs_dict[i]}' for i in pub_list if self.all_pubs_dict[i] != 'unattributed'],
+                'evidence_curies': [self.all_pubs_dict[i] for i in pub_id_list if self.all_pubs_dict[i] != 'unattributed'],
                 'internal': syno_internal,
                 'obsolete': False
             }
-            output_synonym_dto_list.append(output_synonym_dto)
-        if return_single_value is True and len(output_synonym_dto_list) != 1:
-            log.error('Found many synonyms but was expecting only one.')
-            raise
-        elif return_single_value is True and len(output_synonym_dto_list) == 1:
-            return output_synonym_dto_list[0]
-        else:
-            return output_synonym_dto_list
+            name_dto_list.append(output_synonym_dto)
+        # Sift through name DTOs for symbol, fullname, systematic_name, etc.
+        for name_dto in name_dto_list:
+            if name_dto['display_text'] == feature.curr_anno_id:
+                feature.gene_systematic_name_dto = name_dto
+                if name_dto['name_type_name'] != 'systematic_name':
+                    log.warning(f"{feature}: Found mistyped curr anno ID: type={name_dto['name_type_name']}, anno_id={name_dto['display_text']}")
+            if name_dto['display_text'] == feature.curr_symbol_name:
+                if name_dto['name_type_name'] not in ['systematic_name', 'nomenclature_symbol']:
+                    name_dto['name_type_name'] = 'nomenclature_symbol'
+                    log.warning(f"{feature}: Found mistyped curr symbol: type={name_dto['name_type_name']}, anno_id={name_dto['display_text']}")
+                feature.gene_symbol_dto = name_dto
+            elif name_dto['display_text'] == feature.curr_fullname:
+                feature.gene_full_name_dto = name_dto
+                if name_dto['name_type_name'] != 'full_name':
+                    log.warning(f"{feature}: Found mistyped curr full_name: type={name_dto['name_type_name']}, anno_id={name_dto['display_text']}")
+            else:
+                feature.gene_synonym_dtos.append(name_dto)
+        # Symbol is required. If none, fill it in.
+        if feature.gene_symbol_dto is None:
+            placeholder_symbol_dto = default_name_dto.copy()
+            placeholder_symbol_dto['name_type_name'] = 'nomenclature_symbol'
+            placeholder_symbol_dto['format_text'] = feature.feature.name
+            placeholder_symbol_dto['display_text'] = feature.feature.name
+            feature.gene_symbol_dto = placeholder_symbol_dto
+        # Full name is required. If none, fill it in. Could be because FB has none, or, it's the same as the symbol.
+        if feature.gene_full_name_dto is None:
+            placeholder_full_name_dto = feature.gene_symbol_dto.copy()
+            placeholder_full_name_dto['name_type_name'] = 'full_name'
+            feature.gene_full_name_dto = placeholder_full_name_dto
+        return
 
     # Synthesis of initial db info.
     def synthesize_info(self):
@@ -612,23 +622,7 @@ def synthesize_info(self):
         log.info('Synthesizing gene info.')
         for gene in self.gene_dict.values():
             log.debug(f'Evaluating annotation: {gene}')
-            # BOB: Handle synonyms.
-            log.debug(f'BOB: Handle symbol for {gene}')
-            if gene.curr_fb_symbol:
-                gene.gene_symbol_dto = self.process_feature_synonyms(gene.curr_fb_symbol, 'nomenclature_symbol', True)
-            else:
-                gene.gene_symbol_dto = self.process_feature_synonyms(gene.feature.name, 'nomenclature_symbol', True)
-            log.debug(f'BOB: Handle full_name for {gene}')
-            if gene.curr_fb_fullname:
-                gene.gene_full_name_dto = self.process_feature_synonyms(gene.curr_fb_fullname, 'full_name', True)
-            else:
-                gene.gene_full_name_dto = self.process_feature_synonyms(gene.feature.name, 'full_name', True)
-            log.debug(f'BOB: Handle systematic_name for {gene}')
-            if gene.systematic_name:
-                gene.gene_systematic_name_dto = self.process_feature_synonyms(gene.systematic_name, 'systematic_name', True)
-            log.debug(f'BOB: Handle other synonyms for {gene}')
-            if gene.other_synonyms:
-                gene.gene_synonym_dtos = self.process_feature_synonyms(gene.other_synonyms, 'unspecified', False)
+            self.process_feature_synonyms(gene)
             # Get timestamps.
             if gene.timestamps:
                 gene.date_created = strict_rfc3339.\

From 55f999c5161aa763c1828c83e9952141db013773 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Wed, 25 Jan 2023 14:36:57 -0500
Subject: [PATCH 11/52] skip feature_synonyms for non-curr pubs

---
 src/AGR_data_retrieval_curation_gene.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index 7085b1c..a71c55e 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -325,6 +325,9 @@ def get_synonyms(self, session):
             distinct()
         counter = 0
         for result in results:
+            # Skip any references to non-current pubs.
+            if result.FeatureSynonym.pub_id not in self.all_pubs_dict.keys():
+                continue
             # First, build the all_synonyms_dict.
             self.all_synonyms_dict[result.Synonym.synonym_id] = result.Synonym
             # Second, collect FeatureSynonyms for each gene.
@@ -335,6 +338,7 @@ def get_synonyms(self, session):
             elif result.FeatureSynonym.is_current is True and result.Synonym.type.name == 'fullname':
                 self.gene_dict[result.Feature.uniquename].curr_fullname = sub_sup_sgml_to_html(result.Synonym.synonym_sgml)
             counter += 1
+        log.info(f'Found {counter} feature_synonyms (current pubs) for genes.')
         return
 
     def get_annotation_ids(self, session):

From 3640455ebe68ba79e7737ae5a2949ab1f96edee9 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Wed, 25 Jan 2023 14:54:32 -0500
Subject: [PATCH 12/52] run anno id method; add placeholder systematic name

---
 src/AGR_data_retrieval_curation_gene.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index a71c55e..dff479f 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -512,6 +512,7 @@ def query_chado(self, session):
         self.get_gene_taxons(session)
         # self.get_gene_dbxrefs(session)    # BOB - suppress for faster dev.
         self.get_synonyms(session)
+        self.get_annotation_ids(session)
         self.get_gene_snapshots(session)
         self.get_gene_types(session)
         self.get_gene_timestamps(session)
@@ -606,6 +607,7 @@ def process_feature_synonyms(self, feature):
                     log.warning(f"{feature}: Found mistyped curr full_name: type={name_dto['name_type_name']}, anno_id={name_dto['display_text']}")
             else:
                 feature.gene_synonym_dtos.append(name_dto)
+        # LinkML change required: make gene_full_name_dto and gene_systematic_name_dto OPTIONAL.
         # Symbol is required. If none, fill it in.
         if feature.gene_symbol_dto is None:
             placeholder_symbol_dto = default_name_dto.copy()
@@ -618,6 +620,11 @@ def process_feature_synonyms(self, feature):
             placeholder_full_name_dto = feature.gene_symbol_dto.copy()
             placeholder_full_name_dto['name_type_name'] = 'full_name'
             feature.gene_full_name_dto = placeholder_full_name_dto
+        # Full name is required. If none, fill it in. Could be because FB has none, or, it's the same as the symbol.
+        if feature.gene_systematic_name_dto is None:
+            placeholder_systematic_name_dto = feature.gene_symbol_dto.copy()
+            placeholder_systematic_name_dto['name_type_name'] = 'systematic_name'
+            feature.gene_systematic_name_dto = placeholder_full_name_dto
         return
 
     # Synthesis of initial db info.

From 2910e42ed52ffd94b6d8ae0503aa195aa9a7b79e Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Wed, 25 Jan 2023 14:58:28 -0500
Subject: [PATCH 13/52] fix scope attr name

---
 src/AGR_data_retrieval_curation_gene.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index dff479f..7d02cdf 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -532,7 +532,7 @@ def process_feature_synonyms(self, feature):
             'name_type_name': 'unspecified',
             'format_text': 'unspecified',
             'display_text': 'unspecified',
-            'synonym_scope': 'exact',
+            'synonym_scope_name': 'exact',
             'evidence_curies': [],
             'internal': False,
             'obsolete': False
@@ -584,7 +584,7 @@ def process_feature_synonyms(self, feature):
                 'name_type_name': name_type_to_use,
                 'format_text': sub_sup_sgml_to_html(syno_name[FORMAT_TEXT]),
                 'display_text': sub_sup_sgml_to_html(syno_name[DISPLAY_TEXT]),
-                'synonym_scope': 'exact',
+                'synonym_scope_name': 'exact',
                 'evidence_curies': [self.all_pubs_dict[i] for i in pub_id_list if self.all_pubs_dict[i] != 'unattributed'],
                 'internal': syno_internal,
                 'obsolete': False

From 8acd6042e237f7a1655347c40e92b0643c13d478 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Wed, 25 Jan 2023 15:03:06 -0500
Subject: [PATCH 14/52] fix sys name placeholder

---
 src/AGR_data_retrieval_curation_gene.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index 7d02cdf..e6b3266 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -624,7 +624,7 @@ def process_feature_synonyms(self, feature):
         if feature.gene_systematic_name_dto is None:
             placeholder_systematic_name_dto = feature.gene_symbol_dto.copy()
             placeholder_systematic_name_dto['name_type_name'] = 'systematic_name'
-            feature.gene_systematic_name_dto = placeholder_full_name_dto
+            feature.gene_systematic_name_dto = placeholder_systematic_name_dto
         return
 
     # Synthesis of initial db info.

From 3a2cae0a4e19899a1f1ba0ed5f31ea37bafa1f36 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Wed, 25 Jan 2023 15:18:44 -0500
Subject: [PATCH 15/52] debug sys name export

---
 src/AGR_data_retrieval_curation_gene.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index e6b3266..ccd4185 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -191,8 +191,9 @@ def __init__(self):
     # Export fields.
     required_fields = [
         'curie',
-        'gene_symbol_dto',
         'gene_full_name_dto',
+        'gene_symbol_dto',
+        'gene_systematic_name_dto',
         'internal',
         'taxon_curie',
     ]
@@ -205,6 +206,7 @@ def __init__(self):
         'gene_full_name_dto',
         'gene_symbol_dto',
         'gene_synonym_dtos',
+        'gene_systematic_name_dto',
         'gene_type_curie',
         'genomic_location_dtos',
         'internal',
@@ -347,6 +349,7 @@ def get_annotation_ids(self, session):
         filters = (
             Feature.uniquename.op('~')(self.gene_regex),
             Feature.is_analysis.is_(False),
+            Feature.is_obsolete.is_(False),
             Cvterm.name == 'gene',
             FeatureDbxref.is_current.is_(True),
             Db.name == 'FlyBase Annotation IDs'
@@ -361,6 +364,7 @@ def get_annotation_ids(self, session):
         counter = 0
         for result in results:
             self.gene_dict[result.Feature.uniquename].curr_anno_id = result.Dbxref.accession
+            log.debug(f'For {self.gene_dict[result.Feature.uniquename]}, anno_id={result.Dbxref.accession}')
             counter += 1
         log.info(f'Found {counter} current annotation IDs for FlyBase genes.')
         return
@@ -620,10 +624,13 @@ def process_feature_synonyms(self, feature):
             placeholder_full_name_dto = feature.gene_symbol_dto.copy()
             placeholder_full_name_dto['name_type_name'] = 'full_name'
             feature.gene_full_name_dto = placeholder_full_name_dto
-        # Full name is required. If none, fill it in. Could be because FB has none, or, it's the same as the symbol.
+        # Systematic name is required. If none, fill it in. Could be because FB has none, or, it's the same as the symbol.
         if feature.gene_systematic_name_dto is None:
             placeholder_systematic_name_dto = feature.gene_symbol_dto.copy()
             placeholder_systematic_name_dto['name_type_name'] = 'systematic_name'
+            if feature.curr_anno_id:
+                placeholder_symbol_dto['format_text'] = feature.curr_anno_id
+                placeholder_symbol_dto['display_text'] = feature.curr_anno_id
             feature.gene_systematic_name_dto = placeholder_systematic_name_dto
         return
 

From c2484331f160d035bd3b11e2adb345e356c9ce9c Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Wed, 25 Jan 2023 15:29:26 -0500
Subject: [PATCH 16/52] fix reporting of mis-typed synonyms in log

---
 src/AGR_data_retrieval_curation_gene.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index ccd4185..f534f97 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -597,18 +597,20 @@ def process_feature_synonyms(self, feature):
         # Sift through name DTOs for symbol, fullname, systematic_name, etc.
         for name_dto in name_dto_list:
             if name_dto['display_text'] == feature.curr_anno_id:
-                feature.gene_systematic_name_dto = name_dto
                 if name_dto['name_type_name'] != 'systematic_name':
-                    log.warning(f"{feature}: Found mistyped curr anno ID: type={name_dto['name_type_name']}, anno_id={name_dto['display_text']}")
+                    log.warning(f"{feature}: Found mistyped curr anno ID: type={name_dto['name_type_name']}, name={name_dto['display_text']}")
+                    name_dto['name_type_name'] = 'systematic_name'
+                feature.gene_systematic_name_dto = name_dto
             if name_dto['display_text'] == feature.curr_symbol_name:
                 if name_dto['name_type_name'] not in ['systematic_name', 'nomenclature_symbol']:
+                    log.warning(f"{feature}: Found mistyped curr symbol: type={name_dto['name_type_name']}, name={name_dto['display_text']}")
                     name_dto['name_type_name'] = 'nomenclature_symbol'
-                    log.warning(f"{feature}: Found mistyped curr symbol: type={name_dto['name_type_name']}, anno_id={name_dto['display_text']}")
                 feature.gene_symbol_dto = name_dto
             elif name_dto['display_text'] == feature.curr_fullname:
-                feature.gene_full_name_dto = name_dto
                 if name_dto['name_type_name'] != 'full_name':
-                    log.warning(f"{feature}: Found mistyped curr full_name: type={name_dto['name_type_name']}, anno_id={name_dto['display_text']}")
+                    log.warning(f"{feature}: Found mistyped curr full_name: type={name_dto['name_type_name']}, name={name_dto['display_text']}")
+                    name_dto['name_type_name'] = 'full_name'
+                feature.gene_full_name_dto = name_dto
             else:
                 feature.gene_synonym_dtos.append(name_dto)
         # LinkML change required: make gene_full_name_dto and gene_systematic_name_dto OPTIONAL.

From 4043733c8aed7544e6c515f748452f4e5e1e6eb8 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Wed, 25 Jan 2023 15:33:57 -0500
Subject: [PATCH 17/52] fix sys name placeholder

---
 src/AGR_data_retrieval_curation_gene.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index f534f97..e2bd680 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -597,6 +597,7 @@ def process_feature_synonyms(self, feature):
         # Sift through name DTOs for symbol, fullname, systematic_name, etc.
         for name_dto in name_dto_list:
             if name_dto['display_text'] == feature.curr_anno_id:
+                log.debug(f"BOB: Found name_dto annotation match: {name_dto['display_text']}")
                 if name_dto['name_type_name'] != 'systematic_name':
                     log.warning(f"{feature}: Found mistyped curr anno ID: type={name_dto['name_type_name']}, name={name_dto['display_text']}")
                     name_dto['name_type_name'] = 'systematic_name'
@@ -631,8 +632,8 @@ def process_feature_synonyms(self, feature):
             placeholder_systematic_name_dto = feature.gene_symbol_dto.copy()
             placeholder_systematic_name_dto['name_type_name'] = 'systematic_name'
             if feature.curr_anno_id:
-                placeholder_symbol_dto['format_text'] = feature.curr_anno_id
-                placeholder_symbol_dto['display_text'] = feature.curr_anno_id
+                placeholder_systematic_name_dto['format_text'] = feature.curr_anno_id
+                placeholder_systematic_name_dto['display_text'] = feature.curr_anno_id
             feature.gene_systematic_name_dto = placeholder_systematic_name_dto
         return
 

From e4115a278297bdebb126986dfd7f6950f7926d16 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Wed, 25 Jan 2023 15:37:30 -0500
Subject: [PATCH 18/52] tweak comments about sys name placeholder

---
 src/AGR_data_retrieval_curation_gene.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index e2bd680..6aad57d 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -627,7 +627,7 @@ def process_feature_synonyms(self, feature):
             placeholder_full_name_dto = feature.gene_symbol_dto.copy()
             placeholder_full_name_dto['name_type_name'] = 'full_name'
             feature.gene_full_name_dto = placeholder_full_name_dto
-        # Systematic name is required. If none, fill it in. Could be because FB has none, or, it's the same as the symbol.
+        # Systematic name is required. If none, fill it in. Could be because gene is unannotated, or annotation ID has never been used in pubs.
         if feature.gene_systematic_name_dto is None:
             placeholder_systematic_name_dto = feature.gene_symbol_dto.copy()
             placeholder_systematic_name_dto['name_type_name'] = 'systematic_name'

From 9978a2b3082f082273c23439101d1b98ecfec789 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Wed, 25 Jan 2023 15:46:22 -0500
Subject: [PATCH 19/52] tweak log of anno ID matches

---
 src/AGR_data_retrieval_curation_gene.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index 6aad57d..ae62ea9 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -597,7 +597,7 @@ def process_feature_synonyms(self, feature):
         # Sift through name DTOs for symbol, fullname, systematic_name, etc.
         for name_dto in name_dto_list:
             if name_dto['display_text'] == feature.curr_anno_id:
-                log.debug(f"BOB: Found name_dto annotation match: {name_dto['display_text']}")
+                log.debug(f"BOB: Found synonym-annoID match: {name_dto['display_text']}")
                 if name_dto['name_type_name'] != 'systematic_name':
                     log.warning(f"{feature}: Found mistyped curr anno ID: type={name_dto['name_type_name']}, name={name_dto['display_text']}")
                     name_dto['name_type_name'] = 'systematic_name'
@@ -609,7 +609,7 @@ def process_feature_synonyms(self, feature):
                 feature.gene_symbol_dto = name_dto
             elif name_dto['display_text'] == feature.curr_fullname:
                 if name_dto['name_type_name'] != 'full_name':
-                    log.warning(f"{feature}: Found mistyped curr full_name: type={name_dto['name_type_name']}, name={name_dto['display_text']}")
+                    log.warning(f"BOB: {feature}: Found mistyped curr full_name: type={name_dto['name_type_name']}, name={name_dto['display_text']}")
                     name_dto['name_type_name'] = 'full_name'
                 feature.gene_full_name_dto = name_dto
             else:
@@ -634,6 +634,7 @@ def process_feature_synonyms(self, feature):
             if feature.curr_anno_id:
                 placeholder_systematic_name_dto['format_text'] = feature.curr_anno_id
                 placeholder_systematic_name_dto['display_text'] = feature.curr_anno_id
+                log.warning(f"BOB: {feature}: Has anno ID never used as a synonym: {feature.curr_anno_id}")
             feature.gene_systematic_name_dto = placeholder_systematic_name_dto
         return
 

From 381a6b63b50791aa1c7ff9cf463201ac456f1073 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Wed, 25 Jan 2023 15:47:33 -0500
Subject: [PATCH 20/52] tweak log of anno ID matches

---
 src/AGR_data_retrieval_curation_gene.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index ae62ea9..2e4bc49 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -634,7 +634,7 @@ def process_feature_synonyms(self, feature):
             if feature.curr_anno_id:
                 placeholder_systematic_name_dto['format_text'] = feature.curr_anno_id
                 placeholder_systematic_name_dto['display_text'] = feature.curr_anno_id
-                log.warning(f"BOB: {feature}: Has anno ID never used as a synonym: {feature.curr_anno_id}")
+                log.warning(f"BOB: {feature}: Has annoID never used as a synonym: {feature.curr_anno_id}")
             feature.gene_systematic_name_dto = placeholder_systematic_name_dto
         return
 

From ec651e5b461ab2dc0eb519c484ad5619365236c0 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Wed, 25 Jan 2023 15:50:31 -0500
Subject: [PATCH 21/52] tweak log of anno ID matches

---
 src/AGR_data_retrieval_curation_gene.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index 2e4bc49..5c39025 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -364,7 +364,6 @@ def get_annotation_ids(self, session):
         counter = 0
         for result in results:
             self.gene_dict[result.Feature.uniquename].curr_anno_id = result.Dbxref.accession
-            log.debug(f'For {self.gene_dict[result.Feature.uniquename]}, anno_id={result.Dbxref.accession}')
             counter += 1
         log.info(f'Found {counter} current annotation IDs for FlyBase genes.')
         return

From b4986b402612d37cda22dde72e9009918445ca30 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Thu, 26 Jan 2023 15:14:41 -0500
Subject: [PATCH 22/52] add back method for getting gene xrefs, tidy log msg
 and code comments

---
 src/AGR_data_retrieval_curation_gene.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index 5c39025..86f4910 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -105,8 +105,8 @@ def __init__(self, feature):
         self.featureloc = None                                # Will be Featureloc object for the gene.
         self.gene_type_name = None                            # Will be the cvterm.name for "promoted_gene_type" featureprop.
         self.gene_snapshot = None                             # Will be the "gene_summary_text" Featureprop object.
-        self.curr_symbol_name = None                          # Will be the current symbol synonym.synonym_sgml.
-        self.curr_fullname = None                             # Will be the current fullname synonym.synonym_sgml.
+        self.curr_symbol_name = None                          # Will be the current symbol synonym.synonym_sgml, processed by sub_sup_sgml_to_html().
+        self.curr_fullname = None                             # Will be the current fullname synonym.synonym_sgml, processed by sub_sup_sgml_to_html().
         self.curr_anno_id = None                              # Will be current annotation ID for the gene (str).
         self.feature_synonyms = []                            # Will be list of all FeatureSynonym objects.
         self.dbxrefs = []                                     # Will be list of dbxrefs as sql result groupings: Db, Dbxref, FeatureDbxref.
@@ -129,8 +129,8 @@ def __init__(self, feature):
         self.genomic_location_dtos = []                       # Will need to be list of GenomicLocation objects.
         # Attributes for the Alliance GeneDTO. GeneDTO is_a GenomicEntityDTO.
         self.gene_symbol_dto = None                           # Will be a single SymbolSlotAnnotationDTO.
-        self.gene_full_name_dto = None                        # Will be a single GeneFullNameSlotAnnotation.
-        self.gene_systematic_name_dto = None                  # Will be a single GeneSystematicNameSlotAnnotation.
+        self.gene_full_name_dto = None                        # Will be a single FullNameSlotAnnotation.
+        self.gene_systematic_name_dto = None                  # Will be a single SystematicNameSlotAnnotation.
         self.gene_synonym_dtos = []                           # Will be list of NameSlotAnnotationDTO objects.
         self.gene_type_curie = None                           # Will be the SO term ID corresponding to the gene's promoted_gene_type.
         # Notes associated with the object.
@@ -234,7 +234,7 @@ def open_panther_file(self):
                     self.pthr_dict[re.search(self.gene_regex, row[FB]).group(0)] = re.search(self.pthr_regex, row[PTHR]).group(0)
         return
 
-    def get_references(self, session):
+    def get_all_references(self, session):
         """Get all references."""
         log.info('Get all references.')
         # First get all current pubs having an FBrf uniquename.
@@ -510,10 +510,10 @@ def get_gene_featureloc(self, session):
     def query_chado(self, session):
         """A wrapper method that runs initial db queries."""
         self.open_panther_file()
-        self.get_references(session)
+        self.get_all_references(session)
         self.get_genes(session)
         self.get_gene_taxons(session)
-        # self.get_gene_dbxrefs(session)    # BOB - suppress for faster dev.
+        self.get_gene_dbxrefs(session)
         self.get_synonyms(session)
         self.get_annotation_ids(session)
         self.get_gene_snapshots(session)
@@ -596,7 +596,6 @@ def process_feature_synonyms(self, feature):
         # Sift through name DTOs for symbol, fullname, systematic_name, etc.
         for name_dto in name_dto_list:
             if name_dto['display_text'] == feature.curr_anno_id:
-                log.debug(f"BOB: Found synonym-annoID match: {name_dto['display_text']}")
                 if name_dto['name_type_name'] != 'systematic_name':
                     log.warning(f"{feature}: Found mistyped curr anno ID: type={name_dto['name_type_name']}, name={name_dto['display_text']}")
                     name_dto['name_type_name'] = 'systematic_name'
@@ -608,7 +607,7 @@ def process_feature_synonyms(self, feature):
                 feature.gene_symbol_dto = name_dto
             elif name_dto['display_text'] == feature.curr_fullname:
                 if name_dto['name_type_name'] != 'full_name':
-                    log.warning(f"BOB: {feature}: Found mistyped curr full_name: type={name_dto['name_type_name']}, name={name_dto['display_text']}")
+                    log.warning(f"{feature}: Found mistyped curr full_name: type={name_dto['name_type_name']}, name={name_dto['display_text']}")
                     name_dto['name_type_name'] = 'full_name'
                 feature.gene_full_name_dto = name_dto
             else:
@@ -633,7 +632,7 @@ def process_feature_synonyms(self, feature):
             if feature.curr_anno_id:
                 placeholder_systematic_name_dto['format_text'] = feature.curr_anno_id
                 placeholder_systematic_name_dto['display_text'] = feature.curr_anno_id
-                log.warning(f"BOB: {feature}: Has annoID never used as a synonym: {feature.curr_anno_id}")
+                log.warning(f"{feature}: Has annoID never used as a synonym: {feature.curr_anno_id}")
             feature.gene_systematic_name_dto = placeholder_systematic_name_dto
         return
 

From c0314de57ee243b224b72411764220c69b3533ac Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Thu, 26 Jan 2023 15:15:09 -0500
Subject: [PATCH 23/52] update attr; update synonym, xref and 2o id handling

---
 src/AGR_data_retrieval_curation_allele.py | 521 +++++++++++++---------
 1 file changed, 322 insertions(+), 199 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py
index 6e293eb..d7f7925 100644
--- a/src/AGR_data_retrieval_curation_allele.py
+++ b/src/AGR_data_retrieval_curation_allele.py
@@ -23,6 +23,7 @@
 import argparse
 import datetime
 import json
+import re
 import strict_rfc3339
 from sqlalchemy import create_engine, inspect
 from sqlalchemy.orm import aliased, sessionmaker
@@ -93,62 +94,64 @@ def __init__(self, feature):
         """
         # Attributes representing unprocessed FlyBase data.
         # Note: use attribute names that do not match an Alliance LinkML slot name.
-        # For initial load, the Alliance A-Team just needs minimum info.
-        # ALLELE: curie, taxon, symbol, description, internal, obsolete.
         # Problems with Allele LinkML:
-        # 1. Allele.taxon is required, but even after updating NCBITaxon info at FlyBase, not all alleles will have NCBI taxon ID.
-        self.feature = feature                    # The Feature object corresponding to the FlyBase allele.
-        self.organism_abbr = None                 # Will be the organism.abbreviation for the allele's species of origin.
-        self.adj_organism_abbr = 'Dmel'           # Assume allele is Dmel (classical or transgenic) unless allele is of classical type in another insect.
-        self.in_vitro = False                     # Change to True if allele associated with "in vitro%" cvterm.
-        self.constructs = []                      # Will be a list of FBtp IDs for this allele's constructs.
-        self.dmel_insertions = []                 # Will be a list of FBti IDs for this allele's Dmel insertions.
-        self.non_dmel_insertions = []             # Will be a list of FBti IDs for this allele's non-Dmel insertions.
-        self.args = []                            # Will be a list of ARGs Features (variants).
-        self.parent_gene = None                   # Will be the FBgn ID of the allele's gene.
-        self.allele_of_internal_gene = False      # Will change to True if is allele of Dmel internal-type gene (e.g., origin_of_replication).
-        self.taxon_dbxref = None                  # Will be the NCBITaxon (Db, Dbxref) tuple for the organism.
-        self.curr_fb_symbol = None                # Will be the current symbol Synonym object.
-        self.curr_fb_fullname = None              # Will be the current fullname Synonym object.
-        self.internal_synonyms = []               # Will be list of internal synonym names (and synonym_sgml if different).
-        self.public_synonyms = []                 # Will be list of public synonym names (and synonym_sgml if different).
-        self.dbxrefs = []                         # Will be list of dbxrefs as sql result groupings: Db, Dbxref, FeatureDbxref.
-        self.alt_fb_ids = []                      # Will be list of Dbxrefs for 2o FlyBase IDs.
-        self.timestamps = []                      # Add all timestamps here.
-        self.fb_references = []                   # Will be list of FBrf IDs related to an allele: directly and indirectly.
-        self.featureprops = {}                    # A CVterm-keyed dict of Featureprop lists.
-        self.phenotypes = []                      # Will be a list of SQLAlchemy (Feature, Genotype, Phenotype, Cvterm) results.
-        self.direct_libraries = []                # Will be a list of Library objects directly related to the allele.
-        self.ins_libraries = []                   # Will be a list of Library objects related to the allele via insertion (FBti).
-        self.cons_libraries = []                  # Will be a list of Library objects related to the allele via construct (FBtp).
-        self.sf_libraries = []                    # Will be a list of Library objects related to the allele via seq. feature (FBsf).
+        # 1. Allele.taxon_curie is required, but even after updating NCBITaxon info at FlyBase, not all alleles will have NCBI taxon ID.
+        # 2. Allele.inheritance_mode_name is singular, but some FB alleles have many documented modes.
+        self.feature = feature                             # The Feature object corresponding to the FlyBase allele.
+        self.organism_abbr = None                          # Will be the organism.abbreviation for the allele's species of origin.
+        self.adj_organism_abbr = 'Dmel'                    # Assume allele is Dmel (classical/transgenic) unless allele is of classical type in another insect.
+        self.in_vitro = False                              # Change to True if allele associated with "in vitro%" cvterm.
+        self.constructs = []                               # Will be a list of FBtp IDs for this allele's constructs.
+        self.dmel_insertions = []                          # Will be a list of FBti IDs for this allele's Dmel insertions.
+        self.non_dmel_insertions = []                      # Will be a list of FBti IDs for this allele's non-Dmel insertions.
+        self.args = []                                     # Will be a list of ARGs Features (variants).
+        self.parent_gene = None                            # Will be the FBgn ID of the allele's gene.
+        self.allele_of_internal_gene = False               # Will change to True if is allele of Dmel internal-type gene (e.g., origin_of_replication).
+        self.curr_symbol_name = None                       # Will be the current symbol synonym.synonym_sgml, processed by sub_sup_sgml_to_html().
+        self.curr_fullname = None                          # Will be the current fullname synonym.synonym_sgml, processed by sub_sup_sgml_to_html().
+        self.feature_synonyms = []                         # Will be list of all FeatureSynonym objects.
+        self.dbxrefs = []                                  # Will be list of dbxrefs as sql result groupings: Db, Dbxref, FeatureDbxref.
+        self.alt_fb_ids = []                               # Will be list of Dbxrefs for 2o FlyBase IDs.
+        self.timestamps = []                               # Add all timestamps here.
+        self.fb_references = []                            # Will be list of pub_ids from feature_pub, feature_synonym.
+        self.featureprops = {}                             # A CVterm-keyed dict of Featureprop lists.
+        self.phenotypes = []                               # Will be a list of SQLAlchemy (Feature, Genotype, Phenotype, Cvterm) results.
+        self.direct_libraries = []                         # Will be a list of Library objects directly related to the allele.
+        self.ins_libraries = []                            # Will be a list of Library objects related to the allele via insertion (FBti).
+        self.cons_libraries = []                           # Will be a list of Library objects related to the allele via construct (FBtp).
+        self.sf_libraries = []                             # Will be a list of Library objects related to the allele via seq. feature (FBsf).
         # Attributes for the Alliance AuditedObject.
-        self.obsolete = feature.is_obsolete       # Will be the FlyBase value here.
-        self.internal = False                     # Change to true if allele not intended for display at Alliance website.
-        self.created_by = 'FB:FB_curator'         # Use placeholder value since no Person object at FlyBase.
-        self.updated_by = 'FB:FB_curator'         # Use placeholder value since no Person object at FlyBase.
-        self.date_created = None                  # Earliest timestamp.
-        self.date_updated = None                  # Latest timestamp.
-        # self.data_provider = 'FB'                 # The MOD abbreviation.
+        self.obsolete = feature.is_obsolete                    # Will be the FlyBase value here.
+        self.internal = False                                  # Change to true if allele not intended for display at Alliance website.
+        self.created_by_curie = 'FB:FB_curator'                # Use placeholder value since no Person object at FlyBase.
+        self.updated_by_curie = 'FB:FB_curator'                # Use placeholder value since no Person object at FlyBase.
+        self.date_created = None                               # Earliest timestamp.
+        self.date_updated = None                               # Latest timestamp.
         # Attributes for the Alliance BiologicalEntity. BiologicalEntity is_a AuditedObject.
         self.curie = 'FB:{}'.format(feature.uniquename)
-        self.taxon = None                         # A string representing the NCBI taxon ID. We have no NCBI taxonID for 223 alleles.
+        self.taxon_curie = None                                # A string representing the NCBI taxon ID. We have no NCBI taxonID for 223 alleles.
         # Attributes for the Alliance GenomicEntity. GenomicEntity is_a BiologicalEntity.
-        self.name = None                          # Will be current fullname synonym - report ascii or utf8 (sgml) version?
-        self.synonyms = []                        # All current and non-current ASCII and SGML synonyms.
-        self.cross_references = []                # Report only select dbs, using AGR-accepted db_prefix.
-        self.secondary_identifiers = []           # Annotation IDs and 2o FlyBase IDs.
+        self.cross_reference_dtos = []                         # Report only select dbs, using AGR-accepted db_prefix.
         # Attributes for the Alliance Allele. Allele is_a GenomicEntity.
-        self.symbol = None                        # Will be a string (ascii or utf8)?
-        self.references = []                      # KANBAN-237: READY: Will be a list of pubs (PMID or FB:FBrf IDs) for the allele.
-        self.is_extinct = None                    # KANBAN-237: READY: Change to true if extinction has been reported. Otherwise, leave blank.
-        self.inheritence_mode = []                # KANBAN-237: READY: Will be a list of CV terms.
-        self.in_collection = []                   # KANBAN-237: TO DO: Will be a library names.
-        self.sequencing_status = None             # KANBAN-237: TO DO: Will be a CV term? TBD. Might be dropped.
+        self.allele_symbol_dto = None                          # Will be a single SymbolSlotAnnotationDTO.
+        self.allele_full_name_dto = None                       # Will be a single FullNameSlotAnnotation.
+        self.allele_synonym_dtos = []                          # Will be list of NameSlotAnnotationDTO objects.
+        self.allele_database_status_dto = None                 # ToDo
+        self.allele_functional_impact_dtos = None              # ToDo
+        self.allele_germline_transmission_status_dto = None    # ToDo
+        self.allele_molecular_mutation_dtos = None             # ToDo
+        self.allele_mutation_type_dtos = None                  # ToDo
+        self.allele_nomenclature_event_dtos = None             # ToDo
+        self.allele_note_dtos = None                           # ToDo
+        self.allele_secondary_id_dtos = None                   # Only 2o FlyBase IDs (redundant with GenomicEntity.secondary_identifiers?)
+        self.in_collection_name = None                         # Will be library.name.
+        self.inheritance_mode_name = 'unknown'                 # Change to one of: dominant, semi-dominant, recessive. If many apply, leave as unknown.
+        self.is_extinct = None                                 # Make True if extinction reported; make False is stock exists; leave as None otherwise.
+        self.reference_curies = None                           # Will be a list of reference curies (directly or indirectly related).
         # Notes associated with the object.
-        self.for_alliance_export = True           # Change to False if object should be excluded from export.
-        self.internal_reasons = []                # Reasons for marking an object as internal in the export file.
-        self.export_warnings = []                 # Reasons for suppressing an object from the export file.
+        self.for_alliance_export = True                        # Change to False if object should be excluded from export.
+        self.internal_reasons = []                             # Reasons for marking an object as internal in the export file.
+        self.export_warnings = []                              # Reasons for suppressing an object from the export file.
 
     def __str__(self):
         """Succinct text string describing the AllianceAllele object."""
@@ -161,51 +164,103 @@ class AlleleHandler(object):
     def __init__(self):
         """Create the AlleleHandler object."""
         self.allele_dict = {}         # An FBalID-keyed dict of AllianceAllele objects.
+        self.all_pubs_dict = {}       # A pub_id-keyed dict of pub curies (PMID or FBrf).
+        self.all_synonyms_dict = {}   # A synonym_id-keyed dict of Synonym objects.
         self.drosophilid_list = []    # A list of organism_ids for "Drosophilid" species in chado.
         self.total_feat_cnt = 0       # Count of all alleles found in starting query.
         self.export_feat_cnt = 0      # Count of all alleles exported to file.
         self.internal_feat_cnt = 0    # Count of all alleles marked as internal=True in export file.
-        self.fbrf_pmid_dict = {}      # Will be a dict of FBrf-to-PMID xrefs.
 
+    # Regexes.
+    gene_regex = r'^FBgn[0-9]{7}$'
+    allele_regex = r'^FBal[0-9]{7}$'
+    cons_regex = r'^FBtp[0-9]{7}$'
+    ins_regex = r'^FBti[0-9]{7}$'
+    seqfeat_regex = r'^FBsf[0-9]{10}$'
+    feature_regex = r'^FB(tp|ti)[0-9]{7}$'
+    lib_regex = r'^FBlc[0-9]{7}$'
+    pub_regex = r'^(FBrf[0-9]{7}|unattributed)$'
+    # Sample set.
     test_alleles = []
+    # Export fields.
     required_fields = [
+        'allele_symbol_dto',
         'curie',
-        'taxon',
-        'symbol',
-        'internal'
+        'internal',
+        'taxon_curie',
     ]
     output_fields = [
-        'created_by',
-        'cross_references',
+        'allele_database_status_dto',
+        'allele_full_name_dto',
+        'allele_functional_impact_dtos',
+        'allele_germline_transmission_status_dto',
+        'allele_molecular_mutation_dtos',
+        'allele_mutation_type_dtos',
+        'allele_nomenclature_event_dtos',
+        'allele_note_dtos',
+        'allele_secondary_id_dtos',
+        'allele_symbol_dto',
+        'allele_synonym_dtos',
+        'created_by_curie',
+        'cross_reference_dtos',
         'curie',
         'date_created',
         'date_updated',
-        'in_collection',
-        'inheritence_mode',
+        'in_collection_name',
+        'inheritance_mode_name',
         'internal',
         'is_extinct',
-        'updated_by',
-        'name',
         'obsolete',
-        'references',
-        'secondary_identifiers',
-        # 'sequencing_status',    # KANBAN-237: Not implemented yet in LinkML v1.2.4
-        'symbol',
-        'synonyms',
-        'taxon'
+        'reference_curies',
+        'taxon_curie',
+        'updated_by_curie',
     ]
     fb_agr_db_dict = {
         'FlyBase': 'FB'
     }
 
+    def get_all_references(self, session):
+        """Get all references."""
+        log.info('Get all references.')
+        # First get all current pubs having an FBrf uniquename.
+        filters = (
+            Pub.uniquename.op('~')(self.pub_regex),
+            Pub.is_obsolete.is_(False)
+        )
+        results = session.query(Pub).\
+            filter(*filters).\
+            distinct()
+        pub_counter = 0
+        for pub in results:
+            self.all_pubs_dict[pub.pub_id] = f'FB:{pub.uniquename}'
+            pub_counter += 1
+        # Next find PMIDs if available and replace the curie in the all_pubs_dict.
+        filters = (
+            Pub.uniquename.op('~')(self.pub_regex),
+            Pub.is_obsolete.is_(False),
+            Db.name == 'pubmed',
+            PubDbxref.is_current.is_(True)
+        )
+        pmid_xrefs = session.query(Pub, Dbxref).\
+            join(PubDbxref, (PubDbxref.pub_id == Pub.pub_id)).\
+            join(Dbxref, (Dbxref.dbxref_id == PubDbxref.dbxref_id)).\
+            join(Db, (Db.db_id == Dbxref.db_id)).\
+            filter(*filters).\
+            distinct()
+        pmid_counter = 0
+        for xref in pmid_xrefs:
+            self.all_pubs_dict[xref.Pub.pub_id] = f'PMID:{xref.Dbxref.accession}'
+            pmid_counter += 1
+        log.info(f'Found {pmid_counter} PMID IDs for {pub_counter} current FB publications.')
+        return
+
     def get_alleles(self, session):
         """Get all alleles."""
         log.info('Querying chado for alleles.')
 
         # First get all allele features from chado.
-        allele_regex = r'^FBal[0-9]{7}$'
         filters = (
-            Feature.uniquename.op('~')(allele_regex),
+            Feature.uniquename.op('~')(self.allele_regex),
             Feature.is_analysis.is_(False),
             Cvterm.name == 'allele'
         )
@@ -224,15 +279,13 @@ def get_allele_gene(self, session):
         """For current alleles, get the FBgn ID of allele's current gene."""
         gene = aliased(Feature, name='gene')
         allele = aliased(Feature, name='allele')
-        gene_regex = r'^FBgn[0-9]{7}$'
-        allele_regex = r'^FBal[0-9]{7}$'
         filters = (
             gene.is_obsolete.is_(False),
             gene.is_analysis.is_(False),
-            gene.uniquename.op('~')(gene_regex),
+            gene.uniquename.op('~')(self.gene_regex),
             allele.is_obsolete.is_(False),
             allele.is_analysis.is_(False),
-            allele.uniquename.op('~')(allele_regex),
+            allele.uniquename.op('~')(self.allele_regex),
             Cvterm.name == 'alleleof'
         )
         allele_gene_results = session.query(allele, gene).\
@@ -268,9 +321,8 @@ def flag_alleles_of_internal_genes(self, session):
         ]
         feature_type = aliased(Cvterm, name='feature_type')
         prop_type = aliased(Cvterm, name='promoted_gene_type')
-        gene_regex = r'^FBgn[0-9]{7}$'
         filters = (
-            Feature.uniquename.op('~')(gene_regex),
+            Feature.uniquename.op('~')(self.gene_regex),
             Feature.is_obsolete.is_(False),
             Feature.is_analysis.is_(False),
             Organism.abbreviation == 'Dmel',
@@ -303,10 +355,9 @@ def flag_alleles_of_internal_genes(self, session):
     def flag_in_vitro_alleles(self, session):
         """Flag alleles associated with "in vitro" type CV term."""
         log.info('Flag in vitro alleles.')
-        allele_regex = r'^FBal[0-9]{7}$'
         cvterm_name_regex = '^in vitro construct'
         filters = (
-            Feature.uniquename.op('~')(allele_regex),
+            Feature.uniquename.op('~')(self.allele_regex),
             Cvterm.name.op('~')(cvterm_name_regex)
         )
         ivt_alleles = session.query(Feature).\
@@ -326,13 +377,11 @@ def get_allele_constructs(self, session):
         log.info('Find constructs related to alleles.')
         allele = aliased(Feature, name='allele')
         construct = aliased(Feature, name='construct')
-        allele_regex = r'^FBal[0-9]{7}$'
-        construct_regex = r'^FBtp[0-9]{7}$'
         filters = (
             allele.is_obsolete.is_(False),
-            allele.uniquename.op('~')(allele_regex),
+            allele.uniquename.op('~')(self.allele_regex),
             construct.is_obsolete.is_(False),
-            construct.uniquename.op('~')(construct_regex)
+            construct.uniquename.op('~')(self.construct_regex)
         )
         construct_results = session.query(allele, construct).\
             join(FeatureRelationship, (FeatureRelationship.object_id == construct.feature_id)).\
@@ -351,13 +400,11 @@ def get_allele_insertions(self, session):
         log.info('Find insertions related to alleles.')
         allele = aliased(Feature, name='allele')
         insertion = aliased(Feature, name='insertion')
-        allele_regex = r'^FBal[0-9]{7}$'
-        insertion_regex = r'^FBti[0-9]{7}$'
         filters = (
             allele.is_obsolete.is_(False),
-            allele.uniquename.op('~')(allele_regex),
+            allele.uniquename.op('~')(self.allele_regex),
             insertion.is_obsolete.is_(False),
-            insertion.uniquename.op('~')(insertion_regex)
+            insertion.uniquename.op('~')(self.insertion_regex)
         )
         insertion_results = session.query(Organism, allele, insertion).\
             join(FeatureRelationship, (FeatureRelationship.object_id == insertion.feature_id)).\
@@ -425,6 +472,7 @@ def adjust_allele_org(self, session):
     def get_allele_taxons(self, session):
         """Get taxon IDs for alleles. Depends on all organisms for features having an abbreviation."""
         log.info('Getting allele taxon IDs.')
+        # First make a dict of organism abbr to NCBI taxon IDs.
         filters = (
             OrganismDbxref.is_current.is_(True),
             Db.name == 'NCBITaxon'
@@ -438,46 +486,47 @@ def get_allele_taxons(self, session):
         organism_taxon_dict = {}
         for result in organism_dbxref_results:
             organism_taxon_dict[result.Organism.abbreviation] = result.Dbxref.accession
+        # Now fill in the info for alleles.
         for allele in self.allele_dict.values():
             try:
-                allele.taxon = 'NCBITaxon:{}'.format(organism_taxon_dict[allele.adj_organism_abbr])
+                allele.taxon_curie = f'NCBITaxon:{organism_taxon_dict[allele.adj_organism_abbr]}'
             except KeyError:
                 log.debug('No NCBI taxon ID available for: {}'.format(allele))
         return
 
     def get_synonyms(self, session):
         """Get current and non-current symbols and full names for alleles."""
-        log.info('Getting allele synonyms.')
-        feature_type = aliased(Cvterm, name='feature_type')
-        synonym_type = aliased(Cvterm, name='synonym_type')
-        allele_regex = r'^FBal[0-9]{7}$'
+        log.info('Get current and non-current symbols and full names for alleles.')
         filters = (
-            Feature.uniquename.op('~')(allele_regex),
+            Feature.uniquename.op('~')(self.allele_regex),
             Feature.is_analysis.is_(False),
-            feature_type.name == 'allele'
+            Cvterm.name == 'allele'
         )
-        allele_curr_symbol_results = session.query(synonym_type, Feature, FeatureSynonym, Synonym).\
+        results = session.query(Feature, FeatureSynonym, Synonym).\
             join(FeatureSynonym, (FeatureSynonym.synonym_id == Synonym.synonym_id)).\
             join(Feature, (Feature.feature_id == FeatureSynonym.feature_id)).\
-            join(feature_type, (feature_type.cvterm_id == Feature.type_id)).\
-            join(synonym_type, (synonym_type.cvterm_id == Synonym.type_id)).\
+            join(Cvterm, (Cvterm.cvterm_id == Feature.type_id)).\
             filter(*filters).\
             distinct()
         counter = 0
-        for result in allele_curr_symbol_results:
+        for result in results:
+            # Skip any references to non-current pubs.
+            if result.FeatureSynonym.pub_id not in self.all_pubs_dict.keys():
+                continue
+            # First, build the all_synonyms_dict.
+            self.all_synonyms_dict[result.Synonym.synonym_id] = result.Synonym
+            # Second, collect FeatureSynonyms for each allele.
+            self.allele_dict[result.Feature.uniquename].feature_synonyms.append(result.FeatureSynonym)
+            # Third, capture pub_ids.
+            self.allele_dict[result.Feature.uniquename].fb_references.append(result.FeatureSynonym.pub_id)
+
+            # Finally, catch current symbol and fullname strings.
+            if result.FeatureSynonym.is_current is True and result.Synonym.type.name == 'symbol':
+                self.allele_dict[result.Feature.uniquename].curr_symbol_name = sub_sup_sgml_to_html(result.Synonym.synonym_sgml)
+            elif result.FeatureSynonym.is_current is True and result.Synonym.type.name == 'fullname':
+                self.allele_dict[result.Feature.uniquename].curr_fullname = sub_sup_sgml_to_html(result.Synonym.synonym_sgml)
             counter += 1
-            if result.FeatureSynonym.is_current is True:
-                if result.synonym_type.name == 'symbol':
-                    self.allele_dict[result.Feature.uniquename].curr_fb_symbol = result.Synonym
-                elif result.synonym_type.name == 'fullname':
-                    self.allele_dict[result.Feature.uniquename].curr_fb_fullname = result.Synonym
-            elif result.FeatureSynonym.is_internal is True:
-                self.allele_dict[result.Feature.uniquename].internal_synonyms.append(result.Synonym.name)
-                self.allele_dict[result.Feature.uniquename].internal_synonyms.append(sub_sup_sgml_to_html(result.Synonym.synonym_sgml))
-            else:
-                self.allele_dict[result.Feature.uniquename].public_synonyms.append(result.Synonym.name)
-                self.allele_dict[result.Feature.uniquename].public_synonyms.append(sub_sup_sgml_to_html(result.Synonym.synonym_sgml))
-        log.info('Found {} allele synonyms.'.format(counter))
+        log.info(f'Found {counter} feature_synonyms (current pubs) for alleles.')
         return
 
     def get_allele_timestamps(self, session):
@@ -514,9 +563,8 @@ def get_allele_timestamps(self, session):
     def get_allele_dbxrefs(self, session):
         """Get all dbxrefs for alleles."""
         log.info('Getting allele dbxrefs.')
-        allele_regex = r'^FBal[0-9]{7}$'
         filters = (
-            Feature.uniquename.op('~')(allele_regex),
+            Feature.uniquename.op('~')(self.allele_regex),
             Feature.is_analysis.is_(False),
             Cvterm.name == 'allele',
             Db.name.in_((self.fb_agr_db_dict.keys()))
@@ -537,20 +585,19 @@ def get_allele_dbxrefs(self, session):
             if result.FeatureDbxref.is_current is True and result.Db.name == 'FlyBase':
                 pass
             elif result.FeatureDbxref.is_current is False and result.Db.name == 'FlyBase':
+                self.allele_dict[result.Feature.uniquename].dbxrefs.append(result)
                 self.allele_dict[result.Feature.uniquename].alt_fb_ids.append(result.Dbxref)
             else:
                 self.allele_dict[result.Feature.uniquename].dbxrefs.append(result)
         log.info('Found {} allele crossreferences.'.format(counter))
         return
 
-    def get_references(self, session):
+    def get_allele_references(self, session):
         """Get references for alleles."""
         log.info('Get allele references.')
-        allele_regex = r'^FBal[0-9]{7}$'
-        fbrf_regex = r'^FBrf[0-9]{7}$'
         filters = (
-            Feature.uniquename.op('~')(allele_regex),
-            Pub.uniquename.op('~')(fbrf_regex),
+            Feature.uniquename.op('~')(self.allele_regex),
+            Pub.uniquename.op('~')(self.pub_regex),
             Pub.is_obsolete.is_(False)
         )
         allele_pubs = session.query(Feature, Pub).\
@@ -560,36 +607,16 @@ def get_references(self, session):
             distinct()
         counter = 0
         for result in allele_pubs:
-            self.allele_dict[result.Feature.uniquename].fb_references.append(result.Pub.uniquename)
+            self.allele_dict[result.Feature.uniquename].fb_references.append(result.Pub.pub_id)
             counter += 1
         log.info(f'Found {counter} allele-pub relationships.')
         return
 
-    def get_pmid_xrefs(self, session):
-        """Create a dict of FBrf to PMID for publications."""
-        log.info('Getting PMID IDs for FB publications.')
-        filters = (
-            Db.name == 'pubmed',
-            Pub.is_obsolete.is_(False),
-            PubDbxref.is_current.is_(True)
-        )
-        pmid_xrefs = session.query(Pub, Dbxref).\
-            join(PubDbxref, (PubDbxref.pub_id == Pub.pub_id)).\
-            join(Dbxref, (Dbxref.dbxref_id == PubDbxref.dbxref_id)).\
-            join(Db, (Db.db_id == Dbxref.db_id)).\
-            filter(*filters).\
-            distinct()
-        for xref in pmid_xrefs:
-            self.fbrf_pmid_dict[xref.Pub.uniquename] = xref.Dbxref.accession
-        log.info(f'Found {len(self.fbrf_pmid_dict.keys())} PMID IDs for FB publications.')
-        return
-
     def get_allele_featureprops(self, session):
         """Get all allele featureprops."""
         log.info('Get allele featureprops.')
-        allele_regex = r'^FBal[0-9]{7}$'
         filters = (
-            Feature.uniquename.op('~')(allele_regex),
+            Feature.uniquename.op('~')(self.allele_regex),
             Cvterm.is_obsolete == 0
         )
         allele_fprops = session.query(Feature, Cvterm, Featureprop).\
@@ -612,7 +639,6 @@ def get_allele_featureprops(self, session):
     def get_args(self, session):
         """Get ARGs related to alleles."""
         log.info('Get allele ARGs.')
-        allele_regex = r'^FBal[0-9]{7}$'
         arg_types = [
             'MNV',
             'complex_substitution',
@@ -629,7 +655,7 @@ def get_args(self, session):
         argtype = aliased(Cvterm, name='argtype')
         reltype = aliased(Cvterm, name='reltype')
         filters = (
-            allele.uniquename.op('~')(allele_regex),
+            allele.uniquename.op('~')(self.allele_regex),
             arg.is_obsolete.is_(False),
             argtype.name.in_((arg_types)),
             reltype.name == 'partof'
@@ -651,9 +677,8 @@ def get_args(self, session):
     def get_phenotypes(self, session):
         """Get phenotypes related to alleles."""
         log.info('Get phenotypes related to alleles.')
-        allele_regex = r'^FBal[0-9]{7}$'
         filters = (
-            Feature.uniquename.op('~')(allele_regex),
+            Feature.uniquename.op('~')(self.allele_regex),
             Genotype.is_obsolete.is_(False)
         )
         results = session.query(Feature, Genotype, Phenotype, Cvterm).\
@@ -676,15 +701,13 @@ def get_direct_collections(self, session):
         """Find library collections directly related to alleles."""
         log.info('Get directly-related allele collections.')
         counter = 0
-        allele_regex = r'^FBal[0-9]{7}$'
-        lib_regex = r'^FBlc[0-9]{7}$'
         libtype = aliased(Cvterm, name='libtype')
         libfeattype = aliased(Cvterm, name='libfeattype')
         # First, look for direct FBal-FBlc associations.
         filters = (
-            Feature.uniquename.op('~')(allele_regex),
+            Feature.uniquename.op('~')(self.allele_regex),
             Library.is_obsolete.is_(False),
-            Library.uniquename.op('~')(lib_regex),
+            Library.uniquename.op('~')(self.lib_regex),
             libtype.name == 'reagent collection',
             libfeattype.name == 'member_of_reagent_collection'
         )
@@ -706,20 +729,17 @@ def get_direct_collections(self, session):
     def get_indirect_collections(self, session):
         """Find library collections indirectly related to alleles via insertion or construct."""
         log.info('Get indirectly-related allele collections (via insertion or construct).')
-        allele_regex = r'^FBal[0-9]{7}$'
-        feature_regex = r'^FB(tp|ti)[0-9]{7}$'
-        lib_regex = r'^FBlc[0-9]{7}$'
         allele = aliased(Feature, name='allele')
         feature = aliased(Feature, name='feature')
         libtype = aliased(Cvterm, name='libtype')
         libfeattype = aliased(Cvterm, name='libfeattype')
         featreltype = aliased(Cvterm, name='featreltype')
         filters = (
-            allele.uniquename.op('~')(allele_regex),
-            feature.uniquename.op('~')(feature_regex),
+            allele.uniquename.op('~')(self.allele_regex),
+            feature.uniquename.op('~')(self.feature_regex),
             feature.is_obsolete.is_(False),
             Library.is_obsolete.is_(False),
-            Library.uniquename.op('~')(lib_regex),
+            Library.uniquename.op('~')(self.lib_regex),
             libtype.name == 'reagent collection',
             libfeattype.name == 'member_of_reagent_collection',
             featreltype.name == 'associated_with'
@@ -751,10 +771,6 @@ def get_indirect_collections(self, session):
     def get_sf_collections(self, session):
         """Find library collections indirectly related to alleles via sequence feature."""
         log.info('Get indirectly-related allele collections (via equence feature).')
-        allele_regex = r'^FBal[0-9]{7}$'
-        cons_regex = r'^FBtp[0-9]{7}$'
-        sf_regex = r'^FBsf[0-9]{10}$'
-        lib_regex = r'^FBlc[0-9]{7}$'
         allele = aliased(Feature, name='allele')
         construct = aliased(Feature, name='construct')
         seqfeat = aliased(Feature, name='seqfeat')
@@ -764,13 +780,13 @@ def get_sf_collections(self, session):
         allele_construct = aliased(FeatureRelationship, name='allele_construct')
         seqfeat_construct = aliased(FeatureRelationship, name='seqfeat_construct')
         filters = (
-            allele.uniquename.op('~')(allele_regex),
-            construct.uniquename.op('~')(cons_regex),
-            seqfeat.uniquename.op('~')(sf_regex),
+            allele.uniquename.op('~')(self.allele_regex),
+            construct.uniquename.op('~')(self.cons_regex),
+            seqfeat.uniquename.op('~')(self.seqfeat_regex),
             construct.is_obsolete.is_(False),
             seqfeat.is_obsolete.is_(False),
             Library.is_obsolete.is_(False),
-            Library.uniquename.op('~')(lib_regex),
+            Library.uniquename.op('~')(self.lib_regex),
             libtype.name == 'reagent collection',
             libfeattype.name == 'member_of_reagent_collection',
             featreltype.name == 'associated_with'
@@ -797,6 +813,7 @@ def get_sf_collections(self, session):
 
     def query_chado(self, session):
         """A wrapper method that runs initial db queries."""
+        self.get_all_references(session)
         self.get_alleles(session)
         self.get_direct_collections(session)
         self.get_indirect_collections(session)
@@ -812,8 +829,7 @@ def query_chado(self, session):
         self.get_synonyms(session)
         self.get_allele_timestamps(session)
         self.get_allele_dbxrefs(session)
-        self.get_references(session)
-        self.get_pmid_xrefs(session)
+        self.get_allele_references(session)
         self.get_allele_featureprops(session)
         self.get_args(session)
         self.get_phenotypes(session)
@@ -853,7 +869,7 @@ def synthesize_synonyms(self, allele):
         for internal_synonym in internal_synonym_set:
             internal_synonym_dict = {
                 'name': internal_synonym,
-                'created_by': 'FB:FB_curator',
+                'created_by_curie': 'FB:FB_curator',
                 'obsolete': False,
                 'internal': True
             }
@@ -862,7 +878,7 @@ def synthesize_synonyms(self, allele):
         for public_synonym in public_synonym_set:
             public_synonym_dict = {
                 'name': public_synonym,
-                'created_by': 'FB:FB_curator',
+                'created_by_curie': 'FB:FB_curator',
                 'obsolete': False,
                 'internal': False
             }
@@ -871,8 +887,15 @@ def synthesize_synonyms(self, allele):
 
     def synthesize_secondary_ids(self, allele):
         """Process 2o IDs."""
-        for fb_id in allele.alt_fb_ids:
-            allele.secondary_identifiers.append('FB:{}'.format(fb_id.accession))
+        unique_fb_id_list = list(set(allele.alt_fb_ids))
+        for fb_id in unique_fb_id_list:
+            secondary_id_dict = {
+                'secondary_id': f'FB:{fb_id.accession}',
+                'created_by_curie': 'FB:FB_curator',
+                'obsolete': False,
+                'internal': False
+            }
+            allele.allele_secondary_id_dtos.append(secondary_id_dict)
         return
 
     def synthesize_xrefs(self, allele):
@@ -883,35 +906,32 @@ def synthesize_xrefs(self, allele):
             'display_name': 'FB:{}'.format(allele.feature.uniquename),
             'prefix': 'FB',
             'page_areas': ['allele'],
-            'created_by': 'FB:FB_curator',
+            'created_by_curie': 'FB:FB_curator',
             'obsolete': False,
             'internal': False
         }
-        allele.cross_references.append(xref_dict)
+        allele.cross_reference_dtos.append(xref_dict)
         # Add other xrefs.
         for result in allele.dbxrefs:
-            if result.Db.name in self.fb_agr_db_dict.keys():
-                xref_dict = {
-                    'curie': '{}:{}'.format(self.fb_agr_db_dict[result.Db.name], result.Dbxref.accession),
-                    'display_name': '{}:{}'.format(self.fb_agr_db_dict[result.Db.name], result.Dbxref.accession),
-                    'prefix': self.fb_agr_db_dict[result.Db.name],
-                    'page_areas': ['allele'],
-                    'created_by': 'FB:FB_curator',
-                    'obsolete': False,
-                    'internal': False
-                }
-                if result.FeatureDbxref.is_current is False:
-                    xref_dict['internal'] = True
-                allele.cross_references.append(xref_dict)
+            if result.Db.name not in self.fb_agr_db_dict.keys():
+                continue
+            xref_dict = {
+                'curie': '{}:{}'.format(self.fb_agr_db_dict[result.Db.name], result.Dbxref.accession),
+                'display_name': '{}:{}'.format(self.fb_agr_db_dict[result.Db.name], result.Dbxref.accession),
+                'prefix': self.fb_agr_db_dict[result.Db.name],
+                'page_areas': ['allele'],
+                'created_by_curie': 'FB:FB_curator',
+                'obsolete': False,
+                'internal': False
+            }
+            if result.FeatureDbxref.is_current is False:
+                xref_dict['internal'] = True
+            allele.cross_reference_dtos.append(xref_dict)
         return
 
     def synthesize_references(self, allele):
         """Process pubs for allele."""
-        for fbrf_id in allele.fb_references:
-            try:
-                allele.references.append(f'PMID:{self.fbrf_pmid_dict[fbrf_id]}')
-            except KeyError:
-                allele.references.append(f'FB:{fbrf_id}')
+        allele.reference_curies = [self.all_pubs_dict[i] for i in allele.fb_references if self.all_pubs_dict[i] != 'unattributed']
         return
 
     def synthesize_insertions(self, allele):
@@ -922,11 +942,11 @@ def synthesize_insertions(self, allele):
                 'display_name': '{}:{}'.format('FB', insertion.uniquename),
                 'prefix': 'FB',
                 'page_areas': ['allele'],
-                'created_by': 'FB:FB_curator',
+                'created_by_curie': 'FB:FB_curator',
                 'obsolete': False,
                 'internal': False
             }
-            allele.cross_references.append(xref_dict)
+            allele.cross_reference_dtos.append(xref_dict)
         return
 
     def flag_internal_alleles(self, allele):
@@ -956,13 +976,27 @@ def flag_unexportable_alleles(self, allele):
         return
 
     def synthesize_extinction(self, allele):
-        """Determine if allele is definitively extinct."""
+        """Determine if allele definitively exists or is extinct."""
+        has_stocks = False
+        reported_extinct = False
+        # First find evidence of extinction.
         try:
             for fprop in allele.featureprops['availability']:
                 if fprop.value == 'Stated to be lost.':
-                    allele.is_extinct = True
+                    reported_extinct = True
         except KeyError:
             pass
+        # Second find evidence for existence.
+        for fprop_type in allele.featureprops.keys():
+            if fprop_type.startswith('derived_stock_'):
+                has_stocks = True
+        # Synthesize these two pieces of info.
+        if reported_extinct is True:
+            allele.is_extinct = True
+            if has_stocks is True:
+                log.warning(f'{allele}: stated to be lost, but has stocks.')
+        elif has_stocks is True:
+            allele.is_extinct = False
         return
 
     def synthesize_inheritance_mode(self, allele):
@@ -1007,17 +1041,15 @@ def synthesize_inheritance_mode(self, allele):
                     pheno = phenotype.Phenotype.uniquename
                     mode_context = f'{allele.curie}\t{cvterm}\t{geno}\t{pheno}'
                     mode_context_list.append(mode_context)
-        if reported_modes:
-            reported_modes = list(set(reported_modes))
-            allele.inheritence_mode = '|'.join(reported_modes)
-            log.debug(f'\tFound {len(reported_modes)} inheritance mode(s): {allele.curie}: {allele.inheritence_mode}')
-            # Log cases of multiple inheritance modes for curator review.
-            if len(reported_modes) > 1:
-                mode_context_list = list(set(mode_context_list))
-                for i in mode_context_list:
-                    log.warning(f'MULTIPLE_INHERITANCE_MODES:\t{i}')
-        else:
-            allele.inheritence_mode = 'unknown'
+        reported_modes = list(set(reported_modes))
+        mode_context_list = list(set(mode_context_list))
+        # Update inheritance_mode_name if unambiguous.
+        if len(reported_modes) == 1:
+            allele.inheritance_mode_name = reported_modes[0]
+        # If ambiguous, change from "unknown" to None.
+        elif len(reported_modes) > 1:
+            allele.inheritance_mode_name = None
+            log.warning(f"{allele}: Found {len(reported_modes)} inheritance modes: {'|'.join(reported_modes)}. Context: {mode_context_list}")
         return
 
     def synthesize_collections(self, allele):
@@ -1033,11 +1065,102 @@ def synthesize_collections(self, allele):
             collection_names = allele.sf_libraries
         if collection_names:
             collection_names = list(set(collection_names))
-            allele.in_collection = collection_names[0].name
+            allele.in_collection_name = collection_names[0].name
             if len(collection_names) > 1:
                 log.warning(f'\tFound {len(collection_names)} collection(s) for {allele.curie}: {allele.in_collection}')
         return
 
+    def synthesize_synonyms(self, feature):
+        """Generate name/synonym DTOs for a feature that has a list of FeatureSynonym objects."""
+        # Dict for converting FB to AGR synonym types.
+        synonym_type_conversion = {
+            'symbol': 'nomenclature_symbol',
+            'fullname': 'full_name',
+            'nickname': 'nomenclature_symbol',
+            'synonym': 'nomenclature_symbol'
+        }
+        default_name_dto = {
+            'name_type_name': 'unspecified',
+            'format_text': 'unspecified',
+            'display_text': 'unspecified',
+            'synonym_scope_name': 'exact',
+            'evidence_curies': [],
+            'internal': False,
+            'obsolete': False
+        }
+        # Create a dict of all distinct name/synonym_sgml combinations: for each, capture synonym type(s) an pub_ids.
+        # Keys are (synonym.name, synonym.synonym_sgml) tuples.
+        # Values are dicts too where keys are chado synonym types and values are lists of pub_ids.
+        # Value dict also has an "internal" key that stores list of FeatureSynonym.is_internal values.
+        feature_synonym_dict = {}
+        for f_s in feature.feature_synonyms:
+            synonym = self.all_synonyms_dict[f_s.synonym_id]
+            distinct_synonym_name = (synonym.name, synonym.synonym_sgml)
+            if distinct_synonym_name in feature_synonym_dict.keys():
+                feature_synonym_dict[distinct_synonym_name]['internal'].append(f_s.is_internal)
+                if synonym.type.name in feature_synonym_dict[distinct_synonym_name].keys():
+                    feature_synonym_dict[distinct_synonym_name][synonym.type.name].append(f_s.pub_id)
+                else:
+                    feature_synonym_dict[distinct_synonym_name][synonym.type.name] = [f_s.pub_id]
+            else:
+                feature_synonym_dict[distinct_synonym_name] = {synonym.type.name: [f_s.pub_id], 'internal': [f_s.is_internal]}
+        # Convert to AGR name DTO objects.
+        name_dto_list = []
+        FORMAT_TEXT = 0
+        DISPLAY_TEXT = 1
+        for syno_name, syno_attributes in feature_synonym_dict.items():
+            # Determine internal status. False trumps True.
+            if False in set(syno_attributes['internal']):
+                syno_internal = False
+            else:
+                syno_internal = True
+            # Collect all pubs.
+            pub_id_list = []
+            for syno_type, syno_type_pub_list in syno_attributes.items():
+                if syno_type == 'internal':
+                    continue
+                pub_id_list.extend(syno_type_pub_list)
+            pub_id_list = list(set(pub_id_list))
+            # Pick the best synonym type.
+            type_tally = {}
+            for syno_type, syno_type_pub_list in syno_attributes.items():
+                if syno_type == 'internal':
+                    continue
+                type_tally[len(set(syno_type_pub_list))] = syno_type
+            name_type_to_use = synonym_type_conversion[type_tally[max(type_tally.keys())]]
+            output_synonym_dto = {
+                'name_type_name': name_type_to_use,
+                'format_text': sub_sup_sgml_to_html(syno_name[FORMAT_TEXT]),
+                'display_text': sub_sup_sgml_to_html(syno_name[DISPLAY_TEXT]),
+                'synonym_scope_name': 'exact',
+                'evidence_curies': [self.all_pubs_dict[i] for i in pub_id_list if self.all_pubs_dict[i] != 'unattributed'],
+                'internal': syno_internal,
+                'obsolete': False
+            }
+            name_dto_list.append(output_synonym_dto)
+        # Sift through name DTOs for symbol, fullname, systematic_name, etc.
+        for name_dto in name_dto_list:
+            if name_dto['display_text'] == feature.curr_symbol_name:
+                if name_dto['name_type_name'] != 'nomenclature_symbol':
+                    log.warning(f"{feature}: Found mistyped curr symbol: type={name_dto['name_type_name']}, name={name_dto['display_text']}")
+                    name_dto['name_type_name'] = 'nomenclature_symbol'
+                feature.allele_symbol_dto = name_dto
+            elif name_dto['display_text'] == feature.curr_fullname:
+                if name_dto['name_type_name'] != 'full_name':
+                    log.warning(f"{feature}: Found mistyped curr full_name: type={name_dto['name_type_name']}, name={name_dto['display_text']}")
+                    name_dto['name_type_name'] = 'full_name'
+                feature.allele_full_name_dto = name_dto
+            else:
+                feature.allele_synonym_dtos.append(name_dto)
+        # Symbol is required. If none, fill it in.
+        if feature.allele_symbol_dto is None:
+            placeholder_symbol_dto = default_name_dto.copy()
+            placeholder_symbol_dto['name_type_name'] = 'nomenclature_symbol'
+            placeholder_symbol_dto['format_text'] = feature.feature.name
+            placeholder_symbol_dto['display_text'] = feature.feature.name
+            feature.allele_symbol_dto = placeholder_symbol_dto
+        return
+
     def synthesize_info(self):
         """Convert FlyBase allele data into an AllianceAllele representation."""
         log.info('Synthesizing allele info.')

From 37ff0c78623f4729684f7505e802cb16b3bc0010 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Thu, 26 Jan 2023 15:16:28 -0500
Subject: [PATCH 24/52] remove redundant synonym synth method

---
 src/AGR_data_retrieval_curation_allele.py | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py
index d7f7925..98cbd10 100644
--- a/src/AGR_data_retrieval_curation_allele.py
+++ b/src/AGR_data_retrieval_curation_allele.py
@@ -23,7 +23,6 @@
 import argparse
 import datetime
 import json
-import re
 import strict_rfc3339
 from sqlalchemy import create_engine, inspect
 from sqlalchemy.orm import aliased, sessionmaker
@@ -863,28 +862,6 @@ def synthesize_fullname(self, allele):
             allele.name = allele.feature.name
         return
 
-    def synthesize_synonyms(self, allele):
-        """Process allele synonyms."""
-        internal_synonym_set = set(allele.internal_synonyms)
-        for internal_synonym in internal_synonym_set:
-            internal_synonym_dict = {
-                'name': internal_synonym,
-                'created_by_curie': 'FB:FB_curator',
-                'obsolete': False,
-                'internal': True
-            }
-            allele.synonyms.append(internal_synonym_dict)
-        public_synonym_set = set(allele.public_synonyms)
-        for public_synonym in public_synonym_set:
-            public_synonym_dict = {
-                'name': public_synonym,
-                'created_by_curie': 'FB:FB_curator',
-                'obsolete': False,
-                'internal': False
-            }
-            allele.synonyms.append(public_synonym_dict)
-        return
-
     def synthesize_secondary_ids(self, allele):
         """Process 2o IDs."""
         unique_fb_id_list = list(set(allele.alt_fb_ids))

From 4560ed6d3dfb02681223546dcc9f8d639d881bb2 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Thu, 26 Jan 2023 16:31:38 -0500
Subject: [PATCH 25/52] fix typo

---
 src/AGR_data_retrieval_curation_allele.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py
index 98cbd10..9db106f 100644
--- a/src/AGR_data_retrieval_curation_allele.py
+++ b/src/AGR_data_retrieval_curation_allele.py
@@ -173,7 +173,7 @@ def __init__(self):
     # Regexes.
     gene_regex = r'^FBgn[0-9]{7}$'
     allele_regex = r'^FBal[0-9]{7}$'
-    cons_regex = r'^FBtp[0-9]{7}$'
+    construct_regex = r'^FBtp[0-9]{7}$'
     ins_regex = r'^FBti[0-9]{7}$'
     seqfeat_regex = r'^FBsf[0-9]{10}$'
     feature_regex = r'^FB(tp|ti)[0-9]{7}$'
@@ -780,7 +780,7 @@ def get_sf_collections(self, session):
         seqfeat_construct = aliased(FeatureRelationship, name='seqfeat_construct')
         filters = (
             allele.uniquename.op('~')(self.allele_regex),
-            construct.uniquename.op('~')(self.cons_regex),
+            construct.uniquename.op('~')(self.construct_regex),
             seqfeat.uniquename.op('~')(self.seqfeat_regex),
             construct.is_obsolete.is_(False),
             seqfeat.is_obsolete.is_(False),

From f0440efcf3b25069601e801a88f69a4b9127c622 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Thu, 26 Jan 2023 17:27:50 -0500
Subject: [PATCH 26/52] fix typo

---
 src/AGR_data_retrieval_curation_allele.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py
index 9db106f..23cfce9 100644
--- a/src/AGR_data_retrieval_curation_allele.py
+++ b/src/AGR_data_retrieval_curation_allele.py
@@ -174,7 +174,7 @@ def __init__(self):
     gene_regex = r'^FBgn[0-9]{7}$'
     allele_regex = r'^FBal[0-9]{7}$'
     construct_regex = r'^FBtp[0-9]{7}$'
-    ins_regex = r'^FBti[0-9]{7}$'
+    insertion_regex = r'^FBti[0-9]{7}$'
     seqfeat_regex = r'^FBsf[0-9]{10}$'
     feature_regex = r'^FB(tp|ti)[0-9]{7}$'
     lib_regex = r'^FBlc[0-9]{7}$'

From c79cbbaced8591c252b039eaba683f6d4c455f9c Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Thu, 26 Jan 2023 19:26:57 -0500
Subject: [PATCH 27/52] fix typo

---
 src/AGR_data_retrieval_curation_allele.py | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py
index 23cfce9..f0ed033 100644
--- a/src/AGR_data_retrieval_curation_allele.py
+++ b/src/AGR_data_retrieval_curation_allele.py
@@ -844,24 +844,6 @@ def synthesize_timestamps(self, allele):
                 timestamp_to_rfc3339_localoffset(datetime.datetime.timestamp(max(allele.timestamps)))
         return
 
-    def synthesize_symbol(self, allele):
-        """Process symbol for an allele."""
-        if allele.curr_fb_symbol:
-            allele.symbol = sub_sup_sgml_to_html(allele.curr_fb_symbol.synonym_sgml)
-        else:
-            allele.symbol = allele.feature.name
-        return
-
-    def synthesize_fullname(self, allele):
-        """Process allele fullname."""
-        if allele.curr_fb_fullname:
-            allele.name = sub_sup_sgml_to_html(allele.curr_fb_fullname.synonym_sgml)
-        elif allele.curr_fb_symbol:
-            allele.name = sub_sup_sgml_to_html(allele.curr_fb_symbol.synonym_sgml)
-        else:
-            allele.name = allele.feature.name
-        return
-
     def synthesize_secondary_ids(self, allele):
         """Process 2o IDs."""
         unique_fb_id_list = list(set(allele.alt_fb_ids))
@@ -1145,8 +1127,6 @@ def synthesize_info(self):
             log.debug('Evaluating annotation: {}'.format(allele))
             self.synthesize_collections(allele)
             self.synthesize_timestamps(allele)
-            self.synthesize_symbol(allele)
-            self.synthesize_fullname(allele)
             self.synthesize_synonyms(allele)
             self.synthesize_secondary_ids(allele)
             self.synthesize_xrefs(allele)

From 00ec15e1b35255e1d173a7e6561702f930804d03 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Thu, 26 Jan 2023 19:45:58 -0500
Subject: [PATCH 28/52] fix typo

---
 src/AGR_data_retrieval_curation_allele.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py
index f0ed033..a69948f 100644
--- a/src/AGR_data_retrieval_curation_allele.py
+++ b/src/AGR_data_retrieval_curation_allele.py
@@ -136,17 +136,17 @@ def __init__(self, feature):
         self.allele_full_name_dto = None                       # Will be a single FullNameSlotAnnotation.
         self.allele_synonym_dtos = []                          # Will be list of NameSlotAnnotationDTO objects.
         self.allele_database_status_dto = None                 # ToDo
-        self.allele_functional_impact_dtos = None              # ToDo
+        self.allele_functional_impact_dtos = []                # ToDo
         self.allele_germline_transmission_status_dto = None    # ToDo
-        self.allele_molecular_mutation_dtos = None             # ToDo
-        self.allele_mutation_type_dtos = None                  # ToDo
-        self.allele_nomenclature_event_dtos = None             # ToDo
-        self.allele_note_dtos = None                           # ToDo
-        self.allele_secondary_id_dtos = None                   # Only 2o FlyBase IDs (redundant with GenomicEntity.secondary_identifiers?)
+        self.allele_molecular_mutation_dtos = []               # ToDo
+        self.allele_mutation_type_dtos = []                    # ToDo
+        self.allele_nomenclature_event_dtos = []               # ToDo
+        self.allele_note_dtos = []                             # ToDo
+        self.allele_secondary_id_dtos = []                     # Only 2o FlyBase IDs (redundant with GenomicEntity.secondary_identifiers?)
         self.in_collection_name = None                         # Will be library.name.
         self.inheritance_mode_name = 'unknown'                 # Change to one of: dominant, semi-dominant, recessive. If many apply, leave as unknown.
         self.is_extinct = None                                 # Make True if extinction reported; make False is stock exists; leave as None otherwise.
-        self.reference_curies = None                           # Will be a list of reference curies (directly or indirectly related).
+        self.reference_curies = []                             # Will be a list of reference curies (directly or indirectly related).
         # Notes associated with the object.
         self.for_alliance_export = True                        # Change to False if object should be excluded from export.
         self.internal_reasons = []                             # Reasons for marking an object as internal in the export file.

From 643d4c667d42642ab0fc71e2d8608ce10349cb4a Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Fri, 27 Jan 2023 09:50:40 -0500
Subject: [PATCH 29/52] fix obsolete eval

---
 src/AGR_data_retrieval_curation_gene.py | 40 +++++++++++++++----------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index 86f4910..aad68cc 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -36,7 +36,7 @@
 )
 from harvdev_utils.psycopg_functions import set_up_db_reading
 
-# Now proceed with generic setup.
+# Generic setup.
 report_label = 'gene_curation'
 set_up_dict = set_up_db_reading(report_label)
 server = set_up_dict['server']
@@ -66,10 +66,8 @@
 # The main process.
 def main():
     """Run the steps for exporting LinkML-compliant FlyBase Genes."""
-    log.info('Running script "{}"'.format(__file__))
-    log.info('Started main function.')
-    log.info('Output JSON file corresponds to "agr_curation_schema" release: {}'.format(linkml_release))
-
+    log.info('Running main() for script "{}"'.format(__file__))
+    log.info('Output corresponds to "agr_curation_schema" release: {}'.format(linkml_release))
     # Instantiate the object, get the data, synthesize it, export it.
     gene_handler = GeneHandler()
     db_query_transaction(gene_handler)
@@ -227,11 +225,14 @@ def open_panther_file(self):
         tsvin = csv.reader(tsv_file, delimiter='\t')
         FB = 0
         PTHR = 3
+        counter = 0
         for row in tsvin:
             fields = len(row)
             if fields:  # Ignore blank lines
                 if re.search(self.gene_regex, row[FB]) and re.search(self.pthr_regex, row[PTHR]):
                     self.pthr_dict[re.search(self.gene_regex, row[FB]).group(0)] = re.search(self.pthr_regex, row[PTHR]).group(0)
+                    counter += 1
+        log.info(f'Processed {counter} lines from the panther orthology file.')
         return
 
     def get_all_references(self, session):
@@ -302,13 +303,18 @@ def get_gene_taxons(self, session):
             filter(*filters).\
             distinct()
         organism_taxon_dict = {}
+        org_counter = 0
+        gene_counter = 0
         for result in organism_dbxref_results:
             organism_taxon_dict[result.OrganismDbxref.organism_id] = result.Dbxref.accession
+            org_counter += 1
         for gene in self.gene_dict.values():
             try:
                 gene.taxon_curie = 'NCBITaxon:{}'.format(organism_taxon_dict[gene.feature.organism_id])
+                gene_counter += 1
             except KeyError:
                 log.debug('No NCBI taxon ID available for: {}'.format(gene))
+        log.info(f'Found {org_counter} distinct NCBITaxon IDs for {gene_counter} genes.')
         return
 
     def get_synonyms(self, session):
@@ -420,8 +426,11 @@ def get_gene_snapshots(self, session):
             join(prop_type, (prop_type.cvterm_id == Featureprop.type_id)).\
             filter(*filters).\
             distinct()
+        counter = 0
         for result in gene_snapshot_results:
             self.gene_dict[result.feature.uniquename].gene_snapshot = result
+            counter += 1
+        log.info(f'Found {counter} gene snapshots.')
         return
 
     def get_gene_types(self, session):
@@ -440,9 +449,12 @@ def get_gene_types(self, session):
             join(prop_type, (prop_type.cvterm_id == Featureprop.type_id)).\
             filter(*filters).\
             distinct()
+        counter = 0
         for result in gene_type_results:
             self.gene_dict[result.feature.uniquename].gene_type_curie = result.value[1:10].replace('SO', 'SO:')
             self.gene_dict[result.feature.uniquename].gene_type_name = result.value[11:-1]
+            counter += 1
+        log.info(f'Found {counter} gene types for genes.')
         return
 
     def get_gene_timestamps(self, session):
@@ -490,8 +502,11 @@ def get_gene_featureloc(self, session):
             filter(*filters).\
             distinct()
         self.chr_dict = {}
+        chr_counter = 0
         for result in chr_results:
             self.chr_dict[result.feature_id] = result.uniquename
+            chr_counter += 1
+        log.info(f'Got basic info for {chr_counter} chr scaffolds.')
         # Now get gene featureloc.
         filters = (
             Feature.uniquename.op('~')(self.gene_regex),
@@ -503,8 +518,11 @@ def get_gene_featureloc(self, session):
             join(Cvterm, (Cvterm.cvterm_id == Feature.type_id)).\
             filter(*filters).\
             distinct()
+        gene_counter = 0
         for result in gene_featureloc_results:
             self.gene_dict[result.feature.uniquename].featureloc = result
+            gene_counter += 1
+        log.info(f'Found {gene_counter} genomic locations for genes.')
         return
 
     def query_chado(self, session):
@@ -715,18 +733,10 @@ def synthesize_info(self):
                         xref_dict['internal'] = True
                     gene.cross_reference_dtos.append(xref_dict)
             # Flag internal features.
-            if gene.organism_abbr != 'Dmel':
-                gene.internal = True
-                gene.internal_reasons.append('Non-Dmel')
-            if gene.obsolete is True:
+            if gene.feature.is_obsolete is True:
+                gene.obsolete = True
                 gene.internal = True
                 gene.internal_reasons.append('Obsolete')
-            if gene.gene_type_curie is None:
-                gene.internal = True
-                gene.internal_reasons.append('Lacks gene type')
-            if gene.gene_type_name in self.internal_gene_types:
-                gene.internal = True
-                gene.internal_reasons.append('Internal gene type {} ({})'.format(gene.gene_type_name, gene.gene_type_curie))
             for attr in self.required_fields:
                 if attr not in gene.__dict__.keys():
                     gene.for_alliance_export = False

From 46ef9c329217b81c0c40e67c318594f40740832e Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Fri, 27 Jan 2023 10:18:05 -0500
Subject: [PATCH 30/52] fix gene regex for pthr file

---
 src/AGR_data_retrieval_curation_gene.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index aad68cc..eed3230 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -226,11 +226,12 @@ def open_panther_file(self):
         FB = 0
         PTHR = 3
         counter = 0
+        gene_regex = r'FBgn[0-9]{7}'    # Since the FBgn ID does not represent the entire column entry, do not use self.gene_regex here. 
         for row in tsvin:
             fields = len(row)
             if fields:  # Ignore blank lines
-                if re.search(self.gene_regex, row[FB]) and re.search(self.pthr_regex, row[PTHR]):
-                    self.pthr_dict[re.search(self.gene_regex, row[FB]).group(0)] = re.search(self.pthr_regex, row[PTHR]).group(0)
+                if re.search(gene_regex, row[FB]) and re.search(self.pthr_regex, row[PTHR]):
+                    self.pthr_dict[re.search(gene_regex, row[FB]).group(0)] = re.search(self.pthr_regex, row[PTHR]).group(0)
                     counter += 1
         log.info(f'Processed {counter} lines from the panther orthology file.')
         return

From 147685a8d6c964cb11edac2d9337a69bb593bce6 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Fri, 27 Jan 2023 10:18:33 -0500
Subject: [PATCH 31/52] flake8

---
 src/AGR_data_retrieval_curation_gene.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index eed3230..6d8e866 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -226,7 +226,7 @@ def open_panther_file(self):
         FB = 0
         PTHR = 3
         counter = 0
-        gene_regex = r'FBgn[0-9]{7}'    # Since the FBgn ID does not represent the entire column entry, do not use self.gene_regex here. 
+        gene_regex = r'FBgn[0-9]{7}'    # Since the FBgn ID does not represent the entire column entry, do not use self.gene_regex here.
         for row in tsvin:
             fields = len(row)
             if fields:  # Ignore blank lines

From b51c3285e4d93cc7501fb71a292963aa72676b22 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Fri, 27 Jan 2023 10:24:28 -0500
Subject: [PATCH 32/52] get only Dmel chr scaffolds

---
 src/AGR_data_retrieval_curation_gene.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index 6d8e866..189ee13 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -505,9 +505,11 @@ def get_gene_featureloc(self, session):
         self.chr_dict = {}
         chr_counter = 0
         for result in chr_results:
+            if result.organism.abbreviation != 'Dmel':
+                continue
             self.chr_dict[result.feature_id] = result.uniquename
             chr_counter += 1
-        log.info(f'Got basic info for {chr_counter} chr scaffolds.')
+        log.info(f'Got basic info for {chr_counter} current Dmel chr scaffolds.')
         # Now get gene featureloc.
         filters = (
             Feature.uniquename.op('~')(self.gene_regex),

From d6a68326bad04f123ee7999b9743d7d85862cc0f Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Fri, 27 Jan 2023 12:19:59 -0500
Subject: [PATCH 33/52] reduce reasons for marking genes, alleles as internal

---
 src/AGR_data_retrieval_curation_allele.py | 16 ++++++++--------
 src/AGR_data_retrieval_curation_gene.py   |  7 +++----
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py
index a69948f..0051594 100644
--- a/src/AGR_data_retrieval_curation_allele.py
+++ b/src/AGR_data_retrieval_curation_allele.py
@@ -121,7 +121,7 @@ def __init__(self, feature):
         self.sf_libraries = []                             # Will be a list of Library objects related to the allele via seq. feature (FBsf).
         # Attributes for the Alliance AuditedObject.
         self.obsolete = feature.is_obsolete                    # Will be the FlyBase value here.
-        self.internal = False                                  # Change to true if allele not intended for display at Alliance website.
+        self.internal = False                                  # Change to true if not public on FlyBase.
         self.created_by_curie = 'FB:FB_curator'                # Use placeholder value since no Person object at FlyBase.
         self.updated_by_curie = 'FB:FB_curator'                # Use placeholder value since no Person object at FlyBase.
         self.date_created = None                               # Earliest timestamp.
@@ -909,16 +909,16 @@ def synthesize_insertions(self, allele):
         return
 
     def flag_internal_alleles(self, allele):
-        """Flag alleles as internal and/or obsolete, or not."""
-        if allele.organism_abbr != 'Dmel':
-            allele.internal = True
-            allele.internal_reasons.append('Non-Dmel')
+        """Flag alleles as internal."""
         if allele.obsolete is True:
             allele.internal = True
             allele.internal_reasons.append('Obsolete')
-        if allele.allele_of_internal_gene is True:
-            allele.internal = True
-            allele.internal_reasons.append('Allele of internal Dmel gene type.')
+        # if allele.organism_abbr != 'Dmel':
+        #     allele.internal = True
+        #     allele.internal_reasons.append('Non-Dmel')
+        # if allele.allele_of_internal_gene is True:
+        #     allele.internal = True
+        #     allele.internal_reasons.append('Allele of internal Dmel gene type.')
         return
 
     def flag_unexportable_alleles(self, allele):
diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index 189ee13..32c62ca 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -112,8 +112,8 @@ def __init__(self, feature):
         self.annotation_ids = []                              # Will be list of Dbxrefs for annotation IDs.
         self.timestamps = []                                  # Add all timestamps here.
         # Attributes for the Alliance AuditedObjectDTO.
-        self.obsolete = False                                 # Never True. All FB annotations are deleted if no longer current.
-        self.internal = False                                 # Will be internal if annotation should not be exported to Alliance for some reason.
+        self.obsolete = feature.is_obsolete                   # Will be the FlyBase value here.
+        self.internal = False                                 # Change to true if not public on FlyBase.
         self.created_by_curie = 'FB:FB_curator'               # Use placeholder value since no Person object at FlyBase.
         self.updated_by_curie = 'FB:FB_curator'               # Use placeholder value since no Person object at FlyBase.
         self.date_created = None                              # Not straightforward as half of relevant annotations are derived in the reporting build.
@@ -736,8 +736,7 @@ def synthesize_info(self):
                         xref_dict['internal'] = True
                     gene.cross_reference_dtos.append(xref_dict)
             # Flag internal features.
-            if gene.feature.is_obsolete is True:
-                gene.obsolete = True
+            if gene.obsolete is True:
                 gene.internal = True
                 gene.internal_reasons.append('Obsolete')
             for attr in self.required_fields:

From 82b55a80ec6070838b536ce43a4bcb2e545cafcd Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Fri, 27 Jan 2023 12:37:42 -0500
Subject: [PATCH 34/52] code comments on emerging allele attr

---
 src/AGR_data_retrieval_curation_allele.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py
index 0051594..171d453 100644
--- a/src/AGR_data_retrieval_curation_allele.py
+++ b/src/AGR_data_retrieval_curation_allele.py
@@ -135,18 +135,18 @@ def __init__(self, feature):
         self.allele_symbol_dto = None                          # Will be a single SymbolSlotAnnotationDTO.
         self.allele_full_name_dto = None                       # Will be a single FullNameSlotAnnotation.
         self.allele_synonym_dtos = []                          # Will be list of NameSlotAnnotationDTO objects.
-        self.allele_database_status_dto = None                 # ToDo
-        self.allele_functional_impact_dtos = []                # ToDo
-        self.allele_germline_transmission_status_dto = None    # ToDo
-        self.allele_molecular_mutation_dtos = []               # ToDo
-        self.allele_mutation_type_dtos = []                    # ToDo
-        self.allele_nomenclature_event_dtos = []               # ToDo
-        self.allele_note_dtos = []                             # ToDo
         self.allele_secondary_id_dtos = []                     # Only 2o FlyBase IDs (redundant with GenomicEntity.secondary_identifiers?)
         self.in_collection_name = None                         # Will be library.name.
         self.inheritance_mode_name = 'unknown'                 # Change to one of: dominant, semi-dominant, recessive. If many apply, leave as unknown.
         self.is_extinct = None                                 # Make True if extinction reported; make False is stock exists; leave as None otherwise.
         self.reference_curies = []                             # Will be a list of reference curies (directly or indirectly related).
+        self.allele_database_status_dto = None                 # ToDo - must be CV term: e.g., ? - CV not settled yet?
+        self.allele_functional_impact_dtos = []                # ToDo - must be CV term: e.g., amorph - CV not settled yet?
+        self.allele_germline_transmission_status_dto = None    # ToDo - must be CV term: e.g., ? - CV not settled yet?
+        self.allele_molecular_mutation_dtos = []               # ToDo - must be CV term: e.g., ? - CV not settled yet?
+        self.allele_mutation_type_dtos = []                    # ToDo - must be SO term curies: e.g., ?.
+        self.allele_nomenclature_event_dtos = []               # ToDo - must be CV term: e.g., named, renamed - CV not settled yet?
+        self.allele_note_dtos = []                             # ToDo - must have CV term for note_type_name: e.g., ? - CV not settled yet?
         # Notes associated with the object.
         self.for_alliance_export = True                        # Change to False if object should be excluded from export.
         self.internal_reasons = []                             # Reasons for marking an object as internal in the export file.

From 39ae78eb376fcef448340e098e19f7cf182bf08a Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Mon, 30 Jan 2023 10:32:20 -0500
Subject: [PATCH 35/52] fix filtering out of unattributed pub

---
 src/AGR_data_retrieval_curation_allele.py | 4 ++--
 src/AGR_data_retrieval_curation_gene.py   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py
index 171d453..1d6db31 100644
--- a/src/AGR_data_retrieval_curation_allele.py
+++ b/src/AGR_data_retrieval_curation_allele.py
@@ -127,7 +127,7 @@ def __init__(self, feature):
         self.date_created = None                               # Earliest timestamp.
         self.date_updated = None                               # Latest timestamp.
         # Attributes for the Alliance BiologicalEntity. BiologicalEntity is_a AuditedObject.
-        self.curie = 'FB:{}'.format(feature.uniquename)
+        self.curie = 'FB:{}'.format(feature.uniquename)f
         self.taxon_curie = None                                # A string representing the NCBI taxon ID. We have no NCBI taxonID for 223 alleles.
         # Attributes for the Alliance GenomicEntity. GenomicEntity is_a BiologicalEntity.
         self.cross_reference_dtos = []                         # Report only select dbs, using AGR-accepted db_prefix.
@@ -890,7 +890,7 @@ def synthesize_xrefs(self, allele):
 
     def synthesize_references(self, allele):
         """Process pubs for allele."""
-        allele.reference_curies = [self.all_pubs_dict[i] for i in allele.fb_references if self.all_pubs_dict[i] != 'unattributed']
+        allele.reference_curies = [self.all_pubs_dict[i] for i in allele.fb_references if self.all_pubs_dict[i] != 'FB:unattributed']
         return
 
     def synthesize_insertions(self, allele):
diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index 32c62ca..4b4f804 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -609,7 +609,7 @@ def process_feature_synonyms(self, feature):
                 'format_text': sub_sup_sgml_to_html(syno_name[FORMAT_TEXT]),
                 'display_text': sub_sup_sgml_to_html(syno_name[DISPLAY_TEXT]),
                 'synonym_scope_name': 'exact',
-                'evidence_curies': [self.all_pubs_dict[i] for i in pub_id_list if self.all_pubs_dict[i] != 'unattributed'],
+                'evidence_curies': [self.all_pubs_dict[i] for i in pub_id_list if self.all_pubs_dict[i] != 'FB:unattributed'],
                 'internal': syno_internal,
                 'obsolete': False
             }

From ce56b7d1bb86bff1775daa8a1cd1b325dce88774 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Mon, 30 Jan 2023 10:44:03 -0500
Subject: [PATCH 36/52] fix filtering out of unattributed pub

---
 src/AGR_data_retrieval_curation_allele.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py
index 1d6db31..863f5cf 100644
--- a/src/AGR_data_retrieval_curation_allele.py
+++ b/src/AGR_data_retrieval_curation_allele.py
@@ -1092,7 +1092,7 @@ def synthesize_synonyms(self, feature):
                 'format_text': sub_sup_sgml_to_html(syno_name[FORMAT_TEXT]),
                 'display_text': sub_sup_sgml_to_html(syno_name[DISPLAY_TEXT]),
                 'synonym_scope_name': 'exact',
-                'evidence_curies': [self.all_pubs_dict[i] for i in pub_id_list if self.all_pubs_dict[i] != 'unattributed'],
+                'evidence_curies': [self.all_pubs_dict[i] for i in pub_id_list if self.all_pubs_dict[i] != 'FB:unattributed'],
                 'internal': syno_internal,
                 'obsolete': False
             }

From 058e2c921768cb0269612628a87b6dec6875e203 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Mon, 30 Jan 2023 11:16:35 -0500
Subject: [PATCH 37/52] debug unattr issue

---
 src/AGR_data_retrieval_curation_allele.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py
index 863f5cf..bff04bb 100644
--- a/src/AGR_data_retrieval_curation_allele.py
+++ b/src/AGR_data_retrieval_curation_allele.py
@@ -127,7 +127,7 @@ def __init__(self, feature):
         self.date_created = None                               # Earliest timestamp.
         self.date_updated = None                               # Latest timestamp.
         # Attributes for the Alliance BiologicalEntity. BiologicalEntity is_a AuditedObject.
-        self.curie = 'FB:{}'.format(feature.uniquename)f
+        self.curie = 'FB:{}'.format(feature.uniquename)
         self.taxon_curie = None                                # A string representing the NCBI taxon ID. We have no NCBI taxonID for 223 alleles.
         # Attributes for the Alliance GenomicEntity. GenomicEntity is_a BiologicalEntity.
         self.cross_reference_dtos = []                         # Report only select dbs, using AGR-accepted db_prefix.
@@ -232,6 +232,9 @@ def get_all_references(self, session):
         pub_counter = 0
         for pub in results:
             self.all_pubs_dict[pub.pub_id] = f'FB:{pub.uniquename}'
+            # BOB: DEBUG unattr issue
+            if pub.uniquename == 'unattributed':
+                log.debug(f'BOB: Found unattributed pub: pub_id={pub.pub_id}, dict_value={self.all_pubs_dict[pub.pub_id]}')
             pub_counter += 1
         # Next find PMIDs if available and replace the curie in the all_pubs_dict.
         filters = (
@@ -250,6 +253,10 @@ def get_all_references(self, session):
         for xref in pmid_xrefs:
             self.all_pubs_dict[xref.Pub.pub_id] = f'PMID:{xref.Dbxref.accession}'
             pmid_counter += 1
+        # BOB: DEBUG unattr issue:
+        for pub_id, curie in self.all_pubs_dict.items():
+            if 'unattributed' in curie:
+                log.debug(f'BOB: Found unattributed pub: pub_id={pub_id}, dict_value={curie}')
         log.info(f'Found {pmid_counter} PMID IDs for {pub_counter} current FB publications.')
         return
 

From 7d991b0872c05c77e96bca38c4d28f961f60894f Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Mon, 30 Jan 2023 11:32:05 -0500
Subject: [PATCH 38/52] fix debug of unattr issue

---
 src/AGR_data_retrieval_curation_allele.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py
index bff04bb..5c81962 100644
--- a/src/AGR_data_retrieval_curation_allele.py
+++ b/src/AGR_data_retrieval_curation_allele.py
@@ -231,10 +231,12 @@ def get_all_references(self, session):
             distinct()
         pub_counter = 0
         for pub in results:
+            if pub.uniquename == 'unattributed':
+                log.info('BOB: Found unattributed pub.')
             self.all_pubs_dict[pub.pub_id] = f'FB:{pub.uniquename}'
             # BOB: DEBUG unattr issue
             if pub.uniquename == 'unattributed':
-                log.debug(f'BOB: Found unattributed pub: pub_id={pub.pub_id}, dict_value={self.all_pubs_dict[pub.pub_id]}')
+                log.info(f'BOB: Found unattributed pub: pub_id={pub.pub_id}, dict_value={self.all_pubs_dict[pub.pub_id]}')
             pub_counter += 1
         # Next find PMIDs if available and replace the curie in the all_pubs_dict.
         filters = (
@@ -254,9 +256,10 @@ def get_all_references(self, session):
             self.all_pubs_dict[xref.Pub.pub_id] = f'PMID:{xref.Dbxref.accession}'
             pmid_counter += 1
         # BOB: DEBUG unattr issue:
+        log.info('BOB: DEBUB UNATTRIBUTED ISSUE')
         for pub_id, curie in self.all_pubs_dict.items():
             if 'unattributed' in curie:
-                log.debug(f'BOB: Found unattributed pub: pub_id={pub_id}, dict_value={curie}')
+                log.info(f'BOB: Found unattributed pub: pub_id={pub_id}, dict_value={curie}')
         log.info(f'Found {pmid_counter} PMID IDs for {pub_counter} current FB publications.')
         return
 

From 82967680cf09a776cedb9c4b9773a82be3359748 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Mon, 30 Jan 2023 12:39:50 -0500
Subject: [PATCH 39/52] remove debug stuff

---
 src/AGR_data_retrieval_curation_allele.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py
index 5c81962..a37bf37 100644
--- a/src/AGR_data_retrieval_curation_allele.py
+++ b/src/AGR_data_retrieval_curation_allele.py
@@ -231,12 +231,7 @@ def get_all_references(self, session):
             distinct()
         pub_counter = 0
         for pub in results:
-            if pub.uniquename == 'unattributed':
-                log.info('BOB: Found unattributed pub.')
             self.all_pubs_dict[pub.pub_id] = f'FB:{pub.uniquename}'
-            # BOB: DEBUG unattr issue
-            if pub.uniquename == 'unattributed':
-                log.info(f'BOB: Found unattributed pub: pub_id={pub.pub_id}, dict_value={self.all_pubs_dict[pub.pub_id]}')
             pub_counter += 1
         # Next find PMIDs if available and replace the curie in the all_pubs_dict.
         filters = (
@@ -255,11 +250,6 @@ def get_all_references(self, session):
         for xref in pmid_xrefs:
             self.all_pubs_dict[xref.Pub.pub_id] = f'PMID:{xref.Dbxref.accession}'
             pmid_counter += 1
-        # BOB: DEBUG unattr issue:
-        log.info('BOB: DEBUB UNATTRIBUTED ISSUE')
-        for pub_id, curie in self.all_pubs_dict.items():
-            if 'unattributed' in curie:
-                log.info(f'BOB: Found unattributed pub: pub_id={pub_id}, dict_value={curie}')
         log.info(f'Found {pmid_counter} PMID IDs for {pub_counter} current FB publications.')
         return
 

From 8abf70787e924c4a375b11efa108bc950ccd02ee Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Tue, 31 Jan 2023 10:45:29 -0500
Subject: [PATCH 40/52] unique list of fb refs for alleles

---
 src/AGR_data_retrieval_curation_allele.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py
index a37bf37..92c993c 100644
--- a/src/AGR_data_retrieval_curation_allele.py
+++ b/src/AGR_data_retrieval_curation_allele.py
@@ -890,6 +890,7 @@ def synthesize_xrefs(self, allele):
 
     def synthesize_references(self, allele):
         """Process pubs for allele."""
+        allele.fb_references = list(set(allele.fb_references))
         allele.reference_curies = [self.all_pubs_dict[i] for i in allele.fb_references if self.all_pubs_dict[i] != 'FB:unattributed']
         return
 

From ad2c5d12d5a95826f5e1c2852d716450882c9907 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Wed, 1 Feb 2023 12:27:16 -0500
Subject: [PATCH 41/52] look for redundant dis annos

---
 src/AGR_data_retrieval_curation_disease.py | 31 ++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/src/AGR_data_retrieval_curation_disease.py b/src/AGR_data_retrieval_curation_disease.py
index ec4cc23..62630d9 100644
--- a/src/AGR_data_retrieval_curation_disease.py
+++ b/src/AGR_data_retrieval_curation_disease.py
@@ -96,6 +96,7 @@ def __init__(self, feature_cvterm, provenance_prop):
         self.timestamps = []                                  # Will be a list of audit_chado timestamp lists.
         # Derived attributes.
         self.modifier_problem = False                         # Change to true if there's a problem finding the modifier allele.
+        self.agr_uniq_key = None                              # Will be unique key based on Alliance defining features.
         # Attributes for the Alliance AuditedObjectDTO.
         self.obsolete = False                                 # Never True. All FB annotations are deleted if no longer current.
         self.internal = False                                 # Will be internal if annotation should not be exported to Alliance for some reason.
@@ -149,6 +150,7 @@ class DAFMaker(object):
     def __init__(self):
         """Create the DAFMaker object."""
         self.dis_anno_dict = {}       # A dict of DiseaseAnnotations keyed by feature_cvterm_id plus rank (e.g., 1234567_0).
+        self.uniq_dis_dict = {}       # A dict of DiseaseAnnotations keyed by AGR defining features.
         self.total_anno_cnt = 0       # Count of all disease annotations found in starting query.
         self.export_anno_cnt = 0      # Count of all disease annotations exported to file.
         self.internal_anno_cnt = 0    # Count of all disease annotations marked as internal=True in export file.
@@ -466,6 +468,9 @@ def synthesize_info(self, session):
                                 dis_anno.modifier_problem = True
             # Now check for conditions that prevent export.
             self.evaluate_annot(dis_anno)
+            # Generate the unique AGR key based on AGR defining features for FB disease annotations.
+            self.derive_agr_uniq_key(dis_anno)
+        self.group_dis_annos()
         log.info('Done synthesizing disease annotation info.')
         return
 
@@ -487,6 +492,32 @@ def evaluate_annot(self, dis_anno):
                 log.debug(msg)
         return
 
+    def derive_agr_uniq_key(self, dis_anno):
+        """Derive the AGR unique key based on defining features of FB disease annotations."""
+        dis_anno.agr_uniq_key = f'{dis_anno.allele_curie}||{dis_anno.do_term_curie}||{dis_anno.disease_relation_name}'
+        dis_anno.agr_uniq_key += f'||{dis_anno.reference_curie}'
+        evi_codes = list(set(dis_anno.evidence_code_curies)).sorted
+        evi_code_str = '|'.join(evi_codes)
+        dis_anno.agr_uniq_key += f'||{evi_code_str}'
+        dis_anno.agr_uniq_key += f'||{dis_anno.disease_genetic_modifier_curie}'
+        dis_anno.agr_uniq_key += f'||{dis_anno.disease_genetic_modifier_relation_name}'
+        log.debug(f'{dis_anno} HAS AGR_UNIQ_KEY: {dis_anno.agr_uniq_key}')
+        return
+
+    def group_dis_annos(self):
+        """Group redundant disease annotations."""
+        log.info('Group redundant disease annotations.')
+        input_counter = 0
+        for dis_anno in self.dis_anno_dict.values():
+            input_counter += 1
+            try:
+                self.uniq_dis_dict[dis_anno.agr_uniq_key].append(dis_anno)
+            except KeyError:
+                self.uniq_dis_dict[dis_anno.agr_uniq_key] = [dis_anno]
+        grouped_counter = len(self.uniq_dis_dict.keys())
+        log.info(f'Found {grouped_counter} unique keys for {input_counter} disease annotations.')
+        return
+
     def generate_export_file(self):
         """Process disease annotations and print to a LinkML-compliant JSON file."""
         log.info('Generating output JSON file of disease annotations.')

From 209f3e2bfba008bbb1748e21ffe62f65905e5082 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Wed, 1 Feb 2023 12:33:09 -0500
Subject: [PATCH 42/52] fix typo

---
 src/AGR_data_retrieval_curation_disease.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/AGR_data_retrieval_curation_disease.py b/src/AGR_data_retrieval_curation_disease.py
index 62630d9..23f7569 100644
--- a/src/AGR_data_retrieval_curation_disease.py
+++ b/src/AGR_data_retrieval_curation_disease.py
@@ -496,7 +496,8 @@ def derive_agr_uniq_key(self, dis_anno):
         """Derive the AGR unique key based on defining features of FB disease annotations."""
         dis_anno.agr_uniq_key = f'{dis_anno.allele_curie}||{dis_anno.do_term_curie}||{dis_anno.disease_relation_name}'
         dis_anno.agr_uniq_key += f'||{dis_anno.reference_curie}'
-        evi_codes = list(set(dis_anno.evidence_code_curies)).sorted
+        evi_codes = sorted(list(set(dis_anno.evidence_code_curies)))
+        log.debug(f'BOB: {evi_codes}')
         evi_code_str = '|'.join(evi_codes)
         dis_anno.agr_uniq_key += f'||{evi_code_str}'
         dis_anno.agr_uniq_key += f'||{dis_anno.disease_genetic_modifier_curie}'

From 6c5497d5e3da53b89a7f19e5622a9a3b1e406f19 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Wed, 1 Feb 2023 12:42:36 -0500
Subject: [PATCH 43/52] restrict uniq key counts to exportable dis_annos, print
 out redundant ones for assessment

---
 src/AGR_data_retrieval_curation_disease.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/AGR_data_retrieval_curation_disease.py b/src/AGR_data_retrieval_curation_disease.py
index 23f7569..d953777 100644
--- a/src/AGR_data_retrieval_curation_disease.py
+++ b/src/AGR_data_retrieval_curation_disease.py
@@ -510,13 +510,20 @@ def group_dis_annos(self):
         log.info('Group redundant disease annotations.')
         input_counter = 0
         for dis_anno in self.dis_anno_dict.values():
+            if dis_anno.for_alliance_export is False:
+                continue
             input_counter += 1
             try:
                 self.uniq_dis_dict[dis_anno.agr_uniq_key].append(dis_anno)
             except KeyError:
                 self.uniq_dis_dict[dis_anno.agr_uniq_key] = [dis_anno]
         grouped_counter = len(self.uniq_dis_dict.keys())
-        log.info(f'Found {grouped_counter} unique keys for {input_counter} disease annotations.')
+        log.info(f'Found {grouped_counter} unique keys for {input_counter} exportable disease annotations.')
+        for uniq_key, anno_list in self.uniq_dis_dict.items():
+            if len(anno_list) > 1:
+                log.warning(f'REDUNDANT: {uniq_key}:')
+                for i in anno_list:
+                    log.warning(f'\t{i}')
         return
 
     def generate_export_file(self):

From f0edbea53005a0311ba7d7413b18f9fc1b5124d9 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Wed, 1 Feb 2023 13:02:14 -0500
Subject: [PATCH 44/52] tidy warnings, report/count non-redundant dis_anno
 requiring modifier ID update

---
 src/AGR_data_retrieval_curation_disease.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_disease.py b/src/AGR_data_retrieval_curation_disease.py
index d953777..8fac8c3 100644
--- a/src/AGR_data_retrieval_curation_disease.py
+++ b/src/AGR_data_retrieval_curation_disease.py
@@ -95,6 +95,7 @@ def __init__(self, feature_cvterm, provenance_prop):
         self.qualifier = None                                 # Will be the "qualifier" FeatureCvtermprop.
         self.timestamps = []                                  # Will be a list of audit_chado timestamp lists.
         # Derived attributes.
+        self.modifier_id_was_updated = False                  # Change to true if modifier ID in evidence text was updated.
         self.modifier_problem = False                         # Change to true if there's a problem finding the modifier allele.
         self.agr_uniq_key = None                              # Will be unique key based on Alliance defining features.
         # Attributes for the Alliance AuditedObjectDTO.
@@ -416,13 +417,13 @@ def get_current_id_for_allele(self, session, old_uniquename):
             distinct()
         curr_uniquenames = [i.uniquename for i in curr_alleles]
         if len(curr_uniquenames) == 1:
-            log.warning('For obsolete {}, found one current allele: {}'.format(old_uniquename, curr_uniquenames[0]))
+            log.debug('For obsolete {}, found one current allele: {}'.format(old_uniquename, curr_uniquenames[0]))
             curr_allele_id = curr_uniquenames[0]
         elif len(curr_uniquenames) > 1:
-            log.warning('For obsolete {}, found many current alleles: {}'.format(old_uniquename, curr_uniquenames))
+            log.debug('For obsolete {}, found many current alleles: {}'.format(old_uniquename, curr_uniquenames))
             curr_allele_id = None
         else:
-            log.warning('For obsolete {}, found no current alleles.'.format(old_uniquename))
+            log.debug('For obsolete {}, found no current alleles.'.format(old_uniquename))
             curr_allele_id = None
         return curr_allele_id
 
@@ -464,6 +465,7 @@ def synthesize_info(self, session):
                             curr_allele_id = self.get_current_id_for_allele(session, allele_id)
                             if curr_allele_id:
                                 dis_anno.disease_genetic_modifier_curie = 'FB:{}'.format(curr_allele_id)
+                                dis_anno.modifier_id_was_updated = True
                             else:
                                 dis_anno.modifier_problem = True
             # Now check for conditions that prevent export.
@@ -497,7 +499,6 @@ def derive_agr_uniq_key(self, dis_anno):
         dis_anno.agr_uniq_key = f'{dis_anno.allele_curie}||{dis_anno.do_term_curie}||{dis_anno.disease_relation_name}'
         dis_anno.agr_uniq_key += f'||{dis_anno.reference_curie}'
         evi_codes = sorted(list(set(dis_anno.evidence_code_curies)))
-        log.debug(f'BOB: {evi_codes}')
         evi_code_str = '|'.join(evi_codes)
         dis_anno.agr_uniq_key += f'||{evi_code_str}'
         dis_anno.agr_uniq_key += f'||{dis_anno.disease_genetic_modifier_curie}'
@@ -519,11 +520,18 @@ def group_dis_annos(self):
                 self.uniq_dis_dict[dis_anno.agr_uniq_key] = [dis_anno]
         grouped_counter = len(self.uniq_dis_dict.keys())
         log.info(f'Found {grouped_counter} unique keys for {input_counter} exportable disease annotations.')
+        # Report redundant disease annotations in detail.
+        # Also report non-redundant disease annotations that required modifier ID update.
+        update_allele_id_counter = 0
         for uniq_key, anno_list in self.uniq_dis_dict.items():
             if len(anno_list) > 1:
-                log.warning(f'REDUNDANT: {uniq_key}:')
+                log.warning(f'REDUNDANT: AGR_UNIQ_KEY: {uniq_key}')
                 for i in anno_list:
-                    log.warning(f'\t{i}')
+                    log.warning(f'REDUNDANT:\t{i}')
+            elif anno_list[0].modifier_id_was_updated is True:
+                log.warning(f'UPDATED DIS_ANNO: {anno_list[0]}')
+                update_allele_id_counter +=1
+        log.info(f'Found {update_allele_id_counter} non-redundant exportable disease annotations that required modifier ID update.')
         return
 
     def generate_export_file(self):

From 65cee0802fa269306c43fb88ca923dbb87af50e5 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Wed, 1 Feb 2023 13:27:21 -0500
Subject: [PATCH 45/52] add negated to agr_uniq_key, do not export dis annos
 requiring modifier ID update

---
 src/AGR_data_retrieval_curation_disease.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_disease.py b/src/AGR_data_retrieval_curation_disease.py
index 8fac8c3..f0b604f 100644
--- a/src/AGR_data_retrieval_curation_disease.py
+++ b/src/AGR_data_retrieval_curation_disease.py
@@ -485,7 +485,8 @@ def evaluate_annot(self, dis_anno):
                 'Only "model of|DOES NOT model" is exportable',
             ' with FLYBASE' in dis_anno.evidence_code.value:
                 'Only disease annotations modeled by a single allele are exportable',
-            dis_anno.modifier_problem is True: 'Cannot find current feature for disease modifier.'
+            dis_anno.modifier_problem is True: 'Cannot find current feature for disease modifier.',
+            dis_anno.modifier_id_was_updated is True: 'Modifier referenced by non-current allele ID.'
         }
         for check, msg in export_checks.items():
             if check:
@@ -497,7 +498,7 @@ def evaluate_annot(self, dis_anno):
     def derive_agr_uniq_key(self, dis_anno):
         """Derive the AGR unique key based on defining features of FB disease annotations."""
         dis_anno.agr_uniq_key = f'{dis_anno.allele_curie}||{dis_anno.do_term_curie}||{dis_anno.disease_relation_name}'
-        dis_anno.agr_uniq_key += f'||{dis_anno.reference_curie}'
+        dis_anno.agr_uniq_key += f'||{dis_anno.negated}||{dis_anno.reference_curie}'
         evi_codes = sorted(list(set(dis_anno.evidence_code_curies)))
         evi_code_str = '|'.join(evi_codes)
         dis_anno.agr_uniq_key += f'||{evi_code_str}'

From 0ecfc7cca9ab3f088d86218e1fc0b2667f2f0181 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Wed, 1 Feb 2023 13:43:30 -0500
Subject: [PATCH 46/52] export non-redundant dis_annos

---
 src/AGR_data_retrieval_curation_disease.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_disease.py b/src/AGR_data_retrieval_curation_disease.py
index f0b604f..5ead74f 100644
--- a/src/AGR_data_retrieval_curation_disease.py
+++ b/src/AGR_data_retrieval_curation_disease.py
@@ -542,7 +542,9 @@ def generate_export_file(self):
             'linkml_version': linkml_release,
             'disease_allele_ingest_set': []
         }
-        for dis_anno in self.dis_anno_dict.values():
+        # For each AGR unique key, just process the 1st disease annotation in the list of redundant FB annotations.
+        for dis_anno_list in self.uniq_dis_dict.values():
+            dis_anno = dis_anno_list[0]
             if dis_anno.for_alliance_export is False:
                 log.debug('Suppress disease annotation from export: {}. Reasons: {}'.format(dis_anno, '; '.join(dis_anno.export_warnings)))
                 continue
@@ -561,9 +563,8 @@ def generate_export_file(self):
             outfile.close()
         log.info('Done writing data to output file.')
         total_public_anno_cnt = self.export_anno_cnt - self.internal_anno_cnt
-        log.info('Exported {} of {} disease annotations ({} are public).'.
-                 format(self.export_anno_cnt, self.total_anno_cnt, total_public_anno_cnt))
-        log.info('Suppressed {} disease annotations from export.'.format(self.total_anno_cnt - self.export_anno_cnt))
+        log.info(f'Exported {self.export_anno_cnt} of {self.total_anno_cnt} disease annotations ({total_public_anno_cnt} are public).')
+        log.info(f'Suppressed {self.total_anno_cnt - self.export_anno_cnt} disease annotations from export.')
         return
 
 

From fd308b6861598a0678437cd01c36d785c5b89f12 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Thu, 2 Feb 2023 10:48:26 -0500
Subject: [PATCH 47/52] update req

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 8d07faf..0fa5691 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,7 +11,7 @@ git+https://github.com/FlyBase/harvdev-utils.git@master#egg=harvdev_utils
 # pubchempy
 retry
 # requests>=2.21.0
-# sqlalchemy
+sqlalchemy
 svn
 # urllib3>=1.24.1
 # virtualenv>=16.2.0

From a4039bb5b47473a4b6388b67d92dd7a81424a47f Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Thu, 2 Feb 2023 11:07:16 -0500
Subject: [PATCH 48/52] update sqlalchemy

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 0fa5691..9b218dc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,7 +11,7 @@ git+https://github.com/FlyBase/harvdev-utils.git@master#egg=harvdev_utils
 # pubchempy
 retry
 # requests>=2.21.0
-sqlalchemy
+sqlalchemy>=2.0
 svn
 # urllib3>=1.24.1
 # virtualenv>=16.2.0

From 9eb33a88cf227ccb187ba1a31eaf988a8ba8b3d8 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Thu, 2 Feb 2023 11:15:02 -0500
Subject: [PATCH 49/52] try updating h-utils

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 9b218dc..dc869bc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ nested_dict==1.61
 psycopg2>=2.6.2
 strict_rfc3339==0.7
 tqdm>=4.29.0
-git+https://github.com/FlyBase/harvdev-utils.git@master#egg=harvdev_utils
+git+https://github.com/FlyBase/harvdev-utils.git@test#egg=harvdev_utils
 # Below are additional requirements for harvdev-utils itself (may not be automatically installed by cmd above).
 # bioservices
 # flake8>=3.5.0

From 1f48fa4c14cb62f1fee49e311ff8ed1e360438c0 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Thu, 2 Feb 2023 12:53:17 -0500
Subject: [PATCH 50/52] revert to working form

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index dc869bc..8d07faf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,14 +4,14 @@ nested_dict==1.61
 psycopg2>=2.6.2
 strict_rfc3339==0.7
 tqdm>=4.29.0
-git+https://github.com/FlyBase/harvdev-utils.git@test#egg=harvdev_utils
+git+https://github.com/FlyBase/harvdev-utils.git@master#egg=harvdev_utils
 # Below are additional requirements for harvdev-utils itself (may not be automatically installed by cmd above).
 # bioservices
 # flake8>=3.5.0
 # pubchempy
 retry
 # requests>=2.21.0
-sqlalchemy>=2.0
+# sqlalchemy
 svn
 # urllib3>=1.24.1
 # virtualenv>=16.2.0

From 4ec8c9eb1dabc624c2ea74898fd13f15685256de Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Thu, 2 Feb 2023 13:24:29 -0500
Subject: [PATCH 51/52] try no-cache pip install in docker build

---
 Dockerfile       | 2 +-
 requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index b279fea..f56a062 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -19,6 +19,6 @@ ADD git_agr_curation_schema/util/validate_agr_schema.py             /src/validat
 ADD git_agr_curation_schema/jsonschema/allianceModel.schema.json    jsonschema/allianceModel.schema.json
 
 # Install required modules.
-RUN pip3 install -r requirements.txt
+RUN pip3 install -r requirements.txt --no-cache-dir
 
 ENTRYPOINT [ "/bin/bash" ]
diff --git a/requirements.txt b/requirements.txt
index 8d07faf..0fa5691 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,7 +11,7 @@ git+https://github.com/FlyBase/harvdev-utils.git@master#egg=harvdev_utils
 # pubchempy
 retry
 # requests>=2.21.0
-# sqlalchemy
+sqlalchemy
 svn
 # urllib3>=1.24.1
 # virtualenv>=16.2.0

From 2eb2668f49250b19738b492105264d165e37cc48 Mon Sep 17 00:00:00 2001
From: gildossantos <gilbertodossantos74@gmail.com>
Date: Tue, 7 Feb 2023 09:41:27 -0500
Subject: [PATCH 52/52] flake8

---
 src/AGR_data_retrieval_curation_disease.py | 2 +-
 src/AGR_data_retrieval_curation_gene.py    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/AGR_data_retrieval_curation_disease.py b/src/AGR_data_retrieval_curation_disease.py
index 5ead74f..d51a41d 100644
--- a/src/AGR_data_retrieval_curation_disease.py
+++ b/src/AGR_data_retrieval_curation_disease.py
@@ -531,7 +531,7 @@ def group_dis_annos(self):
                     log.warning(f'REDUNDANT:\t{i}')
             elif anno_list[0].modifier_id_was_updated is True:
                 log.warning(f'UPDATED DIS_ANNO: {anno_list[0]}')
-                update_allele_id_counter +=1
+                update_allele_id_counter += 1
         log.info(f'Found {update_allele_id_counter} non-redundant exportable disease annotations that required modifier ID update.')
         return
 
diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py
index 4b4f804..5c26fbc 100644
--- a/src/AGR_data_retrieval_curation_gene.py
+++ b/src/AGR_data_retrieval_curation_gene.py
@@ -641,12 +641,12 @@ def process_feature_synonyms(self, feature):
             placeholder_symbol_dto['format_text'] = feature.feature.name
             placeholder_symbol_dto['display_text'] = feature.feature.name
             feature.gene_symbol_dto = placeholder_symbol_dto
-        # Full name is required. If none, fill it in. Could be because FB has none, or, it's the same as the symbol.
+        # TEMPORARY: Full name is required for now. If none, fill it in. Could be because FB has none, or, it's the same as the symbol.
         if feature.gene_full_name_dto is None:
             placeholder_full_name_dto = feature.gene_symbol_dto.copy()
             placeholder_full_name_dto['name_type_name'] = 'full_name'
             feature.gene_full_name_dto = placeholder_full_name_dto
-        # Systematic name is required. If none, fill it in. Could be because gene is unannotated, or annotation ID has never been used in pubs.
+        # TEMPORARY: Systematic name is required for now. If none, fill it in. Could be because gene is unannotated, or annotation ID never used in pubs.
         if feature.gene_systematic_name_dto is None:
             placeholder_systematic_name_dto = feature.gene_symbol_dto.copy()
             placeholder_systematic_name_dto['name_type_name'] = 'systematic_name'