From 88f9dca5fef95d13bcfc2edcfe39b2874515626a Mon Sep 17 00:00:00 2001 From: gildossantos Date: Tue, 24 Jan 2023 11:10:30 -0500 Subject: [PATCH 01/52] update attribute names --- src/AGR_data_retrieval_curation_disease.py | 113 ++++++++++----------- 1 file changed, 56 insertions(+), 57 deletions(-) diff --git a/src/AGR_data_retrieval_curation_disease.py b/src/AGR_data_retrieval_curation_disease.py index c953b62..c140da6 100644 --- a/src/AGR_data_retrieval_curation_disease.py +++ b/src/AGR_data_retrieval_curation_disease.py @@ -89,45 +89,45 @@ def __init__(self, feature_cvterm, provenance_prop): """ # FlyBase data self.unique_key = '{}_{}'.format(feature_cvterm.feature_cvterm_id, provenance_prop.rank) - self.feature_cvterm = feature_cvterm # The FeatureCvterm object. - self.provenance = provenance_prop # The "provenance" FeatureCvtermprop. - self.evidence_code = None # Will be the "evidence_code" FeatureCvtermprop. - self.qualifier = None # Will be the "qualifier" FeatureCvtermprop. - self.timestamps = [] # Will be a list of audit_chado timestamp lists. - # Derived attribures. - self.modifier_problem = False # Change to true if there's a problem finding the modifier allele. - # Attributes for the Alliance AuditedObject. - self.obsolete = False # Never True. All FB annotations are deleted if no longer current. - self.internal = False # Will be internal if annotation should not be exported to Alliance for some reason. - self.created_by = 'FB:FB_curator' # Use placeholder value since no Person object at FlyBase. - self.updated_by = 'FB:FB_curator' # Use placeholder value since no Person object at FlyBase. - self.date_created = None # Not straightforward as half of relevant annotations are derived in the reporting build. - self.date_updated = None # Not straightforward as half of relevant annotations are derived in the reporting build. - # Attributes for the Alliance Association - self.subject = None # Provide allele curie (slot usage from AlleleDiseaseAnnotation) - self.predicate = 'is_implicated_in' # "Allele disease relations" CV (slot usage from AlleleDiseaseAnnotation) - self.object = None # Provide DOID (slot usage from DiseaseAnnotation). - # Attributes for the Alliance DiseaseAnnotation - self.data_provider = 'FB' - self.negated = False # Change to True for "NOT" annotations. - self.evidence_codes = [] # Set as appropriate. - self.single_reference = None # Provide FBrf ID. - self.annotation_type = 'manually_curated' # "Annotation types" CV. - self.disease_genetic_modifier = None # Gene, Allele or AGM curie. - self.disease_genetic_modifier_relation = None # "Disease genetic modifier relations" CV. - self.unique_id = self.unique_key # Use the unique_key (internal ID is ok). - self.mod_entity_id = None # N/A to FlyBase data. - self.inferred_gene = None # Gene asserted by curator to be associated with the disease annotation. - # self.with = None # N/A to FlyBase data. - self.disease_qualifiers = [] # N/A to FlyBase data. "Disease Qualifiers" CV. - self.condition_relations = [] # N/A to FlyBase data. - self.genetic_sex = None # N/A to FlyBase data. "Genetic sexes" CV. - self.related_notes = [] # N/A to FlyBase data. - self.secondary_data_provider = None # N/A to FlyBase data. + self.feature_cvterm = feature_cvterm # The FeatureCvterm object. + self.provenance = provenance_prop # The "provenance" FeatureCvtermprop. + self.evidence_code = None # Will be the "evidence_code" FeatureCvtermprop. + self.qualifier = None # Will be the "qualifier" FeatureCvtermprop. + self.timestamps = [] # Will be a list of audit_chado timestamp lists. + # Derived attributes. + self.modifier_problem = False # Change to true if there's a problem finding the modifier allele. + # Attributes for the Alliance AuditedObjectDTO. + self.obsolete = False # Never True. All FB annotations are deleted if no longer current. + self.internal = False # Will be internal if annotation should not be exported to Alliance for some reason. + self.created_by_curie = 'FB:FB_curator' # Use placeholder value since no Person object at FlyBase. + self.updated_by_curie = 'FB:FB_curator' # Use placeholder value since no Person object at FlyBase. + self.date_created = None # Not straightforward as half of relevant annotations are derived in the reporting build. + self.date_updated = None # Not straightforward as half of relevant annotations are derived in the reporting build. + # Attributes for the Alliance DiseaseAnnotationDTO. + self.disease_relation_name = 'is_implicated_in' # "Allele disease relations" CV (slot usage from AlleleDiseaseAnnotation) + self.do_term_curie = None # Provide DOID (slot usage from DiseaseAnnotation). + self.mod_entity_id = None # N/A to FlyBase data. + self.negated = False # Change to True for "NOT" annotations. + self.evidence_curies = [] # Not sure what these are? + self.evidence_code_curies = [] # Set as appropriate. + self.reference_curie = None # Provide FBrf ID. + self.annotation_type_name = 'manually_curated' # "Annotation types" CV. + self.with_gene_curies = [] # N/A to FlyBase data. + self.disease_qualifier_names = [] # N/A to FlyBase data. "Disease Qualifiers" CV. + self.condition_relation_dtos = [] # N/A to FlyBase data. + self.genetic_sex_name = None # N/A to FlyBase data. "Genetic sexes" CV. + self.note_dtos = [] # N/A to FlyBase data. + self.data_provider_name = 'FB' + self.secondary_data_provider_name = None # N/A to FlyBase data. + self.disease_genetic_modifier_curie = None # Gene, Allele or AGM curie. + self.disease_genetic_modifier_relation_name = None # "Disease genetic modifier relations" CV. + # Attributes for the Alliance AlleleDiseaseAnnotationDTO. + self.allele_curie = None # Provide allele curie. + self.inferred_gene_curie = None # Gene inferred to be associated with the disease annotation based on curated allele. # Notes associated with the object. - self.for_alliance_export = True # Change to False if object should be excluded from export. - self.internal_reasons = [] # Reasons for marking an object as internal. Will be exported but not necessarily displayed at Alliance. - self.export_warnings = [] # Reasons for suppressing an object from the export file. + self.for_alliance_export = True # Change to False if object should be excluded from export. + self.internal_reasons = [] # Reasons for marking an object as internal (exported but not displayed at Alliance). + self.export_warnings = [] # Reasons for suppressing an object from the export file. def __str__(self): """Succinct text string describing the disease annotation.""" @@ -171,34 +171,33 @@ def __init__(self): } required_fields = [ - 'data_provider', - 'evidence_codes', + 'allele_curie', + 'data_provider_name', + 'disease_relation_name', + 'do_term_curie', + 'evidence_code_curies', 'internal', - 'object', - 'predicate', - 'single_reference' - 'subject' + 'reference_curie', ] output_fields = [ - 'annotation_type', - 'created_by', - 'data_provider', + 'allele_curie', + 'annotation_type_name', + 'created_by_curie', + 'data_provider_name', 'date_created', 'date_updated', - 'disease_genetic_modifier', - 'disease_genetic_modifier_relation', - 'evidence_codes', - 'inferred_gene', + 'disease_genetic_modifier_curie', + 'disease_genetic_modifier_relation_name', + 'disease_relation_name', + 'do_term_curie', + 'evidence_code_curies', + 'inferred_gene_curie', 'internal', - 'updated_by', 'negated', - 'object', 'obsolete', - 'predicate', - 'single_reference', - 'subject', - 'unique_id' # For derived annotations, feature_cvterm_id+rank changes each release. So, suppress. + 'reference_curie', + 'updated_by_curie', ] def get_disease_annotations(self, session): From b40a82d0c49b676aef071c25cd280efe1fb738e1 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Tue, 24 Jan 2023 11:39:54 -0500 Subject: [PATCH 02/52] update attribute names in synthesis method --- src/AGR_data_retrieval_curation_disease.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/AGR_data_retrieval_curation_disease.py b/src/AGR_data_retrieval_curation_disease.py index c140da6..ec4cc23 100644 --- a/src/AGR_data_retrieval_curation_disease.py +++ b/src/AGR_data_retrieval_curation_disease.py @@ -429,11 +429,11 @@ def synthesize_info(self, session): log.info('Synthesizing disease annotation info.') for dis_anno in self.dis_anno_dict.values(): log.debug('Evaluating annotation: {}'.format(dis_anno)) - # Get subject, object and pub. - dis_anno.subject = 'FB:{}'.format(dis_anno.feature_cvterm.feature.uniquename) - dis_anno.object = 'DOID:{}'.format(dis_anno.feature_cvterm.cvterm.dbxref.accession) - dis_anno.single_reference = self.get_pub_xref(session, dis_anno.feature_cvterm.pub.uniquename) - dis_anno.inferred_gene = self.get_inferred_gene(session, dis_anno.feature_cvterm.feature.feature_id) + # Get allele, DO term and pub. + dis_anno.allele_curie = 'FB:{}'.format(dis_anno.feature_cvterm.feature.uniquename) + dis_anno.do_term_curie = 'DOID:{}'.format(dis_anno.feature_cvterm.cvterm.dbxref.accession) + dis_anno.reference_curie = self.get_pub_xref(session, dis_anno.feature_cvterm.pub.uniquename) + dis_anno.inferred_gene_curie = self.get_inferred_gene(session, dis_anno.feature_cvterm.feature.feature_id) # Mark negative annotations. if dis_anno.qualifier.value == 'DOES NOT model': dis_anno.negated = True @@ -445,23 +445,23 @@ def synthesize_info(self, session): # timestamp_to_rfc3339_localoffset(datetime.datetime.timestamp(max(dis_anno.timestamps))) # Determine evidence_code if dis_anno.evidence_code.value.startswith('CEC'): - dis_anno.evidence_codes.append(self.evidence_code_xrefs['CEC']) + dis_anno.evidence_code_curies.append(self.evidence_code_xrefs['CEC']) else: - dis_anno.evidence_codes.append(self.evidence_code_xrefs['CEA']) + dis_anno.evidence_code_curies.append(self.evidence_code_xrefs['CEA']) # Find modifiers and their relations. allele_regex = r'FBal[0-9]{7}' for fb_term in self.disease_genetic_modifier_terms.keys(): if fb_term in dis_anno.evidence_code.value: - dis_anno.disease_genetic_modifier_relation = self.disease_genetic_modifier_terms[fb_term] + dis_anno.disease_genetic_modifier_relation_name = self.disease_genetic_modifier_terms[fb_term] if re.search(allele_regex, dis_anno.evidence_code.value): allele_id = re.search(allele_regex, dis_anno.evidence_code.value).group(0) if self.confirm_current_allele_by_uniquename(session, allele_id): - dis_anno.disease_genetic_modifier = 'FB:{}'.format(allele_id) + dis_anno.disease_genetic_modifier_curie = 'FB:{}'.format(allele_id) else: # Look up current allele by 2o ID. Use that. curr_allele_id = self.get_current_id_for_allele(session, allele_id) if curr_allele_id: - dis_anno.disease_genetic_modifier = 'FB:{}'.format(curr_allele_id) + dis_anno.disease_genetic_modifier_curie = 'FB:{}'.format(curr_allele_id) else: dis_anno.modifier_problem = True # Now check for conditions that prevent export. From 9e2b82eaf7315764a6ce14084d0f921b9889756f Mon Sep 17 00:00:00 2001 From: gildossantos Date: Tue, 24 Jan 2023 15:02:23 -0500 Subject: [PATCH 03/52] update attribute names and synonym handling --- src/AGR_data_retrieval_curation_gene.py | 383 +++++++++++++++--------- 1 file changed, 235 insertions(+), 148 deletions(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index df7f5ee..3bf47c2 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -31,8 +31,8 @@ # from sqlalchemy.orm.exc import NoResultFound from harvdev_utils.char_conversions import sub_sup_sgml_to_html from harvdev_utils.production import ( - Cvterm, Db, Dbxref, Feature, FeatureDbxref, FeatureSynonym, Featureloc, - Featureprop, OrganismDbxref, Synonym + Cvterm, Db, Dbxref, Feature, FeatureDbxref, FeatureSynonym, + Featureloc, Featureprop, Organism, OrganismDbxref, Pub, PubDbxref, Synonym ) from harvdev_utils.psycopg_functions import set_up_db_reading @@ -99,45 +99,45 @@ def __init__(self, feature): # 1. Gene.name is requested (not required), but not all genes have a fullname. # 2. Gene.taxon is required, but even after updating NCBITaxon info at FlyBase, not all genes will have NCBI taxon ID. # 3. GenomicLocation lacks strand info. - self.feature = feature # The Feature object corresponding to the FlyBase gene. - self.organism_abbr = None # Will be the organism.abbreviation for the gene's species of origin. - self.taxon_dbxref = None # Will be the NCBITaxon (Db, Dbxref) tuple for the organism. - self.featureloc = None # Will be Featureloc object for the gene. - self.gene_type_name = None # Will be the cvterm.name for "promoted_gene_type" featureprop. - self.gene_snapshot = None # Will be the "gene_summary_text" Featureprop object. - self.curr_fb_symbol = None # Will be the current symbol Synonym object. - self.curr_fb_fullname = None # Will be the current fullname Synonym object. - self.internal_synonyms = [] # Will be list of internal synonym names (and synonym_sgml if different). - self.public_synonyms = [] # Will be list of public synonym names (and synonym_sgml if different). - self.dbxrefs = [] # Will be list of dbxrefs as sql result groupings: Db, Dbxref, FeatureDbxref. - self.alt_fb_ids = [] # Will be list of Dbxrefs for 2o FlyBase IDs. - self.annotation_ids = [] # Will be list of Dbxrefs for annotation IDs. - self.timestamps = [] # Add all timestamps here. - # Attributes for the Alliance AuditedObject. - self.obsolete = feature.is_obsolete # Will be the FlyBase value here. - self.internal = False # Change to true if gene not intended for display at Alliance website. - self.created_by = 'FB:FB_curator' # Use placeholder value since no Person object at FlyBase. - self.updated_by = 'FB:FB_curator' # Use placeholder value since no Person object at FlyBase. - self.date_created = None # Earliest timestamp. - self.date_updated = None # Latest timestamp. - # self.data_provider = 'FB' # The MOD abbreviation. - # Attributes for the Alliance BiologicalEntity. BiologicalEntity is_a AuditedObject. + self.feature = feature # The Feature object corresponding to the FlyBase gene. + self.organism_abbr = None # Will be the organism.abbreviation for the gene's species of origin. + self.taxon_dbxref = None # Will be the NCBITaxon (Db, Dbxref) tuple for the organism. + self.featureloc = None # Will be Featureloc object for the gene. + self.gene_type_name = None # Will be the cvterm.name for "promoted_gene_type" featureprop. + self.gene_snapshot = None # Will be the "gene_summary_text" Featureprop object. + self.curr_anno_id = None # Will be current annotation ID for the gene (str). + self.curr_fb_symbol = [] # Will be all FeatureSynonym objects in support of the current symbol Synonym object. + self.curr_fb_fullname = [] # Will be all FeatureSynonym objects in support of the current fullname Synonym object. + self.systematic_name = [] # Will be all FeatureSynonym objects using the systematic name of the gene. + self.other_synonyms = [] # Will be all FeatureSynonym objects in support of non-current synonyms. + self.dbxrefs = [] # Will be list of dbxrefs as sql result groupings: Db, Dbxref, FeatureDbxref. + self.alt_fb_ids = [] # Will be list of Dbxrefs for 2o FlyBase IDs. + self.annotation_ids = [] # Will be list of Dbxrefs for annotation IDs. + self.timestamps = [] # Add all timestamps here. + # Attributes for the Alliance AuditedObjectDTO. + self.obsolete = False # Never True. All FB annotations are deleted if no longer current. + self.internal = False # Will be internal if annotation should not be exported to Alliance for some reason. + self.created_by_curie = 'FB:FB_curator' # Use placeholder value since no Person object at FlyBase. + self.updated_by_curie = 'FB:FB_curator' # Use placeholder value since no Person object at FlyBase. + self.date_created = None # Not straightforward as half of relevant annotations are derived in the reporting build. + self.date_updated = None # Not straightforward as half of relevant annotations are derived in the reporting build. + # Attributes for the Alliance BiologicalEntityDTO. BiologicalEntityDTO is_a AuditedObjectDTO. self.curie = 'FB:{}'.format(feature.uniquename) - self.taxon = None # A string representing the NCBI taxon ID. We have no NCBI taxonID for 561 genes (72 species). - # Attributes for the Alliance GenomicEntity. GenomicEntity is_a BiologicalEntity. - self.name = None # Will be current fullname synonym - report ascii or utf8 (sgml) version? - self.synonyms = [] # All current and non-current ASCII and SGML synonyms. - self.cross_references = [] # Report only select dbs, using AGR-accepted db_prefix. - self.secondary_identifiers = [] # Annotation IDs and 2o FlyBase IDs. - # Attributes for the Alliance Gene. Gene is_a GenomicEntity. - self.genomic_locations = [] # Will need to be list of GenomicLocation objects. - self.symbol = None # Will be a string (ascii or utf8)? - self.gene_synopsis = None # Will be the gene's "gene_summary_text" featureprop value - remove "@" symbols. - self.gene_type = None # Will be the SO term ID corresponding to the gene's promoted_gene_type. + self.taxon_curie = None # A string representing the NCBI taxon ID. We have no NCBI taxonID for 561 genes (72 species). + # Attributes for the Alliance GenomicEntityDTO. GenomicEntityDTO is_a BiologicalEntityDTO. + self.cross_reference_dtos = [] # Report only select dbs, using AGR-accepted db_prefix. + self.secondary_identifiers = [] # Annotation IDs and 2o FlyBase IDs. + self.genomic_location_dtos = [] # Will need to be list of GenomicLocation objects. + # Attributes for the Alliance GeneDTO. GeneDTO is_a GenomicEntityDTO. + self.gene_symbol_dto = None # Will be a single SymbolSlotAnnotationDTO. + self.gene_full_name_dto = None # Will be a single GeneFullNameSlotAnnotation. + self.gene_systematic_name_dto = None # Will be a single GeneSystematicNameSlotAnnotation. + self.gene_synonym_dtos = [] # Will be list of NameSlotAnnotationDTO objects. + self.gene_type_curie = None # Will be the SO term ID corresponding to the gene's promoted_gene_type. # Notes associated with the object. - self.for_alliance_export = True # Change to False if object should be excluded from export. - self.internal_reasons = [] # Reasons for marking an object as internal in the export file. - self.export_warnings = [] # Reasons for suppressing an object from the export file. + self.for_alliance_export = True # Change to False if object should be excluded from export. + self.internal_reasons = [] # Reasons for marking an object as internal in the export file. + self.export_warnings = [] # Reasons for suppressing an object from the export file. def __str__(self): """Succinct text string describing the AllianceGene object.""" @@ -150,6 +150,8 @@ class GeneHandler(object): def __init__(self): """Create the GeneHandler object.""" self.gene_dict = {} # An FBgnID-keyed dict of AllianceGene objects. + self.all_pubs_dict = {} # A pub_id-keyed dict of pub curies (PMID or FBrf). + self.all_synonyms_dict = {} # A synonym_id-keyed dict of Synonym objects. self.pthr_dict = {} # Will be an 1:1 FBgnID-PTHR xref dict. self.chr_dict = {} # Will be a feature_id-keyed dict of chr scaffold uniquenames. self.total_feat_cnt = 0 # Count of all genes found in starting query. @@ -159,26 +161,27 @@ def __init__(self): test_genes = ['wg', 'mt:ori', 'lncRNA:roX1', 'CG12656'] required_fields = [ 'curie', - 'taxon', - 'symbol', - 'internal' + 'gene_symbol_dto', + 'gene_full_name_dto', + 'internal', + 'taxon_curie', ] output_fields = [ - 'created_by', - 'cross_references', + 'created_by_curie', + 'cross_reference_dtos', 'curie', 'date_created', 'date_updated', - 'gene_type', - 'genomic_locations', + 'gene_full_name_dto', + 'gene_symbol_dto', + 'gene_synonym_dtos', + 'gene_type_curie', + 'genomic_location_dtos', 'internal', - 'updated_by', - 'name', 'obsolete', 'secondary_identifiers', - 'symbol', - 'synonyms', - 'taxon' + 'taxon_curie', + 'updated_by_curie', ] internal_gene_types = [ 'engineered_fusion_gene', @@ -224,6 +227,42 @@ def open_panther_file(self): self.pthr_dict[re.search(fb_regex, row[FB]).group(0)] = re.search(pthr_regex, row[PTHR]).group(0) return + def get_references(self, session): + """Get all references.""" + log.info('Get all references.') + # First get all current pubs having an FBrf uniquename. + fbrf_regex = r'^(FBrf[0-9]{7}|unattributed)$' + filters = ( + Pub.uniquename.op('~')(fbrf_regex), + Pub.is_obsolete.is_(False) + ) + results = session.query(Pub).\ + filter(*filters).\ + distinct() + pub_counter = 0 + for pub in results: + self.all_pubs_dict[pub.pub_id] = f'FB:{pub.uniquename}' + counter += 1 + # Next find PMIDs if available and replace the curie in the all_pubs_dict. + filters = ( + Pub.uniquename.op('~')(fbrf_regex), + Pub.is_obsolete.is_(False), + Db.name == 'pubmed', + PubDbxref.is_current.is_(True) + ) + pmid_xrefs = session.query(Pub, Dbxref).\ + join(PubDbxref, (PubDbxref.pub_id == Pub.pub_id)).\ + join(Dbxref, (Dbxref.dbxref_id == PubDbxref.dbxref_id)).\ + join(Db, (Db.db_id == Dbxref.db_id)).\ + filter(*filters).\ + distinct() + pmid_counter = 0 + for xref in pmid_xrefs: + self.all_pubs_dict[xref.Pub.pub_id] = f'PMID:{xref.Dbxref.accession}' + pmid_counter += 1 + log.info(f'Found {pmid_counter} PMID IDs for {pub_counter} current FB publications.') + return + def get_genes(self, session): """Get all genes.""" log.info('Querying chado for genes.') @@ -262,11 +301,48 @@ def get_gene_taxons(self, session): organism_taxon_dict[result.OrganismDbxref.organism_id] = result.Dbxref.accession for gene in self.gene_dict.values(): try: - gene.taxon = 'NCBITaxon:{}'.format(organism_taxon_dict[gene.feature.organism_id]) + gene.taxon_curie = 'NCBITaxon:{}'.format(organism_taxon_dict[gene.feature.organism_id]) except KeyError: log.debug('No NCBI taxon ID available for: {}'.format(gene)) return + def get_gene_dbxrefs(self, session): + """Get all dbxrefs for genes. This will take 10-15 minutes.""" + log.info('Getting gene dbxrefs.') + gene_regex = r'^FBgn[0-9]{7}$' + filters = ( + Feature.uniquename.op('~')(gene_regex), + Feature.is_analysis.is_(False), + Cvterm.name == 'gene', + Db.name.in_((self.fb_agr_db_dict.keys())) + ) + gene_dbxref_results = session.query(Feature, FeatureDbxref, Dbxref, Db).\ + join(Cvterm, (Cvterm.cvterm_id == Feature.type_id)).\ + join(FeatureDbxref, (FeatureDbxref.feature_id == Feature.feature_id)).\ + join(Dbxref, (Dbxref.dbxref_id == FeatureDbxref.dbxref_id)).\ + join(Db, (Db.db_id == Dbxref.db_id)).\ + filter(*filters).\ + distinct() + counter = 0 + for result in gene_dbxref_results: + counter += 1 + if counter % 100000 == 0: + log.debug('Processing xref #{}'.format(counter)) + # Skip current FlyBase accessions. + # If present, these are same as feature.uniquename. + # However, not present for all genes (e.g., FBgn0085177), so cannot be relied upon. + if result.FeatureDbxref.is_current is True and result.Db.name == 'FlyBase': + pass + elif result.FeatureDbxref.is_current is False and result.Db.name == 'FlyBase': + self.gene_dict[result.Feature.uniquename].alt_fb_ids.append(result.Dbxref) + elif result.Db.name == 'FlyBase Annotation IDs': + self.gene_dict[result.Feature.uniquename].annotation_ids.append(result.Dbxref) + if result.FeatureDbxref.is_current is True: + self.gene_dict[result.Feature.uniquename].curr_anno_id = result.Dbxref.accession + else: + self.gene_dict[result.Feature.uniquename].dbxrefs.append(result) + return + def get_synonyms(self, session): """Get current and non-current symbols and full names for genes.""" log.info('Getting gene synonyms.') @@ -286,18 +362,19 @@ def get_synonyms(self, session): filter(*filters).\ distinct() for result in gene_curr_symbol_results: + # First, build the all_synonyms_dict. + self.all_synonyms_dict[result.Synonym.synonym_id] = Synonym + # Second, collect FeatureSynonym objects by type. if result.FeatureSynonym.is_current is True: if result.synonym_type.name == 'symbol': - self.gene_dict[result.Feature.uniquename].curr_fb_symbol = result.Synonym + self.gene_dict[result.Feature.uniquename].curr_fb_symbol.append(result.FeatureSynonym) elif result.synonym_type.name == 'fullname': - self.gene_dict[result.Feature.uniquename].curr_fb_fullname = result.Synonym - elif result.FeatureSynonym.is_internal is True: - self.gene_dict[result.Feature.uniquename].internal_synonyms.append(result.Synonym.name) - self.gene_dict[result.Feature.uniquename].internal_synonyms.append(sub_sup_sgml_to_html(result.Synonym.synonym_sgml)) + self.gene_dict[result.Feature.uniquename].curr_fb_fullname.append(result.FeatureSynonym) else: - self.gene_dict[result.Feature.uniquename].public_synonyms.append(result.Synonym.name) - self.gene_dict[result.Feature.uniquename].public_synonyms.append(sub_sup_sgml_to_html(result.Synonym.synonym_sgml)) - + self.gene_dict[result.Feature.uniquename].other_synonyms.append(result.FeatureSynonym) + # Third, catch synonyms that match the annotation ID (aka, systematic_name). + if result.Synonym.name == self.gene_dict[result.Feature.uniquename].curr_anno_id: + self.gene_dict[result.Feature.uniquename].systematic_name.append(result.FeatureSynonym) return def get_gene_snapshots(self, session): @@ -340,7 +417,7 @@ def get_gene_types(self, session): filter(*filters).\ distinct() for result in gene_type_results: - self.gene_dict[result.feature.uniquename].gene_type = result.value[1:10].replace('SO', 'SO:') + self.gene_dict[result.feature.uniquename].gene_type_curie = result.value[1:10].replace('SO', 'SO:') self.gene_dict[result.feature.uniquename].gene_type_name = result.value[11:-1] return @@ -375,41 +452,6 @@ def get_gene_timestamps(self, session): ######################################################################## return - def get_gene_dbxrefs(self, session): - """Get all dbxrefs for genes. This will take 10-15 minutes.""" - log.info('Getting gene dbxrefs.') - gene_regex = r'^FBgn[0-9]{7}$' - filters = ( - Feature.uniquename.op('~')(gene_regex), - Feature.is_analysis.is_(False), - Cvterm.name == 'gene', - Db.name.in_((self.fb_agr_db_dict.keys())) - ) - gene_dbxref_results = session.query(Feature, FeatureDbxref, Dbxref, Db).\ - join(Cvterm, (Cvterm.cvterm_id == Feature.type_id)).\ - join(FeatureDbxref, (FeatureDbxref.feature_id == Feature.feature_id)).\ - join(Dbxref, (Dbxref.dbxref_id == FeatureDbxref.dbxref_id)).\ - join(Db, (Db.db_id == Dbxref.db_id)).\ - filter(*filters).\ - distinct() - counter = 0 - for result in gene_dbxref_results: - counter += 1 - if counter % 100000 == 0: - log.debug('Processing xref #{}'.format(counter)) - # Skip current FlyBase accessions. - # If present, these are same as feature.uniquename. - # However, not present for all genes (e.g., FBgn0085177), so cannot be relied upon. - if result.FeatureDbxref.is_current is True and result.Db.name == 'FlyBase': - pass - elif result.FeatureDbxref.is_current is False and result.Db.name == 'FlyBase': - self.gene_dict[result.Feature.uniquename].alt_fb_ids.append(result.Dbxref) - elif result.Db.name == 'FlyBase Annotation IDs': - self.gene_dict[result.Feature.uniquename].annotation_ids.append(result.Dbxref) - else: - self.gene_dict[result.Feature.uniquename].dbxrefs.append(result) - return - def get_gene_featureloc(self, session): """Getting gene featureloc.""" log.info('Getting gene genomic locations.') @@ -445,22 +487,103 @@ def get_gene_featureloc(self, session): def query_chado(self, session): """A wrapper method that runs initial db queries.""" self.open_panther_file() + self.get_references(session) self.get_genes(session) self.get_gene_taxons(session) + self.get_gene_dbxrefs(session) self.get_synonyms(session) self.get_gene_snapshots(session) self.get_gene_types(session) self.get_gene_timestamps(session) - self.get_gene_dbxrefs(session) self.get_gene_featureloc(session) return + # BOB: new method for synonyms. + def process_feature_synonyms(self, input, name_type, return_single_value): + """Convert a string or list of FeatureSynonym objects into single or many DTO objects for export. + + Args: + arg1 (input): (str or list) A string, or, a list of FeatureSynonym objects. + arg2 (name_type): (str) The type of name to return. If "unspecified" is given, go by Synonym type. + arg3 (return_single_value): (bool) True if output should be a single DTO, False if a list is to be returned. + + Returns: + A single or list of name DTO objects. + + Raises: + Raises error if in put is not a string/list. + Raises error if return_single_value set to True, but many synonyms found in the input list. + + """ + if type(input) is not str or type(input) is not list: + log.error('Input must be a string or list of FeatureSynonym objects.') + raise + # First handle the simplest case where a string is given as the input. + if type(input) is str: + output_synonym_dto = { + 'name_type_name': name_type, + 'format_text': input, + 'display_text': input, + 'synonym_scope': 'exact', + 'evidence_curies': [], + 'internal': False, + 'obsolete': False + } + if return_single_value is False: + output_synonym_dto = [output_synonym_dto] + return output_synonym_dto + # Next handle a list of FeatureSynonym objects. + # Collect pub_ids for each synonym (keyed by synonym_id). + feature_synonym_dict = {} + output_synonym_dto_list = [] + for f_s in input: + try: + feature_synonym_dict[f_s.synonym_id].append(f_s.pub_id) + except KeyError: + feature_synonym_dict[f_s.synonym_id] = [f_s.pub_id] + for synonym_id, pub_list in feature_synonym_dict.items(): + synonym = self.all_synonyms_dict[synonym_id] + if name_type == 'unspecified': + name_type_to_use = synonym.type.name + else: + name_type_to_use = name_type + output_synonym_dto = { + 'name_type_name': name_type_to_use, + 'format_text': synonym.name, + 'display_text': synonym.synonym_sgml, + 'synonym_scope': 'exact', + 'evidence_curies': [f'{self.all_pubs_dict[i]}' for i in pub_list if self.all_pubs_dict[i] != 'unattributed'], + 'internal': False, + 'obsolete': False + } + output_synonym_dto_list.append(output_synonym_dto) + if return_single_value is True and len(output_synonym_dto_list) != 1: + log.error('Found many synonyms but was expecting only one.') + raise + elif return_single_value is True and len(output_synonym_dto_list) == 1: + return output_synonym_dto_list[0] + else: + return output_synonym_dto_list + # Synthesis of initial db info. def synthesize_info(self): """Convert FlyBase gene data into an AllianceGene representation.""" log.info('Synthesizing gene info.') for gene in self.gene_dict.values(): log.debug('Evaluating annotation: {}'.format(gene)) + # BOB: Handle synonyms. + if gene.curr_fb_symbol: + gene.gene_symbol_dto = self.process_feature_synonyms(gene.curr_fb_symbol, 'nomenclature_symbol', True) + else: + gene.gene_symbol_dto = self.process_feature_synonyms(gene.feature.name, 'nomenclature_symbol', True) + if gene.curr_fb_fullname: + gene.gene_full_name_dto = self.process_feature_synonyms(gene.curr_fb_fullname, 'full_name', True) + else: + gene.gene_full_name_dto = self.process_feature_synonyms(gene.feature.name, 'full_name', True) + if gene.systematic_name: + gene.gene_systematic_name_dto = self.process_feature_synonyms(gene.systematic_name, 'systematic_name', True) + if gene.other_synonyms: + gene.gene_synonym_dtos = self.process_feature_synonyms(gene.other_synonyms, 'unspecified', False) # Get timestamps. if gene.timestamps: gene.date_created = strict_rfc3339.\ @@ -472,12 +595,12 @@ def synthesize_info(self): genomic_location_dict = { 'internal': False, 'obsolete': False, - 'created_by': 'FB:FB_curator', - 'updated_by': 'FB:FB_curator', - 'subject': gene.curie, + 'created_by_curie': 'FB:FB_curator', + 'updated_by_curie': 'FB:FB_curator', + 'genomic_entity_curie': gene.curie, 'predicate': 'localizes_to', - 'object': 'FB:{}'.format(self.chr_dict[gene.featureloc.srcfeature_id]), - 'has_assembly': reference_assembly + 'chromosome_curie': 'FB:{}'.format(self.chr_dict[gene.featureloc.srcfeature_id]), + 'assembly_curie': reference_assembly } if gene.featureloc.strand == -1: genomic_location_dict['start'] = str(gene.featureloc.fmax) @@ -485,51 +608,15 @@ def synthesize_info(self): else: genomic_location_dict['start'] = str(gene.featureloc.fmin + 1) genomic_location_dict['end'] = str(gene.featureloc.fmax) - gene.genomic_locations.append(genomic_location_dict) - # Get the symbol. - if gene.curr_fb_symbol: - gene.symbol = sub_sup_sgml_to_html(gene.curr_fb_symbol.synonym_sgml) - else: - gene.symbol = gene.feature.name - # Get the fullname. - if gene.curr_fb_fullname: - gene.name = sub_sup_sgml_to_html(gene.curr_fb_fullname.synonym_sgml) - else: - gene.name = gene.feature.name - # Get synonyms. - internal_synonym_set = set(gene.internal_synonyms) - for internal_synonym in internal_synonym_set: - internal_synonym_dict = { - 'name': internal_synonym, - 'created_by': 'FB:FB_curator', - 'obsolete': False, - 'internal': True - } - gene.synonyms.append(internal_synonym_dict) - public_synonym_set = set(gene.public_synonyms) - for public_synonym in public_synonym_set: - public_synonym_dict = { - 'name': public_synonym, - 'created_by': 'FB:FB_curator', - 'obsolete': False, - 'internal': False - } - gene.synonyms.append(public_synonym_dict) + gene.genomic_location_dtos.append(genomic_location_dict) # Add gene synopsis. if gene.gene_snapshot: gene.gene_synopsis = gene.gene_snapshot.value - # Get secondary IDs. + # Get secondary IDs (FBgn and annotation IDs). for fb_id in gene.alt_fb_ids: gene.secondary_identifiers.append('FB:{}'.format(fb_id.accession)) for anno_id in gene.annotation_ids: - # gene.secondary_identifiers.append('FB:{}'.format(anno_id.accession)) - public_synonym_dict = { - 'synonym': anno_id.accession, - 'created_by': 'FB:FB_curator', - 'obsolete': False, - 'internal': False - } - gene.synonyms.append(public_synonym_dict) + gene.secondary_identifiers.append('FB:{}'.format(anno_id.accession)) # Get crossreferences. # Start by adding gene uniquename as an xref. xref_dict = { @@ -537,11 +624,11 @@ def synthesize_info(self): 'display_name': 'FB:{}'.format(gene.feature.uniquename), 'prefix': 'FB', 'page_areas': ['gene'], - 'created_by': 'FB:FB_curator', + 'created_by_curie': 'FB:FB_curator', 'obsolete': False, 'internal': False } - gene.cross_references.append(xref_dict) + gene.cross_reference_dtos.append(xref_dict) # Then add PANTHER xref (from file). if gene.feature.uniquename in self.pthr_dict.keys(): pthr_xref_dict = { @@ -552,7 +639,7 @@ def synthesize_info(self): 'obsolete': False, 'internal': False } - gene.cross_references.append(pthr_xref_dict) + gene.cross_reference_dtos.append(pthr_xref_dict) # Get other xrefs. for result in gene.dbxrefs: if result.Db.name in self.fb_agr_db_dict.keys(): @@ -561,13 +648,13 @@ def synthesize_info(self): 'display_name': '{}:{}'.format(self.fb_agr_db_dict[result.Db.name], result.Dbxref.accession), 'prefix': self.fb_agr_db_dict[result.Db.name], 'page_areas': ['gene'], - 'created_by': 'FB:FB_curator', + 'created_by_curie': 'FB:FB_curator', 'obsolete': False, 'internal': False } if result.FeatureDbxref.is_current is False: xref_dict['internal'] = True - gene.cross_references.append(xref_dict) + gene.cross_reference_dtos.append(xref_dict) # Flag internal features. if gene.organism_abbr != 'Dmel': gene.internal = True @@ -575,12 +662,12 @@ def synthesize_info(self): if gene.obsolete is True: gene.internal = True gene.internal_reasons.append('Obsolete') - if gene.gene_type is None: + if gene.gene_type_curie is None: gene.internal = True gene.internal_reasons.append('Lacks gene type') if gene.gene_type_name in self.internal_gene_types: gene.internal = True - gene.internal_reasons.append('Internal gene type {} ({})'.format(gene.gene_type_name, gene.gene_type)) + gene.internal_reasons.append('Internal gene type {} ({})'.format(gene.gene_type_name, gene.gene_type_curie)) for attr in self.required_fields: if attr not in gene.__dict__.keys(): gene.for_alliance_export = False From 0469a5af92f8e72f84cdccb4d2b146024dc12eae Mon Sep 17 00:00:00 2001 From: gildossantos Date: Tue, 24 Jan 2023 15:05:14 -0500 Subject: [PATCH 04/52] flake8 --- src/AGR_data_retrieval_curation_gene.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index 3bf47c2..cb18e10 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -32,7 +32,7 @@ from harvdev_utils.char_conversions import sub_sup_sgml_to_html from harvdev_utils.production import ( Cvterm, Db, Dbxref, Feature, FeatureDbxref, FeatureSynonym, - Featureloc, Featureprop, Organism, OrganismDbxref, Pub, PubDbxref, Synonym + Featureloc, Featureprop, OrganismDbxref, Pub, PubDbxref, Synonym ) from harvdev_utils.psycopg_functions import set_up_db_reading @@ -242,7 +242,7 @@ def get_references(self, session): pub_counter = 0 for pub in results: self.all_pubs_dict[pub.pub_id] = f'FB:{pub.uniquename}' - counter += 1 + pub_counter += 1 # Next find PMIDs if available and replace the curie in the all_pubs_dict. filters = ( Pub.uniquename.op('~')(fbrf_regex), @@ -550,7 +550,7 @@ def process_feature_synonyms(self, input, name_type, return_single_value): output_synonym_dto = { 'name_type_name': name_type_to_use, 'format_text': synonym.name, - 'display_text': synonym.synonym_sgml, + 'display_text': sub_sup_sgml_to_html(synonym.synonym_sgml), 'synonym_scope': 'exact', 'evidence_curies': [f'{self.all_pubs_dict[i]}' for i in pub_list if self.all_pubs_dict[i] != 'unattributed'], 'internal': False, From 98d6fb0b7eba86fdb926620a09a175f4a8791e5e Mon Sep 17 00:00:00 2001 From: gildossantos Date: Tue, 24 Jan 2023 15:36:45 -0500 Subject: [PATCH 05/52] fix type check for synonym method --- src/AGR_data_retrieval_curation_gene.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index cb18e10..8312752 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -515,7 +515,7 @@ def process_feature_synonyms(self, input, name_type, return_single_value): Raises error if return_single_value set to True, but many synonyms found in the input list. """ - if type(input) is not str or type(input) is not list: + if type(input) is not str and type(input) is not list: log.error('Input must be a string or list of FeatureSynonym objects.') raise # First handle the simplest case where a string is given as the input. @@ -570,8 +570,9 @@ def synthesize_info(self): """Convert FlyBase gene data into an AllianceGene representation.""" log.info('Synthesizing gene info.') for gene in self.gene_dict.values(): - log.debug('Evaluating annotation: {}'.format(gene)) + log.debug(f'Evaluating annotation: {gene}') # BOB: Handle synonyms. + log.debug(f'Handle synonyms for {gene}') if gene.curr_fb_symbol: gene.gene_symbol_dto = self.process_feature_synonyms(gene.curr_fb_symbol, 'nomenclature_symbol', True) else: From bddb072ee9a941cfa8d10296295033c04ca6d552 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Tue, 24 Jan 2023 15:47:30 -0500 Subject: [PATCH 06/52] temp suppress xrefs for faster dev --- src/AGR_data_retrieval_curation_gene.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index 8312752..b1a9420 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -490,7 +490,7 @@ def query_chado(self, session): self.get_references(session) self.get_genes(session) self.get_gene_taxons(session) - self.get_gene_dbxrefs(session) + # self.get_gene_dbxrefs(session) # BOB - suppress for faster dev. self.get_synonyms(session) self.get_gene_snapshots(session) self.get_gene_types(session) From b9ff60f6f5ea0c74f501fe73c53cb24cd642c21b Mon Sep 17 00:00:00 2001 From: gildossantos Date: Tue, 24 Jan 2023 15:53:08 -0500 Subject: [PATCH 07/52] fix synonym dict construction --- src/AGR_data_retrieval_curation_gene.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index b1a9420..4c93020 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -363,7 +363,7 @@ def get_synonyms(self, session): distinct() for result in gene_curr_symbol_results: # First, build the all_synonyms_dict. - self.all_synonyms_dict[result.Synonym.synonym_id] = Synonym + self.all_synonyms_dict[result.Synonym.synonym_id] = result.Synonym # Second, collect FeatureSynonym objects by type. if result.FeatureSynonym.is_current is True: if result.synonym_type.name == 'symbol': From 14adf409ab1ef2bd0a44ec492f21bd42af65ca3e Mon Sep 17 00:00:00 2001 From: gildossantos Date: Tue, 24 Jan 2023 15:58:49 -0500 Subject: [PATCH 08/52] debug syno --- src/AGR_data_retrieval_curation_gene.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index 4c93020..1880e66 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -572,17 +572,20 @@ def synthesize_info(self): for gene in self.gene_dict.values(): log.debug(f'Evaluating annotation: {gene}') # BOB: Handle synonyms. - log.debug(f'Handle synonyms for {gene}') + log.debug(f'BOB: Handle symbol for {gene}') if gene.curr_fb_symbol: gene.gene_symbol_dto = self.process_feature_synonyms(gene.curr_fb_symbol, 'nomenclature_symbol', True) else: gene.gene_symbol_dto = self.process_feature_synonyms(gene.feature.name, 'nomenclature_symbol', True) + log.debug(f'BOB: Handle full_name for {gene}') if gene.curr_fb_fullname: gene.gene_full_name_dto = self.process_feature_synonyms(gene.curr_fb_fullname, 'full_name', True) else: gene.gene_full_name_dto = self.process_feature_synonyms(gene.feature.name, 'full_name', True) + log.debug(f'BOB: Handle systematic_name for {gene}') if gene.systematic_name: gene.gene_systematic_name_dto = self.process_feature_synonyms(gene.systematic_name, 'systematic_name', True) + log.debug(f'BOB: Handle other synonyms for {gene}') if gene.other_synonyms: gene.gene_synonym_dtos = self.process_feature_synonyms(gene.other_synonyms, 'unspecified', False) # Get timestamps. From b50590e28c5853743813a99de8148a913424f684 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Wed, 25 Jan 2023 12:45:33 -0500 Subject: [PATCH 09/52] group distinct chado synonyms by shared name/synonym_sgml --- src/AGR_data_retrieval_curation_gene.py | 73 +++++++++++++++++++------ 1 file changed, 57 insertions(+), 16 deletions(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index 1880e66..0aa10d9 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -498,7 +498,6 @@ def query_chado(self, session): self.get_gene_featureloc(session) return - # BOB: new method for synonyms. def process_feature_synonyms(self, input, name_type, return_single_value): """Convert a string or list of FeatureSynonym objects into single or many DTO objects for export. @@ -515,10 +514,20 @@ def process_feature_synonyms(self, input, name_type, return_single_value): Raises error if return_single_value set to True, but many synonyms found in the input list. """ + # Dict for converting FB to AGR synonym types. + synonym_type_conversion = { + 'symbol': 'nomenclature_symbol', + 'fullname': 'full_name', + 'nickname': 'nomenclature_symbol', + 'synonym': 'nomenclature_symbol' + } + # Regex for FB systematic names (Dmel or other Dros species). + systematic_name_regex = r'^(D[a-z]{3}\\|)(CG|CR|G[A-Z])[0-9]{4,5}$' + # Check for correct input. if type(input) is not str and type(input) is not list: log.error('Input must be a string or list of FeatureSynonym objects.') raise - # First handle the simplest case where a string is given as the input. + # Handle a simple string. if type(input) is str: output_synonym_dto = { 'name_type_name': name_type, @@ -532,28 +541,60 @@ def process_feature_synonyms(self, input, name_type, return_single_value): if return_single_value is False: output_synonym_dto = [output_synonym_dto] return output_synonym_dto - # Next handle a list of FeatureSynonym objects. - # Collect pub_ids for each synonym (keyed by synonym_id). + # Handle a list of FeatureSynonym objects. + # Group by each distinct name/synonym_sgml combination: for each, group pub_id by synonym type. + # Have a dict, keyed by tuple of (format_text, display_text) + # Value for each key is a dict where synonym type is key for a list of pubs, or, 'internal' key for feature_synonym.is_internal values. feature_synonym_dict = {} - output_synonym_dto_list = [] for f_s in input: - try: - feature_synonym_dict[f_s.synonym_id].append(f_s.pub_id) - except KeyError: - feature_synonym_dict[f_s.synonym_id] = [f_s.pub_id] - for synonym_id, pub_list in feature_synonym_dict.items(): - synonym = self.all_synonyms_dict[synonym_id] - if name_type == 'unspecified': - name_type_to_use = synonym.type.name + synonym = self.all_synonyms_dict[f_s.synonym_id] + distinct_synonym_name = (synonym.name, synonym.synonym_sgml) + if distinct_synonym_name in feature_synonym_dict.keys(): + feature_synonym_dict[distinct_synonym_name]['internal'].append(f_s.is_internal) + if synonym.type.name in feature_synonym_dict[distinct_synonym_name].keys(): + feature_synonym_dict[distinct_synonym_name][synonym.type.name].append(f_s.pub_id) + else: + feature_synonym_dict[distinct_synonym_name][synonym.type.name] = [f_s.pub_id] else: + feature_synonym_dict[distinct_synonym_name] = {synonym.type.name: [f_s.pub_id], 'internal': [f_s.is_internal]} + # Now convert to AGR DTO object. + output_synonym_dto_list = [] + FORMAT_TEXT = 0 + DISPLAY_TEXT = 1 + for syno_name, syno_types_pubs in feature_synonym_dict: + # Determine internal status. + if True in syno_types_pubs['internal']: + syno_internal = True + else: + syno_internal = False + # Collect all pubs. + pub_list = [] + for syno_type, syno_type_pub_list in syno_types_pubs.items(): + if syno_type == 'internal': + continue + pub_list.extend(syno_type_pub_list) + pub_list = list(set(pub_list)) + # Pick correct name type to apply. + if re.match(systematic_name_regex, syno_name[FORMAT_TEXT]) and name_type != 'full_name': + name_type_to_use = 'systematic_name' + elif name_type != 'unspecified': name_type_to_use = name_type + # If name_type is "unspecified", we need to figure this out. Same name can be used in diff ways. Pick most frequent use. + # e.g., "wingless" is stored as both symbol and fullname in chado, but more frequently curated as a fullname. + else: + type_tally = {} + for syno_type, syno_type_pub_list in syno_types_pubs.items(): + if syno_type == 'internal': + continue + type_tally[len(set(pub_list))] = syno_type + name_type_to_use = synonym_type_conversion[type_tally[max(type_tally.keys())]] output_synonym_dto = { 'name_type_name': name_type_to_use, - 'format_text': synonym.name, - 'display_text': sub_sup_sgml_to_html(synonym.synonym_sgml), + 'format_text': syno_name[FORMAT_TEXT], + 'display_text': sub_sup_sgml_to_html(syno_name[DISPLAY_TEXT]), 'synonym_scope': 'exact', 'evidence_curies': [f'{self.all_pubs_dict[i]}' for i in pub_list if self.all_pubs_dict[i] != 'unattributed'], - 'internal': False, + 'internal': syno_internal, 'obsolete': False } output_synonym_dto_list.append(output_synonym_dto) From f86aeef16454a847ef1662a7f690c9658f155f63 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Wed, 25 Jan 2023 14:28:01 -0500 Subject: [PATCH 10/52] revise export of synonyms and anno IDs --- src/AGR_data_retrieval_curation_gene.py | 338 ++++++++++++------------ 1 file changed, 166 insertions(+), 172 deletions(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index 0aa10d9..7085b1c 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -105,11 +105,10 @@ def __init__(self, feature): self.featureloc = None # Will be Featureloc object for the gene. self.gene_type_name = None # Will be the cvterm.name for "promoted_gene_type" featureprop. self.gene_snapshot = None # Will be the "gene_summary_text" Featureprop object. + self.curr_symbol_name = None # Will be the current symbol synonym.synonym_sgml. + self.curr_fullname = None # Will be the current fullname synonym.synonym_sgml. self.curr_anno_id = None # Will be current annotation ID for the gene (str). - self.curr_fb_symbol = [] # Will be all FeatureSynonym objects in support of the current symbol Synonym object. - self.curr_fb_fullname = [] # Will be all FeatureSynonym objects in support of the current fullname Synonym object. - self.systematic_name = [] # Will be all FeatureSynonym objects using the systematic name of the gene. - self.other_synonyms = [] # Will be all FeatureSynonym objects in support of non-current synonyms. + self.feature_synonyms = [] # Will be list of all FeatureSynonym objects. self.dbxrefs = [] # Will be list of dbxrefs as sql result groupings: Db, Dbxref, FeatureDbxref. self.alt_fb_ids = [] # Will be list of Dbxrefs for 2o FlyBase IDs. self.annotation_ids = [] # Will be list of Dbxrefs for annotation IDs. @@ -158,7 +157,38 @@ def __init__(self): self.export_feat_cnt = 0 # Count of all genes exported to file. self.internal_feat_cnt = 0 # Count of all genes marked as internal=True in export file. + # Regexes. + gene_regex = r'^FBgn[0-9]{7}$' + pthr_regex = r'PTHR[0-9]{5}' + pub_regex = r'^(FBrf[0-9]{7}|unattributed)$' + systematic_name_regex = r'^(D[a-z]{3}\\|)(CG|CR|G[A-Z])[0-9]{4,5}$' + # Reference dicts. + internal_gene_types = [ + 'engineered_fusion_gene', + 'engineered_region', + 'gene_group', + 'gene_with_polycistronic_transcript', + 'insulator', + 'mitochondrial_sequence', + 'origin_of_replication', + 'region', + 'regulatory_region', + 'repeat_region', + 'satellite_DNA', + 'transposable_element_gene' + ] + fb_agr_db_dict = { + 'EntrezGene': 'NCBI_Gene', + 'FlyBase': 'FB', + 'FlyBase Annotation IDs': 'FB', + 'RNAcentral': 'RNAcentral', + # 'UniProt/GCRP': 'UniProt/GCRP', + 'UniProt/Swiss-Prot': 'UniProtKB', + 'UniProt/TrEMBL': 'UniProtKB' + } + # Sample set. test_genes = ['wg', 'mt:ori', 'lncRNA:roX1', 'CG12656'] + # Export fields. required_fields = [ 'curie', 'gene_symbol_dto', @@ -183,29 +213,6 @@ def __init__(self): 'taxon_curie', 'updated_by_curie', ] - internal_gene_types = [ - 'engineered_fusion_gene', - 'engineered_region', - 'gene_group', - 'gene_with_polycistronic_transcript', - 'insulator', - 'mitochondrial_sequence', - 'origin_of_replication', - 'region', - 'regulatory_region', - 'repeat_region', - 'satellite_DNA', - 'transposable_element_gene' - ] - fb_agr_db_dict = { - 'EntrezGene': 'NCBI_Gene', - 'FlyBase': 'FB', - 'FlyBase Annotation IDs': 'FB', - 'RNAcentral': 'RNAcentral', - # 'UniProt/GCRP': 'UniProt/GCRP', - 'UniProt/Swiss-Prot': 'UniProtKB', - 'UniProt/TrEMBL': 'UniProtKB' - } def open_panther_file(self): """Extract panther information from file.""" @@ -216,24 +223,21 @@ def open_panther_file(self): filepath = '/data/ortholog/panther/PTHR17.0_fruit_fly' tsv_file = open(filepath, "r") tsvin = csv.reader(tsv_file, delimiter='\t') - fb_regex = r'FBgn[0-9]{7}' - pthr_regex = r'PTHR[0-9]{5}' FB = 0 PTHR = 3 for row in tsvin: fields = len(row) if fields: # Ignore blank lines - if re.search(fb_regex, row[FB]) and re.search(pthr_regex, row[PTHR]): - self.pthr_dict[re.search(fb_regex, row[FB]).group(0)] = re.search(pthr_regex, row[PTHR]).group(0) + if re.search(self.gene_regex, row[FB]) and re.search(self.pthr_regex, row[PTHR]): + self.pthr_dict[re.search(self.gene_regex, row[FB]).group(0)] = re.search(self.pthr_regex, row[PTHR]).group(0) return def get_references(self, session): """Get all references.""" log.info('Get all references.') # First get all current pubs having an FBrf uniquename. - fbrf_regex = r'^(FBrf[0-9]{7}|unattributed)$' filters = ( - Pub.uniquename.op('~')(fbrf_regex), + Pub.uniquename.op('~')(self.pub_regex), Pub.is_obsolete.is_(False) ) results = session.query(Pub).\ @@ -245,7 +249,7 @@ def get_references(self, session): pub_counter += 1 # Next find PMIDs if available and replace the curie in the all_pubs_dict. filters = ( - Pub.uniquename.op('~')(fbrf_regex), + Pub.uniquename.op('~')(self.pub_regex), Pub.is_obsolete.is_(False), Db.name == 'pubmed', PubDbxref.is_current.is_(True) @@ -267,9 +271,8 @@ def get_genes(self, session): """Get all genes.""" log.info('Querying chado for genes.') # First get all gene features from chado. - gene_regex = r'^FBgn[0-9]{7}$' filters = ( - Feature.uniquename.op('~')(gene_regex), + Feature.uniquename.op('~')(self.gene_regex), Feature.is_analysis.is_(False), Cvterm.name == 'gene' ) @@ -306,17 +309,68 @@ def get_gene_taxons(self, session): log.debug('No NCBI taxon ID available for: {}'.format(gene)) return + def get_synonyms(self, session): + """Get current and non-current symbols and full names for genes.""" + log.info('Get current and non-current symbols and full names for genes.') + filters = ( + Feature.uniquename.op('~')(self.gene_regex), + Feature.is_analysis.is_(False), + Cvterm.name == 'gene' + ) + results = session.query(Feature, FeatureSynonym, Synonym).\ + join(FeatureSynonym, (FeatureSynonym.synonym_id == Synonym.synonym_id)).\ + join(Feature, (Feature.feature_id == FeatureSynonym.feature_id)).\ + join(Cvterm, (Cvterm.cvterm_id == Feature.type_id)).\ + filter(*filters).\ + distinct() + counter = 0 + for result in results: + # First, build the all_synonyms_dict. + self.all_synonyms_dict[result.Synonym.synonym_id] = result.Synonym + # Second, collect FeatureSynonyms for each gene. + self.gene_dict[result.Feature.uniquename].feature_synonyms.append(result.FeatureSynonym) + # Catch current symbol and fullname strings. + if result.FeatureSynonym.is_current is True and result.Synonym.type.name == 'symbol': + self.gene_dict[result.Feature.uniquename].curr_symbol_name = sub_sup_sgml_to_html(result.Synonym.synonym_sgml) + elif result.FeatureSynonym.is_current is True and result.Synonym.type.name == 'fullname': + self.gene_dict[result.Feature.uniquename].curr_fullname = sub_sup_sgml_to_html(result.Synonym.synonym_sgml) + counter += 1 + return + + def get_annotation_ids(self, session): + """Get current annotation IDs.""" + log.info('Get current annotation IDs.') + filters = ( + Feature.uniquename.op('~')(self.gene_regex), + Feature.is_analysis.is_(False), + Cvterm.name == 'gene', + FeatureDbxref.is_current.is_(True), + Db.name == 'FlyBase Annotation IDs' + ) + results = session.query(Feature, Dbxref).\ + join(Cvterm, (Cvterm.cvterm_id == Feature.type_id)).\ + join(FeatureDbxref, (FeatureDbxref.feature_id == Feature.feature_id)).\ + join(Dbxref, (Dbxref.dbxref_id == FeatureDbxref.dbxref_id)).\ + join(Db, (Db.db_id == Dbxref.db_id)).\ + filter(*filters).\ + distinct() + counter = 0 + for result in results: + self.gene_dict[result.Feature.uniquename].curr_anno_id = result.Dbxref.accession + counter += 1 + log.info(f'Found {counter} current annotation IDs for FlyBase genes.') + return + def get_gene_dbxrefs(self, session): """Get all dbxrefs for genes. This will take 10-15 minutes.""" log.info('Getting gene dbxrefs.') - gene_regex = r'^FBgn[0-9]{7}$' filters = ( - Feature.uniquename.op('~')(gene_regex), + Feature.uniquename.op('~')(self.gene_regex), Feature.is_analysis.is_(False), Cvterm.name == 'gene', - Db.name.in_((self.fb_agr_db_dict.keys())) + Db.name.in_((self.fb_agr_db_dict.keys())), ) - gene_dbxref_results = session.query(Feature, FeatureDbxref, Dbxref, Db).\ + results = session.query(Feature, FeatureDbxref, Dbxref, Db).\ join(Cvterm, (Cvterm.cvterm_id == Feature.type_id)).\ join(FeatureDbxref, (FeatureDbxref.feature_id == Feature.feature_id)).\ join(Dbxref, (Dbxref.dbxref_id == FeatureDbxref.dbxref_id)).\ @@ -324,57 +378,22 @@ def get_gene_dbxrefs(self, session): filter(*filters).\ distinct() counter = 0 - for result in gene_dbxref_results: + for result in results: counter += 1 if counter % 100000 == 0: log.debug('Processing xref #{}'.format(counter)) - # Skip current FlyBase accessions. - # If present, these are same as feature.uniquename. - # However, not present for all genes (e.g., FBgn0085177), so cannot be relied upon. + # Skip current FlyBase accessions because these are not comprehensive. + # When they exist, they're always equal to the feature.uniquename. + # But they're not always present, so these dbxrefs can't be relied upon (e.g., FBgn0085177) if result.FeatureDbxref.is_current is True and result.Db.name == 'FlyBase': pass elif result.FeatureDbxref.is_current is False and result.Db.name == 'FlyBase': self.gene_dict[result.Feature.uniquename].alt_fb_ids.append(result.Dbxref) elif result.Db.name == 'FlyBase Annotation IDs': self.gene_dict[result.Feature.uniquename].annotation_ids.append(result.Dbxref) - if result.FeatureDbxref.is_current is True: - self.gene_dict[result.Feature.uniquename].curr_anno_id = result.Dbxref.accession else: self.gene_dict[result.Feature.uniquename].dbxrefs.append(result) - return - - def get_synonyms(self, session): - """Get current and non-current symbols and full names for genes.""" - log.info('Getting gene synonyms.') - feature_type = aliased(Cvterm, name='feature_type') - synonym_type = aliased(Cvterm, name='synonym_type') - gene_regex = r'^FBgn[0-9]{7}$' - filters = ( - Feature.uniquename.op('~')(gene_regex), - Feature.is_analysis.is_(False), - feature_type.name == 'gene' - ) - gene_curr_symbol_results = session.query(synonym_type, Feature, FeatureSynonym, Synonym).\ - join(FeatureSynonym, (FeatureSynonym.synonym_id == Synonym.synonym_id)).\ - join(Feature, (Feature.feature_id == FeatureSynonym.feature_id)).\ - join(feature_type, (feature_type.cvterm_id == Feature.type_id)).\ - join(synonym_type, (synonym_type.cvterm_id == Synonym.type_id)).\ - filter(*filters).\ - distinct() - for result in gene_curr_symbol_results: - # First, build the all_synonyms_dict. - self.all_synonyms_dict[result.Synonym.synonym_id] = result.Synonym - # Second, collect FeatureSynonym objects by type. - if result.FeatureSynonym.is_current is True: - if result.synonym_type.name == 'symbol': - self.gene_dict[result.Feature.uniquename].curr_fb_symbol.append(result.FeatureSynonym) - elif result.synonym_type.name == 'fullname': - self.gene_dict[result.Feature.uniquename].curr_fb_fullname.append(result.FeatureSynonym) - else: - self.gene_dict[result.Feature.uniquename].other_synonyms.append(result.FeatureSynonym) - # Third, catch synonyms that match the annotation ID (aka, systematic_name). - if result.Synonym.name == self.gene_dict[result.Feature.uniquename].curr_anno_id: - self.gene_dict[result.Feature.uniquename].systematic_name.append(result.FeatureSynonym) + log.info(f'Found {counter} gene dbxrefs.') return def get_gene_snapshots(self, session): @@ -382,9 +401,8 @@ def get_gene_snapshots(self, session): log.info('Getting gene snapshots.') feature_type = aliased(Cvterm, name='feature_type') prop_type = aliased(Cvterm, name='gene_summary_text') - gene_regex = r'^FBgn[0-9]{7}$' filters = ( - Feature.uniquename.op('~')(gene_regex), + Feature.uniquename.op('~')(self.gene_regex), Feature.is_analysis.is_(False), feature_type.name == 'gene', prop_type.name == 'gene_summary_text' @@ -404,9 +422,8 @@ def get_gene_types(self, session): log.info('Getting gene types.') feature_type = aliased(Cvterm, name='feature_type') prop_type = aliased(Cvterm, name='promoted_gene_type') - gene_regex = r'^FBgn[0-9]{7}$' filters = ( - Feature.uniquename.op('~')(gene_regex), + Feature.uniquename.op('~')(self.gene_regex), Feature.is_analysis.is_(False), prop_type.name == 'promoted_gene_type' ) @@ -469,9 +486,8 @@ def get_gene_featureloc(self, session): for result in chr_results: self.chr_dict[result.feature_id] = result.uniquename # Now get gene featureloc. - gene_regex = r'^FBgn[0-9]{7}$' filters = ( - Feature.uniquename.op('~')(gene_regex), + Feature.uniquename.op('~')(self.gene_regex), Feature.is_analysis.is_(False), Cvterm.name == 'gene' ) @@ -498,22 +514,8 @@ def query_chado(self, session): self.get_gene_featureloc(session) return - def process_feature_synonyms(self, input, name_type, return_single_value): - """Convert a string or list of FeatureSynonym objects into single or many DTO objects for export. - - Args: - arg1 (input): (str or list) A string, or, a list of FeatureSynonym objects. - arg2 (name_type): (str) The type of name to return. If "unspecified" is given, go by Synonym type. - arg3 (return_single_value): (bool) True if output should be a single DTO, False if a list is to be returned. - - Returns: - A single or list of name DTO objects. - - Raises: - Raises error if in put is not a string/list. - Raises error if return_single_value set to True, but many synonyms found in the input list. - - """ + def process_feature_synonyms(self, feature): + """Generate name/synonym DTOs for a feature that has a list of FeatureSynonym objects.""" # Dict for converting FB to AGR synonym types. synonym_type_conversion = { 'symbol': 'nomenclature_symbol', @@ -521,32 +523,21 @@ def process_feature_synonyms(self, input, name_type, return_single_value): 'nickname': 'nomenclature_symbol', 'synonym': 'nomenclature_symbol' } - # Regex for FB systematic names (Dmel or other Dros species). - systematic_name_regex = r'^(D[a-z]{3}\\|)(CG|CR|G[A-Z])[0-9]{4,5}$' - # Check for correct input. - if type(input) is not str and type(input) is not list: - log.error('Input must be a string or list of FeatureSynonym objects.') - raise - # Handle a simple string. - if type(input) is str: - output_synonym_dto = { - 'name_type_name': name_type, - 'format_text': input, - 'display_text': input, - 'synonym_scope': 'exact', - 'evidence_curies': [], - 'internal': False, - 'obsolete': False - } - if return_single_value is False: - output_synonym_dto = [output_synonym_dto] - return output_synonym_dto - # Handle a list of FeatureSynonym objects. - # Group by each distinct name/synonym_sgml combination: for each, group pub_id by synonym type. - # Have a dict, keyed by tuple of (format_text, display_text) - # Value for each key is a dict where synonym type is key for a list of pubs, or, 'internal' key for feature_synonym.is_internal values. + default_name_dto = { + 'name_type_name': 'unspecified', + 'format_text': 'unspecified', + 'display_text': 'unspecified', + 'synonym_scope': 'exact', + 'evidence_curies': [], + 'internal': False, + 'obsolete': False + } + # Create a dict of all distinct name/synonym_sgml combinations: for each, capture synonym type(s) an pub_ids. + # Keys are (synonym.name, synonym.synonym_sgml) tuples. + # Values are dicts too where keys are chado synonym types and values are lists of pub_ids. + # Value dict also has an "internal" key that stores list of FeatureSynonym.is_internal values. feature_synonym_dict = {} - for f_s in input: + for f_s in feature.feature_synonyms: synonym = self.all_synonyms_dict[f_s.synonym_id] distinct_synonym_name = (synonym.name, synonym.synonym_sgml) if distinct_synonym_name in feature_synonym_dict.keys(): @@ -557,54 +548,73 @@ def process_feature_synonyms(self, input, name_type, return_single_value): feature_synonym_dict[distinct_synonym_name][synonym.type.name] = [f_s.pub_id] else: feature_synonym_dict[distinct_synonym_name] = {synonym.type.name: [f_s.pub_id], 'internal': [f_s.is_internal]} - # Now convert to AGR DTO object. - output_synonym_dto_list = [] + # Convert to AGR name DTO objects. + name_dto_list = [] FORMAT_TEXT = 0 DISPLAY_TEXT = 1 - for syno_name, syno_types_pubs in feature_synonym_dict: - # Determine internal status. - if True in syno_types_pubs['internal']: - syno_internal = True - else: + for syno_name, syno_attributes in feature_synonym_dict.items(): + # Determine internal status. False trumps True. + if False in set(syno_attributes['internal']): syno_internal = False + else: + syno_internal = True # Collect all pubs. - pub_list = [] - for syno_type, syno_type_pub_list in syno_types_pubs.items(): + pub_id_list = [] + for syno_type, syno_type_pub_list in syno_attributes.items(): if syno_type == 'internal': continue - pub_list.extend(syno_type_pub_list) - pub_list = list(set(pub_list)) + pub_id_list.extend(syno_type_pub_list) + pub_id_list = list(set(pub_id_list)) # Pick correct name type to apply. - if re.match(systematic_name_regex, syno_name[FORMAT_TEXT]) and name_type != 'full_name': + if re.match(self.systematic_name_regex, syno_name[DISPLAY_TEXT]): name_type_to_use = 'systematic_name' - elif name_type != 'unspecified': - name_type_to_use = name_type - # If name_type is "unspecified", we need to figure this out. Same name can be used in diff ways. Pick most frequent use. - # e.g., "wingless" is stored as both symbol and fullname in chado, but more frequently curated as a fullname. else: type_tally = {} - for syno_type, syno_type_pub_list in syno_types_pubs.items(): + for syno_type, syno_type_pub_list in syno_attributes.items(): if syno_type == 'internal': continue - type_tally[len(set(pub_list))] = syno_type + type_tally[len(set(syno_type_pub_list))] = syno_type name_type_to_use = synonym_type_conversion[type_tally[max(type_tally.keys())]] output_synonym_dto = { 'name_type_name': name_type_to_use, - 'format_text': syno_name[FORMAT_TEXT], + 'format_text': sub_sup_sgml_to_html(syno_name[FORMAT_TEXT]), 'display_text': sub_sup_sgml_to_html(syno_name[DISPLAY_TEXT]), 'synonym_scope': 'exact', - 'evidence_curies': [f'{self.all_pubs_dict[i]}' for i in pub_list if self.all_pubs_dict[i] != 'unattributed'], + 'evidence_curies': [self.all_pubs_dict[i] for i in pub_id_list if self.all_pubs_dict[i] != 'unattributed'], 'internal': syno_internal, 'obsolete': False } - output_synonym_dto_list.append(output_synonym_dto) - if return_single_value is True and len(output_synonym_dto_list) != 1: - log.error('Found many synonyms but was expecting only one.') - raise - elif return_single_value is True and len(output_synonym_dto_list) == 1: - return output_synonym_dto_list[0] - else: - return output_synonym_dto_list + name_dto_list.append(output_synonym_dto) + # Sift through name DTOs for symbol, fullname, systematic_name, etc. + for name_dto in name_dto_list: + if name_dto['display_text'] == feature.curr_anno_id: + feature.gene_systematic_name_dto = name_dto + if name_dto['name_type_name'] != 'systematic_name': + log.warning(f"{feature}: Found mistyped curr anno ID: type={name_dto['name_type_name']}, anno_id={name_dto['display_text']}") + if name_dto['display_text'] == feature.curr_symbol_name: + if name_dto['name_type_name'] not in ['systematic_name', 'nomenclature_symbol']: + name_dto['name_type_name'] = 'nomenclature_symbol' + log.warning(f"{feature}: Found mistyped curr symbol: type={name_dto['name_type_name']}, anno_id={name_dto['display_text']}") + feature.gene_symbol_dto = name_dto + elif name_dto['display_text'] == feature.curr_fullname: + feature.gene_full_name_dto = name_dto + if name_dto['name_type_name'] != 'full_name': + log.warning(f"{feature}: Found mistyped curr full_name: type={name_dto['name_type_name']}, anno_id={name_dto['display_text']}") + else: + feature.gene_synonym_dtos.append(name_dto) + # Symbol is required. If none, fill it in. + if feature.gene_symbol_dto is None: + placeholder_symbol_dto = default_name_dto.copy() + placeholder_symbol_dto['name_type_name'] = 'nomenclature_symbol' + placeholder_symbol_dto['format_text'] = feature.feature.name + placeholder_symbol_dto['display_text'] = feature.feature.name + feature.gene_symbol_dto = placeholder_symbol_dto + # Full name is required. If none, fill it in. Could be because FB has none, or, it's the same as the symbol. + if feature.gene_full_name_dto is None: + placeholder_full_name_dto = feature.gene_symbol_dto.copy() + placeholder_full_name_dto['name_type_name'] = 'full_name' + feature.gene_full_name_dto = placeholder_full_name_dto + return # Synthesis of initial db info. def synthesize_info(self): @@ -612,23 +622,7 @@ def synthesize_info(self): log.info('Synthesizing gene info.') for gene in self.gene_dict.values(): log.debug(f'Evaluating annotation: {gene}') - # BOB: Handle synonyms. - log.debug(f'BOB: Handle symbol for {gene}') - if gene.curr_fb_symbol: - gene.gene_symbol_dto = self.process_feature_synonyms(gene.curr_fb_symbol, 'nomenclature_symbol', True) - else: - gene.gene_symbol_dto = self.process_feature_synonyms(gene.feature.name, 'nomenclature_symbol', True) - log.debug(f'BOB: Handle full_name for {gene}') - if gene.curr_fb_fullname: - gene.gene_full_name_dto = self.process_feature_synonyms(gene.curr_fb_fullname, 'full_name', True) - else: - gene.gene_full_name_dto = self.process_feature_synonyms(gene.feature.name, 'full_name', True) - log.debug(f'BOB: Handle systematic_name for {gene}') - if gene.systematic_name: - gene.gene_systematic_name_dto = self.process_feature_synonyms(gene.systematic_name, 'systematic_name', True) - log.debug(f'BOB: Handle other synonyms for {gene}') - if gene.other_synonyms: - gene.gene_synonym_dtos = self.process_feature_synonyms(gene.other_synonyms, 'unspecified', False) + self.process_feature_synonyms(gene) # Get timestamps. if gene.timestamps: gene.date_created = strict_rfc3339.\ From 55f999c5161aa763c1828c83e9952141db013773 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Wed, 25 Jan 2023 14:36:57 -0500 Subject: [PATCH 11/52] skip feature_synonyms for non-curr pubs --- src/AGR_data_retrieval_curation_gene.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index 7085b1c..a71c55e 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -325,6 +325,9 @@ def get_synonyms(self, session): distinct() counter = 0 for result in results: + # Skip any references to non-current pubs. + if result.FeatureSynonym.pub_id not in self.all_pubs_dict.keys(): + continue # First, build the all_synonyms_dict. self.all_synonyms_dict[result.Synonym.synonym_id] = result.Synonym # Second, collect FeatureSynonyms for each gene. @@ -335,6 +338,7 @@ def get_synonyms(self, session): elif result.FeatureSynonym.is_current is True and result.Synonym.type.name == 'fullname': self.gene_dict[result.Feature.uniquename].curr_fullname = sub_sup_sgml_to_html(result.Synonym.synonym_sgml) counter += 1 + log.info(f'Found {counter} feature_synonyms (current pubs) for genes.') return def get_annotation_ids(self, session): From 3640455ebe68ba79e7737ae5a2949ab1f96edee9 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Wed, 25 Jan 2023 14:54:32 -0500 Subject: [PATCH 12/52] run anno id method; add placeholder systematic name --- src/AGR_data_retrieval_curation_gene.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index a71c55e..dff479f 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -512,6 +512,7 @@ def query_chado(self, session): self.get_gene_taxons(session) # self.get_gene_dbxrefs(session) # BOB - suppress for faster dev. self.get_synonyms(session) + self.get_annotation_ids(session) self.get_gene_snapshots(session) self.get_gene_types(session) self.get_gene_timestamps(session) @@ -606,6 +607,7 @@ def process_feature_synonyms(self, feature): log.warning(f"{feature}: Found mistyped curr full_name: type={name_dto['name_type_name']}, anno_id={name_dto['display_text']}") else: feature.gene_synonym_dtos.append(name_dto) + # LinkML change required: make gene_full_name_dto and gene_systematic_name_dto OPTIONAL. # Symbol is required. If none, fill it in. if feature.gene_symbol_dto is None: placeholder_symbol_dto = default_name_dto.copy() @@ -618,6 +620,11 @@ def process_feature_synonyms(self, feature): placeholder_full_name_dto = feature.gene_symbol_dto.copy() placeholder_full_name_dto['name_type_name'] = 'full_name' feature.gene_full_name_dto = placeholder_full_name_dto + # Full name is required. If none, fill it in. Could be because FB has none, or, it's the same as the symbol. + if feature.gene_systematic_name_dto is None: + placeholder_systematic_name_dto = feature.gene_symbol_dto.copy() + placeholder_systematic_name_dto['name_type_name'] = 'systematic_name' + feature.gene_systematic_name_dto = placeholder_full_name_dto return # Synthesis of initial db info. From 2910e42ed52ffd94b6d8ae0503aa195aa9a7b79e Mon Sep 17 00:00:00 2001 From: gildossantos Date: Wed, 25 Jan 2023 14:58:28 -0500 Subject: [PATCH 13/52] fix scope attr name --- src/AGR_data_retrieval_curation_gene.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index dff479f..7d02cdf 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -532,7 +532,7 @@ def process_feature_synonyms(self, feature): 'name_type_name': 'unspecified', 'format_text': 'unspecified', 'display_text': 'unspecified', - 'synonym_scope': 'exact', + 'synonym_scope_name': 'exact', 'evidence_curies': [], 'internal': False, 'obsolete': False @@ -584,7 +584,7 @@ def process_feature_synonyms(self, feature): 'name_type_name': name_type_to_use, 'format_text': sub_sup_sgml_to_html(syno_name[FORMAT_TEXT]), 'display_text': sub_sup_sgml_to_html(syno_name[DISPLAY_TEXT]), - 'synonym_scope': 'exact', + 'synonym_scope_name': 'exact', 'evidence_curies': [self.all_pubs_dict[i] for i in pub_id_list if self.all_pubs_dict[i] != 'unattributed'], 'internal': syno_internal, 'obsolete': False From 8acd6042e237f7a1655347c40e92b0643c13d478 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Wed, 25 Jan 2023 15:03:06 -0500 Subject: [PATCH 14/52] fix sys name placeholder --- src/AGR_data_retrieval_curation_gene.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index 7d02cdf..e6b3266 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -624,7 +624,7 @@ def process_feature_synonyms(self, feature): if feature.gene_systematic_name_dto is None: placeholder_systematic_name_dto = feature.gene_symbol_dto.copy() placeholder_systematic_name_dto['name_type_name'] = 'systematic_name' - feature.gene_systematic_name_dto = placeholder_full_name_dto + feature.gene_systematic_name_dto = placeholder_systematic_name_dto return # Synthesis of initial db info. From 3a2cae0a4e19899a1f1ba0ed5f31ea37bafa1f36 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Wed, 25 Jan 2023 15:18:44 -0500 Subject: [PATCH 15/52] debug sys name export --- src/AGR_data_retrieval_curation_gene.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index e6b3266..ccd4185 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -191,8 +191,9 @@ def __init__(self): # Export fields. required_fields = [ 'curie', - 'gene_symbol_dto', 'gene_full_name_dto', + 'gene_symbol_dto', + 'gene_systematic_name_dto', 'internal', 'taxon_curie', ] @@ -205,6 +206,7 @@ def __init__(self): 'gene_full_name_dto', 'gene_symbol_dto', 'gene_synonym_dtos', + 'gene_systematic_name_dto', 'gene_type_curie', 'genomic_location_dtos', 'internal', @@ -347,6 +349,7 @@ def get_annotation_ids(self, session): filters = ( Feature.uniquename.op('~')(self.gene_regex), Feature.is_analysis.is_(False), + Feature.is_obsolete.is_(False), Cvterm.name == 'gene', FeatureDbxref.is_current.is_(True), Db.name == 'FlyBase Annotation IDs' @@ -361,6 +364,7 @@ def get_annotation_ids(self, session): counter = 0 for result in results: self.gene_dict[result.Feature.uniquename].curr_anno_id = result.Dbxref.accession + log.debug(f'For {self.gene_dict[result.Feature.uniquename]}, anno_id={result.Dbxref.accession}') counter += 1 log.info(f'Found {counter} current annotation IDs for FlyBase genes.') return @@ -620,10 +624,13 @@ def process_feature_synonyms(self, feature): placeholder_full_name_dto = feature.gene_symbol_dto.copy() placeholder_full_name_dto['name_type_name'] = 'full_name' feature.gene_full_name_dto = placeholder_full_name_dto - # Full name is required. If none, fill it in. Could be because FB has none, or, it's the same as the symbol. + # Systematic name is required. If none, fill it in. Could be because FB has none, or, it's the same as the symbol. if feature.gene_systematic_name_dto is None: placeholder_systematic_name_dto = feature.gene_symbol_dto.copy() placeholder_systematic_name_dto['name_type_name'] = 'systematic_name' + if feature.curr_anno_id: + placeholder_symbol_dto['format_text'] = feature.curr_anno_id + placeholder_symbol_dto['display_text'] = feature.curr_anno_id feature.gene_systematic_name_dto = placeholder_systematic_name_dto return From c2484331f160d035bd3b11e2adb345e356c9ce9c Mon Sep 17 00:00:00 2001 From: gildossantos Date: Wed, 25 Jan 2023 15:29:26 -0500 Subject: [PATCH 16/52] fix reporting of mis-typed synonyms in log --- src/AGR_data_retrieval_curation_gene.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index ccd4185..f534f97 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -597,18 +597,20 @@ def process_feature_synonyms(self, feature): # Sift through name DTOs for symbol, fullname, systematic_name, etc. for name_dto in name_dto_list: if name_dto['display_text'] == feature.curr_anno_id: - feature.gene_systematic_name_dto = name_dto if name_dto['name_type_name'] != 'systematic_name': - log.warning(f"{feature}: Found mistyped curr anno ID: type={name_dto['name_type_name']}, anno_id={name_dto['display_text']}") + log.warning(f"{feature}: Found mistyped curr anno ID: type={name_dto['name_type_name']}, name={name_dto['display_text']}") + name_dto['name_type_name'] = 'systematic_name' + feature.gene_systematic_name_dto = name_dto if name_dto['display_text'] == feature.curr_symbol_name: if name_dto['name_type_name'] not in ['systematic_name', 'nomenclature_symbol']: + log.warning(f"{feature}: Found mistyped curr symbol: type={name_dto['name_type_name']}, name={name_dto['display_text']}") name_dto['name_type_name'] = 'nomenclature_symbol' - log.warning(f"{feature}: Found mistyped curr symbol: type={name_dto['name_type_name']}, anno_id={name_dto['display_text']}") feature.gene_symbol_dto = name_dto elif name_dto['display_text'] == feature.curr_fullname: - feature.gene_full_name_dto = name_dto if name_dto['name_type_name'] != 'full_name': - log.warning(f"{feature}: Found mistyped curr full_name: type={name_dto['name_type_name']}, anno_id={name_dto['display_text']}") + log.warning(f"{feature}: Found mistyped curr full_name: type={name_dto['name_type_name']}, name={name_dto['display_text']}") + name_dto['name_type_name'] = 'full_name' + feature.gene_full_name_dto = name_dto else: feature.gene_synonym_dtos.append(name_dto) # LinkML change required: make gene_full_name_dto and gene_systematic_name_dto OPTIONAL. From 4043733c8aed7544e6c515f748452f4e5e1e6eb8 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Wed, 25 Jan 2023 15:33:57 -0500 Subject: [PATCH 17/52] fix sys name placeholder --- src/AGR_data_retrieval_curation_gene.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index f534f97..e2bd680 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -597,6 +597,7 @@ def process_feature_synonyms(self, feature): # Sift through name DTOs for symbol, fullname, systematic_name, etc. for name_dto in name_dto_list: if name_dto['display_text'] == feature.curr_anno_id: + log.debug(f"BOB: Found name_dto annotation match: {name_dto['display_text']}") if name_dto['name_type_name'] != 'systematic_name': log.warning(f"{feature}: Found mistyped curr anno ID: type={name_dto['name_type_name']}, name={name_dto['display_text']}") name_dto['name_type_name'] = 'systematic_name' @@ -631,8 +632,8 @@ def process_feature_synonyms(self, feature): placeholder_systematic_name_dto = feature.gene_symbol_dto.copy() placeholder_systematic_name_dto['name_type_name'] = 'systematic_name' if feature.curr_anno_id: - placeholder_symbol_dto['format_text'] = feature.curr_anno_id - placeholder_symbol_dto['display_text'] = feature.curr_anno_id + placeholder_systematic_name_dto['format_text'] = feature.curr_anno_id + placeholder_systematic_name_dto['display_text'] = feature.curr_anno_id feature.gene_systematic_name_dto = placeholder_systematic_name_dto return From e4115a278297bdebb126986dfd7f6950f7926d16 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Wed, 25 Jan 2023 15:37:30 -0500 Subject: [PATCH 18/52] tweak comments about sys name placeholder --- src/AGR_data_retrieval_curation_gene.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index e2bd680..6aad57d 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -627,7 +627,7 @@ def process_feature_synonyms(self, feature): placeholder_full_name_dto = feature.gene_symbol_dto.copy() placeholder_full_name_dto['name_type_name'] = 'full_name' feature.gene_full_name_dto = placeholder_full_name_dto - # Systematic name is required. If none, fill it in. Could be because FB has none, or, it's the same as the symbol. + # Systematic name is required. If none, fill it in. Could be because gene is unannotated, or annotation ID has never been used in pubs. if feature.gene_systematic_name_dto is None: placeholder_systematic_name_dto = feature.gene_symbol_dto.copy() placeholder_systematic_name_dto['name_type_name'] = 'systematic_name' From 9978a2b3082f082273c23439101d1b98ecfec789 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Wed, 25 Jan 2023 15:46:22 -0500 Subject: [PATCH 19/52] tweak log of anno ID matches --- src/AGR_data_retrieval_curation_gene.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index 6aad57d..ae62ea9 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -597,7 +597,7 @@ def process_feature_synonyms(self, feature): # Sift through name DTOs for symbol, fullname, systematic_name, etc. for name_dto in name_dto_list: if name_dto['display_text'] == feature.curr_anno_id: - log.debug(f"BOB: Found name_dto annotation match: {name_dto['display_text']}") + log.debug(f"BOB: Found synonym-annoID match: {name_dto['display_text']}") if name_dto['name_type_name'] != 'systematic_name': log.warning(f"{feature}: Found mistyped curr anno ID: type={name_dto['name_type_name']}, name={name_dto['display_text']}") name_dto['name_type_name'] = 'systematic_name' @@ -609,7 +609,7 @@ def process_feature_synonyms(self, feature): feature.gene_symbol_dto = name_dto elif name_dto['display_text'] == feature.curr_fullname: if name_dto['name_type_name'] != 'full_name': - log.warning(f"{feature}: Found mistyped curr full_name: type={name_dto['name_type_name']}, name={name_dto['display_text']}") + log.warning(f"BOB: {feature}: Found mistyped curr full_name: type={name_dto['name_type_name']}, name={name_dto['display_text']}") name_dto['name_type_name'] = 'full_name' feature.gene_full_name_dto = name_dto else: @@ -634,6 +634,7 @@ def process_feature_synonyms(self, feature): if feature.curr_anno_id: placeholder_systematic_name_dto['format_text'] = feature.curr_anno_id placeholder_systematic_name_dto['display_text'] = feature.curr_anno_id + log.warning(f"BOB: {feature}: Has anno ID never used as a synonym: {feature.curr_anno_id}") feature.gene_systematic_name_dto = placeholder_systematic_name_dto return From 381a6b63b50791aa1c7ff9cf463201ac456f1073 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Wed, 25 Jan 2023 15:47:33 -0500 Subject: [PATCH 20/52] tweak log of anno ID matches --- src/AGR_data_retrieval_curation_gene.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index ae62ea9..2e4bc49 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -634,7 +634,7 @@ def process_feature_synonyms(self, feature): if feature.curr_anno_id: placeholder_systematic_name_dto['format_text'] = feature.curr_anno_id placeholder_systematic_name_dto['display_text'] = feature.curr_anno_id - log.warning(f"BOB: {feature}: Has anno ID never used as a synonym: {feature.curr_anno_id}") + log.warning(f"BOB: {feature}: Has annoID never used as a synonym: {feature.curr_anno_id}") feature.gene_systematic_name_dto = placeholder_systematic_name_dto return From ec651e5b461ab2dc0eb519c484ad5619365236c0 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Wed, 25 Jan 2023 15:50:31 -0500 Subject: [PATCH 21/52] tweak log of anno ID matches --- src/AGR_data_retrieval_curation_gene.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index 2e4bc49..5c39025 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -364,7 +364,6 @@ def get_annotation_ids(self, session): counter = 0 for result in results: self.gene_dict[result.Feature.uniquename].curr_anno_id = result.Dbxref.accession - log.debug(f'For {self.gene_dict[result.Feature.uniquename]}, anno_id={result.Dbxref.accession}') counter += 1 log.info(f'Found {counter} current annotation IDs for FlyBase genes.') return From b4986b402612d37cda22dde72e9009918445ca30 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Thu, 26 Jan 2023 15:14:41 -0500 Subject: [PATCH 22/52] add back method for getting gene xrefs, tidy log msg and code comments --- src/AGR_data_retrieval_curation_gene.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index 5c39025..86f4910 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -105,8 +105,8 @@ def __init__(self, feature): self.featureloc = None # Will be Featureloc object for the gene. self.gene_type_name = None # Will be the cvterm.name for "promoted_gene_type" featureprop. self.gene_snapshot = None # Will be the "gene_summary_text" Featureprop object. - self.curr_symbol_name = None # Will be the current symbol synonym.synonym_sgml. - self.curr_fullname = None # Will be the current fullname synonym.synonym_sgml. + self.curr_symbol_name = None # Will be the current symbol synonym.synonym_sgml, processed by sub_sup_sgml_to_html(). + self.curr_fullname = None # Will be the current fullname synonym.synonym_sgml, processed by sub_sup_sgml_to_html(). self.curr_anno_id = None # Will be current annotation ID for the gene (str). self.feature_synonyms = [] # Will be list of all FeatureSynonym objects. self.dbxrefs = [] # Will be list of dbxrefs as sql result groupings: Db, Dbxref, FeatureDbxref. @@ -129,8 +129,8 @@ def __init__(self, feature): self.genomic_location_dtos = [] # Will need to be list of GenomicLocation objects. # Attributes for the Alliance GeneDTO. GeneDTO is_a GenomicEntityDTO. self.gene_symbol_dto = None # Will be a single SymbolSlotAnnotationDTO. - self.gene_full_name_dto = None # Will be a single GeneFullNameSlotAnnotation. - self.gene_systematic_name_dto = None # Will be a single GeneSystematicNameSlotAnnotation. + self.gene_full_name_dto = None # Will be a single FullNameSlotAnnotation. + self.gene_systematic_name_dto = None # Will be a single SystematicNameSlotAnnotation. self.gene_synonym_dtos = [] # Will be list of NameSlotAnnotationDTO objects. self.gene_type_curie = None # Will be the SO term ID corresponding to the gene's promoted_gene_type. # Notes associated with the object. @@ -234,7 +234,7 @@ def open_panther_file(self): self.pthr_dict[re.search(self.gene_regex, row[FB]).group(0)] = re.search(self.pthr_regex, row[PTHR]).group(0) return - def get_references(self, session): + def get_all_references(self, session): """Get all references.""" log.info('Get all references.') # First get all current pubs having an FBrf uniquename. @@ -510,10 +510,10 @@ def get_gene_featureloc(self, session): def query_chado(self, session): """A wrapper method that runs initial db queries.""" self.open_panther_file() - self.get_references(session) + self.get_all_references(session) self.get_genes(session) self.get_gene_taxons(session) - # self.get_gene_dbxrefs(session) # BOB - suppress for faster dev. + self.get_gene_dbxrefs(session) self.get_synonyms(session) self.get_annotation_ids(session) self.get_gene_snapshots(session) @@ -596,7 +596,6 @@ def process_feature_synonyms(self, feature): # Sift through name DTOs for symbol, fullname, systematic_name, etc. for name_dto in name_dto_list: if name_dto['display_text'] == feature.curr_anno_id: - log.debug(f"BOB: Found synonym-annoID match: {name_dto['display_text']}") if name_dto['name_type_name'] != 'systematic_name': log.warning(f"{feature}: Found mistyped curr anno ID: type={name_dto['name_type_name']}, name={name_dto['display_text']}") name_dto['name_type_name'] = 'systematic_name' @@ -608,7 +607,7 @@ def process_feature_synonyms(self, feature): feature.gene_symbol_dto = name_dto elif name_dto['display_text'] == feature.curr_fullname: if name_dto['name_type_name'] != 'full_name': - log.warning(f"BOB: {feature}: Found mistyped curr full_name: type={name_dto['name_type_name']}, name={name_dto['display_text']}") + log.warning(f"{feature}: Found mistyped curr full_name: type={name_dto['name_type_name']}, name={name_dto['display_text']}") name_dto['name_type_name'] = 'full_name' feature.gene_full_name_dto = name_dto else: @@ -633,7 +632,7 @@ def process_feature_synonyms(self, feature): if feature.curr_anno_id: placeholder_systematic_name_dto['format_text'] = feature.curr_anno_id placeholder_systematic_name_dto['display_text'] = feature.curr_anno_id - log.warning(f"BOB: {feature}: Has annoID never used as a synonym: {feature.curr_anno_id}") + log.warning(f"{feature}: Has annoID never used as a synonym: {feature.curr_anno_id}") feature.gene_systematic_name_dto = placeholder_systematic_name_dto return From c0314de57ee243b224b72411764220c69b3533ac Mon Sep 17 00:00:00 2001 From: gildossantos Date: Thu, 26 Jan 2023 15:15:09 -0500 Subject: [PATCH 23/52] update attr; update synonym, xref and 2o id handling --- src/AGR_data_retrieval_curation_allele.py | 521 +++++++++++++--------- 1 file changed, 322 insertions(+), 199 deletions(-) diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py index 6e293eb..d7f7925 100644 --- a/src/AGR_data_retrieval_curation_allele.py +++ b/src/AGR_data_retrieval_curation_allele.py @@ -23,6 +23,7 @@ import argparse import datetime import json +import re import strict_rfc3339 from sqlalchemy import create_engine, inspect from sqlalchemy.orm import aliased, sessionmaker @@ -93,62 +94,64 @@ def __init__(self, feature): """ # Attributes representing unprocessed FlyBase data. # Note: use attribute names that do not match an Alliance LinkML slot name. - # For initial load, the Alliance A-Team just needs minimum info. - # ALLELE: curie, taxon, symbol, description, internal, obsolete. # Problems with Allele LinkML: - # 1. Allele.taxon is required, but even after updating NCBITaxon info at FlyBase, not all alleles will have NCBI taxon ID. - self.feature = feature # The Feature object corresponding to the FlyBase allele. - self.organism_abbr = None # Will be the organism.abbreviation for the allele's species of origin. - self.adj_organism_abbr = 'Dmel' # Assume allele is Dmel (classical or transgenic) unless allele is of classical type in another insect. - self.in_vitro = False # Change to True if allele associated with "in vitro%" cvterm. - self.constructs = [] # Will be a list of FBtp IDs for this allele's constructs. - self.dmel_insertions = [] # Will be a list of FBti IDs for this allele's Dmel insertions. - self.non_dmel_insertions = [] # Will be a list of FBti IDs for this allele's non-Dmel insertions. - self.args = [] # Will be a list of ARGs Features (variants). - self.parent_gene = None # Will be the FBgn ID of the allele's gene. - self.allele_of_internal_gene = False # Will change to True if is allele of Dmel internal-type gene (e.g., origin_of_replication). - self.taxon_dbxref = None # Will be the NCBITaxon (Db, Dbxref) tuple for the organism. - self.curr_fb_symbol = None # Will be the current symbol Synonym object. - self.curr_fb_fullname = None # Will be the current fullname Synonym object. - self.internal_synonyms = [] # Will be list of internal synonym names (and synonym_sgml if different). - self.public_synonyms = [] # Will be list of public synonym names (and synonym_sgml if different). - self.dbxrefs = [] # Will be list of dbxrefs as sql result groupings: Db, Dbxref, FeatureDbxref. - self.alt_fb_ids = [] # Will be list of Dbxrefs for 2o FlyBase IDs. - self.timestamps = [] # Add all timestamps here. - self.fb_references = [] # Will be list of FBrf IDs related to an allele: directly and indirectly. - self.featureprops = {} # A CVterm-keyed dict of Featureprop lists. - self.phenotypes = [] # Will be a list of SQLAlchemy (Feature, Genotype, Phenotype, Cvterm) results. - self.direct_libraries = [] # Will be a list of Library objects directly related to the allele. - self.ins_libraries = [] # Will be a list of Library objects related to the allele via insertion (FBti). - self.cons_libraries = [] # Will be a list of Library objects related to the allele via construct (FBtp). - self.sf_libraries = [] # Will be a list of Library objects related to the allele via seq. feature (FBsf). + # 1. Allele.taxon_curie is required, but even after updating NCBITaxon info at FlyBase, not all alleles will have NCBI taxon ID. + # 2. Allele.inheritance_mode_name is singular, but some FB alleles have many documented modes. + self.feature = feature # The Feature object corresponding to the FlyBase allele. + self.organism_abbr = None # Will be the organism.abbreviation for the allele's species of origin. + self.adj_organism_abbr = 'Dmel' # Assume allele is Dmel (classical/transgenic) unless allele is of classical type in another insect. + self.in_vitro = False # Change to True if allele associated with "in vitro%" cvterm. + self.constructs = [] # Will be a list of FBtp IDs for this allele's constructs. + self.dmel_insertions = [] # Will be a list of FBti IDs for this allele's Dmel insertions. + self.non_dmel_insertions = [] # Will be a list of FBti IDs for this allele's non-Dmel insertions. + self.args = [] # Will be a list of ARGs Features (variants). + self.parent_gene = None # Will be the FBgn ID of the allele's gene. + self.allele_of_internal_gene = False # Will change to True if is allele of Dmel internal-type gene (e.g., origin_of_replication). + self.curr_symbol_name = None # Will be the current symbol synonym.synonym_sgml, processed by sub_sup_sgml_to_html(). + self.curr_fullname = None # Will be the current fullname synonym.synonym_sgml, processed by sub_sup_sgml_to_html(). + self.feature_synonyms = [] # Will be list of all FeatureSynonym objects. + self.dbxrefs = [] # Will be list of dbxrefs as sql result groupings: Db, Dbxref, FeatureDbxref. + self.alt_fb_ids = [] # Will be list of Dbxrefs for 2o FlyBase IDs. + self.timestamps = [] # Add all timestamps here. + self.fb_references = [] # Will be list of pub_ids from feature_pub, feature_synonym. + self.featureprops = {} # A CVterm-keyed dict of Featureprop lists. + self.phenotypes = [] # Will be a list of SQLAlchemy (Feature, Genotype, Phenotype, Cvterm) results. + self.direct_libraries = [] # Will be a list of Library objects directly related to the allele. + self.ins_libraries = [] # Will be a list of Library objects related to the allele via insertion (FBti). + self.cons_libraries = [] # Will be a list of Library objects related to the allele via construct (FBtp). + self.sf_libraries = [] # Will be a list of Library objects related to the allele via seq. feature (FBsf). # Attributes for the Alliance AuditedObject. - self.obsolete = feature.is_obsolete # Will be the FlyBase value here. - self.internal = False # Change to true if allele not intended for display at Alliance website. - self.created_by = 'FB:FB_curator' # Use placeholder value since no Person object at FlyBase. - self.updated_by = 'FB:FB_curator' # Use placeholder value since no Person object at FlyBase. - self.date_created = None # Earliest timestamp. - self.date_updated = None # Latest timestamp. - # self.data_provider = 'FB' # The MOD abbreviation. + self.obsolete = feature.is_obsolete # Will be the FlyBase value here. + self.internal = False # Change to true if allele not intended for display at Alliance website. + self.created_by_curie = 'FB:FB_curator' # Use placeholder value since no Person object at FlyBase. + self.updated_by_curie = 'FB:FB_curator' # Use placeholder value since no Person object at FlyBase. + self.date_created = None # Earliest timestamp. + self.date_updated = None # Latest timestamp. # Attributes for the Alliance BiologicalEntity. BiologicalEntity is_a AuditedObject. self.curie = 'FB:{}'.format(feature.uniquename) - self.taxon = None # A string representing the NCBI taxon ID. We have no NCBI taxonID for 223 alleles. + self.taxon_curie = None # A string representing the NCBI taxon ID. We have no NCBI taxonID for 223 alleles. # Attributes for the Alliance GenomicEntity. GenomicEntity is_a BiologicalEntity. - self.name = None # Will be current fullname synonym - report ascii or utf8 (sgml) version? - self.synonyms = [] # All current and non-current ASCII and SGML synonyms. - self.cross_references = [] # Report only select dbs, using AGR-accepted db_prefix. - self.secondary_identifiers = [] # Annotation IDs and 2o FlyBase IDs. + self.cross_reference_dtos = [] # Report only select dbs, using AGR-accepted db_prefix. # Attributes for the Alliance Allele. Allele is_a GenomicEntity. - self.symbol = None # Will be a string (ascii or utf8)? - self.references = [] # KANBAN-237: READY: Will be a list of pubs (PMID or FB:FBrf IDs) for the allele. - self.is_extinct = None # KANBAN-237: READY: Change to true if extinction has been reported. Otherwise, leave blank. - self.inheritence_mode = [] # KANBAN-237: READY: Will be a list of CV terms. - self.in_collection = [] # KANBAN-237: TO DO: Will be a library names. - self.sequencing_status = None # KANBAN-237: TO DO: Will be a CV term? TBD. Might be dropped. + self.allele_symbol_dto = None # Will be a single SymbolSlotAnnotationDTO. + self.allele_full_name_dto = None # Will be a single FullNameSlotAnnotation. + self.allele_synonym_dtos = [] # Will be list of NameSlotAnnotationDTO objects. + self.allele_database_status_dto = None # ToDo + self.allele_functional_impact_dtos = None # ToDo + self.allele_germline_transmission_status_dto = None # ToDo + self.allele_molecular_mutation_dtos = None # ToDo + self.allele_mutation_type_dtos = None # ToDo + self.allele_nomenclature_event_dtos = None # ToDo + self.allele_note_dtos = None # ToDo + self.allele_secondary_id_dtos = None # Only 2o FlyBase IDs (redundant with GenomicEntity.secondary_identifiers?) + self.in_collection_name = None # Will be library.name. + self.inheritance_mode_name = 'unknown' # Change to one of: dominant, semi-dominant, recessive. If many apply, leave as unknown. + self.is_extinct = None # Make True if extinction reported; make False is stock exists; leave as None otherwise. + self.reference_curies = None # Will be a list of reference curies (directly or indirectly related). # Notes associated with the object. - self.for_alliance_export = True # Change to False if object should be excluded from export. - self.internal_reasons = [] # Reasons for marking an object as internal in the export file. - self.export_warnings = [] # Reasons for suppressing an object from the export file. + self.for_alliance_export = True # Change to False if object should be excluded from export. + self.internal_reasons = [] # Reasons for marking an object as internal in the export file. + self.export_warnings = [] # Reasons for suppressing an object from the export file. def __str__(self): """Succinct text string describing the AllianceAllele object.""" @@ -161,51 +164,103 @@ class AlleleHandler(object): def __init__(self): """Create the AlleleHandler object.""" self.allele_dict = {} # An FBalID-keyed dict of AllianceAllele objects. + self.all_pubs_dict = {} # A pub_id-keyed dict of pub curies (PMID or FBrf). + self.all_synonyms_dict = {} # A synonym_id-keyed dict of Synonym objects. self.drosophilid_list = [] # A list of organism_ids for "Drosophilid" species in chado. self.total_feat_cnt = 0 # Count of all alleles found in starting query. self.export_feat_cnt = 0 # Count of all alleles exported to file. self.internal_feat_cnt = 0 # Count of all alleles marked as internal=True in export file. - self.fbrf_pmid_dict = {} # Will be a dict of FBrf-to-PMID xrefs. + # Regexes. + gene_regex = r'^FBgn[0-9]{7}$' + allele_regex = r'^FBal[0-9]{7}$' + cons_regex = r'^FBtp[0-9]{7}$' + ins_regex = r'^FBti[0-9]{7}$' + seqfeat_regex = r'^FBsf[0-9]{10}$' + feature_regex = r'^FB(tp|ti)[0-9]{7}$' + lib_regex = r'^FBlc[0-9]{7}$' + pub_regex = r'^(FBrf[0-9]{7}|unattributed)$' + # Sample set. test_alleles = [] + # Export fields. required_fields = [ + 'allele_symbol_dto', 'curie', - 'taxon', - 'symbol', - 'internal' + 'internal', + 'taxon_curie', ] output_fields = [ - 'created_by', - 'cross_references', + 'allele_database_status_dto', + 'allele_full_name_dto', + 'allele_functional_impact_dtos', + 'allele_germline_transmission_status_dto', + 'allele_molecular_mutation_dtos', + 'allele_mutation_type_dtos', + 'allele_nomenclature_event_dtos', + 'allele_note_dtos', + 'allele_secondary_id_dtos', + 'allele_symbol_dto', + 'allele_synonym_dtos', + 'created_by_curie', + 'cross_reference_dtos', 'curie', 'date_created', 'date_updated', - 'in_collection', - 'inheritence_mode', + 'in_collection_name', + 'inheritance_mode_name', 'internal', 'is_extinct', - 'updated_by', - 'name', 'obsolete', - 'references', - 'secondary_identifiers', - # 'sequencing_status', # KANBAN-237: Not implemented yet in LinkML v1.2.4 - 'symbol', - 'synonyms', - 'taxon' + 'reference_curies', + 'taxon_curie', + 'updated_by_curie', ] fb_agr_db_dict = { 'FlyBase': 'FB' } + def get_all_references(self, session): + """Get all references.""" + log.info('Get all references.') + # First get all current pubs having an FBrf uniquename. + filters = ( + Pub.uniquename.op('~')(self.pub_regex), + Pub.is_obsolete.is_(False) + ) + results = session.query(Pub).\ + filter(*filters).\ + distinct() + pub_counter = 0 + for pub in results: + self.all_pubs_dict[pub.pub_id] = f'FB:{pub.uniquename}' + pub_counter += 1 + # Next find PMIDs if available and replace the curie in the all_pubs_dict. + filters = ( + Pub.uniquename.op('~')(self.pub_regex), + Pub.is_obsolete.is_(False), + Db.name == 'pubmed', + PubDbxref.is_current.is_(True) + ) + pmid_xrefs = session.query(Pub, Dbxref).\ + join(PubDbxref, (PubDbxref.pub_id == Pub.pub_id)).\ + join(Dbxref, (Dbxref.dbxref_id == PubDbxref.dbxref_id)).\ + join(Db, (Db.db_id == Dbxref.db_id)).\ + filter(*filters).\ + distinct() + pmid_counter = 0 + for xref in pmid_xrefs: + self.all_pubs_dict[xref.Pub.pub_id] = f'PMID:{xref.Dbxref.accession}' + pmid_counter += 1 + log.info(f'Found {pmid_counter} PMID IDs for {pub_counter} current FB publications.') + return + def get_alleles(self, session): """Get all alleles.""" log.info('Querying chado for alleles.') # First get all allele features from chado. - allele_regex = r'^FBal[0-9]{7}$' filters = ( - Feature.uniquename.op('~')(allele_regex), + Feature.uniquename.op('~')(self.allele_regex), Feature.is_analysis.is_(False), Cvterm.name == 'allele' ) @@ -224,15 +279,13 @@ def get_allele_gene(self, session): """For current alleles, get the FBgn ID of allele's current gene.""" gene = aliased(Feature, name='gene') allele = aliased(Feature, name='allele') - gene_regex = r'^FBgn[0-9]{7}$' - allele_regex = r'^FBal[0-9]{7}$' filters = ( gene.is_obsolete.is_(False), gene.is_analysis.is_(False), - gene.uniquename.op('~')(gene_regex), + gene.uniquename.op('~')(self.gene_regex), allele.is_obsolete.is_(False), allele.is_analysis.is_(False), - allele.uniquename.op('~')(allele_regex), + allele.uniquename.op('~')(self.allele_regex), Cvterm.name == 'alleleof' ) allele_gene_results = session.query(allele, gene).\ @@ -268,9 +321,8 @@ def flag_alleles_of_internal_genes(self, session): ] feature_type = aliased(Cvterm, name='feature_type') prop_type = aliased(Cvterm, name='promoted_gene_type') - gene_regex = r'^FBgn[0-9]{7}$' filters = ( - Feature.uniquename.op('~')(gene_regex), + Feature.uniquename.op('~')(self.gene_regex), Feature.is_obsolete.is_(False), Feature.is_analysis.is_(False), Organism.abbreviation == 'Dmel', @@ -303,10 +355,9 @@ def flag_alleles_of_internal_genes(self, session): def flag_in_vitro_alleles(self, session): """Flag alleles associated with "in vitro" type CV term.""" log.info('Flag in vitro alleles.') - allele_regex = r'^FBal[0-9]{7}$' cvterm_name_regex = '^in vitro construct' filters = ( - Feature.uniquename.op('~')(allele_regex), + Feature.uniquename.op('~')(self.allele_regex), Cvterm.name.op('~')(cvterm_name_regex) ) ivt_alleles = session.query(Feature).\ @@ -326,13 +377,11 @@ def get_allele_constructs(self, session): log.info('Find constructs related to alleles.') allele = aliased(Feature, name='allele') construct = aliased(Feature, name='construct') - allele_regex = r'^FBal[0-9]{7}$' - construct_regex = r'^FBtp[0-9]{7}$' filters = ( allele.is_obsolete.is_(False), - allele.uniquename.op('~')(allele_regex), + allele.uniquename.op('~')(self.allele_regex), construct.is_obsolete.is_(False), - construct.uniquename.op('~')(construct_regex) + construct.uniquename.op('~')(self.construct_regex) ) construct_results = session.query(allele, construct).\ join(FeatureRelationship, (FeatureRelationship.object_id == construct.feature_id)).\ @@ -351,13 +400,11 @@ def get_allele_insertions(self, session): log.info('Find insertions related to alleles.') allele = aliased(Feature, name='allele') insertion = aliased(Feature, name='insertion') - allele_regex = r'^FBal[0-9]{7}$' - insertion_regex = r'^FBti[0-9]{7}$' filters = ( allele.is_obsolete.is_(False), - allele.uniquename.op('~')(allele_regex), + allele.uniquename.op('~')(self.allele_regex), insertion.is_obsolete.is_(False), - insertion.uniquename.op('~')(insertion_regex) + insertion.uniquename.op('~')(self.insertion_regex) ) insertion_results = session.query(Organism, allele, insertion).\ join(FeatureRelationship, (FeatureRelationship.object_id == insertion.feature_id)).\ @@ -425,6 +472,7 @@ def adjust_allele_org(self, session): def get_allele_taxons(self, session): """Get taxon IDs for alleles. Depends on all organisms for features having an abbreviation.""" log.info('Getting allele taxon IDs.') + # First make a dict of organism abbr to NCBI taxon IDs. filters = ( OrganismDbxref.is_current.is_(True), Db.name == 'NCBITaxon' @@ -438,46 +486,47 @@ def get_allele_taxons(self, session): organism_taxon_dict = {} for result in organism_dbxref_results: organism_taxon_dict[result.Organism.abbreviation] = result.Dbxref.accession + # Now fill in the info for alleles. for allele in self.allele_dict.values(): try: - allele.taxon = 'NCBITaxon:{}'.format(organism_taxon_dict[allele.adj_organism_abbr]) + allele.taxon_curie = f'NCBITaxon:{organism_taxon_dict[allele.adj_organism_abbr]}' except KeyError: log.debug('No NCBI taxon ID available for: {}'.format(allele)) return def get_synonyms(self, session): """Get current and non-current symbols and full names for alleles.""" - log.info('Getting allele synonyms.') - feature_type = aliased(Cvterm, name='feature_type') - synonym_type = aliased(Cvterm, name='synonym_type') - allele_regex = r'^FBal[0-9]{7}$' + log.info('Get current and non-current symbols and full names for alleles.') filters = ( - Feature.uniquename.op('~')(allele_regex), + Feature.uniquename.op('~')(self.allele_regex), Feature.is_analysis.is_(False), - feature_type.name == 'allele' + Cvterm.name == 'allele' ) - allele_curr_symbol_results = session.query(synonym_type, Feature, FeatureSynonym, Synonym).\ + results = session.query(Feature, FeatureSynonym, Synonym).\ join(FeatureSynonym, (FeatureSynonym.synonym_id == Synonym.synonym_id)).\ join(Feature, (Feature.feature_id == FeatureSynonym.feature_id)).\ - join(feature_type, (feature_type.cvterm_id == Feature.type_id)).\ - join(synonym_type, (synonym_type.cvterm_id == Synonym.type_id)).\ + join(Cvterm, (Cvterm.cvterm_id == Feature.type_id)).\ filter(*filters).\ distinct() counter = 0 - for result in allele_curr_symbol_results: + for result in results: + # Skip any references to non-current pubs. + if result.FeatureSynonym.pub_id not in self.all_pubs_dict.keys(): + continue + # First, build the all_synonyms_dict. + self.all_synonyms_dict[result.Synonym.synonym_id] = result.Synonym + # Second, collect FeatureSynonyms for each allele. + self.allele_dict[result.Feature.uniquename].feature_synonyms.append(result.FeatureSynonym) + # Third, capture pub_ids. + self.allele_dict[result.Feature.uniquename].fb_references.append(result.FeatureSynonym.pub_id) + + # Finally, catch current symbol and fullname strings. + if result.FeatureSynonym.is_current is True and result.Synonym.type.name == 'symbol': + self.allele_dict[result.Feature.uniquename].curr_symbol_name = sub_sup_sgml_to_html(result.Synonym.synonym_sgml) + elif result.FeatureSynonym.is_current is True and result.Synonym.type.name == 'fullname': + self.allele_dict[result.Feature.uniquename].curr_fullname = sub_sup_sgml_to_html(result.Synonym.synonym_sgml) counter += 1 - if result.FeatureSynonym.is_current is True: - if result.synonym_type.name == 'symbol': - self.allele_dict[result.Feature.uniquename].curr_fb_symbol = result.Synonym - elif result.synonym_type.name == 'fullname': - self.allele_dict[result.Feature.uniquename].curr_fb_fullname = result.Synonym - elif result.FeatureSynonym.is_internal is True: - self.allele_dict[result.Feature.uniquename].internal_synonyms.append(result.Synonym.name) - self.allele_dict[result.Feature.uniquename].internal_synonyms.append(sub_sup_sgml_to_html(result.Synonym.synonym_sgml)) - else: - self.allele_dict[result.Feature.uniquename].public_synonyms.append(result.Synonym.name) - self.allele_dict[result.Feature.uniquename].public_synonyms.append(sub_sup_sgml_to_html(result.Synonym.synonym_sgml)) - log.info('Found {} allele synonyms.'.format(counter)) + log.info(f'Found {counter} feature_synonyms (current pubs) for alleles.') return def get_allele_timestamps(self, session): @@ -514,9 +563,8 @@ def get_allele_timestamps(self, session): def get_allele_dbxrefs(self, session): """Get all dbxrefs for alleles.""" log.info('Getting allele dbxrefs.') - allele_regex = r'^FBal[0-9]{7}$' filters = ( - Feature.uniquename.op('~')(allele_regex), + Feature.uniquename.op('~')(self.allele_regex), Feature.is_analysis.is_(False), Cvterm.name == 'allele', Db.name.in_((self.fb_agr_db_dict.keys())) @@ -537,20 +585,19 @@ def get_allele_dbxrefs(self, session): if result.FeatureDbxref.is_current is True and result.Db.name == 'FlyBase': pass elif result.FeatureDbxref.is_current is False and result.Db.name == 'FlyBase': + self.allele_dict[result.Feature.uniquename].dbxrefs.append(result) self.allele_dict[result.Feature.uniquename].alt_fb_ids.append(result.Dbxref) else: self.allele_dict[result.Feature.uniquename].dbxrefs.append(result) log.info('Found {} allele crossreferences.'.format(counter)) return - def get_references(self, session): + def get_allele_references(self, session): """Get references for alleles.""" log.info('Get allele references.') - allele_regex = r'^FBal[0-9]{7}$' - fbrf_regex = r'^FBrf[0-9]{7}$' filters = ( - Feature.uniquename.op('~')(allele_regex), - Pub.uniquename.op('~')(fbrf_regex), + Feature.uniquename.op('~')(self.allele_regex), + Pub.uniquename.op('~')(self.pub_regex), Pub.is_obsolete.is_(False) ) allele_pubs = session.query(Feature, Pub).\ @@ -560,36 +607,16 @@ def get_references(self, session): distinct() counter = 0 for result in allele_pubs: - self.allele_dict[result.Feature.uniquename].fb_references.append(result.Pub.uniquename) + self.allele_dict[result.Feature.uniquename].fb_references.append(result.Pub.pub_id) counter += 1 log.info(f'Found {counter} allele-pub relationships.') return - def get_pmid_xrefs(self, session): - """Create a dict of FBrf to PMID for publications.""" - log.info('Getting PMID IDs for FB publications.') - filters = ( - Db.name == 'pubmed', - Pub.is_obsolete.is_(False), - PubDbxref.is_current.is_(True) - ) - pmid_xrefs = session.query(Pub, Dbxref).\ - join(PubDbxref, (PubDbxref.pub_id == Pub.pub_id)).\ - join(Dbxref, (Dbxref.dbxref_id == PubDbxref.dbxref_id)).\ - join(Db, (Db.db_id == Dbxref.db_id)).\ - filter(*filters).\ - distinct() - for xref in pmid_xrefs: - self.fbrf_pmid_dict[xref.Pub.uniquename] = xref.Dbxref.accession - log.info(f'Found {len(self.fbrf_pmid_dict.keys())} PMID IDs for FB publications.') - return - def get_allele_featureprops(self, session): """Get all allele featureprops.""" log.info('Get allele featureprops.') - allele_regex = r'^FBal[0-9]{7}$' filters = ( - Feature.uniquename.op('~')(allele_regex), + Feature.uniquename.op('~')(self.allele_regex), Cvterm.is_obsolete == 0 ) allele_fprops = session.query(Feature, Cvterm, Featureprop).\ @@ -612,7 +639,6 @@ def get_allele_featureprops(self, session): def get_args(self, session): """Get ARGs related to alleles.""" log.info('Get allele ARGs.') - allele_regex = r'^FBal[0-9]{7}$' arg_types = [ 'MNV', 'complex_substitution', @@ -629,7 +655,7 @@ def get_args(self, session): argtype = aliased(Cvterm, name='argtype') reltype = aliased(Cvterm, name='reltype') filters = ( - allele.uniquename.op('~')(allele_regex), + allele.uniquename.op('~')(self.allele_regex), arg.is_obsolete.is_(False), argtype.name.in_((arg_types)), reltype.name == 'partof' @@ -651,9 +677,8 @@ def get_args(self, session): def get_phenotypes(self, session): """Get phenotypes related to alleles.""" log.info('Get phenotypes related to alleles.') - allele_regex = r'^FBal[0-9]{7}$' filters = ( - Feature.uniquename.op('~')(allele_regex), + Feature.uniquename.op('~')(self.allele_regex), Genotype.is_obsolete.is_(False) ) results = session.query(Feature, Genotype, Phenotype, Cvterm).\ @@ -676,15 +701,13 @@ def get_direct_collections(self, session): """Find library collections directly related to alleles.""" log.info('Get directly-related allele collections.') counter = 0 - allele_regex = r'^FBal[0-9]{7}$' - lib_regex = r'^FBlc[0-9]{7}$' libtype = aliased(Cvterm, name='libtype') libfeattype = aliased(Cvterm, name='libfeattype') # First, look for direct FBal-FBlc associations. filters = ( - Feature.uniquename.op('~')(allele_regex), + Feature.uniquename.op('~')(self.allele_regex), Library.is_obsolete.is_(False), - Library.uniquename.op('~')(lib_regex), + Library.uniquename.op('~')(self.lib_regex), libtype.name == 'reagent collection', libfeattype.name == 'member_of_reagent_collection' ) @@ -706,20 +729,17 @@ def get_direct_collections(self, session): def get_indirect_collections(self, session): """Find library collections indirectly related to alleles via insertion or construct.""" log.info('Get indirectly-related allele collections (via insertion or construct).') - allele_regex = r'^FBal[0-9]{7}$' - feature_regex = r'^FB(tp|ti)[0-9]{7}$' - lib_regex = r'^FBlc[0-9]{7}$' allele = aliased(Feature, name='allele') feature = aliased(Feature, name='feature') libtype = aliased(Cvterm, name='libtype') libfeattype = aliased(Cvterm, name='libfeattype') featreltype = aliased(Cvterm, name='featreltype') filters = ( - allele.uniquename.op('~')(allele_regex), - feature.uniquename.op('~')(feature_regex), + allele.uniquename.op('~')(self.allele_regex), + feature.uniquename.op('~')(self.feature_regex), feature.is_obsolete.is_(False), Library.is_obsolete.is_(False), - Library.uniquename.op('~')(lib_regex), + Library.uniquename.op('~')(self.lib_regex), libtype.name == 'reagent collection', libfeattype.name == 'member_of_reagent_collection', featreltype.name == 'associated_with' @@ -751,10 +771,6 @@ def get_indirect_collections(self, session): def get_sf_collections(self, session): """Find library collections indirectly related to alleles via sequence feature.""" log.info('Get indirectly-related allele collections (via equence feature).') - allele_regex = r'^FBal[0-9]{7}$' - cons_regex = r'^FBtp[0-9]{7}$' - sf_regex = r'^FBsf[0-9]{10}$' - lib_regex = r'^FBlc[0-9]{7}$' allele = aliased(Feature, name='allele') construct = aliased(Feature, name='construct') seqfeat = aliased(Feature, name='seqfeat') @@ -764,13 +780,13 @@ def get_sf_collections(self, session): allele_construct = aliased(FeatureRelationship, name='allele_construct') seqfeat_construct = aliased(FeatureRelationship, name='seqfeat_construct') filters = ( - allele.uniquename.op('~')(allele_regex), - construct.uniquename.op('~')(cons_regex), - seqfeat.uniquename.op('~')(sf_regex), + allele.uniquename.op('~')(self.allele_regex), + construct.uniquename.op('~')(self.cons_regex), + seqfeat.uniquename.op('~')(self.seqfeat_regex), construct.is_obsolete.is_(False), seqfeat.is_obsolete.is_(False), Library.is_obsolete.is_(False), - Library.uniquename.op('~')(lib_regex), + Library.uniquename.op('~')(self.lib_regex), libtype.name == 'reagent collection', libfeattype.name == 'member_of_reagent_collection', featreltype.name == 'associated_with' @@ -797,6 +813,7 @@ def get_sf_collections(self, session): def query_chado(self, session): """A wrapper method that runs initial db queries.""" + self.get_all_references(session) self.get_alleles(session) self.get_direct_collections(session) self.get_indirect_collections(session) @@ -812,8 +829,7 @@ def query_chado(self, session): self.get_synonyms(session) self.get_allele_timestamps(session) self.get_allele_dbxrefs(session) - self.get_references(session) - self.get_pmid_xrefs(session) + self.get_allele_references(session) self.get_allele_featureprops(session) self.get_args(session) self.get_phenotypes(session) @@ -853,7 +869,7 @@ def synthesize_synonyms(self, allele): for internal_synonym in internal_synonym_set: internal_synonym_dict = { 'name': internal_synonym, - 'created_by': 'FB:FB_curator', + 'created_by_curie': 'FB:FB_curator', 'obsolete': False, 'internal': True } @@ -862,7 +878,7 @@ def synthesize_synonyms(self, allele): for public_synonym in public_synonym_set: public_synonym_dict = { 'name': public_synonym, - 'created_by': 'FB:FB_curator', + 'created_by_curie': 'FB:FB_curator', 'obsolete': False, 'internal': False } @@ -871,8 +887,15 @@ def synthesize_synonyms(self, allele): def synthesize_secondary_ids(self, allele): """Process 2o IDs.""" - for fb_id in allele.alt_fb_ids: - allele.secondary_identifiers.append('FB:{}'.format(fb_id.accession)) + unique_fb_id_list = list(set(allele.alt_fb_ids)) + for fb_id in unique_fb_id_list: + secondary_id_dict = { + 'secondary_id': f'FB:{fb_id.accession}', + 'created_by_curie': 'FB:FB_curator', + 'obsolete': False, + 'internal': False + } + allele.allele_secondary_id_dtos.append(secondary_id_dict) return def synthesize_xrefs(self, allele): @@ -883,35 +906,32 @@ def synthesize_xrefs(self, allele): 'display_name': 'FB:{}'.format(allele.feature.uniquename), 'prefix': 'FB', 'page_areas': ['allele'], - 'created_by': 'FB:FB_curator', + 'created_by_curie': 'FB:FB_curator', 'obsolete': False, 'internal': False } - allele.cross_references.append(xref_dict) + allele.cross_reference_dtos.append(xref_dict) # Add other xrefs. for result in allele.dbxrefs: - if result.Db.name in self.fb_agr_db_dict.keys(): - xref_dict = { - 'curie': '{}:{}'.format(self.fb_agr_db_dict[result.Db.name], result.Dbxref.accession), - 'display_name': '{}:{}'.format(self.fb_agr_db_dict[result.Db.name], result.Dbxref.accession), - 'prefix': self.fb_agr_db_dict[result.Db.name], - 'page_areas': ['allele'], - 'created_by': 'FB:FB_curator', - 'obsolete': False, - 'internal': False - } - if result.FeatureDbxref.is_current is False: - xref_dict['internal'] = True - allele.cross_references.append(xref_dict) + if result.Db.name not in self.fb_agr_db_dict.keys(): + continue + xref_dict = { + 'curie': '{}:{}'.format(self.fb_agr_db_dict[result.Db.name], result.Dbxref.accession), + 'display_name': '{}:{}'.format(self.fb_agr_db_dict[result.Db.name], result.Dbxref.accession), + 'prefix': self.fb_agr_db_dict[result.Db.name], + 'page_areas': ['allele'], + 'created_by_curie': 'FB:FB_curator', + 'obsolete': False, + 'internal': False + } + if result.FeatureDbxref.is_current is False: + xref_dict['internal'] = True + allele.cross_reference_dtos.append(xref_dict) return def synthesize_references(self, allele): """Process pubs for allele.""" - for fbrf_id in allele.fb_references: - try: - allele.references.append(f'PMID:{self.fbrf_pmid_dict[fbrf_id]}') - except KeyError: - allele.references.append(f'FB:{fbrf_id}') + allele.reference_curies = [self.all_pubs_dict[i] for i in allele.fb_references if self.all_pubs_dict[i] != 'unattributed'] return def synthesize_insertions(self, allele): @@ -922,11 +942,11 @@ def synthesize_insertions(self, allele): 'display_name': '{}:{}'.format('FB', insertion.uniquename), 'prefix': 'FB', 'page_areas': ['allele'], - 'created_by': 'FB:FB_curator', + 'created_by_curie': 'FB:FB_curator', 'obsolete': False, 'internal': False } - allele.cross_references.append(xref_dict) + allele.cross_reference_dtos.append(xref_dict) return def flag_internal_alleles(self, allele): @@ -956,13 +976,27 @@ def flag_unexportable_alleles(self, allele): return def synthesize_extinction(self, allele): - """Determine if allele is definitively extinct.""" + """Determine if allele definitively exists or is extinct.""" + has_stocks = False + reported_extinct = False + # First find evidence of extinction. try: for fprop in allele.featureprops['availability']: if fprop.value == 'Stated to be lost.': - allele.is_extinct = True + reported_extinct = True except KeyError: pass + # Second find evidence for existence. + for fprop_type in allele.featureprops.keys(): + if fprop_type.startswith('derived_stock_'): + has_stocks = True + # Synthesize these two pieces of info. + if reported_extinct is True: + allele.is_extinct = True + if has_stocks is True: + log.warning(f'{allele}: stated to be lost, but has stocks.') + elif has_stocks is True: + allele.is_extinct = False return def synthesize_inheritance_mode(self, allele): @@ -1007,17 +1041,15 @@ def synthesize_inheritance_mode(self, allele): pheno = phenotype.Phenotype.uniquename mode_context = f'{allele.curie}\t{cvterm}\t{geno}\t{pheno}' mode_context_list.append(mode_context) - if reported_modes: - reported_modes = list(set(reported_modes)) - allele.inheritence_mode = '|'.join(reported_modes) - log.debug(f'\tFound {len(reported_modes)} inheritance mode(s): {allele.curie}: {allele.inheritence_mode}') - # Log cases of multiple inheritance modes for curator review. - if len(reported_modes) > 1: - mode_context_list = list(set(mode_context_list)) - for i in mode_context_list: - log.warning(f'MULTIPLE_INHERITANCE_MODES:\t{i}') - else: - allele.inheritence_mode = 'unknown' + reported_modes = list(set(reported_modes)) + mode_context_list = list(set(mode_context_list)) + # Update inheritance_mode_name if unambiguous. + if len(reported_modes) == 1: + allele.inheritance_mode_name = reported_modes[0] + # If ambiguous, change from "unknown" to None. + elif len(reported_modes) > 1: + allele.inheritance_mode_name = None + log.warning(f"{allele}: Found {len(reported_modes)} inheritance modes: {'|'.join(reported_modes)}. Context: {mode_context_list}") return def synthesize_collections(self, allele): @@ -1033,11 +1065,102 @@ def synthesize_collections(self, allele): collection_names = allele.sf_libraries if collection_names: collection_names = list(set(collection_names)) - allele.in_collection = collection_names[0].name + allele.in_collection_name = collection_names[0].name if len(collection_names) > 1: log.warning(f'\tFound {len(collection_names)} collection(s) for {allele.curie}: {allele.in_collection}') return + def synthesize_synonyms(self, feature): + """Generate name/synonym DTOs for a feature that has a list of FeatureSynonym objects.""" + # Dict for converting FB to AGR synonym types. + synonym_type_conversion = { + 'symbol': 'nomenclature_symbol', + 'fullname': 'full_name', + 'nickname': 'nomenclature_symbol', + 'synonym': 'nomenclature_symbol' + } + default_name_dto = { + 'name_type_name': 'unspecified', + 'format_text': 'unspecified', + 'display_text': 'unspecified', + 'synonym_scope_name': 'exact', + 'evidence_curies': [], + 'internal': False, + 'obsolete': False + } + # Create a dict of all distinct name/synonym_sgml combinations: for each, capture synonym type(s) an pub_ids. + # Keys are (synonym.name, synonym.synonym_sgml) tuples. + # Values are dicts too where keys are chado synonym types and values are lists of pub_ids. + # Value dict also has an "internal" key that stores list of FeatureSynonym.is_internal values. + feature_synonym_dict = {} + for f_s in feature.feature_synonyms: + synonym = self.all_synonyms_dict[f_s.synonym_id] + distinct_synonym_name = (synonym.name, synonym.synonym_sgml) + if distinct_synonym_name in feature_synonym_dict.keys(): + feature_synonym_dict[distinct_synonym_name]['internal'].append(f_s.is_internal) + if synonym.type.name in feature_synonym_dict[distinct_synonym_name].keys(): + feature_synonym_dict[distinct_synonym_name][synonym.type.name].append(f_s.pub_id) + else: + feature_synonym_dict[distinct_synonym_name][synonym.type.name] = [f_s.pub_id] + else: + feature_synonym_dict[distinct_synonym_name] = {synonym.type.name: [f_s.pub_id], 'internal': [f_s.is_internal]} + # Convert to AGR name DTO objects. + name_dto_list = [] + FORMAT_TEXT = 0 + DISPLAY_TEXT = 1 + for syno_name, syno_attributes in feature_synonym_dict.items(): + # Determine internal status. False trumps True. + if False in set(syno_attributes['internal']): + syno_internal = False + else: + syno_internal = True + # Collect all pubs. + pub_id_list = [] + for syno_type, syno_type_pub_list in syno_attributes.items(): + if syno_type == 'internal': + continue + pub_id_list.extend(syno_type_pub_list) + pub_id_list = list(set(pub_id_list)) + # Pick the best synonym type. + type_tally = {} + for syno_type, syno_type_pub_list in syno_attributes.items(): + if syno_type == 'internal': + continue + type_tally[len(set(syno_type_pub_list))] = syno_type + name_type_to_use = synonym_type_conversion[type_tally[max(type_tally.keys())]] + output_synonym_dto = { + 'name_type_name': name_type_to_use, + 'format_text': sub_sup_sgml_to_html(syno_name[FORMAT_TEXT]), + 'display_text': sub_sup_sgml_to_html(syno_name[DISPLAY_TEXT]), + 'synonym_scope_name': 'exact', + 'evidence_curies': [self.all_pubs_dict[i] for i in pub_id_list if self.all_pubs_dict[i] != 'unattributed'], + 'internal': syno_internal, + 'obsolete': False + } + name_dto_list.append(output_synonym_dto) + # Sift through name DTOs for symbol, fullname, systematic_name, etc. + for name_dto in name_dto_list: + if name_dto['display_text'] == feature.curr_symbol_name: + if name_dto['name_type_name'] != 'nomenclature_symbol': + log.warning(f"{feature}: Found mistyped curr symbol: type={name_dto['name_type_name']}, name={name_dto['display_text']}") + name_dto['name_type_name'] = 'nomenclature_symbol' + feature.allele_symbol_dto = name_dto + elif name_dto['display_text'] == feature.curr_fullname: + if name_dto['name_type_name'] != 'full_name': + log.warning(f"{feature}: Found mistyped curr full_name: type={name_dto['name_type_name']}, name={name_dto['display_text']}") + name_dto['name_type_name'] = 'full_name' + feature.allele_full_name_dto = name_dto + else: + feature.allele_synonym_dtos.append(name_dto) + # Symbol is required. If none, fill it in. + if feature.allele_symbol_dto is None: + placeholder_symbol_dto = default_name_dto.copy() + placeholder_symbol_dto['name_type_name'] = 'nomenclature_symbol' + placeholder_symbol_dto['format_text'] = feature.feature.name + placeholder_symbol_dto['display_text'] = feature.feature.name + feature.allele_symbol_dto = placeholder_symbol_dto + return + def synthesize_info(self): """Convert FlyBase allele data into an AllianceAllele representation.""" log.info('Synthesizing allele info.') From 37ff0c78623f4729684f7505e802cb16b3bc0010 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Thu, 26 Jan 2023 15:16:28 -0500 Subject: [PATCH 24/52] remove redundant synonym synth method --- src/AGR_data_retrieval_curation_allele.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py index d7f7925..98cbd10 100644 --- a/src/AGR_data_retrieval_curation_allele.py +++ b/src/AGR_data_retrieval_curation_allele.py @@ -23,7 +23,6 @@ import argparse import datetime import json -import re import strict_rfc3339 from sqlalchemy import create_engine, inspect from sqlalchemy.orm import aliased, sessionmaker @@ -863,28 +862,6 @@ def synthesize_fullname(self, allele): allele.name = allele.feature.name return - def synthesize_synonyms(self, allele): - """Process allele synonyms.""" - internal_synonym_set = set(allele.internal_synonyms) - for internal_synonym in internal_synonym_set: - internal_synonym_dict = { - 'name': internal_synonym, - 'created_by_curie': 'FB:FB_curator', - 'obsolete': False, - 'internal': True - } - allele.synonyms.append(internal_synonym_dict) - public_synonym_set = set(allele.public_synonyms) - for public_synonym in public_synonym_set: - public_synonym_dict = { - 'name': public_synonym, - 'created_by_curie': 'FB:FB_curator', - 'obsolete': False, - 'internal': False - } - allele.synonyms.append(public_synonym_dict) - return - def synthesize_secondary_ids(self, allele): """Process 2o IDs.""" unique_fb_id_list = list(set(allele.alt_fb_ids)) From 4560ed6d3dfb02681223546dcc9f8d639d881bb2 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Thu, 26 Jan 2023 16:31:38 -0500 Subject: [PATCH 25/52] fix typo --- src/AGR_data_retrieval_curation_allele.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py index 98cbd10..9db106f 100644 --- a/src/AGR_data_retrieval_curation_allele.py +++ b/src/AGR_data_retrieval_curation_allele.py @@ -173,7 +173,7 @@ def __init__(self): # Regexes. gene_regex = r'^FBgn[0-9]{7}$' allele_regex = r'^FBal[0-9]{7}$' - cons_regex = r'^FBtp[0-9]{7}$' + construct_regex = r'^FBtp[0-9]{7}$' ins_regex = r'^FBti[0-9]{7}$' seqfeat_regex = r'^FBsf[0-9]{10}$' feature_regex = r'^FB(tp|ti)[0-9]{7}$' @@ -780,7 +780,7 @@ def get_sf_collections(self, session): seqfeat_construct = aliased(FeatureRelationship, name='seqfeat_construct') filters = ( allele.uniquename.op('~')(self.allele_regex), - construct.uniquename.op('~')(self.cons_regex), + construct.uniquename.op('~')(self.construct_regex), seqfeat.uniquename.op('~')(self.seqfeat_regex), construct.is_obsolete.is_(False), seqfeat.is_obsolete.is_(False), From f0440efcf3b25069601e801a88f69a4b9127c622 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Thu, 26 Jan 2023 17:27:50 -0500 Subject: [PATCH 26/52] fix typo --- src/AGR_data_retrieval_curation_allele.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py index 9db106f..23cfce9 100644 --- a/src/AGR_data_retrieval_curation_allele.py +++ b/src/AGR_data_retrieval_curation_allele.py @@ -174,7 +174,7 @@ def __init__(self): gene_regex = r'^FBgn[0-9]{7}$' allele_regex = r'^FBal[0-9]{7}$' construct_regex = r'^FBtp[0-9]{7}$' - ins_regex = r'^FBti[0-9]{7}$' + insertion_regex = r'^FBti[0-9]{7}$' seqfeat_regex = r'^FBsf[0-9]{10}$' feature_regex = r'^FB(tp|ti)[0-9]{7}$' lib_regex = r'^FBlc[0-9]{7}$' From c79cbbaced8591c252b039eaba683f6d4c455f9c Mon Sep 17 00:00:00 2001 From: gildossantos Date: Thu, 26 Jan 2023 19:26:57 -0500 Subject: [PATCH 27/52] fix typo --- src/AGR_data_retrieval_curation_allele.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py index 23cfce9..f0ed033 100644 --- a/src/AGR_data_retrieval_curation_allele.py +++ b/src/AGR_data_retrieval_curation_allele.py @@ -844,24 +844,6 @@ def synthesize_timestamps(self, allele): timestamp_to_rfc3339_localoffset(datetime.datetime.timestamp(max(allele.timestamps))) return - def synthesize_symbol(self, allele): - """Process symbol for an allele.""" - if allele.curr_fb_symbol: - allele.symbol = sub_sup_sgml_to_html(allele.curr_fb_symbol.synonym_sgml) - else: - allele.symbol = allele.feature.name - return - - def synthesize_fullname(self, allele): - """Process allele fullname.""" - if allele.curr_fb_fullname: - allele.name = sub_sup_sgml_to_html(allele.curr_fb_fullname.synonym_sgml) - elif allele.curr_fb_symbol: - allele.name = sub_sup_sgml_to_html(allele.curr_fb_symbol.synonym_sgml) - else: - allele.name = allele.feature.name - return - def synthesize_secondary_ids(self, allele): """Process 2o IDs.""" unique_fb_id_list = list(set(allele.alt_fb_ids)) @@ -1145,8 +1127,6 @@ def synthesize_info(self): log.debug('Evaluating annotation: {}'.format(allele)) self.synthesize_collections(allele) self.synthesize_timestamps(allele) - self.synthesize_symbol(allele) - self.synthesize_fullname(allele) self.synthesize_synonyms(allele) self.synthesize_secondary_ids(allele) self.synthesize_xrefs(allele) From 00ec15e1b35255e1d173a7e6561702f930804d03 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Thu, 26 Jan 2023 19:45:58 -0500 Subject: [PATCH 28/52] fix typo --- src/AGR_data_retrieval_curation_allele.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py index f0ed033..a69948f 100644 --- a/src/AGR_data_retrieval_curation_allele.py +++ b/src/AGR_data_retrieval_curation_allele.py @@ -136,17 +136,17 @@ def __init__(self, feature): self.allele_full_name_dto = None # Will be a single FullNameSlotAnnotation. self.allele_synonym_dtos = [] # Will be list of NameSlotAnnotationDTO objects. self.allele_database_status_dto = None # ToDo - self.allele_functional_impact_dtos = None # ToDo + self.allele_functional_impact_dtos = [] # ToDo self.allele_germline_transmission_status_dto = None # ToDo - self.allele_molecular_mutation_dtos = None # ToDo - self.allele_mutation_type_dtos = None # ToDo - self.allele_nomenclature_event_dtos = None # ToDo - self.allele_note_dtos = None # ToDo - self.allele_secondary_id_dtos = None # Only 2o FlyBase IDs (redundant with GenomicEntity.secondary_identifiers?) + self.allele_molecular_mutation_dtos = [] # ToDo + self.allele_mutation_type_dtos = [] # ToDo + self.allele_nomenclature_event_dtos = [] # ToDo + self.allele_note_dtos = [] # ToDo + self.allele_secondary_id_dtos = [] # Only 2o FlyBase IDs (redundant with GenomicEntity.secondary_identifiers?) self.in_collection_name = None # Will be library.name. self.inheritance_mode_name = 'unknown' # Change to one of: dominant, semi-dominant, recessive. If many apply, leave as unknown. self.is_extinct = None # Make True if extinction reported; make False is stock exists; leave as None otherwise. - self.reference_curies = None # Will be a list of reference curies (directly or indirectly related). + self.reference_curies = [] # Will be a list of reference curies (directly or indirectly related). # Notes associated with the object. self.for_alliance_export = True # Change to False if object should be excluded from export. self.internal_reasons = [] # Reasons for marking an object as internal in the export file. From 643d4c667d42642ab0fc71e2d8608ce10349cb4a Mon Sep 17 00:00:00 2001 From: gildossantos Date: Fri, 27 Jan 2023 09:50:40 -0500 Subject: [PATCH 29/52] fix obsolete eval --- src/AGR_data_retrieval_curation_gene.py | 40 +++++++++++++++---------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index 86f4910..aad68cc 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -36,7 +36,7 @@ ) from harvdev_utils.psycopg_functions import set_up_db_reading -# Now proceed with generic setup. +# Generic setup. report_label = 'gene_curation' set_up_dict = set_up_db_reading(report_label) server = set_up_dict['server'] @@ -66,10 +66,8 @@ # The main process. def main(): """Run the steps for exporting LinkML-compliant FlyBase Genes.""" - log.info('Running script "{}"'.format(__file__)) - log.info('Started main function.') - log.info('Output JSON file corresponds to "agr_curation_schema" release: {}'.format(linkml_release)) - + log.info('Running main() for script "{}"'.format(__file__)) + log.info('Output corresponds to "agr_curation_schema" release: {}'.format(linkml_release)) # Instantiate the object, get the data, synthesize it, export it. gene_handler = GeneHandler() db_query_transaction(gene_handler) @@ -227,11 +225,14 @@ def open_panther_file(self): tsvin = csv.reader(tsv_file, delimiter='\t') FB = 0 PTHR = 3 + counter = 0 for row in tsvin: fields = len(row) if fields: # Ignore blank lines if re.search(self.gene_regex, row[FB]) and re.search(self.pthr_regex, row[PTHR]): self.pthr_dict[re.search(self.gene_regex, row[FB]).group(0)] = re.search(self.pthr_regex, row[PTHR]).group(0) + counter += 1 + log.info(f'Processed {counter} lines from the panther orthology file.') return def get_all_references(self, session): @@ -302,13 +303,18 @@ def get_gene_taxons(self, session): filter(*filters).\ distinct() organism_taxon_dict = {} + org_counter = 0 + gene_counter = 0 for result in organism_dbxref_results: organism_taxon_dict[result.OrganismDbxref.organism_id] = result.Dbxref.accession + org_counter += 1 for gene in self.gene_dict.values(): try: gene.taxon_curie = 'NCBITaxon:{}'.format(organism_taxon_dict[gene.feature.organism_id]) + gene_counter += 1 except KeyError: log.debug('No NCBI taxon ID available for: {}'.format(gene)) + log.info(f'Found {org_counter} distinct NCBITaxon IDs for {gene_counter} genes.') return def get_synonyms(self, session): @@ -420,8 +426,11 @@ def get_gene_snapshots(self, session): join(prop_type, (prop_type.cvterm_id == Featureprop.type_id)).\ filter(*filters).\ distinct() + counter = 0 for result in gene_snapshot_results: self.gene_dict[result.feature.uniquename].gene_snapshot = result + counter += 1 + log.info(f'Found {counter} gene snapshots.') return def get_gene_types(self, session): @@ -440,9 +449,12 @@ def get_gene_types(self, session): join(prop_type, (prop_type.cvterm_id == Featureprop.type_id)).\ filter(*filters).\ distinct() + counter = 0 for result in gene_type_results: self.gene_dict[result.feature.uniquename].gene_type_curie = result.value[1:10].replace('SO', 'SO:') self.gene_dict[result.feature.uniquename].gene_type_name = result.value[11:-1] + counter += 1 + log.info(f'Found {counter} gene types for genes.') return def get_gene_timestamps(self, session): @@ -490,8 +502,11 @@ def get_gene_featureloc(self, session): filter(*filters).\ distinct() self.chr_dict = {} + chr_counter = 0 for result in chr_results: self.chr_dict[result.feature_id] = result.uniquename + chr_counter += 1 + log.info(f'Got basic info for {chr_counter} chr scaffolds.') # Now get gene featureloc. filters = ( Feature.uniquename.op('~')(self.gene_regex), @@ -503,8 +518,11 @@ def get_gene_featureloc(self, session): join(Cvterm, (Cvterm.cvterm_id == Feature.type_id)).\ filter(*filters).\ distinct() + gene_counter = 0 for result in gene_featureloc_results: self.gene_dict[result.feature.uniquename].featureloc = result + gene_counter += 1 + log.info(f'Found {gene_counter} genomic locations for genes.') return def query_chado(self, session): @@ -715,18 +733,10 @@ def synthesize_info(self): xref_dict['internal'] = True gene.cross_reference_dtos.append(xref_dict) # Flag internal features. - if gene.organism_abbr != 'Dmel': - gene.internal = True - gene.internal_reasons.append('Non-Dmel') - if gene.obsolete is True: + if gene.feature.is_obsolete is True: + gene.obsolete = True gene.internal = True gene.internal_reasons.append('Obsolete') - if gene.gene_type_curie is None: - gene.internal = True - gene.internal_reasons.append('Lacks gene type') - if gene.gene_type_name in self.internal_gene_types: - gene.internal = True - gene.internal_reasons.append('Internal gene type {} ({})'.format(gene.gene_type_name, gene.gene_type_curie)) for attr in self.required_fields: if attr not in gene.__dict__.keys(): gene.for_alliance_export = False From 46ef9c329217b81c0c40e67c318594f40740832e Mon Sep 17 00:00:00 2001 From: gildossantos Date: Fri, 27 Jan 2023 10:18:05 -0500 Subject: [PATCH 30/52] fix gene regex for pthr file --- src/AGR_data_retrieval_curation_gene.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index aad68cc..eed3230 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -226,11 +226,12 @@ def open_panther_file(self): FB = 0 PTHR = 3 counter = 0 + gene_regex = r'FBgn[0-9]{7}' # Since the FBgn ID does not represent the entire column entry, do not use self.gene_regex here. for row in tsvin: fields = len(row) if fields: # Ignore blank lines - if re.search(self.gene_regex, row[FB]) and re.search(self.pthr_regex, row[PTHR]): - self.pthr_dict[re.search(self.gene_regex, row[FB]).group(0)] = re.search(self.pthr_regex, row[PTHR]).group(0) + if re.search(gene_regex, row[FB]) and re.search(self.pthr_regex, row[PTHR]): + self.pthr_dict[re.search(gene_regex, row[FB]).group(0)] = re.search(self.pthr_regex, row[PTHR]).group(0) counter += 1 log.info(f'Processed {counter} lines from the panther orthology file.') return From 147685a8d6c964cb11edac2d9337a69bb593bce6 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Fri, 27 Jan 2023 10:18:33 -0500 Subject: [PATCH 31/52] flake8 --- src/AGR_data_retrieval_curation_gene.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index eed3230..6d8e866 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -226,7 +226,7 @@ def open_panther_file(self): FB = 0 PTHR = 3 counter = 0 - gene_regex = r'FBgn[0-9]{7}' # Since the FBgn ID does not represent the entire column entry, do not use self.gene_regex here. + gene_regex = r'FBgn[0-9]{7}' # Since the FBgn ID does not represent the entire column entry, do not use self.gene_regex here. for row in tsvin: fields = len(row) if fields: # Ignore blank lines From b51c3285e4d93cc7501fb71a292963aa72676b22 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Fri, 27 Jan 2023 10:24:28 -0500 Subject: [PATCH 32/52] get only Dmel chr scaffolds --- src/AGR_data_retrieval_curation_gene.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index 6d8e866..189ee13 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -505,9 +505,11 @@ def get_gene_featureloc(self, session): self.chr_dict = {} chr_counter = 0 for result in chr_results: + if result.organism.abbreviation != 'Dmel': + continue self.chr_dict[result.feature_id] = result.uniquename chr_counter += 1 - log.info(f'Got basic info for {chr_counter} chr scaffolds.') + log.info(f'Got basic info for {chr_counter} current Dmel chr scaffolds.') # Now get gene featureloc. filters = ( Feature.uniquename.op('~')(self.gene_regex), From d6a68326bad04f123ee7999b9743d7d85862cc0f Mon Sep 17 00:00:00 2001 From: gildossantos Date: Fri, 27 Jan 2023 12:19:59 -0500 Subject: [PATCH 33/52] reduce reasons for marking genes, alleles as internal --- src/AGR_data_retrieval_curation_allele.py | 16 ++++++++-------- src/AGR_data_retrieval_curation_gene.py | 7 +++---- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py index a69948f..0051594 100644 --- a/src/AGR_data_retrieval_curation_allele.py +++ b/src/AGR_data_retrieval_curation_allele.py @@ -121,7 +121,7 @@ def __init__(self, feature): self.sf_libraries = [] # Will be a list of Library objects related to the allele via seq. feature (FBsf). # Attributes for the Alliance AuditedObject. self.obsolete = feature.is_obsolete # Will be the FlyBase value here. - self.internal = False # Change to true if allele not intended for display at Alliance website. + self.internal = False # Change to true if not public on FlyBase. self.created_by_curie = 'FB:FB_curator' # Use placeholder value since no Person object at FlyBase. self.updated_by_curie = 'FB:FB_curator' # Use placeholder value since no Person object at FlyBase. self.date_created = None # Earliest timestamp. @@ -909,16 +909,16 @@ def synthesize_insertions(self, allele): return def flag_internal_alleles(self, allele): - """Flag alleles as internal and/or obsolete, or not.""" - if allele.organism_abbr != 'Dmel': - allele.internal = True - allele.internal_reasons.append('Non-Dmel') + """Flag alleles as internal.""" if allele.obsolete is True: allele.internal = True allele.internal_reasons.append('Obsolete') - if allele.allele_of_internal_gene is True: - allele.internal = True - allele.internal_reasons.append('Allele of internal Dmel gene type.') + # if allele.organism_abbr != 'Dmel': + # allele.internal = True + # allele.internal_reasons.append('Non-Dmel') + # if allele.allele_of_internal_gene is True: + # allele.internal = True + # allele.internal_reasons.append('Allele of internal Dmel gene type.') return def flag_unexportable_alleles(self, allele): diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index 189ee13..32c62ca 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -112,8 +112,8 @@ def __init__(self, feature): self.annotation_ids = [] # Will be list of Dbxrefs for annotation IDs. self.timestamps = [] # Add all timestamps here. # Attributes for the Alliance AuditedObjectDTO. - self.obsolete = False # Never True. All FB annotations are deleted if no longer current. - self.internal = False # Will be internal if annotation should not be exported to Alliance for some reason. + self.obsolete = feature.is_obsolete # Will be the FlyBase value here. + self.internal = False # Change to true if not public on FlyBase. self.created_by_curie = 'FB:FB_curator' # Use placeholder value since no Person object at FlyBase. self.updated_by_curie = 'FB:FB_curator' # Use placeholder value since no Person object at FlyBase. self.date_created = None # Not straightforward as half of relevant annotations are derived in the reporting build. @@ -736,8 +736,7 @@ def synthesize_info(self): xref_dict['internal'] = True gene.cross_reference_dtos.append(xref_dict) # Flag internal features. - if gene.feature.is_obsolete is True: - gene.obsolete = True + if gene.obsolete is True: gene.internal = True gene.internal_reasons.append('Obsolete') for attr in self.required_fields: From 82b55a80ec6070838b536ce43a4bcb2e545cafcd Mon Sep 17 00:00:00 2001 From: gildossantos Date: Fri, 27 Jan 2023 12:37:42 -0500 Subject: [PATCH 34/52] code comments on emerging allele attr --- src/AGR_data_retrieval_curation_allele.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py index 0051594..171d453 100644 --- a/src/AGR_data_retrieval_curation_allele.py +++ b/src/AGR_data_retrieval_curation_allele.py @@ -135,18 +135,18 @@ def __init__(self, feature): self.allele_symbol_dto = None # Will be a single SymbolSlotAnnotationDTO. self.allele_full_name_dto = None # Will be a single FullNameSlotAnnotation. self.allele_synonym_dtos = [] # Will be list of NameSlotAnnotationDTO objects. - self.allele_database_status_dto = None # ToDo - self.allele_functional_impact_dtos = [] # ToDo - self.allele_germline_transmission_status_dto = None # ToDo - self.allele_molecular_mutation_dtos = [] # ToDo - self.allele_mutation_type_dtos = [] # ToDo - self.allele_nomenclature_event_dtos = [] # ToDo - self.allele_note_dtos = [] # ToDo self.allele_secondary_id_dtos = [] # Only 2o FlyBase IDs (redundant with GenomicEntity.secondary_identifiers?) self.in_collection_name = None # Will be library.name. self.inheritance_mode_name = 'unknown' # Change to one of: dominant, semi-dominant, recessive. If many apply, leave as unknown. self.is_extinct = None # Make True if extinction reported; make False is stock exists; leave as None otherwise. self.reference_curies = [] # Will be a list of reference curies (directly or indirectly related). + self.allele_database_status_dto = None # ToDo - must be CV term: e.g., ? - CV not settled yet? + self.allele_functional_impact_dtos = [] # ToDo - must be CV term: e.g., amorph - CV not settled yet? + self.allele_germline_transmission_status_dto = None # ToDo - must be CV term: e.g., ? - CV not settled yet? + self.allele_molecular_mutation_dtos = [] # ToDo - must be CV term: e.g., ? - CV not settled yet? + self.allele_mutation_type_dtos = [] # ToDo - must be SO term curies: e.g., ?. + self.allele_nomenclature_event_dtos = [] # ToDo - must be CV term: e.g., named, renamed - CV not settled yet? + self.allele_note_dtos = [] # ToDo - must have CV term for note_type_name: e.g., ? - CV not settled yet? # Notes associated with the object. self.for_alliance_export = True # Change to False if object should be excluded from export. self.internal_reasons = [] # Reasons for marking an object as internal in the export file. From 39ae78eb376fcef448340e098e19f7cf182bf08a Mon Sep 17 00:00:00 2001 From: gildossantos Date: Mon, 30 Jan 2023 10:32:20 -0500 Subject: [PATCH 35/52] fix filtering out of unattributed pub --- src/AGR_data_retrieval_curation_allele.py | 4 ++-- src/AGR_data_retrieval_curation_gene.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py index 171d453..1d6db31 100644 --- a/src/AGR_data_retrieval_curation_allele.py +++ b/src/AGR_data_retrieval_curation_allele.py @@ -127,7 +127,7 @@ def __init__(self, feature): self.date_created = None # Earliest timestamp. self.date_updated = None # Latest timestamp. # Attributes for the Alliance BiologicalEntity. BiologicalEntity is_a AuditedObject. - self.curie = 'FB:{}'.format(feature.uniquename) + self.curie = 'FB:{}'.format(feature.uniquename)f self.taxon_curie = None # A string representing the NCBI taxon ID. We have no NCBI taxonID for 223 alleles. # Attributes for the Alliance GenomicEntity. GenomicEntity is_a BiologicalEntity. self.cross_reference_dtos = [] # Report only select dbs, using AGR-accepted db_prefix. @@ -890,7 +890,7 @@ def synthesize_xrefs(self, allele): def synthesize_references(self, allele): """Process pubs for allele.""" - allele.reference_curies = [self.all_pubs_dict[i] for i in allele.fb_references if self.all_pubs_dict[i] != 'unattributed'] + allele.reference_curies = [self.all_pubs_dict[i] for i in allele.fb_references if self.all_pubs_dict[i] != 'FB:unattributed'] return def synthesize_insertions(self, allele): diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index 32c62ca..4b4f804 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -609,7 +609,7 @@ def process_feature_synonyms(self, feature): 'format_text': sub_sup_sgml_to_html(syno_name[FORMAT_TEXT]), 'display_text': sub_sup_sgml_to_html(syno_name[DISPLAY_TEXT]), 'synonym_scope_name': 'exact', - 'evidence_curies': [self.all_pubs_dict[i] for i in pub_id_list if self.all_pubs_dict[i] != 'unattributed'], + 'evidence_curies': [self.all_pubs_dict[i] for i in pub_id_list if self.all_pubs_dict[i] != 'FB:unattributed'], 'internal': syno_internal, 'obsolete': False } From ce56b7d1bb86bff1775daa8a1cd1b325dce88774 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Mon, 30 Jan 2023 10:44:03 -0500 Subject: [PATCH 36/52] fix filtering out of unattributed pub --- src/AGR_data_retrieval_curation_allele.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py index 1d6db31..863f5cf 100644 --- a/src/AGR_data_retrieval_curation_allele.py +++ b/src/AGR_data_retrieval_curation_allele.py @@ -1092,7 +1092,7 @@ def synthesize_synonyms(self, feature): 'format_text': sub_sup_sgml_to_html(syno_name[FORMAT_TEXT]), 'display_text': sub_sup_sgml_to_html(syno_name[DISPLAY_TEXT]), 'synonym_scope_name': 'exact', - 'evidence_curies': [self.all_pubs_dict[i] for i in pub_id_list if self.all_pubs_dict[i] != 'unattributed'], + 'evidence_curies': [self.all_pubs_dict[i] for i in pub_id_list if self.all_pubs_dict[i] != 'FB:unattributed'], 'internal': syno_internal, 'obsolete': False } From 058e2c921768cb0269612628a87b6dec6875e203 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Mon, 30 Jan 2023 11:16:35 -0500 Subject: [PATCH 37/52] debug unattr issue --- src/AGR_data_retrieval_curation_allele.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py index 863f5cf..bff04bb 100644 --- a/src/AGR_data_retrieval_curation_allele.py +++ b/src/AGR_data_retrieval_curation_allele.py @@ -127,7 +127,7 @@ def __init__(self, feature): self.date_created = None # Earliest timestamp. self.date_updated = None # Latest timestamp. # Attributes for the Alliance BiologicalEntity. BiologicalEntity is_a AuditedObject. - self.curie = 'FB:{}'.format(feature.uniquename)f + self.curie = 'FB:{}'.format(feature.uniquename) self.taxon_curie = None # A string representing the NCBI taxon ID. We have no NCBI taxonID for 223 alleles. # Attributes for the Alliance GenomicEntity. GenomicEntity is_a BiologicalEntity. self.cross_reference_dtos = [] # Report only select dbs, using AGR-accepted db_prefix. @@ -232,6 +232,9 @@ def get_all_references(self, session): pub_counter = 0 for pub in results: self.all_pubs_dict[pub.pub_id] = f'FB:{pub.uniquename}' + # BOB: DEBUG unattr issue + if pub.uniquename == 'unattributed': + log.debug(f'BOB: Found unattributed pub: pub_id={pub.pub_id}, dict_value={self.all_pubs_dict[pub.pub_id]}') pub_counter += 1 # Next find PMIDs if available and replace the curie in the all_pubs_dict. filters = ( @@ -250,6 +253,10 @@ def get_all_references(self, session): for xref in pmid_xrefs: self.all_pubs_dict[xref.Pub.pub_id] = f'PMID:{xref.Dbxref.accession}' pmid_counter += 1 + # BOB: DEBUG unattr issue: + for pub_id, curie in self.all_pubs_dict.items(): + if 'unattributed' in curie: + log.debug(f'BOB: Found unattributed pub: pub_id={pub_id}, dict_value={curie}') log.info(f'Found {pmid_counter} PMID IDs for {pub_counter} current FB publications.') return From 7d991b0872c05c77e96bca38c4d28f961f60894f Mon Sep 17 00:00:00 2001 From: gildossantos Date: Mon, 30 Jan 2023 11:32:05 -0500 Subject: [PATCH 38/52] fix debug of unattr issue --- src/AGR_data_retrieval_curation_allele.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py index bff04bb..5c81962 100644 --- a/src/AGR_data_retrieval_curation_allele.py +++ b/src/AGR_data_retrieval_curation_allele.py @@ -231,10 +231,12 @@ def get_all_references(self, session): distinct() pub_counter = 0 for pub in results: + if pub.uniquename == 'unattributed': + log.info('BOB: Found unattributed pub.') self.all_pubs_dict[pub.pub_id] = f'FB:{pub.uniquename}' # BOB: DEBUG unattr issue if pub.uniquename == 'unattributed': - log.debug(f'BOB: Found unattributed pub: pub_id={pub.pub_id}, dict_value={self.all_pubs_dict[pub.pub_id]}') + log.info(f'BOB: Found unattributed pub: pub_id={pub.pub_id}, dict_value={self.all_pubs_dict[pub.pub_id]}') pub_counter += 1 # Next find PMIDs if available and replace the curie in the all_pubs_dict. filters = ( @@ -254,9 +256,10 @@ def get_all_references(self, session): self.all_pubs_dict[xref.Pub.pub_id] = f'PMID:{xref.Dbxref.accession}' pmid_counter += 1 # BOB: DEBUG unattr issue: + log.info('BOB: DEBUB UNATTRIBUTED ISSUE') for pub_id, curie in self.all_pubs_dict.items(): if 'unattributed' in curie: - log.debug(f'BOB: Found unattributed pub: pub_id={pub_id}, dict_value={curie}') + log.info(f'BOB: Found unattributed pub: pub_id={pub_id}, dict_value={curie}') log.info(f'Found {pmid_counter} PMID IDs for {pub_counter} current FB publications.') return From 82967680cf09a776cedb9c4b9773a82be3359748 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Mon, 30 Jan 2023 12:39:50 -0500 Subject: [PATCH 39/52] remove debug stuff --- src/AGR_data_retrieval_curation_allele.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py index 5c81962..a37bf37 100644 --- a/src/AGR_data_retrieval_curation_allele.py +++ b/src/AGR_data_retrieval_curation_allele.py @@ -231,12 +231,7 @@ def get_all_references(self, session): distinct() pub_counter = 0 for pub in results: - if pub.uniquename == 'unattributed': - log.info('BOB: Found unattributed pub.') self.all_pubs_dict[pub.pub_id] = f'FB:{pub.uniquename}' - # BOB: DEBUG unattr issue - if pub.uniquename == 'unattributed': - log.info(f'BOB: Found unattributed pub: pub_id={pub.pub_id}, dict_value={self.all_pubs_dict[pub.pub_id]}') pub_counter += 1 # Next find PMIDs if available and replace the curie in the all_pubs_dict. filters = ( @@ -255,11 +250,6 @@ def get_all_references(self, session): for xref in pmid_xrefs: self.all_pubs_dict[xref.Pub.pub_id] = f'PMID:{xref.Dbxref.accession}' pmid_counter += 1 - # BOB: DEBUG unattr issue: - log.info('BOB: DEBUB UNATTRIBUTED ISSUE') - for pub_id, curie in self.all_pubs_dict.items(): - if 'unattributed' in curie: - log.info(f'BOB: Found unattributed pub: pub_id={pub_id}, dict_value={curie}') log.info(f'Found {pmid_counter} PMID IDs for {pub_counter} current FB publications.') return From 8abf70787e924c4a375b11efa108bc950ccd02ee Mon Sep 17 00:00:00 2001 From: gildossantos Date: Tue, 31 Jan 2023 10:45:29 -0500 Subject: [PATCH 40/52] unique list of fb refs for alleles --- src/AGR_data_retrieval_curation_allele.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/AGR_data_retrieval_curation_allele.py b/src/AGR_data_retrieval_curation_allele.py index a37bf37..92c993c 100644 --- a/src/AGR_data_retrieval_curation_allele.py +++ b/src/AGR_data_retrieval_curation_allele.py @@ -890,6 +890,7 @@ def synthesize_xrefs(self, allele): def synthesize_references(self, allele): """Process pubs for allele.""" + allele.fb_references = list(set(allele.fb_references)) allele.reference_curies = [self.all_pubs_dict[i] for i in allele.fb_references if self.all_pubs_dict[i] != 'FB:unattributed'] return From ad2c5d12d5a95826f5e1c2852d716450882c9907 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Wed, 1 Feb 2023 12:27:16 -0500 Subject: [PATCH 41/52] look for redundant dis annos --- src/AGR_data_retrieval_curation_disease.py | 31 ++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/AGR_data_retrieval_curation_disease.py b/src/AGR_data_retrieval_curation_disease.py index ec4cc23..62630d9 100644 --- a/src/AGR_data_retrieval_curation_disease.py +++ b/src/AGR_data_retrieval_curation_disease.py @@ -96,6 +96,7 @@ def __init__(self, feature_cvterm, provenance_prop): self.timestamps = [] # Will be a list of audit_chado timestamp lists. # Derived attributes. self.modifier_problem = False # Change to true if there's a problem finding the modifier allele. + self.agr_uniq_key = None # Will be unique key based on Alliance defining features. # Attributes for the Alliance AuditedObjectDTO. self.obsolete = False # Never True. All FB annotations are deleted if no longer current. self.internal = False # Will be internal if annotation should not be exported to Alliance for some reason. @@ -149,6 +150,7 @@ class DAFMaker(object): def __init__(self): """Create the DAFMaker object.""" self.dis_anno_dict = {} # A dict of DiseaseAnnotations keyed by feature_cvterm_id plus rank (e.g., 1234567_0). + self.uniq_dis_dict = {} # A dict of DiseaseAnnotations keyed by AGR defining features. self.total_anno_cnt = 0 # Count of all disease annotations found in starting query. self.export_anno_cnt = 0 # Count of all disease annotations exported to file. self.internal_anno_cnt = 0 # Count of all disease annotations marked as internal=True in export file. @@ -466,6 +468,9 @@ def synthesize_info(self, session): dis_anno.modifier_problem = True # Now check for conditions that prevent export. self.evaluate_annot(dis_anno) + # Generate the unique AGR key based on AGR defining features for FB disease annotations. + self.derive_agr_uniq_key(dis_anno) + self.group_dis_annos() log.info('Done synthesizing disease annotation info.') return @@ -487,6 +492,32 @@ def evaluate_annot(self, dis_anno): log.debug(msg) return + def derive_agr_uniq_key(self, dis_anno): + """Derive the AGR unique key based on defining features of FB disease annotations.""" + dis_anno.agr_uniq_key = f'{dis_anno.allele_curie}||{dis_anno.do_term_curie}||{dis_anno.disease_relation_name}' + dis_anno.agr_uniq_key += f'||{dis_anno.reference_curie}' + evi_codes = list(set(dis_anno.evidence_code_curies)).sorted + evi_code_str = '|'.join(evi_codes) + dis_anno.agr_uniq_key += f'||{evi_code_str}' + dis_anno.agr_uniq_key += f'||{dis_anno.disease_genetic_modifier_curie}' + dis_anno.agr_uniq_key += f'||{dis_anno.disease_genetic_modifier_relation_name}' + log.debug(f'{dis_anno} HAS AGR_UNIQ_KEY: {dis_anno.agr_uniq_key}') + return + + def group_dis_annos(self): + """Group redundant disease annotations.""" + log.info('Group redundant disease annotations.') + input_counter = 0 + for dis_anno in self.dis_anno_dict.values(): + input_counter += 1 + try: + self.uniq_dis_dict[dis_anno.agr_uniq_key].append(dis_anno) + except KeyError: + self.uniq_dis_dict[dis_anno.agr_uniq_key] = [dis_anno] + grouped_counter = len(self.uniq_dis_dict.keys()) + log.info(f'Found {grouped_counter} unique keys for {input_counter} disease annotations.') + return + def generate_export_file(self): """Process disease annotations and print to a LinkML-compliant JSON file.""" log.info('Generating output JSON file of disease annotations.') From 209f3e2bfba008bbb1748e21ffe62f65905e5082 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Wed, 1 Feb 2023 12:33:09 -0500 Subject: [PATCH 42/52] fix typo --- src/AGR_data_retrieval_curation_disease.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/AGR_data_retrieval_curation_disease.py b/src/AGR_data_retrieval_curation_disease.py index 62630d9..23f7569 100644 --- a/src/AGR_data_retrieval_curation_disease.py +++ b/src/AGR_data_retrieval_curation_disease.py @@ -496,7 +496,8 @@ def derive_agr_uniq_key(self, dis_anno): """Derive the AGR unique key based on defining features of FB disease annotations.""" dis_anno.agr_uniq_key = f'{dis_anno.allele_curie}||{dis_anno.do_term_curie}||{dis_anno.disease_relation_name}' dis_anno.agr_uniq_key += f'||{dis_anno.reference_curie}' - evi_codes = list(set(dis_anno.evidence_code_curies)).sorted + evi_codes = sorted(list(set(dis_anno.evidence_code_curies))) + log.debug(f'BOB: {evi_codes}') evi_code_str = '|'.join(evi_codes) dis_anno.agr_uniq_key += f'||{evi_code_str}' dis_anno.agr_uniq_key += f'||{dis_anno.disease_genetic_modifier_curie}' From 6c5497d5e3da53b89a7f19e5622a9a3b1e406f19 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Wed, 1 Feb 2023 12:42:36 -0500 Subject: [PATCH 43/52] restrict uniq key counts to exportable dis_annos, print out redundant ones for assessment --- src/AGR_data_retrieval_curation_disease.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/AGR_data_retrieval_curation_disease.py b/src/AGR_data_retrieval_curation_disease.py index 23f7569..d953777 100644 --- a/src/AGR_data_retrieval_curation_disease.py +++ b/src/AGR_data_retrieval_curation_disease.py @@ -510,13 +510,20 @@ def group_dis_annos(self): log.info('Group redundant disease annotations.') input_counter = 0 for dis_anno in self.dis_anno_dict.values(): + if dis_anno.for_alliance_export is False: + continue input_counter += 1 try: self.uniq_dis_dict[dis_anno.agr_uniq_key].append(dis_anno) except KeyError: self.uniq_dis_dict[dis_anno.agr_uniq_key] = [dis_anno] grouped_counter = len(self.uniq_dis_dict.keys()) - log.info(f'Found {grouped_counter} unique keys for {input_counter} disease annotations.') + log.info(f'Found {grouped_counter} unique keys for {input_counter} exportable disease annotations.') + for uniq_key, anno_list in self.uniq_dis_dict.items(): + if len(anno_list) > 1: + log.warning(f'REDUNDANT: {uniq_key}:') + for i in anno_list: + log.warning(f'\t{i}') return def generate_export_file(self): From f0edbea53005a0311ba7d7413b18f9fc1b5124d9 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Wed, 1 Feb 2023 13:02:14 -0500 Subject: [PATCH 44/52] tidy warnings, report/count non-redundant dis_anno requiring modifier ID update --- src/AGR_data_retrieval_curation_disease.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/AGR_data_retrieval_curation_disease.py b/src/AGR_data_retrieval_curation_disease.py index d953777..8fac8c3 100644 --- a/src/AGR_data_retrieval_curation_disease.py +++ b/src/AGR_data_retrieval_curation_disease.py @@ -95,6 +95,7 @@ def __init__(self, feature_cvterm, provenance_prop): self.qualifier = None # Will be the "qualifier" FeatureCvtermprop. self.timestamps = [] # Will be a list of audit_chado timestamp lists. # Derived attributes. + self.modifier_id_was_updated = False # Change to true if modifier ID in evidence text was updated. self.modifier_problem = False # Change to true if there's a problem finding the modifier allele. self.agr_uniq_key = None # Will be unique key based on Alliance defining features. # Attributes for the Alliance AuditedObjectDTO. @@ -416,13 +417,13 @@ def get_current_id_for_allele(self, session, old_uniquename): distinct() curr_uniquenames = [i.uniquename for i in curr_alleles] if len(curr_uniquenames) == 1: - log.warning('For obsolete {}, found one current allele: {}'.format(old_uniquename, curr_uniquenames[0])) + log.debug('For obsolete {}, found one current allele: {}'.format(old_uniquename, curr_uniquenames[0])) curr_allele_id = curr_uniquenames[0] elif len(curr_uniquenames) > 1: - log.warning('For obsolete {}, found many current alleles: {}'.format(old_uniquename, curr_uniquenames)) + log.debug('For obsolete {}, found many current alleles: {}'.format(old_uniquename, curr_uniquenames)) curr_allele_id = None else: - log.warning('For obsolete {}, found no current alleles.'.format(old_uniquename)) + log.debug('For obsolete {}, found no current alleles.'.format(old_uniquename)) curr_allele_id = None return curr_allele_id @@ -464,6 +465,7 @@ def synthesize_info(self, session): curr_allele_id = self.get_current_id_for_allele(session, allele_id) if curr_allele_id: dis_anno.disease_genetic_modifier_curie = 'FB:{}'.format(curr_allele_id) + dis_anno.modifier_id_was_updated = True else: dis_anno.modifier_problem = True # Now check for conditions that prevent export. @@ -497,7 +499,6 @@ def derive_agr_uniq_key(self, dis_anno): dis_anno.agr_uniq_key = f'{dis_anno.allele_curie}||{dis_anno.do_term_curie}||{dis_anno.disease_relation_name}' dis_anno.agr_uniq_key += f'||{dis_anno.reference_curie}' evi_codes = sorted(list(set(dis_anno.evidence_code_curies))) - log.debug(f'BOB: {evi_codes}') evi_code_str = '|'.join(evi_codes) dis_anno.agr_uniq_key += f'||{evi_code_str}' dis_anno.agr_uniq_key += f'||{dis_anno.disease_genetic_modifier_curie}' @@ -519,11 +520,18 @@ def group_dis_annos(self): self.uniq_dis_dict[dis_anno.agr_uniq_key] = [dis_anno] grouped_counter = len(self.uniq_dis_dict.keys()) log.info(f'Found {grouped_counter} unique keys for {input_counter} exportable disease annotations.') + # Report redundant disease annotations in detail. + # Also report non-redundant disease annotations that required modifier ID update. + update_allele_id_counter = 0 for uniq_key, anno_list in self.uniq_dis_dict.items(): if len(anno_list) > 1: - log.warning(f'REDUNDANT: {uniq_key}:') + log.warning(f'REDUNDANT: AGR_UNIQ_KEY: {uniq_key}') for i in anno_list: - log.warning(f'\t{i}') + log.warning(f'REDUNDANT:\t{i}') + elif anno_list[0].modifier_id_was_updated is True: + log.warning(f'UPDATED DIS_ANNO: {anno_list[0]}') + update_allele_id_counter +=1 + log.info(f'Found {update_allele_id_counter} non-redundant exportable disease annotations that required modifier ID update.') return def generate_export_file(self): From 65cee0802fa269306c43fb88ca923dbb87af50e5 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Wed, 1 Feb 2023 13:27:21 -0500 Subject: [PATCH 45/52] add negated to agr_uniq_key, do not export dis annos requiring modifier ID update --- src/AGR_data_retrieval_curation_disease.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/AGR_data_retrieval_curation_disease.py b/src/AGR_data_retrieval_curation_disease.py index 8fac8c3..f0b604f 100644 --- a/src/AGR_data_retrieval_curation_disease.py +++ b/src/AGR_data_retrieval_curation_disease.py @@ -485,7 +485,8 @@ def evaluate_annot(self, dis_anno): 'Only "model of|DOES NOT model" is exportable', ' with FLYBASE' in dis_anno.evidence_code.value: 'Only disease annotations modeled by a single allele are exportable', - dis_anno.modifier_problem is True: 'Cannot find current feature for disease modifier.' + dis_anno.modifier_problem is True: 'Cannot find current feature for disease modifier.', + dis_anno.modifier_id_was_updated is True: 'Modifier referenced by non-current allele ID.' } for check, msg in export_checks.items(): if check: @@ -497,7 +498,7 @@ def evaluate_annot(self, dis_anno): def derive_agr_uniq_key(self, dis_anno): """Derive the AGR unique key based on defining features of FB disease annotations.""" dis_anno.agr_uniq_key = f'{dis_anno.allele_curie}||{dis_anno.do_term_curie}||{dis_anno.disease_relation_name}' - dis_anno.agr_uniq_key += f'||{dis_anno.reference_curie}' + dis_anno.agr_uniq_key += f'||{dis_anno.negated}||{dis_anno.reference_curie}' evi_codes = sorted(list(set(dis_anno.evidence_code_curies))) evi_code_str = '|'.join(evi_codes) dis_anno.agr_uniq_key += f'||{evi_code_str}' From 0ecfc7cca9ab3f088d86218e1fc0b2667f2f0181 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Wed, 1 Feb 2023 13:43:30 -0500 Subject: [PATCH 46/52] export non-redundant dis_annos --- src/AGR_data_retrieval_curation_disease.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/AGR_data_retrieval_curation_disease.py b/src/AGR_data_retrieval_curation_disease.py index f0b604f..5ead74f 100644 --- a/src/AGR_data_retrieval_curation_disease.py +++ b/src/AGR_data_retrieval_curation_disease.py @@ -542,7 +542,9 @@ def generate_export_file(self): 'linkml_version': linkml_release, 'disease_allele_ingest_set': [] } - for dis_anno in self.dis_anno_dict.values(): + # For each AGR unique key, just process the 1st disease annotation in the list of redundant FB annotations. + for dis_anno_list in self.uniq_dis_dict.values(): + dis_anno = dis_anno_list[0] if dis_anno.for_alliance_export is False: log.debug('Suppress disease annotation from export: {}. Reasons: {}'.format(dis_anno, '; '.join(dis_anno.export_warnings))) continue @@ -561,9 +563,8 @@ def generate_export_file(self): outfile.close() log.info('Done writing data to output file.') total_public_anno_cnt = self.export_anno_cnt - self.internal_anno_cnt - log.info('Exported {} of {} disease annotations ({} are public).'. - format(self.export_anno_cnt, self.total_anno_cnt, total_public_anno_cnt)) - log.info('Suppressed {} disease annotations from export.'.format(self.total_anno_cnt - self.export_anno_cnt)) + log.info(f'Exported {self.export_anno_cnt} of {self.total_anno_cnt} disease annotations ({total_public_anno_cnt} are public).') + log.info(f'Suppressed {self.total_anno_cnt - self.export_anno_cnt} disease annotations from export.') return From fd308b6861598a0678437cd01c36d785c5b89f12 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Thu, 2 Feb 2023 10:48:26 -0500 Subject: [PATCH 47/52] update req --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8d07faf..0fa5691 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ git+https://github.com/FlyBase/harvdev-utils.git@master#egg=harvdev_utils # pubchempy retry # requests>=2.21.0 -# sqlalchemy +sqlalchemy svn # urllib3>=1.24.1 # virtualenv>=16.2.0 From a4039bb5b47473a4b6388b67d92dd7a81424a47f Mon Sep 17 00:00:00 2001 From: gildossantos Date: Thu, 2 Feb 2023 11:07:16 -0500 Subject: [PATCH 48/52] update sqlalchemy --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0fa5691..9b218dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ git+https://github.com/FlyBase/harvdev-utils.git@master#egg=harvdev_utils # pubchempy retry # requests>=2.21.0 -sqlalchemy +sqlalchemy>=2.0 svn # urllib3>=1.24.1 # virtualenv>=16.2.0 From 9eb33a88cf227ccb187ba1a31eaf988a8ba8b3d8 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Thu, 2 Feb 2023 11:15:02 -0500 Subject: [PATCH 49/52] try updating h-utils --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9b218dc..dc869bc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ nested_dict==1.61 psycopg2>=2.6.2 strict_rfc3339==0.7 tqdm>=4.29.0 -git+https://github.com/FlyBase/harvdev-utils.git@master#egg=harvdev_utils +git+https://github.com/FlyBase/harvdev-utils.git@test#egg=harvdev_utils # Below are additional requirements for harvdev-utils itself (may not be automatically installed by cmd above). # bioservices # flake8>=3.5.0 From 1f48fa4c14cb62f1fee49e311ff8ed1e360438c0 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Thu, 2 Feb 2023 12:53:17 -0500 Subject: [PATCH 50/52] revert to working form --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index dc869bc..8d07faf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,14 +4,14 @@ nested_dict==1.61 psycopg2>=2.6.2 strict_rfc3339==0.7 tqdm>=4.29.0 -git+https://github.com/FlyBase/harvdev-utils.git@test#egg=harvdev_utils +git+https://github.com/FlyBase/harvdev-utils.git@master#egg=harvdev_utils # Below are additional requirements for harvdev-utils itself (may not be automatically installed by cmd above). # bioservices # flake8>=3.5.0 # pubchempy retry # requests>=2.21.0 -sqlalchemy>=2.0 +# sqlalchemy svn # urllib3>=1.24.1 # virtualenv>=16.2.0 From 4ec8c9eb1dabc624c2ea74898fd13f15685256de Mon Sep 17 00:00:00 2001 From: gildossantos Date: Thu, 2 Feb 2023 13:24:29 -0500 Subject: [PATCH 51/52] try no-cache pip install in docker build --- Dockerfile | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index b279fea..f56a062 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,6 +19,6 @@ ADD git_agr_curation_schema/util/validate_agr_schema.py /src/validat ADD git_agr_curation_schema/jsonschema/allianceModel.schema.json jsonschema/allianceModel.schema.json # Install required modules. -RUN pip3 install -r requirements.txt +RUN pip3 install -r requirements.txt --no-cache-dir ENTRYPOINT [ "/bin/bash" ] diff --git a/requirements.txt b/requirements.txt index 8d07faf..0fa5691 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ git+https://github.com/FlyBase/harvdev-utils.git@master#egg=harvdev_utils # pubchempy retry # requests>=2.21.0 -# sqlalchemy +sqlalchemy svn # urllib3>=1.24.1 # virtualenv>=16.2.0 From 2eb2668f49250b19738b492105264d165e37cc48 Mon Sep 17 00:00:00 2001 From: gildossantos Date: Tue, 7 Feb 2023 09:41:27 -0500 Subject: [PATCH 52/52] flake8 --- src/AGR_data_retrieval_curation_disease.py | 2 +- src/AGR_data_retrieval_curation_gene.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/AGR_data_retrieval_curation_disease.py b/src/AGR_data_retrieval_curation_disease.py index 5ead74f..d51a41d 100644 --- a/src/AGR_data_retrieval_curation_disease.py +++ b/src/AGR_data_retrieval_curation_disease.py @@ -531,7 +531,7 @@ def group_dis_annos(self): log.warning(f'REDUNDANT:\t{i}') elif anno_list[0].modifier_id_was_updated is True: log.warning(f'UPDATED DIS_ANNO: {anno_list[0]}') - update_allele_id_counter +=1 + update_allele_id_counter += 1 log.info(f'Found {update_allele_id_counter} non-redundant exportable disease annotations that required modifier ID update.') return diff --git a/src/AGR_data_retrieval_curation_gene.py b/src/AGR_data_retrieval_curation_gene.py index 4b4f804..5c26fbc 100644 --- a/src/AGR_data_retrieval_curation_gene.py +++ b/src/AGR_data_retrieval_curation_gene.py @@ -641,12 +641,12 @@ def process_feature_synonyms(self, feature): placeholder_symbol_dto['format_text'] = feature.feature.name placeholder_symbol_dto['display_text'] = feature.feature.name feature.gene_symbol_dto = placeholder_symbol_dto - # Full name is required. If none, fill it in. Could be because FB has none, or, it's the same as the symbol. + # TEMPORARY: Full name is required for now. If none, fill it in. Could be because FB has none, or, it's the same as the symbol. if feature.gene_full_name_dto is None: placeholder_full_name_dto = feature.gene_symbol_dto.copy() placeholder_full_name_dto['name_type_name'] = 'full_name' feature.gene_full_name_dto = placeholder_full_name_dto - # Systematic name is required. If none, fill it in. Could be because gene is unannotated, or annotation ID has never been used in pubs. + # TEMPORARY: Systematic name is required for now. If none, fill it in. Could be because gene is unannotated, or annotation ID never used in pubs. if feature.gene_systematic_name_dto is None: placeholder_systematic_name_dto = feature.gene_symbol_dto.copy() placeholder_systematic_name_dto['name_type_name'] = 'systematic_name'