diff --git a/Load/add_dbxref_to_cvterms.py b/Load/add_dbxref_to_cvterms.py index 238598a..723b1a9 100644 --- a/Load/add_dbxref_to_cvterms.py +++ b/Load/add_dbxref_to_cvterms.py @@ -3,9 +3,9 @@ def add_dbxref_data_to_cvterms(cursor, cv_cvterm_id, db_id, dbxref_id): # NOTE: these may not be correct but can be used to create some for testing. # The FBxx may not match the actual production ones cv_to_FB = { - "FlyBase anatomy CV": "FBbt", - "FlyBase development CV": "FBdv", - "FlyBase miscellaneous CV": "FBcv" + #"FlyBase anatomy CV": "FBbt", + "FlyBase development CV": "FBdv" + # "FlyBase miscellaneous CV": "FBcv" } dbx_sql = """ INSERT INTO dbxref (db_id, accession) VALUES (%s, %s) RETURNING dbxref_id """ add_to_cvterm = """ UPDATE cvterm SET dbxref_id = %s where cvterm_id = %s """ diff --git a/add-test_data.py b/add-test_data.py index bfd1fd2..5449635 100755 --- a/add-test_data.py +++ b/add-test_data.py @@ -73,6 +73,7 @@ def yaml_parse_and_dispatch(): dispatch_dictionary = { 'db_dbxref.yaml': load_db_dbxref, 'cv_cvterm.yaml': load_cv_cvterm, + 'cvterm_dbxref.yaml': load_cvterm_dbxref, 'pub_author_pubprop.yaml': load_pub_author_pubprop } @@ -81,6 +82,7 @@ def yaml_parse_and_dispatch(): # Need to load in a specific order due to CV term reliance. files_to_load = [ 'db_dbxref.yaml', + 'cvterm_dbxref.yaml', 'cv_cvterm.yaml', 'pub_author_pubprop.yaml' ] @@ -136,11 +138,13 @@ def load_cv_cvterm(parsed_yaml): 'transcriptome': 3034, 'umbrella project': 3030} for cv_name in (cv_cvterm.keys()): - cursor.execute(db_sql, (cv_name,)) - db_id[cv_name] = cursor.fetchone()[0] + if cv_name not in db_id: + cursor.execute(db_sql, (cv_name,)) + db_id[cv_name] = cursor.fetchone()[0] - cursor.execute(cv_sql, (cv_name,)) - cv_id[cv_name] = cursor.fetchone()[0] + if cv_name not in cv_id: + cursor.execute(cv_sql, (cv_name,)) + cv_id[cv_name] = cursor.fetchone()[0] print("adding cv {} [{}] and db [{}]".format(cv_name, cv_id[cv_name], db_id[cv_name])) # for specific cvterm we want to unique numbers as dbxrefs. @@ -171,6 +175,47 @@ def load_cv_cvterm(parsed_yaml): add_cvterm_namespace(cv_cvterm_id) +def load_cvterm_dbxref(parsed_yaml): + cv_acc = parsed_yaml + # cvterm.name, dbxref.accession, db.name + cvterm_name_idx = 0 + accession_idx = 1 + db_name_idx = 2 + + for cv_name in (cv_acc.keys()): + print(f"BOB: {cv_name} populate.") + cursor.execute(cv_sql, (cv_name,)) + cv_id[cv_name] = cursor.fetchone()[0] + cv_cvterm_id[cv_name] = {} + print("adding cvterm and dbxrefs for cv {} [{}]".format(cv_name, cv_id[cv_name])) + for row in cv_acc[cv_name]: + db_name = row[db_name_idx] + cvterm_name = row[cvterm_name_idx] + dbxref_name = row[accession_idx] + if db_name not in db_id: + cursor.execute(db_sql, (db_name,)) + db_id[db_name] = cursor.fetchone()[0] + #print(f"BOB: ROW {row}") + cursor.execute(dbxref_sql, (db_id[db_name], dbxref_name)) + dbxref_id[dbxref_name] = cursor.fetchone()[0] + cursor.execute(cvterm_sql, (dbxref_id[dbxref_name], cv_id[cv_name], cvterm_name)) + cvterm_id[cvterm_name] = cursor.fetchone()[0] + cv_cvterm_id[cv_name][cvterm_name] = cvterm_id[cvterm_name] + + + print(f"\tBOB: cvterm {cvterm_name} and dbxref {dbxref_name}") + check_sql = f""" + SELECT cv.name, cvterm.name, dbxref.accession, db.name + FROM cv, cvterm, dbxref, db + WHERE cv.cv_id = cvterm.cv_id AND + cvterm.dbxref_id = dbxref.dbxref_id AND + dbxref.db_id = db.db_id AND + cv.name = '{cv_name}'""" + print(check_sql) + cursor.execute(check_sql) + for row in cursor.fetchall(): + print(row) + def add_cvterm_namespace(cv_cvterm_id): """Add namespace cvterm props. @@ -216,6 +261,7 @@ def add_cvterm_namespace(cv_cvterm_id): for value in namespaces.keys(): rank = 0 for item in namespaces[value]: + print(f"BOB: looking up cv {item[0]} and cvterm {item[1]}") cvterm_id = cv_cvterm_id[item[0]][item[1]] type_id = cv_cvterm_id[item[2]][item[3]] # add cvtermprop @@ -567,7 +613,19 @@ def load_pub_author_pubprop(parsed_yaml): # transposable_element_insertion_site (teis) create_teis(cursor, organism_id, feature_id, cvterm_id, dbxref_id, db_id, pub_id) -add_dbxref_data_to_cvterms(cursor, cv_cvterm_id, db_id, dbxref_id) +# add_dbxref_data_to_cvterms(cursor, cv_cvterm_id, db_id, dbxref_id) + +check_sql = f""" +SELECT cv.name, cvterm.name, dbxref.accession, db.name + FROM cv, cvterm, dbxref, db + WHERE cv.cv_id = cvterm.cv_id AND + cvterm.dbxref_id = dbxref.dbxref_id AND + dbxref.db_id = db.db_id AND + cv.name = 'FlyBase miscellaneous CV'""" +print(check_sql) +cursor.execute(check_sql) +for row in cursor.fetchall(): + print(row) conn.commit() conn.close() diff --git a/data/cv_cvterm.yaml b/data/cv_cvterm.yaml index ecb4a60..125dc35 100644 --- a/data/cv_cvterm.yaml +++ b/data/cv_cvterm.yaml @@ -3,22 +3,22 @@ # order dependent cv/cvterms. i.e. accession are specific and numbered. ####################################################################### # Order is important only add to end of SO list. Tests rely on this!! -SO: ['chromosome_arm', 'chromosome', 'gene', 'mRNA', 'DNA', 'golden_path', 'ncRNA_gene', - 'regulatory_region', 'chromosome_structure_variation', 'chromosomal_inversion', - 'natural population', 'cloned_region', 'engineered_region', 'transgenic_transposable_element', - 'transposable_element_insertion_site', 'chromosome_band', 'allele', 'transposable_element', - 'natural_transposable_element', 'gene_group', 'polypeptide', 'chromosome_breakpoint', 'engineered_plasmid', 'sgRNA', - 'oligo', 'engineered_foreign_gene', 'point_mutation', 'cDNA_clone', 'TSS', 'rescue_region', 'insertion_site', 'synthetic_sequence', 'RNA', - 'missense_variant', 'wild_type', 'transposable_element_flanking_region'] +#SO: ['chromosome_arm', 'chromosome', 'gene', 'mRNA', 'DNA', 'golden_path', 'ncRNA_gene', +# 'regulatory_region', 'chromosome_structure_variation', 'chromosomal_inversion', +# 'natural population', 'cloned_region', 'engineered_region', 'transgenic_transposable_element', +# 'transposable_element_insertion_site', 'chromosome_band', 'allele', 'transposable_element', +# 'natural_transposable_element', 'gene_group', 'polypeptide', 'chromosome_breakpoint', 'engineered_plasmid', 'sgRNA', +# 'oligo', 'engineered_foreign_gene', 'point_mutation', 'cDNA_clone', 'TSS', 'rescue_region', 'insertion_site', 'synthetic_sequence', 'RNA', +# 'missense_variant', 'wild_type', 'transposable_element_flanking_region'] molecular_function: ['mRNA binding'] cellular_component: ['nucleolus', 'something' ,'extracellular space', 'endoplasmic reticulum', 'mitochondrial crista', 'mitochondrion'] biological_process: ['activation of immune response', 'defense response to other organism', 'rRNA processing'] -FlyBase anatomy CV: ['embryo','dopaminergic PAM neuron 1', 'dopaminergic PAM neuron 5', 'dissociated larval fat cell', - 'embryonic/larval hemolymph', - 'anatomy 1', 'anatomy 2', 'anatomy 3', 'mesoderm', - 'indirect flight muscle', 'macrochaeta', 'scutellar bristle', - 'increased number'] +#FlyBase anatomy CV: ['embryo','dopaminergic PAM neuron 1', 'dopaminergic PAM neuron 5', 'dissociated larval fat cell', +# 'embryonic/larval hemolymph', +# 'anatomy 1', 'anatomy 2', 'anatomy 3', 'mesoderm', +# 'indirect flight muscle', 'macrochaeta', 'scutellar bristle', +# 'increased number'] ####### End of order matters cv/cvterms cell_line_cvtermprop type: ['basis'] @@ -45,21 +45,21 @@ feature_relationshipprop type: ['fly_disease-implication_change', 'comment', 're FlyBase: ['FlyBase analysis'] FlyBase_internal: ['pubprop type:curated_by'] -FlyBase development CV: ['late embryonic stage', 'embryonic stage', 'adult stage', 'development 1', 'development 2', 'development 3', - 'wandering third instar larval stage', 'laval stage'] -FlyBase miscellaneous CV: [ - 'CRISPR/Cas9', 'amorphic allele - molecular evidence', 'assay', 'biosample', 'biotic stimulus study', 'cell isolation', 'chemical entity', - 'conditional', 'colocalizes_with', 'comment', - 'contributes_to', 'disease implicated variant', 'evidence_code', - 'environ1', 'environ2', 'environ3','environ4', 'environ5', 'faint', 'functional group', 'female', - 'in vitro construct', - 'inferred from direct assay', 'inferred from mutant phenotype', 'isolated cells', 'natural population', - 'male', 'misc 1', 'misc 2', 'misc 3', 'multi-individual sample', 'partially', - 'pheno1', 'pheno2', 'pheno3', 'pheno4', 'pheno5', - 'photoactivatable fluorescent protein', 'protein detection tool', 'project', - 'qualifier', 'reagent collection', 'RNA detection tool', 'single balancer', 'spontaneous', - 'split system combination', 'suppressible', - 'transcriptome', 'umbrella project', 'unspecified'] +#FlyBase development CV: ['late embryonic stage', 'embryonic stage', 'adult stage', 'development 1', 'development 2', 'development 3', +# 'wandering third instar larval stage', 'laval stage'] +#FlyBase miscellaneous CV: [ +# 'CRISPR/Cas9', 'amorphic allele - molecular evidence', 'assay', 'biosample', 'biotic stimulus study', 'cell isolation', 'chemical entity', +# 'conditional', 'colocalizes_with', 'comment', +# 'contributes_to', 'disease implicated variant', 'evidence_code', +# 'environ1', 'environ2', 'environ3','environ4', 'environ5', 'faint', 'functional group', 'female', +# 'in vitro construct', +# 'inferred from direct assay', 'inferred from mutant phenotype', 'isolated cells', 'natural population', +# 'male', 'misc 1', 'misc 2', 'misc 3', 'multi-individual sample', 'partially', +# 'pheno1', 'pheno2', 'pheno3', 'pheno4', 'pheno5', +# 'photoactivatable fluorescent protein', 'protein detection tool', 'project', +# 'qualifier', 'reagent collection', 'RNA detection tool', 'single balancer', 'spontaneous', +# 'split system combination', 'suppressible', +# 'transcriptome', 'umbrella project', 'unspecified'] GenBank feature qualifier: [ 'comment', 'linked_to', 'bound_moiety', 'na_change', 'pr_change', diff --git a/data/cvterm_dbxref.yaml b/data/cvterm_dbxref.yaml new file mode 100644 index 0000000..286c211 --- /dev/null +++ b/data/cvterm_dbxref.yaml @@ -0,0 +1,110 @@ +# cvterm.name, dbxref.accession, db.name +SO: [ + [ 'DNA', '0000352', 'SO'], + [ 'RNA', '0000356', 'SO'], + [ 'TSS', '0000315', 'SO'], + [ 'allele', '0001023', 'SO'], + [ 'cDNA_clone', '0000317', 'SO'], + [ 'chromosomal_inversion', '1000030', 'SO'], + [ 'chromosome', '0000340', 'SO'], + [ 'chromosome_arm', '0000105', 'SO'], + [ 'chromosome_band', '0000341', 'SO'], + [ 'chromosome_breakpoint', '0001021', 'SO'], + [ 'chromosome_structure_variation', '1000183', 'SO'], + [ 'cloned_region', '0000785', 'SO'], + [ 'engineered_foreign_gene', '0000281', 'SO'], + [ 'engineered_plasmid', '0000637', 'SO'], + [ 'engineered_region', '0000804', 'SO'], + [ 'gene', '0000704', 'SO'], + [ 'gene_group', '0005855', 'SO'], + [ 'golden_path', '0000688', 'SO'], + [ 'insertion_site', '0000366', 'SO'], + [ 'mRNA', '0000234', 'SO'], + [ 'missense_variant', '0001583', 'SO'], + [ 'natural_transposable_element', '0000797', 'SO'], + [ 'ncRNA_gene', '0001263', 'SO'], + [ 'oligo', '0000696', 'SO'], + [ 'point_mutation', '1000008', 'SO'], + [ 'polypeptide', '0000104', 'SO'], + [ 'regulatory_region', '0005836', 'SO'], + [ 'rescue_region', '0000411', 'SO'], + [ 'sgRNA', '0001998', 'SO'], + [ 'synthetic_sequence', '0000351', 'SO'], + [ 'transgenic_transposable_element', '0000796', 'SO'], + [ 'transposable_element', '0000101', 'SO'], + [ 'transposable_element_flanking_region', '0000364', 'SO'], + [ 'transposable_element_insertion_site', '0000368', 'SO'] + ] +FlyBase miscellaneous CV: [ + ['CRISPR/Cas9', '0003008', 'FBcv'], + ['RNA detection tool', '0005003', 'FBcv'], + ['amorphic allele - molecular evidence', '0000689', 'FBcv'], + ['assay', '0003025', 'FBcv'], + ['biosample', '0003024', 'FBcv'], + ['biotic stimulus study', '0003134', 'FBcv'], + ['cell isolation', '0003170', 'FBcv'], + ['chemical entity', 'chemical entity', 'FlyBase'], + ['colocalizes_with', 'colocalizes_with', 'FlyBase'], + ['comment', '000100', 'FlyBase'], + ['conditional', '0000309', 'FBcv'], + ['contributes_to', 'contributes_to', 'FlyBase'], + ['disease implicated variant', 'div', 'FlyBase'], + [ 'environ1', '0200001', 'FBcv' ], + [ 'environ2', '0200002', 'FBcv' ], + ['evidence_code', 'evidence_code', 'FlyBase'], + ['faint', '0000167', 'FBcv'], + ['female', '0000334', 'FBcv'], + ['functional group', '0003014', 'FBcv'], + ['in vitro construct', '0000455', 'FBcv'], + ['inferred from direct assay', 'inferred from direct assay', 'FlyBase'], + ['inferred from mutant phenotype', 'inferred from mutant phenotype', 'FlyBase'], + ['isolated cells', '0003047', 'FBcv'], + ['madeupstuff', '007', 'FBcv'], + ['male', '0000333', 'FBcv'], + ['multi-individual sample', '0003141', 'FBcv'], + ['natural population', '0000465', 'FBcv'], + ['partially', '0000340', 'FBcv'], + ['pheno1', '0100001', 'FBcv'], + ['pheno2', '0100002', 'FBcv'], + ['pheno3', '0100003', 'FBcv'], + ['pheno4', '0100004', 'FBcv'], + ['pheno5', '0100005', 'FBcv'], + ['photoactivatable fluorescent protein', '0005017', 'FBcv'], + ['project', '0003023', 'FBcv'], + ['protein detection tool', '0005004', 'FBcv'], + ['qualifier', '0000005', 'FBcv'], + ['reagent collection', '0003027', 'FBcv'], + ['single balancer', '0000155', 'FBcv'], + ['split system combination', '0009026', 'FBcv'], + ['spontaneous', '0000469', 'FBcv'], + ['suppressible', '0000622', 'FBcv'], + ['transcriptome', '0003034', 'FBcv'], + ['umbrella project', '0003030', 'FBcv'], + ['unspecified', 'unspecified', 'FlyBase'] + ] + +FlyBase anatomy CV: [ + ['anatomy 1', '00000001', 'FBbt'], + ['anatomy 2', '00000002', 'FBbt'], + ['anatomy 3', '00000003', 'FBbt'], + ['dissociated larval fat cell', '00049951', 'FBbt'], + ['dopaminergic PAM neuron 1', '00111015', 'FBbt'], + ['dopaminergic PAM neuron 5', '00111017', 'FBbt'], + ['embryo', '00000052', 'FBbt'], + ['embryonic/larval hemolymph', '00001683', 'FBbt'], + ['increased number', '00000004', 'FBbt'], + ['indirect flight muscle', '00058882', 'FBbt'], + ['macrochaeta', '00005179', 'FBbt'], + ['mesoderm', '00000126', 'FBbt'], + ['scutellar bristle', '00004312', 'FBbt'] + ] +FlyBase development CV: [ + ['adult stage', '00005369', 'FBdv'], + ['development 1', '00000001', 'FBdv'], + ['development 2', '00000002', 'FBdv'], + ['development 3', '00000003', 'FBdv'], + ['embryonic stage', '00000004', 'FBdv'], + ['late embryonic stage', '00005333', 'FBdv'], + ['laval stage' , '00000005', 'FBdv'], + ['wandering third instar larval stage', '00005341', 'FBdv'] + ] \ No newline at end of file