Skip to content

Commit

Permalink
New format for controlled dbxref cvterms
Browse files Browse the repository at this point in the history
  • Loading branch information
ianlongden committed Oct 4, 2024
1 parent f55c4f3 commit ce2d035
Show file tree
Hide file tree
Showing 4 changed files with 203 additions and 35 deletions.
6 changes: 3 additions & 3 deletions Load/add_dbxref_to_cvterms.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ def add_dbxref_data_to_cvterms(cursor, cv_cvterm_id, db_id, dbxref_id):
# NOTE: these may not be correct but can be used to create some for testing.
# The FBxx may not match the actual production ones
cv_to_FB = {
"FlyBase anatomy CV": "FBbt",
"FlyBase development CV": "FBdv",
"FlyBase miscellaneous CV": "FBcv"
#"FlyBase anatomy CV": "FBbt",
"FlyBase development CV": "FBdv"
# "FlyBase miscellaneous CV": "FBcv"
}
dbx_sql = """ INSERT INTO dbxref (db_id, accession) VALUES (%s, %s) RETURNING dbxref_id """
add_to_cvterm = """ UPDATE cvterm SET dbxref_id = %s where cvterm_id = %s """
Expand Down
68 changes: 63 additions & 5 deletions add-test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def yaml_parse_and_dispatch():
dispatch_dictionary = {
'db_dbxref.yaml': load_db_dbxref,
'cv_cvterm.yaml': load_cv_cvterm,
'cvterm_dbxref.yaml': load_cvterm_dbxref,
'pub_author_pubprop.yaml': load_pub_author_pubprop
}

Expand All @@ -81,6 +82,7 @@ def yaml_parse_and_dispatch():
# Need to load in a specific order due to CV term reliance.
files_to_load = [
'db_dbxref.yaml',
'cvterm_dbxref.yaml',
'cv_cvterm.yaml',
'pub_author_pubprop.yaml'
]
Expand Down Expand Up @@ -136,11 +138,13 @@ def load_cv_cvterm(parsed_yaml):
'transcriptome': 3034,
'umbrella project': 3030}
for cv_name in (cv_cvterm.keys()):
cursor.execute(db_sql, (cv_name,))
db_id[cv_name] = cursor.fetchone()[0]
if cv_name not in db_id:
cursor.execute(db_sql, (cv_name,))
db_id[cv_name] = cursor.fetchone()[0]

cursor.execute(cv_sql, (cv_name,))
cv_id[cv_name] = cursor.fetchone()[0]
if cv_name not in cv_id:
cursor.execute(cv_sql, (cv_name,))
cv_id[cv_name] = cursor.fetchone()[0]

print("adding cv {} [{}] and db [{}]".format(cv_name, cv_id[cv_name], db_id[cv_name]))
# for specific cvterm we want to unique numbers as dbxrefs.
Expand Down Expand Up @@ -171,6 +175,47 @@ def load_cv_cvterm(parsed_yaml):
add_cvterm_namespace(cv_cvterm_id)


def load_cvterm_dbxref(parsed_yaml):
cv_acc = parsed_yaml
# cvterm.name, dbxref.accession, db.name
cvterm_name_idx = 0
accession_idx = 1
db_name_idx = 2

for cv_name in (cv_acc.keys()):
print(f"BOB: {cv_name} populate.")
cursor.execute(cv_sql, (cv_name,))
cv_id[cv_name] = cursor.fetchone()[0]
cv_cvterm_id[cv_name] = {}
print("adding cvterm and dbxrefs for cv {} [{}]".format(cv_name, cv_id[cv_name]))
for row in cv_acc[cv_name]:
db_name = row[db_name_idx]
cvterm_name = row[cvterm_name_idx]
dbxref_name = row[accession_idx]
if db_name not in db_id:
cursor.execute(db_sql, (db_name,))
db_id[db_name] = cursor.fetchone()[0]
#print(f"BOB: ROW {row}")
cursor.execute(dbxref_sql, (db_id[db_name], dbxref_name))
dbxref_id[dbxref_name] = cursor.fetchone()[0]
cursor.execute(cvterm_sql, (dbxref_id[dbxref_name], cv_id[cv_name], cvterm_name))
cvterm_id[cvterm_name] = cursor.fetchone()[0]
cv_cvterm_id[cv_name][cvterm_name] = cvterm_id[cvterm_name]


print(f"\tBOB: cvterm {cvterm_name} and dbxref {dbxref_name}")
check_sql = f"""
SELECT cv.name, cvterm.name, dbxref.accession, db.name
FROM cv, cvterm, dbxref, db
WHERE cv.cv_id = cvterm.cv_id AND
cvterm.dbxref_id = dbxref.dbxref_id AND
dbxref.db_id = db.db_id AND
cv.name = '{cv_name}'"""
print(check_sql)
cursor.execute(check_sql)
for row in cursor.fetchall():
print(row)

def add_cvterm_namespace(cv_cvterm_id):
"""Add namespace cvterm props.
Expand Down Expand Up @@ -216,6 +261,7 @@ def add_cvterm_namespace(cv_cvterm_id):
for value in namespaces.keys():
rank = 0
for item in namespaces[value]:
print(f"BOB: looking up cv {item[0]} and cvterm {item[1]}")
cvterm_id = cv_cvterm_id[item[0]][item[1]]
type_id = cv_cvterm_id[item[2]][item[3]]
# add cvtermprop
Expand Down Expand Up @@ -567,7 +613,19 @@ def load_pub_author_pubprop(parsed_yaml):
# transposable_element_insertion_site (teis)
create_teis(cursor, organism_id, feature_id, cvterm_id, dbxref_id, db_id, pub_id)

add_dbxref_data_to_cvterms(cursor, cv_cvterm_id, db_id, dbxref_id)
# add_dbxref_data_to_cvterms(cursor, cv_cvterm_id, db_id, dbxref_id)

check_sql = f"""
SELECT cv.name, cvterm.name, dbxref.accession, db.name
FROM cv, cvterm, dbxref, db
WHERE cv.cv_id = cvterm.cv_id AND
cvterm.dbxref_id = dbxref.dbxref_id AND
dbxref.db_id = db.db_id AND
cv.name = 'FlyBase miscellaneous CV'"""
print(check_sql)
cursor.execute(check_sql)
for row in cursor.fetchall():
print(row)

conn.commit()
conn.close()
Expand Down
54 changes: 27 additions & 27 deletions data/cv_cvterm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,22 @@
# order dependent cv/cvterms. i.e. accession are specific and numbered.
#######################################################################
# Order is important only add to end of SO list. Tests rely on this!!
SO: ['chromosome_arm', 'chromosome', 'gene', 'mRNA', 'DNA', 'golden_path', 'ncRNA_gene',
'regulatory_region', 'chromosome_structure_variation', 'chromosomal_inversion',
'natural population', 'cloned_region', 'engineered_region', 'transgenic_transposable_element',
'transposable_element_insertion_site', 'chromosome_band', 'allele', 'transposable_element',
'natural_transposable_element', 'gene_group', 'polypeptide', 'chromosome_breakpoint', 'engineered_plasmid', 'sgRNA',
'oligo', 'engineered_foreign_gene', 'point_mutation', 'cDNA_clone', 'TSS', 'rescue_region', 'insertion_site', 'synthetic_sequence', 'RNA',
'missense_variant', 'wild_type', 'transposable_element_flanking_region']
#SO: ['chromosome_arm', 'chromosome', 'gene', 'mRNA', 'DNA', 'golden_path', 'ncRNA_gene',
# 'regulatory_region', 'chromosome_structure_variation', 'chromosomal_inversion',
# 'natural population', 'cloned_region', 'engineered_region', 'transgenic_transposable_element',
# 'transposable_element_insertion_site', 'chromosome_band', 'allele', 'transposable_element',
# 'natural_transposable_element', 'gene_group', 'polypeptide', 'chromosome_breakpoint', 'engineered_plasmid', 'sgRNA',
# 'oligo', 'engineered_foreign_gene', 'point_mutation', 'cDNA_clone', 'TSS', 'rescue_region', 'insertion_site', 'synthetic_sequence', 'RNA',
# 'missense_variant', 'wild_type', 'transposable_element_flanking_region']
molecular_function: ['mRNA binding']
cellular_component: ['nucleolus', 'something' ,'extracellular space', 'endoplasmic reticulum',
'mitochondrial crista', 'mitochondrion']
biological_process: ['activation of immune response', 'defense response to other organism', 'rRNA processing']
FlyBase anatomy CV: ['embryo','dopaminergic PAM neuron 1', 'dopaminergic PAM neuron 5', 'dissociated larval fat cell',
'embryonic/larval hemolymph',
'anatomy 1', 'anatomy 2', 'anatomy 3', 'mesoderm',
'indirect flight muscle', 'macrochaeta', 'scutellar bristle',
'increased number']
#FlyBase anatomy CV: ['embryo','dopaminergic PAM neuron 1', 'dopaminergic PAM neuron 5', 'dissociated larval fat cell',
# 'embryonic/larval hemolymph',
# 'anatomy 1', 'anatomy 2', 'anatomy 3', 'mesoderm',
# 'indirect flight muscle', 'macrochaeta', 'scutellar bristle',
# 'increased number']
####### End of order matters cv/cvterms

cell_line_cvtermprop type: ['basis']
Expand All @@ -45,21 +45,21 @@ feature_relationshipprop type: ['fly_disease-implication_change', 'comment', 're
FlyBase: ['FlyBase analysis']
FlyBase_internal: ['pubprop type:curated_by']

FlyBase development CV: ['late embryonic stage', 'embryonic stage', 'adult stage', 'development 1', 'development 2', 'development 3',
'wandering third instar larval stage', 'laval stage']
FlyBase miscellaneous CV: [
'CRISPR/Cas9', 'amorphic allele - molecular evidence', 'assay', 'biosample', 'biotic stimulus study', 'cell isolation', 'chemical entity',
'conditional', 'colocalizes_with', 'comment',
'contributes_to', 'disease implicated variant', 'evidence_code',
'environ1', 'environ2', 'environ3','environ4', 'environ5', 'faint', 'functional group', 'female',
'in vitro construct',
'inferred from direct assay', 'inferred from mutant phenotype', 'isolated cells', 'natural population',
'male', 'misc 1', 'misc 2', 'misc 3', 'multi-individual sample', 'partially',
'pheno1', 'pheno2', 'pheno3', 'pheno4', 'pheno5',
'photoactivatable fluorescent protein', 'protein detection tool', 'project',
'qualifier', 'reagent collection', 'RNA detection tool', 'single balancer', 'spontaneous',
'split system combination', 'suppressible',
'transcriptome', 'umbrella project', 'unspecified']
#FlyBase development CV: ['late embryonic stage', 'embryonic stage', 'adult stage', 'development 1', 'development 2', 'development 3',
# 'wandering third instar larval stage', 'laval stage']
#FlyBase miscellaneous CV: [
# 'CRISPR/Cas9', 'amorphic allele - molecular evidence', 'assay', 'biosample', 'biotic stimulus study', 'cell isolation', 'chemical entity',
# 'conditional', 'colocalizes_with', 'comment',
# 'contributes_to', 'disease implicated variant', 'evidence_code',
# 'environ1', 'environ2', 'environ3','environ4', 'environ5', 'faint', 'functional group', 'female',
# 'in vitro construct',
# 'inferred from direct assay', 'inferred from mutant phenotype', 'isolated cells', 'natural population',
# 'male', 'misc 1', 'misc 2', 'misc 3', 'multi-individual sample', 'partially',
# 'pheno1', 'pheno2', 'pheno3', 'pheno4', 'pheno5',
# 'photoactivatable fluorescent protein', 'protein detection tool', 'project',
# 'qualifier', 'reagent collection', 'RNA detection tool', 'single balancer', 'spontaneous',
# 'split system combination', 'suppressible',
# 'transcriptome', 'umbrella project', 'unspecified']

GenBank feature qualifier: [
'comment', 'linked_to', 'bound_moiety', 'na_change', 'pr_change',
Expand Down
110 changes: 110 additions & 0 deletions data/cvterm_dbxref.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# cvterm.name, dbxref.accession, db.name
SO: [
[ 'DNA', '0000352', 'SO'],
[ 'RNA', '0000356', 'SO'],
[ 'TSS', '0000315', 'SO'],
[ 'allele', '0001023', 'SO'],
[ 'cDNA_clone', '0000317', 'SO'],
[ 'chromosomal_inversion', '1000030', 'SO'],
[ 'chromosome', '0000340', 'SO'],
[ 'chromosome_arm', '0000105', 'SO'],
[ 'chromosome_band', '0000341', 'SO'],
[ 'chromosome_breakpoint', '0001021', 'SO'],
[ 'chromosome_structure_variation', '1000183', 'SO'],
[ 'cloned_region', '0000785', 'SO'],
[ 'engineered_foreign_gene', '0000281', 'SO'],
[ 'engineered_plasmid', '0000637', 'SO'],
[ 'engineered_region', '0000804', 'SO'],
[ 'gene', '0000704', 'SO'],
[ 'gene_group', '0005855', 'SO'],
[ 'golden_path', '0000688', 'SO'],
[ 'insertion_site', '0000366', 'SO'],
[ 'mRNA', '0000234', 'SO'],
[ 'missense_variant', '0001583', 'SO'],
[ 'natural_transposable_element', '0000797', 'SO'],
[ 'ncRNA_gene', '0001263', 'SO'],
[ 'oligo', '0000696', 'SO'],
[ 'point_mutation', '1000008', 'SO'],
[ 'polypeptide', '0000104', 'SO'],
[ 'regulatory_region', '0005836', 'SO'],
[ 'rescue_region', '0000411', 'SO'],
[ 'sgRNA', '0001998', 'SO'],
[ 'synthetic_sequence', '0000351', 'SO'],
[ 'transgenic_transposable_element', '0000796', 'SO'],
[ 'transposable_element', '0000101', 'SO'],
[ 'transposable_element_flanking_region', '0000364', 'SO'],
[ 'transposable_element_insertion_site', '0000368', 'SO']
]
FlyBase miscellaneous CV: [
['CRISPR/Cas9', '0003008', 'FBcv'],
['RNA detection tool', '0005003', 'FBcv'],
['amorphic allele - molecular evidence', '0000689', 'FBcv'],
['assay', '0003025', 'FBcv'],
['biosample', '0003024', 'FBcv'],
['biotic stimulus study', '0003134', 'FBcv'],
['cell isolation', '0003170', 'FBcv'],
['chemical entity', 'chemical entity', 'FlyBase'],
['colocalizes_with', 'colocalizes_with', 'FlyBase'],
['comment', '000100', 'FlyBase'],
['conditional', '0000309', 'FBcv'],
['contributes_to', 'contributes_to', 'FlyBase'],
['disease implicated variant', 'div', 'FlyBase'],
[ 'environ1', '0200001', 'FBcv' ],
[ 'environ2', '0200002', 'FBcv' ],
['evidence_code', 'evidence_code', 'FlyBase'],
['faint', '0000167', 'FBcv'],
['female', '0000334', 'FBcv'],
['functional group', '0003014', 'FBcv'],
['in vitro construct', '0000455', 'FBcv'],
['inferred from direct assay', 'inferred from direct assay', 'FlyBase'],
['inferred from mutant phenotype', 'inferred from mutant phenotype', 'FlyBase'],
['isolated cells', '0003047', 'FBcv'],
['madeupstuff', '007', 'FBcv'],
['male', '0000333', 'FBcv'],
['multi-individual sample', '0003141', 'FBcv'],
['natural population', '0000465', 'FBcv'],
['partially', '0000340', 'FBcv'],
['pheno1', '0100001', 'FBcv'],
['pheno2', '0100002', 'FBcv'],
['pheno3', '0100003', 'FBcv'],
['pheno4', '0100004', 'FBcv'],
['pheno5', '0100005', 'FBcv'],
['photoactivatable fluorescent protein', '0005017', 'FBcv'],
['project', '0003023', 'FBcv'],
['protein detection tool', '0005004', 'FBcv'],
['qualifier', '0000005', 'FBcv'],
['reagent collection', '0003027', 'FBcv'],
['single balancer', '0000155', 'FBcv'],
['split system combination', '0009026', 'FBcv'],
['spontaneous', '0000469', 'FBcv'],
['suppressible', '0000622', 'FBcv'],
['transcriptome', '0003034', 'FBcv'],
['umbrella project', '0003030', 'FBcv'],
['unspecified', 'unspecified', 'FlyBase']
]

FlyBase anatomy CV: [
['anatomy 1', '00000001', 'FBbt'],
['anatomy 2', '00000002', 'FBbt'],
['anatomy 3', '00000003', 'FBbt'],
['dissociated larval fat cell', '00049951', 'FBbt'],
['dopaminergic PAM neuron 1', '00111015', 'FBbt'],
['dopaminergic PAM neuron 5', '00111017', 'FBbt'],
['embryo', '00000052', 'FBbt'],
['embryonic/larval hemolymph', '00001683', 'FBbt'],
['increased number', '00000004', 'FBbt'],
['indirect flight muscle', '00058882', 'FBbt'],
['macrochaeta', '00005179', 'FBbt'],
['mesoderm', '00000126', 'FBbt'],
['scutellar bristle', '00004312', 'FBbt']
]
FlyBase development CV: [
['adult stage', '00005369', 'FBdv'],
['development 1', '00000001', 'FBdv'],
['development 2', '00000002', 'FBdv'],
['development 3', '00000003', 'FBdv'],
['embryonic stage', '00000004', 'FBdv'],
['late embryonic stage', '00005333', 'FBdv'],
['laval stage' , '00000005', 'FBdv'],
['wandering third instar larval stage', '00005341', 'FBdv']
]

0 comments on commit ce2d035

Please sign in to comment.