Skip to content

Commit

Permalink
Merge pull request #24 from alliance-genome/SCRUM-1969
Browse files Browse the repository at this point in the history
Scrum 1969
  • Loading branch information
christabone authored Sep 9, 2022
2 parents 6a9f700 + 4061ddc commit 20664be
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 51 deletions.
94 changes: 67 additions & 27 deletions src/processor/interaction_genetic_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def __init__(self, configs):
self.master_crossreference_dictionary['UniProtKB'] = dict()
self.master_crossreference_dictionary['ENSEMBL'] = dict()
self.master_crossreference_dictionary['NCBI_Gene'] = dict()
self.taxon_used_dict = dict()
self.output_dir = '/usr/src/app/output/'
self.download_dir = '/usr/src/app/download_genetic/'

Expand Down Expand Up @@ -359,6 +360,8 @@ def get_data(self):
open(self.output_dir + 'alliance_genetic_interactions_fly.tsv', 'w', encoding='utf-8') as fb_out, \
open(self.output_dir + 'alliance_genetic_interactions_worm.tsv', 'w', encoding='utf-8') as wb_out, \
open(self.output_dir + 'alliance_genetic_interactions_xenopus.tsv', 'w', encoding='utf-8') as xb_out, \
open(self.output_dir + 'alliance_genetic_interactions_xenopus_laevis.tsv', 'w', encoding='utf-8') as xbxl_out, \
open(self.output_dir + 'alliance_genetic_interactions_xenopus_tropicalis.tsv', 'w', encoding='utf-8') as xbxt_out, \
open(self.output_dir + 'alliance_genetic_interactions_zebrafish.tsv', 'w', encoding='utf-8') as zfin_out, \
open(self.output_dir + 'alliance_genetic_interactions_yeast.tsv', 'w', encoding='utf-8') as sgd_out, \
open(self.output_dir + 'alliance_genetic_interactions_rat.tsv', 'w', encoding='utf-8') as rgd_out, \
Expand All @@ -371,6 +374,8 @@ def get_data(self):
fb_out = csv.writer(fb_out, quotechar='', quoting=csv.QUOTE_NONE, delimiter='\t')
wb_out = csv.writer(wb_out, quotechar='', quoting=csv.QUOTE_NONE, delimiter='\t')
xb_out = csv.writer(xb_out, quotechar='', quoting=csv.QUOTE_NONE, delimiter='\t')
xbxl_out = csv.writer(xbxl_out, quotechar='', quoting=csv.QUOTE_NONE, delimiter='\t')
xbxt_out = csv.writer(xbxt_out, quotechar='', quoting=csv.QUOTE_NONE, delimiter='\t')
zfin_out = csv.writer(zfin_out, quotechar='', quoting=csv.QUOTE_NONE, delimiter='\t')
sgd_out = csv.writer(sgd_out, quotechar='', quoting=csv.QUOTE_NONE, delimiter='\t')
rgd_out = csv.writer(rgd_out, quotechar='', quoting=csv.QUOTE_NONE, delimiter='\t')
Expand All @@ -380,33 +385,38 @@ def get_data(self):
mapped_out = csv.writer(mapped_out, quotechar = '', quoting=csv.QUOTE_NONE, delimiter='\t')

# This list is now sorted phylogenetically for the header to be sorted
out_write_list = [human_out, rgd_out, mgi_out, xb_out, zfin_out, fb_out, wb_out, sgd_out]
out_write_list = [human_out, rgd_out, mgi_out, xb_out, xbxl_out, xbxt_out, zfin_out, fb_out, wb_out, sgd_out]

taxon_file_dispatch_dict = {
'taxid:10116': rgd_out,
'taxid:9606': human_out,
'taxid:10090': mgi_out,
'taxid:6239': wb_out,
'taxid:559292': sgd_out,
'taxid:7955': zfin_out,
'taxid:7227': fb_out,
'taxid:8355': xb_out,
'taxid:8364': xb_out,
'taxid:4932': sgd_out,
'taxid:307796': sgd_out,
'taxid:643680': sgd_out,
'taxid:574961': sgd_out,
'taxid:285006': sgd_out,
'taxid:545124': sgd_out,
'taxid:764097': sgd_out
'taxid:10116': [rgd_out],
'taxid:9606': [human_out],
'taxid:10090': [mgi_out],
'taxid:6239': [wb_out],
'taxid:559292': [sgd_out],
'taxid:7955': [zfin_out],
'taxid:7227': [fb_out],
'taxid:8355': [xbxl_out, xb_out],
'taxid:8364': [xbxt_out, xb_out],
'taxid:4932': [sgd_out],
'taxid:307796': [sgd_out],
'taxid:643680': [sgd_out],
'taxid:574961': [sgd_out],
'taxid:285006': [sgd_out],
'taxid:545124': [sgd_out],
'taxid:764097': [sgd_out]
}

for taxon in taxon_file_dispatch_dict.keys():
self.taxon_used_dict[taxon] = 0

out_to_species_name_dict = {
rgd_out: 'Rattus norvegicus',
human_out: 'Homo sapiens',
mgi_out: 'Mus musculus',
wb_out: 'Caenorhabditis elegans',
xb_out: 'Xenopus laevis',
xb_out: 'Xenopus laevis, Xenopus tropicalis',
xbxl_out: 'Xenopus laevis',
xbxt_out: 'Xenopus tropicalis',
sgd_out: 'Saccharomyces cerevisiae',
zfin_out: 'Danio rerio',
fb_out: 'Drosophila melanogaster'
Expand All @@ -417,7 +427,9 @@ def get_data(self):
human_out: 'NCBI:txid9606',
mgi_out: 'NCBI:txid10090',
wb_out: 'NCBI:txid6239',
xb_out: 'NCBI:txid8355',
xb_out: 'NCBI:txid8355, NCBI:txid8364',
xbxl_out: 'NCBI:txid8355',
xbxt_out: 'NCBI:txid8364',
sgd_out: 'NCBI:txid559292',
zfin_out: 'NCBI:txid7955',
fb_out: 'NCBI:txid7227'
Expand Down Expand Up @@ -628,14 +640,18 @@ def get_data(self):
self.wrote_to_file_already = False

try:
taxon_file_dispatch_dict[taxon1].writerow(row)
self.taxon_used_dict[taxon1] += 1
for filehandle in taxon_file_dispatch_dict[taxon1]:
filehandle.writerow(row)
self.wrote_to_file_already = True
except KeyError:
pass

try:
if self.wrote_to_file_already is False:
taxon_file_dispatch_dict[taxon2].writerow(row)
self.taxon_used_dict[taxon2] += 1
for filehandle in taxon_file_dispatch_dict[taxon2]:
filehandle.writerow(row)
except KeyError:
pass

Expand All @@ -644,11 +660,26 @@ def validate_and_upload_files_to_fms(self):
logger.info('Summary of files created:')
logger.info(os.system("ls -alh {}*".format(self.output_dir)))

file_taxon_dict = {
'alliance_genetic_interactions_fly.tsv': ['taxid:7227'],
'alliance_genetic_interactions_worm.tsv': ['taxid:6239'],
'alliance_genetic_interactions_xenopus.tsv': ['taxid:8355', 'taxid:8364'],
'alliance_genetic_interactions_xenopus_laevis.tsv': ['taxid:8355'],
'alliance_genetic_interactions_xenopus_tropicalis.tsv': ['taxid:8364'],
'alliance_genetic_interactions_zebrafish.tsv': ['taxid:7955'],
'alliance_genetic_interactions_yeast.tsv': ['taxid:559292', 'taxid:307796', 'taxid:643680', 'taxid:574961', 'taxid:285006', 'taxid:545124', 'taxid:764097'],
'alliance_genetic_interactions_rat.tsv': ['taxid:10116'],
'alliance_genetic_interactions_mouse.tsv': ['taxid:10090'],
'alliance_genetic_interactions_human.tsv': ['taxid:9606'],
}

upload_location_dict = {
'alliance_genetic_interactions.tsv': 'COMBINED',
'alliance_genetic_interactions_fly.tsv': 'FB',
'alliance_genetic_interactions_worm.tsv': 'WB',
'alliance_genetic_interactions_xenopus.tsv': 'XB',
'alliance_genetic_interactions_xenopus_laevis.tsv': 'XBXL',
'alliance_genetic_interactions_xenopus_tropicalis.tsv': 'XBXT',
'alliance_genetic_interactions_zebrafish.tsv': 'ZFIN',
'alliance_genetic_interactions_yeast.tsv': 'SGD',
'alliance_genetic_interactions_rat.tsv': 'RGD',
Expand All @@ -657,13 +688,22 @@ def validate_and_upload_files_to_fms(self):
}

thread_pool = []

for filename in upload_location_dict.keys():
dataSubType = upload_location_dict[filename]

p = multiprocessing.Process(target=super().fms_upload, args=("INTERACTION-GEN", dataSubType, filename))
p.start()
thread_pool.append(p)
upload_flag = False
if filename in file_taxon_dict:
for taxon in file_taxon_dict[filename]:
if self.taxon_used_dict[taxon] > 0:
upload_flag = True
logger.info("filename %s taxon %s count %s" % (filename, taxon, self.taxon_used_dict[taxon]))
if filename == 'alliance_genetic_interactions.tsv':
upload_flag = True
if upload_flag:
logger.info("upload %s" % (upload_location_dict[filename]))
dataSubType = upload_location_dict[filename]

p = multiprocessing.Process(target=super().fms_upload, args=("INTERACTION-GEN", dataSubType, filename))
p.start()
thread_pool.append(p)

Processor.wait_for_threads(thread_pool)

94 changes: 70 additions & 24 deletions src/processor/interaction_molecular_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def __init__(self, configs):
self.master_crossreference_dictionary['ENSEMBL'] = dict()
self.master_crossreference_dictionary['NCBI_Gene'] = dict()
self.master_crossreference_dictionary['RefSeq'] = dict()
self.taxon_used_dict = dict()
self.biogrid_rna_set = set()
self.biogrid_genetic_set = set()
self.biogrid_doi_dict = dict()
Expand Down Expand Up @@ -439,6 +440,8 @@ def get_data(self):
open(self.output_dir + 'alliance_molecular_interactions_fly.tsv', 'w', encoding='utf-8') as fb_out, \
open(self.output_dir + 'alliance_molecular_interactions_worm.tsv', 'w', encoding='utf-8') as wb_out, \
open(self.output_dir + 'alliance_molecular_interactions_xenopus.tsv', 'w', encoding='utf-8') as xb_out, \
open(self.output_dir + 'alliance_molecular_interactions_xenopus_laevis.tsv', 'w', encoding='utf-8') as xbxl_out, \
open(self.output_dir + 'alliance_molecular_interactions_xenopus_tropicalis.tsv', 'w', encoding='utf-8') as xbxt_out, \
open(self.output_dir + 'alliance_molecular_interactions_zebrafish.tsv', 'w', encoding='utf-8') as zfin_out, \
open(self.output_dir + 'alliance_molecular_interactions_yeast.tsv', 'w', encoding='utf-8') as sgd_out, \
open(self.output_dir + 'alliance_molecular_interactions_rat.tsv', 'w', encoding='utf-8') as rgd_out, \
Expand All @@ -454,41 +457,48 @@ def get_data(self):
fb_out = csv.writer(fb_out, quotechar='', quoting=csv.QUOTE_NONE, delimiter='\t')
wb_out = csv.writer(wb_out, quotechar='', quoting=csv.QUOTE_NONE, delimiter='\t')
xb_out = csv.writer(xb_out, quotechar='', quoting=csv.QUOTE_NONE, delimiter='\t')
xbxl_out = csv.writer(xbxl_out, quotechar='', quoting=csv.QUOTE_NONE, delimiter='\t')
xbxt_out = csv.writer(xbxt_out, quotechar='', quoting=csv.QUOTE_NONE, delimiter='\t')
zfin_out = csv.writer(zfin_out, quotechar='', quoting=csv.QUOTE_NONE, delimiter='\t')
sgd_out = csv.writer(sgd_out, quotechar='', quoting=csv.QUOTE_NONE, delimiter='\t')
rgd_out = csv.writer(rgd_out, quotechar='', quoting=csv.QUOTE_NONE, delimiter='\t')
mgi_out = csv.writer(mgi_out, quotechar='', quoting=csv.QUOTE_NONE, delimiter='\t')
human_out = csv.writer(human_out, quotechar='', quoting=csv.QUOTE_NONE, delimiter='\t')

# This list is now sorted phylogenetically for the header to be sorted
out_write_list = [human_out, rgd_out, mgi_out, xb_out, zfin_out, fb_out, wb_out, sgd_out, sarscov2_out]
out_write_list = [human_out, rgd_out, mgi_out, xb_out, xbxl_out, xbxt_out, zfin_out, fb_out, wb_out, sgd_out, sarscov2_out]

taxon_file_dispatch_dict = {
'taxid:10116': rgd_out,
'taxid:9606': human_out,
'taxid:10090': mgi_out,
'taxid:6239': wb_out,
'taxid:559292': sgd_out,
'taxid:7955': zfin_out,
'taxid:7227': fb_out,
'taxid:2697049': sarscov2_out,
'taxid:8355': xb_out,
'taxid:8364': xb_out,
'taxid:4932': sgd_out,
'taxid:307796': sgd_out,
'taxid:643680': sgd_out,
'taxid:574961': sgd_out,
'taxid:285006': sgd_out,
'taxid:545124': sgd_out,
'taxid:764097': sgd_out
'taxid:10116': [rgd_out],
'taxid:9606': [human_out],
'taxid:10090': [mgi_out],
'taxid:6239': [wb_out],
'taxid:559292': [sgd_out],
'taxid:7955': [zfin_out],
'taxid:7227': [fb_out],
'taxid:2697049': [sarscov2_out],
'taxid:8355': [xbxl_out, xb_out],
'taxid:8364': [xbxt_out, xb_out],
'taxid:4932': [sgd_out],
'taxid:307796': [sgd_out],
'taxid:643680': [sgd_out],
'taxid:574961': [sgd_out],
'taxid:285006': [sgd_out],
'taxid:545124': [sgd_out],
'taxid:764097': [sgd_out]
}

for taxon in taxon_file_dispatch_dict.keys():
self.taxon_used_dict[taxon] = 0

out_to_species_name_dict = {
rgd_out: 'Rattus norvegicus',
human_out: 'Homo sapiens',
mgi_out: 'Mus musculus',
wb_out: 'Caenorhabditis elegans',
xb_out: 'Xenopus laevis, Xenopus tropicalis',
xbxl_out: 'Xenopus laevis',
xbxt_out: 'Xenopus tropicalis',
sgd_out: 'Saccharomyces cerevisiae',
zfin_out: 'Danio rerio',
sarscov2_out: 'Severe acute respiratory syndrome coronavirus 2',
Expand All @@ -501,6 +511,8 @@ def get_data(self):
mgi_out: 'NCBI:txid10090',
wb_out: 'NCBI:txid6239',
xb_out: 'NCBI:txid8355, NCBI:txid8364',
xbxl_out: 'NCBI:txid8355',
xbxt_out: 'NCBI:txid8364',
sgd_out: 'NCBI:txid559292',
zfin_out: 'NCBI:txid7955',
sarscov2_out: 'NCBI:txid2697049',
Expand Down Expand Up @@ -745,18 +757,24 @@ def get_data(self):

if taxon1 == taxon2:
try:
taxon_file_dispatch_dict[taxon1].writerow(row)
self.taxon_used_dict[taxon1] += 1
for filehandle in taxon_file_dispatch_dict[taxon1]:
filehandle.writerow(row)
except KeyError:
pass

else:
try:
taxon_file_dispatch_dict[taxon1].writerow(row)
self.taxon_used_dict[taxon1] += 1
for filehandle in taxon_file_dispatch_dict[taxon1]:
filehandle.writerow(row)
except KeyError:
pass

try:
taxon_file_dispatch_dict[taxon2].writerow(row)
self.taxon_used_dict[taxon2] += 1
for filehandle in taxon_file_dispatch_dict[taxon2]:
filehandle.writerow(row)
except KeyError:
pass

Expand All @@ -765,12 +783,28 @@ def validate_and_upload_files_to_fms(self):
logger.info('Summary of files created:')
logger.info(os.system("ls -alh {}*".format(self.output_dir)))

file_taxon_dict = {
'alliance_molecular_interactions_fly.tsv': ['taxid:7227'],
'alliance_molecular_interactions_sarscov2.tsv': ['taxid:2697049'],
'alliance_molecular_interactions_worm.tsv': ['taxid:6239'],
'alliance_molecular_interactions_xenopus.tsv': ['taxid:8355', 'taxid:8364'],
'alliance_molecular_interactions_xenopus_laevis.tsv': ['taxid:8355'],
'alliance_molecular_interactions_xenopus_tropicalis.tsv': ['taxid:8364'],
'alliance_molecular_interactions_zebrafish.tsv': ['taxid:7955'],
'alliance_molecular_interactions_yeast.tsv': ['taxid:559292', 'taxid:307796', 'taxid:643680', 'taxid:574961', 'taxid:285006', 'taxid:545124', 'taxid:764097'],
'alliance_molecular_interactions_rat.tsv': ['taxid:10116'],
'alliance_molecular_interactions_mouse.tsv': ['taxid:10090'],
'alliance_molecular_interactions_human.tsv': ['taxid:9606'],
}

upload_location_dict = {
'alliance_molecular_interactions.tsv': 'COMBINED',
'alliance_molecular_interactions_fly.tsv': 'FB',
'alliance_molecular_interactions_sarscov2.tsv': 'SARS-CoV-2',
'alliance_molecular_interactions_worm.tsv': 'WB',
'alliance_molecular_interactions_xenopus.tsv': 'XB',
'alliance_molecular_interactions_xenopus_laevis.tsv': 'XBXL',
'alliance_molecular_interactions_xenopus_tropicalis.tsv': 'XBXT',
'alliance_molecular_interactions_zebrafish.tsv': 'ZFIN',
'alliance_molecular_interactions_yeast.tsv': 'SGD',
'alliance_molecular_interactions_rat.tsv': 'RGD',
Expand All @@ -783,9 +817,21 @@ def validate_and_upload_files_to_fms(self):
for filename in upload_location_dict.keys():
dataSubType = upload_location_dict[filename]

p = multiprocessing.Process(target=super().fms_upload, args=("INTERACTION-MOL", dataSubType, filename))
p.start()
thread_pool.append(p)
upload_flag = False
if filename in file_taxon_dict:
for taxon in file_taxon_dict[filename]:
if self.taxon_used_dict[taxon] > 0:
upload_flag = True
logger.info("filename %s taxon %s count %s" % (filename, taxon, self.taxon_used_dict[taxon]))
if filename == 'alliance_molecular_interactions.tsv':
upload_flag = True
if upload_flag:
logger.info("upload %s" % (upload_location_dict[filename]))
dataSubType = upload_location_dict[filename]

p = multiprocessing.Process(target=super().fms_upload, args=("INTERACTION-MOL", dataSubType, filename))
p.start()
thread_pool.append(p)

Processor.wait_for_threads(thread_pool)

0 comments on commit 20664be

Please sign in to comment.