-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathinsert_OMIM.py
146 lines (132 loc) · 5.04 KB
/
insert_OMIM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# (c) 2023, Charles VAN GOETHEM <c-vangoethem (at) chu-montpellier (dot) fr>
#
# This file is part of SEAL
#
# SEAL db - Simple, Efficient And Lite database for NGS
# Copyright (C) 2023 Charles VAN GOETHEM - MoBiDiC - CHU Montpellier
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import re
from seal import db
from seal.models import Phenotype, Omim
import pprint
pp = pprint.PrettyPrinter(indent=4)
shorter_transmission = {
"Autosomal dominant": "AD",
"Autosomal recessive": "AR",
"Digenic dominant": "DD",
"Digenic recessive": "DR",
"Isolated cases": "IC",
"Mitochondrial": "Mito",
"Multifactorial": "MF",
"Pseudoautosomal dominant": "PAD",
"Pseudoautosomal recessive": "PAR",
"Somatic mosaicism": "SMos",
"Somatic mutation": "SMut",
"X-linked": "XL",
"X-linked dominant": "XLD",
"X-linked recessive": "XLR",
"Y-linked": "YL",
"?Autosomal dominant": "?AD",
"?Autosomal recessive": "?AR",
"?Digenic dominant": "?DD",
"?Digenic recessive": "?DR",
"?Isolated cases": "?IC",
"?Mitochondrial": "?Mito",
"?Multifactorial": "?MF",
"?Pseudoautosomal dominant": "?PAD",
"?Pseudoautosomal recessive": "?PAR",
"?Somatic mosaicism": "?SMos",
"?Somatic mutation": "?SMut",
"?X-linked": "?XL",
"?X-linked dominant": "?XLD",
"?X-linked recessive": "?XLR",
"?Y-linked": "?YL"
}
# Read from stdin
all_pheno = dict()
with open("genemap2.txt") as fd:
for line in fd:
# Skip comments
if line.startswith('#'):
continue
# Strip trailing new line
line = line.strip('\n')
# Get the values
valueList = line.split('\t')
# Get the fields
chromosome = valueList[0]
genomicPositionStart = valueList[1]
genomicPositionEnd = valueList[2]
cytoLocation = valueList[3]
computedCytoLocation = valueList[4]
mimNumber = valueList[5]
geneSymbols = valueList[6]
geneName = valueList[7]
approvedGeneSymbol = valueList[8]
entrezGeneID = valueList[9]
ensemblGeneID = valueList[10]
comments = valueList[11]
phenotypeString = valueList[12]
mouse = valueList[13]
# Skip empty phenotypes
if not phenotypeString or not entrezGeneID or not approvedGeneSymbol:
continue
omim = Omim.query.get(mimNumber)
if not omim:
omim = Omim(
mimNumber=mimNumber,
approvedGeneSymbol=approvedGeneSymbol,
comments=comments,
cytoLocation=cytoLocation,
computedCytoLocation=computedCytoLocation,
entrezGeneID=entrezGeneID,
ensemblGeneID=ensemblGeneID,
geneSymbols=geneSymbols.split(",")
)
db.session.add(omim)
db.session.commit()
# Parse the phenotypes
for phenotype in phenotypeString.split(';'):
# Clean the phenotype
phenotype = phenotype.strip()
# Long phenotype
matcher_long = re.match(r'^(.*),\s(\d{6})\s\((\d)\)(|, (.*))$', phenotype)
matcher_short = re.match(r'^(.*)\((\d)\)(|, (.*))$', phenotype)
if matcher_long:
# Get the fields
phenotype = matcher_long.group(1).strip('" ')
phenotypeMimNumber = matcher_long.group(2).strip('" ')
phenotypeMappingKey = matcher_long.group(3).strip('" ')
inheritances = matcher_long.group(5)
elif matcher_short:
# Get the fields
phenotype = matcher_short.group(1).strip('" ')
phenotypeMappingKey = matcher_short.group(2).strip('" ')
inheritances = matcher_short.group(3)
if (matcher_long or matcher_short):
if inheritances:
# Get the inheritances, may or may not be there
inheritances = [shorter_transmission[i.strip('" ')] for i in inheritances.split(',')]
else:
inheritances = ['Unknown']
pheno = Phenotype(
phenotypeMimNumber=phenotypeMimNumber if phenotypeMimNumber else None,
phenotype=phenotype,
inheritances=inheritances,
phenotypeMappingKey=phenotypeMappingKey
)
db.session.add(pheno)
db.session.commit()
omim.phenotypes.append(pheno)