-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgen_ground_truth.py
94 lines (74 loc) · 2.99 KB
/
gen_ground_truth.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import enum
import os
import re
import json
from pathlib import Path
def get_raw_file_list(dir_to_parse):
raw_results = re.compile(r'\d{1,3}_?\d{1,3}?\.txt')
raw_files: list[str] = []
for root, dirs, files in os.walk(dir_to_parse):
for file in files:
if raw_results.match(file):
raw_files.append(os.path.join(root, file))
return raw_files
def find_matching_result_files(dir_to_parse, raw_files):
raw_to_result: list[tuple[str, str]] = []
processed = re.compile(r'BRCA_.*\.txt')
for root, dirs, files in os.walk(dir_to_parse):
for file in files:
for raw_file in raw_files:
processed_with_raw = re.compile(
f'BRCA_?{Path(raw_file).stem}.*\\.txt')
if processed_with_raw.match(file):
raw_to_result.append((raw_file, os.path.join(root, file)))
return raw_to_result
def file_len_match(path1, path2):
with open(f'{path1}') as file1:
file1_line_count = len(file1.readlines()) - 1 # first line is header
with open(f'{path2}') as file2:
file2_line_count = len(file2.readlines())
return file1_line_count == file2_line_count
def create_record(result_line, raw_line) -> dict | None:
raw_tokens = raw_line.split('\t')
# "NM_007294.4(BRCA1):c.4308T>C (p.Ser1436=),VCV000125703.29 45.8%"
# "00125703.29 45.8%"
result_tokens = result_line.split('CV0')
try:
# "00125703.29 45.8%"
# "VCV000125703.29"
VCV_id = "VCV0" + result_tokens[1].split('\t')[0]
# "VCV000125703.29"
# "VCV000125703"
VCV_id = VCV_id.split('.')[0]
except:
# if a result line cannot be parsed, it has no vcv_id, so it's unusable
return None
record = {}
record['chromosome'] = raw_tokens[0]
record['position'] = raw_tokens[1]
record['ref'] = raw_tokens[2]
record['var'] = raw_tokens[3]
record['vcv'] = VCV_id
return record
def dump_records(raw_lines, result_lines, matched_file):
records = []
for idx, raw_line in enumerate(raw_lines):
result_line = result_lines[idx]
if record := create_record(result_line, raw_line):
records.append(record)
json.dump(records, matched_file)
if __name__ == '__main__':
dir_to_parse = './test/BRCA_2022'
result_dir = os.path.join(dir_to_parse, "matched")
if not os.path.exists(result_dir):
os.mkdir(os.path.join(dir_to_parse, "matched"))
raw_files = get_raw_file_list(dir_to_parse)
raw_to_result = find_matching_result_files(dir_to_parse, raw_files)
for raw, result in raw_to_result:
if file_len_match(raw, result):
raw_lines = open(f'{raw}').readlines()[1:]
result_lines = open(f'{result}').readlines()
result_file = os.path.join(
result_dir, f'matched_{Path(raw).stem}.json')
with open(result_file, 'w') as matched_file:
dump_records(raw_lines, result_lines, matched_file)