forked from WoldringLabMSU/AP-LASR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAP-LASR.py
1766 lines (1693 loc) · 96.2 KB
/
AP-LASR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Semi-Automation of the Ancestral Sequence Reconstruction Workflow
# This is a program that constructs rough estimates of ancestral sequence reconstruction from an amino acid sequence.
# This program requires instalation of the Bioservieces module
# Written by James VanAntwerp from September 2020 through May 2023 - contact vanantj @ udel . edu
# Written by Pattrick Finneran, Menten AI, Palo Alto, California, United States of America
# Written for the Woldring Lab, Michigan State University in East Lansing,
# Michigan, USA.
import sys
from Bio.Blast import NCBIWWW
import xml.etree.ElementTree as ET
import os
import argparse
Directory_Structure = '''The program will make the following directory structure, where ROOT is the directory the script is placed in:
ROOT/
|AP-LASR.py This script
|--Input.fasta The input fasta file, if you used one.
|--ASR/...................................The directory containing all work from the most recent run. The software WILL overwrite old runs.
| |--BlastP_Results.fasta Fasta file containing all BlastP results.
| |--BlastP_XML_ The raw XML returned by NCBI's BlastP, if you want to find more information about the BlastP resutls.
| |--HitsInfo.csv Data about the BlastP hits - Sequence IDs and notes.
| |--Final_Sequences.fasta The set of modern sequences used for the ASR and the foundation of all the later calculation.
| |--Final_Tree.treefile The final tree which is best for reading, as it has the most information in one place.
| |--Concesus_Ancestors_with_Gaps.fasta The set of likely ancestors, aligned and with gaps where the binary gap analysis predicts them.
| |--Concesus_Ancestors_without_Gaps.fasta The set of likely ancestors, unaligned and without gaps.
| |--IQTree_Phylo/..............................The directory for IQTree files from the phylogeny.
| | |--Phylo.* All IQTree files.
| | |--Supports.* Data about the confidence of tree topology - .csv holds all data, .txt is summary.
| | |--UFB_Confidences.png Histogram of UFB node support values (if made).
| | |--SHaLRT_Confidences.png Histogram of SHaLRT node support values (if made).
| |--IQTree_ASR/................................The directory for IQTree files from ASR.
| | |--ASR.* All IQTree files.
| | |--Ancestral_Sequence_Confidences.* Data about the confidence of ASR prediction - .csv holds all data, .txt is summary, .png is a histogram (if made).
| |--IQTree_Binary/.............................The directory for IQTree files from gap analysis.
| | |--Binary_Alignment.fasta A binary alignment of modern sequences - gap or not.
| | |--Binary.* All IQTree files.
| |--DNA_Libraries/.............................The directory for IQTree files from gap analysis.
| | |--Library_Size_Information.csv Information on the size of generated libraries.
| | |--Library_**%_Cutoff.fasta Fasta of DNA for all high-confidence ancestors, with degenerate codons coding for all AAs with more than a certian likelihood.
| |--Sequence_Supplement/.......................The directory for the files created by adding additional sequence data to poorly supported regions of the tree.
| | |--*__supplement_HitsInfo.csv Same as HitInfo.csv, but for each supplement.
| | |--*_BlastP_Results.fasta Each top 50 hits from sequences that are supplemented.
| |--Confidence_Heatmaps/.......................The directory for IQTree files from gap analysis. Only generated after running the script again with the MakeFigures option.
| | |--*_Confidences.pdf A heatmap of the confidence values for each ancestral position.
'''
Help_String = '''This software automates the process of generating combinatorial protein libraries from ancestral sequence reconstruction.
This software can run in one of four modes.
Assistance mode will show this message, software prerequistes, and an example directory structure.
ASR mode will take in a user sequence and conduct ASR, then generate combinatorial protein libraries of ancestral proteins. Command-line parameters should be used to specify the directory where output is stored and the executables for CD-Hit, MAFFT, and IQ-Tree. The desired final dataset size can be specified, and sequence supplementation can be turned on here too.
option -i specifies the name of the input Fasta file, or can take a raw protein sequence. This is the only mandatory input.
option -n specifies the name of the output directory. The default is "ASR".
option -s specifies the desired maximum size of the final dataset of modern homologs. This allows users some control over the time AP-LASR will take to run, and the detail/quality of the ASR. The default is 500.
option -supplement will turn on supplementation at the provided similarity cutoff (entered as a decimal). If you do turn on supplementation, there is not a default value but we reccomend users specify something between 0.7 and 0.85.
option -iqtree allows users to specify an executable for IQTree other than the default ("iqtree").
option -cdhit allows users to specify an executable for CH-Hit other than the default ("cd-hit").
option -mafft allows users to specify an executable for MAFFT other than the default ("mafft").
option -MSI allows users to specify the number of times the function "Post_MAFFT_Processing" iterates over the raw alignment to generate the final dataset. Each iteation will remove a large number of sequences, more if the different input sequecnes are dissimilar. Default is 2.
MakeFigures mode will generate figures from ASR data of a previous ASR run - specify the directory where the results of interest are stored.
option -n specifies the name of the directory where the ASR results are stored. The default is "ASR".
RemakeLibrareies mode will generate new combinatorial libraries from ancestral proteins of a previous ASR run with a specified threshold confidence. This will allow generation of a library with a different size than what was generated with the default threshold confidences. Specify the directory where the ASR results of interest are stored.
option -n specifies the name of the directory where the ASR results are stored. The default is "ASR".
option -t specifies the new threshold, expressed as a decimal.
Entering just the option "--help" or "-h" will give a less verbose version this description. '''
Software_Prerequistes = '''This AP-LASR script makes use of external software, which must be set up before it can be used. See the GitHub page for this project (github.com/jjvanantwerp/Automated-ASR) for download links.
This software requires IQTree, CD-Hit, and MAFFT to be downloaded and set up with their standard executable names added to the path.
It also requires the python modules BioPython (Bio) and xml to be installed. Optionally, the python module matplotlib can be installed for production of figures from the data.
To make the pdf confidence heatmaps, the python modules pandas and seaborn are also needed.'''
# This dictionary provides the amino acid encoded for by every codon
Codon_to_AA = {
'ata': 'I', 'atc': 'I', 'att': 'I', 'atg': 'M',
'aca': 'T', 'acc': 'T', 'acg': 'T', 'act': 'T',
'aac': 'N', 'aat': 'N', 'aaa': 'K', 'aag': 'K',
'agc': 'S', 'agt': 'S', 'aga': 'R', 'agg': 'R',
'cta': 'L', 'ctc': 'L', 'ctg': 'L', 'ctt': 'L',
'cca': 'P', 'ccc': 'P', 'ccg': 'P', 'cct': 'P',
'cac': 'H', 'cat': 'H', 'caa': 'Q', 'cag': 'Q',
'cga': 'R', 'cgc': 'R', 'cgg': 'R', 'cgt': 'R',
'gta': 'V', 'gtc': 'V', 'gtg': 'V', 'gtt': 'V',
'gca': 'A', 'gcc': 'A', 'gcg': 'A', 'gct': 'A',
'gac': 'D', 'gat': 'D', 'gaa': 'E', 'gag': 'E',
'gga': 'G', 'ggc': 'G', 'ggg': 'G', 'ggt': 'G',
'tca': 'S', 'tcc': 'S', 'tcg': 'S', 'tct': 'S',
'ttc': 'F', 'ttt': 'F', 'tta': 'L', 'ttg': 'L',
'tac': 'Y', 'tat': 'Y', 'tgc': 'C', 'tgt': 'C',
'taa': ' stop ', 'tag': ' stop ', 'tga': ' stop ', 'tgg': 'W'
}
# This dictionary provides the best codon for every amino acid in E Coli
AA_to_Codon_Ecoli = {
'A': 'gcc', 'R': 'cgt', 'N': 'aac', 'D': 'gat', 'B': 'aac',
'C': 'tgc', 'E': 'gaa', 'Q': 'cag', 'Z': 'cag', 'G': 'ggc',
'H': 'cat', 'I': 'att', 'L': 'ctg', 'K': 'aaa', 'M': 'atg',
'F': 'ttt', 'P': 'ccg', 'S': 'agc', 'T': 'acc', 'W': 'tgg',
'Y': 'tat', 'V': 'gtg', ' stop ': 'taa'
}
# This dictionary provides the best codon for every amino acid in humans
AA_to_Codon_Human = {
'A': 'gcc', 'R': 'cgg', 'N': 'aac', 'D': 'gac', 'B': 'aac',
'C': 'tgc', 'E': 'gag', 'Q': 'cag', 'Z': 'cag', 'G': 'ggc',
'H': 'cac', 'I': 'atc', 'L': 'ctg', 'K': 'aag', 'M': 'atg',
'F': 'ttc', 'P': 'ccc', 'S': 'agc', 'T': 'acc', 'W': 'tgg',
'Y': 'tac', 'V': 'gtc', ' stop ': 'tga'
}
# This dictionary provides the mixed-base code for every combination of 2-4 nucleic acids
Mixed_Bases_lookup = {
'a':'a','c':'c','g':'g','t':'t',
'ag':'r', 'ga':'r',
'ct':'y', 'tc':'y',
'ac':'m', 'ca':'m',
'gt':'k', 'tg':'k',
'gc':'s', 'cg':'s',
'at':'w', 'ta':'w',
'act':'h', 'atc':'h', 'cat':'h', 'cta':'h', 'tac':'h', 'tca':'h',
'gct':'b', 'gtc':'b', 'ctg':'b', 'cgt':'b', 'tgc':'b', 'tcg':'b',
'acg':'v', 'agc':'v', 'cag':'v', 'cga':'v', 'gca':'v', 'gac':'v',
'agt':'d', 'atg':'d', 'gat':'d', 'gta':'d', 'tga':'d', 'tag':'d',
'acgt':'n','actg':'n','agct':'n','agtc':'n','atcg':'n','atgc':'n',
'cagt':'n','catg':'n','gact':'n','gatc':'n','tacg':'n','tagc':'n',
'cgat':'n','ctag':'n','gcat':'n','gtac':'n','tcag':'n','tgac':'n',
'cgta':'n','ctga':'n','gcta':'n','gtca':'n','tcga':'n','tgca':'n'
}
# This dictionary provides the best degenerate codon for every combination of two amino acids for Humans
AA_Pair_lookup_Human = {
'AC':'ksc', 'AD':'gmc', 'AE':'gma', 'AF':'kyc', 'AG':'gsc', 'AH':'smc', 'AI':'ryc', 'AK':'rma', 'AL':'syc', 'AM':'ryg', 'AN':'rmc', 'AP':'scc', 'AQ':'sma', 'AR':'rsa', 'AS':'kcc', 'AT':'rcc', 'AV':'gyc', 'AW':'ksg', 'AY':'kmc',
'CA':'ksc', 'CD':'krc', 'CE':'krs', 'CF':'tkc', 'CG':'kgc', 'CH':'yrc', 'CI':'wkc', 'CK':'wrs', 'CL':'ykc', 'CM':'wks', 'CN':'wrc', 'CP':'ysc', 'CQ':'yrs', 'CR':'ygc', 'CS':'tsc', 'CT':'wsc', 'CV':'kkc', 'CW':'tgs', 'CY':'trc',
'DA':'gmc', 'DC':'krc', 'DE':'gas', 'DF':'kwc', 'DG':'grc', 'DH':'sac', 'DI':'rwc', 'DK':'ras', 'DL':'swc', 'DM':'rws', 'DN':'rac', 'DP':'smc', 'DQ':'sas', 'DR':'src', 'DS':'rrc', 'DT':'rmc', 'DV':'gwc', 'DW':'krs', 'DY':'kac',
'EA':'gma', 'EC':'krs', 'ED':'gas', 'EF':'kws', 'EG':'grg', 'EH':'sas', 'EI':'rwa', 'EK':'rag', 'EL':'swg', 'EM':'rwg', 'EN':'ras', 'EP':'smg', 'EQ':'sag', 'ER':'rrg', 'ES':'kmg', 'ET':'rmg', 'EV':'gwg', 'EW':'rrg', 'EY':'kas',
'FA':'kyc', 'FC':'tkc', 'FD':'kwc', 'FE':'kws', 'FG':'kkc', 'FH':'ywc', 'FI':'wtc', 'FK':'wwm', 'FL':'ytc', 'FM':'wts', 'FN':'wwc', 'FP':'yyc', 'FQ':'yws', 'FR':'ykc', 'FS':'tyc', 'FT':'wyc', 'FV':'ktc', 'FW':'tks', 'FY':'twc',
'GA':'gsc', 'GC':'kgc', 'GD':'grc', 'GE':'grg', 'GF':'kkc', 'GH':'src', 'GI':'rkc', 'GK':'rra', 'GL':'skc', 'GM':'rrg', 'GN':'rrc', 'GP':'ssc', 'GQ':'grg', 'GR':'sgg', 'GS':'rgc', 'GT':'rsc', 'GV':'gkc', 'GW':'kgg', 'GY':'krc',
'HA':'smc', 'HC':'yrc', 'HD':'sac', 'HE':'sas', 'HF':'ywc', 'HG':'src', 'HI':'mwc', 'HK':'mas', 'HL':'cwc', 'HM':'mws', 'HN':'mac', 'HP':'cmc', 'HQ':'cas', 'HR':'crc', 'HS':'mrc', 'HT':'mmc', 'HV':'swc', 'HW':'yrs', 'HY':'yac',
'IA':'ryc', 'IC':'wkc', 'ID':'rwc', 'IE':'rwa', 'IF':'wtc', 'IG':'rkc', 'IH':'mwc', 'IK':'awa', 'IL':'mtc', 'IM':'ats', 'IN':'awc', 'IP':'myc', 'IQ':'mya', 'IR':'aka', 'IS':'akc', 'IT':'ayc', 'IV':'rtc', 'IW':'wks', 'IY':'wwc',
'KA':'rma', 'KC':'wrs', 'KD':'ras', 'KE':'rag', 'KF':'wwm', 'KG':'rra', 'KH':'mas', 'KI':'awa', 'KL':'mwa', 'KM':'awg', 'KN':'aas', 'KP':'mma', 'KQ':'maa', 'KR':'arg', 'KS':'ars', 'KT':'ama', 'KV':'rwa', 'KW':'wrg', 'KY':'was',
'LA':'syc', 'LC':'ykc', 'LD':'swc', 'LE':'swg', 'LF':'ytc', 'LG':'skc', 'LH':'csc', 'LI':'mtc', 'LK':'mwa', 'LM':'mtg', 'LN':'mwc', 'LP':'cyc', 'LQ':'cyg', 'LR':'ckg', 'LS':'tyr', 'LT':'myc', 'LV':'stg', 'LW':'tkg', 'LY':'ywc',
'MA':'ryg', 'MC':'wks', 'MD':'rws', 'ME':'rwg', 'MF':'wts', 'MG':'rrg', 'MH':'mws', 'MI':'ats', 'MK':'awg', 'ML':'mtg', 'MN':'aws', 'MP':'myg', 'MQ':'mwg', 'MR':'akg', 'MS':'aks', 'MT':'ayg', 'MV':'rtg', 'MW':'wkg', 'MY':'wws',
'NA':'rmc', 'NC':'wrc', 'ND':'rac', 'NE':'ras', 'NF':'wwc', 'NG':'rrc', 'NH':'mac', 'NI':'awc', 'NK':'aas', 'NL':'mwc', 'NM':'aws', 'NP':'mmc', 'NQ':'mas', 'NR':'ars', 'NS':'arc', 'NT':'amc', 'NV':'rwc', 'NW':'wrs', 'NY':'wac',
'PA':'scc', 'PC':'ysc', 'PD':'smc', 'PE':'smg', 'PF':'yyc', 'PG':'ssc', 'PH':'cmc', 'PI':'myc', 'PK':'mma', 'PL':'cyc', 'PM':'myg', 'PN':'mmc', 'PQ':'cmr', 'PR':'csc', 'PS':'ycc', 'PT':'mcc', 'PV':'syc', 'PW':'ysg', 'PY':'ymc',
'QA':'sma', 'QC':'yrs', 'QD':'sas', 'QE':'sag', 'QF':'yws', 'QG':'grg', 'QH':'cas', 'QI':'mya', 'QK':'maa', 'QL':'cyg', 'QM':'mwg', 'QN':'mas', 'QP':'cmr', 'QR':'crg', 'QS':'yma', 'QT':'mma', 'QV':'swg', 'QW':'yrg', 'QY':'yas',
'RA':'rsa', 'RC':'ygc', 'RD':'src', 'RE':'rrg', 'RF':'ykc', 'RG':'sgg', 'RH':'crc', 'RI':'aka', 'RK':'arg', 'RL':'ckg', 'RM':'akg', 'RN':'ars', 'RP':'csc', 'RQ':'crg', 'RS':'ags', 'RT':'asc', 'RV':'rkc', 'RW':'ygg', 'RY':'yrc',
'SA':'kcc', 'SC':'tsc', 'SD':'rrc', 'SE':'kmg', 'SF':'tyc', 'SG':'rgc', 'SH':'mrc', 'SI':'akc', 'SK':'ars', 'SL':'tyr', 'SM':'aks', 'SN':'arc', 'SP':'ycc', 'SQ':'yma', 'SR':'ags', 'ST':'asc', 'SV':'kyc', 'SW':'tsg', 'SY':'tmc',
'TA':'rcc', 'TC':'wsc', 'TD':'rmc', 'TE':'rmg', 'TF':'wyc', 'TG':'rsc', 'TH':'mmc', 'TI':'ayc', 'TK':'ama', 'TL':'myc', 'TM':'ayg', 'TN':'amc', 'TP':'mcc', 'TQ':'mma', 'TR':'asc', 'TS':'asc', 'TV':'ryc', 'TW':'wsg', 'TY':'wmc',
'VA':'gyc', 'VC':'kkc', 'VD':'gwc', 'VE':'gwg', 'VF':'ktc', 'VG':'gkc', 'VH':'swc', 'VI':'rtc', 'VK':'rwa', 'VL':'stg', 'VM':'rtg', 'VN':'rwc', 'VP':'syc', 'VQ':'swg', 'VR':'rkc', 'VS':'kyc', 'VT':'ryc', 'VW':'kkg', 'VY':'kwc',
'WA':'ksg', 'WC':'tgs', 'WD':'krs', 'WE':'rrg', 'WF':'tks', 'WG':'kgg', 'WH':'yrs', 'WI':'wks', 'WK':'wrg', 'WL':'tkg', 'WM':'wkg', 'WN':'wrs', 'WP':'ysg', 'WQ':'yrg', 'WR':'ygg', 'WS':'tsg', 'WT':'wsg', 'WV':'kkg', 'WY':'trs',
'YA':'kmc', 'YC':'trc', 'YD':'kac', 'YE':'kas', 'YF':'twc', 'YG':'krc', 'YH':'yac', 'YI':'wwc', 'YK':'was', 'YL':'ywc', 'YM':'wws', 'YN':'wac', 'YP':'ymc', 'YQ':'yas', 'YR':'yrc', 'YS':'tmc', 'YT':'wmc', 'YV':'kwc', 'YW':'trs',
}
# This dictionary provides the best degenerate codon for every combination of two amino acids for EColi
AA_Pair_lookup_EColi = {
'AC':'ksc', 'AD':'gmc', 'AE':'gma', 'AF':'kyc', 'AG':'gsc', 'AH':'smc', 'AI':'ryc', 'AK':'rma', 'AL':'syc', 'AM':'ryg', 'AN':'rmc', 'AP':'scc', 'AQ':'sma', 'AR':'rsa', 'AS':'kcc', 'AT':'rcc', 'AV':'gyc', 'AW':'ksg', 'AY':'kmc',
'CA':'ksc', 'CD':'krc', 'CE':'krs', 'CF':'tkc', 'CG':'kgc', 'CH':'yrc', 'CI':'wkc', 'CK':'wrs', 'CL':'ykc', 'CM':'wks', 'CN':'wrc', 'CP':'ysc', 'CQ':'yrs', 'CR':'ygc', 'CS':'tsc', 'CT':'wsc', 'CV':'kkc', 'CW':'tgs', 'CY':'trc',
'DA':'gmc', 'DC':'krc', 'DE':'gas', 'DF':'kwc', 'DG':'grc', 'DH':'sac', 'DI':'rwc', 'DK':'ras', 'DL':'swc', 'DM':'rws', 'DN':'rac', 'DP':'smc', 'DQ':'sas', 'DR':'src', 'DS':'rrc', 'DT':'rmc', 'DV':'gwc', 'DW':'krs', 'DY':'kac',
'EA':'gma', 'EC':'krs', 'ED':'gas', 'EF':'kws', 'EG':'grg', 'EH':'sas', 'EI':'rwa', 'EK':'rag', 'EL':'swg', 'EM':'rwg', 'EN':'ras', 'EP':'smg', 'EQ':'sag', 'ER':'rrg', 'ES':'kmg', 'ET':'rmg', 'EV':'gwg', 'EW':'rrg', 'EY':'kas',
'FA':'kyc', 'FC':'tkc', 'FD':'kwc', 'FE':'kws', 'FG':'kkc', 'FH':'ywc', 'FI':'wtc', 'FK':'wwm', 'FL':'ytc', 'FM':'wts', 'FN':'wwc', 'FP':'yyc', 'FQ':'yws', 'FR':'ykc', 'FS':'tyc', 'FT':'wyc', 'FV':'ktc', 'FW':'tks', 'FY':'twc',
'GA':'gsc', 'GC':'kgc', 'GD':'grc', 'GE':'grg', 'GF':'kkc', 'GH':'src', 'GI':'rkc', 'GK':'rra', 'GL':'skc', 'GM':'rrg', 'GN':'rrc', 'GP':'ssc', 'GQ':'grg', 'GR':'sgg', 'GS':'rgc', 'GT':'rsc', 'GV':'gkc', 'GW':'kgg', 'GY':'krc',
'HA':'smc', 'HC':'yrc', 'HD':'sac', 'HE':'sas', 'HF':'ywc', 'HG':'src', 'HI':'mwc', 'HK':'mas', 'HL':'cwc', 'HM':'mws', 'HN':'mac', 'HP':'cmc', 'HQ':'cas', 'HR':'crc', 'HS':'mrc', 'HT':'mmc', 'HV':'swc', 'HW':'yrs', 'HY':'yac',
'IA':'ryc', 'IC':'wkc', 'ID':'rwc', 'IE':'rwa', 'IF':'wtc', 'IG':'rkc', 'IH':'mwc', 'IK':'awa', 'IL':'mtc', 'IM':'ats', 'IN':'awc', 'IP':'myc', 'IQ':'mya', 'IR':'aka', 'IS':'akc', 'IT':'ayc', 'IV':'rtc', 'IW':'wks', 'IY':'wwc',
'KA':'rma', 'KC':'wrs', 'KD':'ras', 'KE':'rag', 'KF':'wwm', 'KG':'rra', 'KH':'mas', 'KI':'awa', 'KL':'mwa', 'KM':'awg', 'KN':'aas', 'KP':'mma', 'KQ':'maa', 'KR':'arg', 'KS':'ars', 'KT':'ama', 'KV':'rwa', 'KW':'wrg', 'KY':'was',
'LA':'syc', 'LC':'ykc', 'LD':'swc', 'LE':'swg', 'LF':'ytc', 'LG':'skc', 'LH':'csc', 'LI':'mtc', 'LK':'mwa', 'LM':'mtg', 'LN':'mwc', 'LP':'cyc', 'LQ':'cyg', 'LR':'ckg', 'LS':'tyr', 'LT':'myc', 'LV':'stg', 'LW':'tkg', 'LY':'ywc',
'MA':'ryg', 'MC':'wks', 'MD':'rws', 'ME':'rwg', 'MF':'wts', 'MG':'rrg', 'MH':'mws', 'MI':'ats', 'MK':'awg', 'ML':'mtg', 'MN':'aws', 'MP':'myg', 'MQ':'mwg', 'MR':'akg', 'MS':'aks', 'MT':'ayg', 'MV':'rtg', 'MW':'wkg', 'MY':'wws',
'NA':'rmc', 'NC':'wrc', 'ND':'rac', 'NE':'ras', 'NF':'wwc', 'NG':'rrc', 'NH':'mac', 'NI':'awc', 'NK':'aas', 'NL':'mwc', 'NM':'aws', 'NP':'mmc', 'NQ':'mas', 'NR':'ars', 'NS':'arc', 'NT':'amc', 'NV':'rwc', 'NW':'wrs', 'NY':'wac',
'PA':'scc', 'PC':'ysc', 'PD':'smc', 'PE':'smg', 'PF':'yyc', 'PG':'ssc', 'PH':'cmc', 'PI':'myc', 'PK':'mma', 'PL':'cyc', 'PM':'myg', 'PN':'mmc', 'PQ':'cmr', 'PR':'csc', 'PS':'yct', 'PT':'mcc', 'PV':'syc', 'PW':'ysg', 'PY':'ymc',
'QA':'sma', 'QC':'yrs', 'QD':'sas', 'QE':'sag', 'QF':'yws', 'QG':'grg', 'QH':'cas', 'QI':'mya', 'QK':'maa', 'QL':'cyg', 'QM':'mwg', 'QN':'mas', 'QP':'cmr', 'QR':'crg', 'QS':'yma', 'QT':'mma', 'QV':'swg', 'QW':'yrg', 'QY':'yas',
'RA':'rsa', 'RC':'ygc', 'RD':'src', 'RE':'rrg', 'RF':'ykc', 'RG':'sgg', 'RH':'crc', 'RI':'aka', 'RK':'arg', 'RL':'ckg', 'RM':'akg', 'RN':'ars', 'RP':'csc', 'RQ':'crg', 'RS':'ags', 'RT':'asc', 'RV':'rkc', 'RW':'ygg', 'RY':'yrc',
'SA':'kcc', 'SC':'tsc', 'SD':'rrc', 'SE':'kmg', 'SF':'tyc', 'SG':'rgc', 'SH':'mrc', 'SI':'akc', 'SK':'ars', 'SL':'tyr', 'SM':'aks', 'SN':'arc', 'SP':'yct', 'SQ':'yma', 'SR':'ags', 'ST':'asc', 'SV':'kyc', 'SW':'tsg', 'SY':'tmc',
'TA':'rcc', 'TC':'wsc', 'TD':'rmc', 'TE':'rmg', 'TF':'wyc', 'TG':'rsc', 'TH':'mmc', 'TI':'ayc', 'TK':'ama', 'TL':'myc', 'TM':'ayg', 'TN':'amc', 'TP':'mcc', 'TQ':'mma', 'TR':'asc', 'TS':'asc', 'TV':'ryc', 'TW':'wsg', 'TY':'wmc',
'VA':'gyc', 'VC':'kkc', 'VD':'gwc', 'VE':'gwg', 'VF':'ktc', 'VG':'gkc', 'VH':'swc', 'VI':'rtc', 'VK':'rwa', 'VL':'stg', 'VM':'rtg', 'VN':'rwc', 'VP':'syc', 'VQ':'swg', 'VR':'rkc', 'VS':'kyc', 'VT':'ryc', 'VW':'kkg', 'VY':'kwc',
'WA':'ksg', 'WC':'tgs', 'WD':'krs', 'WE':'rrg', 'WF':'tks', 'WG':'kgg', 'WH':'yrs', 'WI':'wks', 'WK':'wrg', 'WL':'tkg', 'WM':'wkg', 'WN':'wrs', 'WP':'ysg', 'WQ':'yrg', 'WR':'ygg', 'WS':'tsg', 'WT':'wsg', 'WV':'kkg', 'WY':'trs',
'YA':'kmc', 'YC':'trc', 'YD':'kac', 'YE':'kas', 'YF':'twc', 'YG':'krc', 'YH':'yac', 'YI':'wwc', 'YK':'was', 'YL':'ywc', 'YM':'wws', 'YN':'wac', 'YP':'ymc', 'YQ':'yas', 'YR':'yrc', 'YS':'tmc', 'YT':'wmc', 'YV':'kwc', 'YW':'trs',
}
# A reverse lookup for degerate bases
Degenerate_Base_lookup = {
'a': 'a', 'c': 'c','g': 'g','t': 't',
'r': 'ag','y': 'ct','m': 'ac','k': 'gt','s': 'cg','w': 'at',
'h': 'act','b': 'cgt','v': 'acg','d': 'agt',
'n': 'acgt'
}
# An amino-acid key for IQ-Tree *.state files.
AA_key = [
'A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V'
]
# Reads in fasta file and renames certain sequences based on forbidden
# characters in IQ Tree as needed
def fasta2dict(fasta_path, return_dict={}):
# Read in the file and prepare some variables
with open(fasta_path, 'r') as infile:
fastafile = infile.readlines()
working_sequence = ''
key = None
for line in fastafile: # For every line in the input fasta
if line[0] == '>': # Check if it's a name
if working_sequence != '': # If we already have a working sequence, another name indicates we're done. Otherwise record the name
if len(working_sequence) >= 0:
return_dict[key] = working_sequence
key = line[1:].rstrip().replace(':', '_')
key = key.replace('|', '_')
key = line[1:].rstrip()
working_sequence = '' # clear the working sequence
else: # If the line isn't a name, it's part of the sequence.
if not all([char for char in working_sequence if (
char.isalpha() or char == '-')]):
raise ValueError(
f"The provided file ({fasta_path}) was not a fasta file.")
working_sequence = working_sequence + line.rstrip()
# Finally, clean up anything left in the working sequence before returning
# the dictionary.
if working_sequence != '':
if not all([char for char in working_sequence if (
char.isalpha() or char == '-')]):
raise ValueError(
f"The provided file ({fasta_path}) was not a fasta file.")
else:
if len(working_sequence) >= 0:
return_dict[key] = working_sequence
if len(return_dict) == 0:
raise ValueError(
f"The provided file ({fasta_path}) was not a fasta file.")
return return_dict
# Saves dictionary to fasta where the dictionary key is the protein name
# and the value is the sequence
def dict2fasta(fasta_d, fpath):
with open(fpath, 'w+') as out_fasta:
for key, seq in fasta_d.items():
out_fasta.write(f'>{key}\n{seq}\n')
def Is_Valid_AA(AA): # Is the argurment a valid amino acid or list of amino acids
if isinstance(
AA, str): # This block lets us evaluate strings of amino acids
return all([True for residue in AA if (
(residue in AA_key) or (residue == '-'))])
if isinstance(
AA,
list) and isinstance(
AA[0],
str): # This block lets us evaluate lists of amino acids
return all((i in AA_key) or (i == '-') for i in AA)
if isinstance(
AA,
list) and isinstance(
AA[0],
list): # This block lets us evaluate lists of lists of amino acids
return all(all(((i in AA_key) or (i == '-'))
for i in lst) for lst in AA)
else:
raise ValueError("A bad type was evaluated as an amino acid list....")
def Is_Valid_Codon(codon): # Is the argurment a valid codon
dna_leters = ['a','c','t','g','r','y','m','k','s','w','h','b','v','d','n']
if isinstance(codon, str):
return (((len(codon) % 3) == 0) and all(
i in dna_leters for i in codon))
if isinstance(codon, list):
return all(((len(c) == 3) and all(i in dna_leters for i in c))
for c in codon)
# Interact with BlastP, and record the XML
def NCBI_to_XML(dirname, sequence, hits=2000, expect_value=0.30, seq_name=''):
# For the given sequence, we will run a BlastP search and parse the XML
# return to make a multi-fasta file
blast_result_handle = NCBIWWW.qblast(
"blastp",
"nr",
sequence,
hitlist_size=hits,
expect=expect_value)
XML_Name = f"BlastP_XML_{seq_name}"
with open(f"./{dirname}/{XML_Name}", "w+") as fout:
fout.write(blast_result_handle.read())
blastp_xml = ET.parse(f"./{dirname}/{XML_Name}")
return (blastp_xml) # Returns the ElementTree type from the xml library
# Parse the BlastP XML - record Fasta
def Parse_BlastP_XML(dirname, blastp_xml, sequence, sequence_name=None):
Fasta_Dict = {}
return_name = "BlastP_Results.fasta"
# If no sequence_name is identified (Meaning this is the search with the
# user sequence)
if sequence_name is None:
if isinstance(sequence, str):
with open(f"./{dirname}/HitsInfo.csv", "a+") as fout:
fout.write(f"Hit ID,Hit Description,Hit Sequence\n")
# Parsing the XML object, looking for hits
for hit in blastp_xml.findall(
'./BlastOutput_iterations/Iteration/Iteration_hits/Hit'):
# I've tried to also add the Hit_accession, but I can't
# access that form the XML for some reason
hitid = (hit.find('Hit_id')).text
hitdef = (hit.find('Hit_def')).text
hitaccession = (
hit.find('Hit_accession')).text.replace(
".", "_")
seq = (hit.find('Hit_hsps/Hsp/Hsp_hseq')).text
# If the sequence doesn't have unknowns amino acids (annoying) then record it.
# The optional second method also removes exceptionally short or long sequences - be sure to synch with the code ~13 lines below
# if (("X" not in seq) and
# (len(seq)<((1+length_cutoff)*User_Sequence_Length)) and
# (len(seq)>((1-length_cutoff)*User_Sequence_Length))):
if ("X" not in seq):
fout.write(f"{hitid},{hitdef},{seq}\n")
Fasta_Dict[hitaccession] = seq
with open(f"{dirname}/{return_name}", "a+") as blastp_file:
for key, F_D_Sequence in Fasta_Dict.items():
# if
# (len(Sequence)<((1+length_cutoff)*User_Sequence_Length))
# and
# (len(Sequence)>((1-length_cutoff)*User_Sequence_Length)):
if len(F_D_Sequence) > 0.5 * len(sequence) and len(F_D_Sequence) < 1.5 * \
len(sequence) and Is_Valid_AA(F_D_Sequence):
# We remove all gaps, because CD-Hit cannot handle
# gaps.
blastp_file.write(
f">{key}\n{F_D_Sequence.replace('-','')}\n")
# Modify the BlastP return to remove duplicate sequences.
Remove_Duplicate_Sequences_FASTA(dirname, f"{return_name}")
with open(f"{dirname}/{return_name}", "a+") as blastp_file:
blastp_file.write(f">User_Sequence\n{sequence}\n")
# If we've been given multiple sequences, we have a dict for sequence
# and a list of xmls for blastp_xml
elif isinstance(sequence, dict):
for xml in blastp_xml:
with open(f"./{dirname}/HitsInfo.csv", "a+") as fout:
# .csv header
fout.write(f"Hit ID,Hit Description,Hit Sequence\n")
# Parsing the XML objects, looking for hits
for hit in xml.findall(
'./BlastOutput_iterations/Iteration/Iteration_hits/Hit'):
# I've tried to also add the Hit_accession, but I can't
# access that form the XML for some reason
hitid = (hit.find('Hit_id')).text
hitdef = (hit.find('Hit_def')).text
hitaccession = (
hit.find('Hit_accession')).text.replace(
".", "_")
seq = (hit.find('Hit_hsps/Hsp/Hsp_hseq')).text
# If the sequence doesn't have unknowns amino acids (annoying) then record it.
# The optional second method also removes exceptionally short or long sequences - be sure to synch with the code ~13 lines below
# if (("X" not in seq) and
# (len(seq)<((1+length_cutoff)*User_Sequence_Length))
# and
# (len(seq)>((1-length_cutoff)*User_Sequence_Length))):
if ("X" not in seq):
fout.write(f"{hitid},{hitdef},{seq}\n")
Fasta_Dict[hitaccession] = seq
for key, seq in sequence.items(
): # Ensure all user seqeucnes are in final fasta file
if key not in Fasta_Dict:
Fasta_Dict.update({key: seq})
with open(f"{dirname}/{return_name}", "a+") as blastp_file:
for key, F_D_Sequence in Fasta_Dict.items():
# if
# (len(Sequence)<((1+length_cutoff)*User_Sequence_Length))
# and
# (len(Sequence)>((1-length_cutoff)*User_Sequence_Length)):
if len(F_D_Sequence) < 10000 and len(
F_D_Sequence) > 10 and Is_Valid_AA(F_D_Sequence):
# We remove all gaps, because CD-Hit cannot handle
# gaps.
blastp_file.write(
f"\n>{key}\n{F_D_Sequence.replace('-','')}")
# Modify the BlastP return to remove duplicate sequences.
Remove_Duplicate_Sequences_FASTA(dirname, return_name)
return (return_name)
# If a sequence_name has been provided, this means we're doing Supplement
# searches so our output directory structure should be different.
elif isinstance(sequence_name, str):
with open(f"{dirname}/{sequence_name}_supplement.csv", "a+") as fout: # DIFFERENT FROM ABOVE
fout.write(f"Hit ID,Hit Description,Hit Sequence\n")
# Parsing the XML object, looking for hits
for hit in blastp_xml.findall(
'./BlastOutput_iterations/Iteration/Iteration_hits/Hit'):
# I've tried to also add the Hit_accession, but I can't access
# that form the XML for some reason
hitid = (hit.find('Hit_id')).text
hitdef = (hit.find('Hit_def')).text
hitaccession = (
hit.find('Hit_accession')).text.replace(
".", "_")
seq = (hit.find('Hit_hsps/Hsp/Hsp_hseq')).text
# If the sequence doesn't have unknowns amino acids (annoying) then record it.
# The optional second method also removes exceptionally short or long sequences - be sure to synch with the code ~13 lines below
# if (("X" not in seq) and
# (len(seq)<((1+length_cutoff)*User_Sequence_Length)) and
# (len(seq)>((1-length_cutoff)*User_Sequence_Length))):
if ("X" not in seq):
fout.write(f"{hitid},{hitdef},{seq}\n")
Fasta_Dict[hitaccession] = seq
# DIFFERENT FROM ABOVE
with open(f"{dirname}/{sequence_name}_{return_name}", "a+") as blastp_file:
# DIFFERENT FROM ABOVE
blastp_file.write(f">{sequence_name}\n{sequence}\n")
for key, F_D_Sequence in Fasta_Dict.items():
if Is_Valid_AA(F_D_Sequence):
# if
# (len(Sequence)<((1+length_cutoff)*User_Sequence_Length))
# and
# (len(Sequence)>((1-length_cutoff)*User_Sequence_Length)):
# We remove all gaps, because CD-Hit cannot handle gaps.
blastp_file.write(
f">{key}\n{F_D_Sequence.replace('-','')}\n")
# Modify the BlastP return to remove duplicate sequences. #DIFFERENT
# FROM ABOVE
Remove_Duplicate_Sequences_FASTA(
dirname, f"{sequence_name}_{return_name}", True)
return (f"{sequence_name}_{return_name}")
else:
print("sequence_name was not a string type")
raise ValueError("sequence_name was not a string type")
# This function takes an amino acid sequence, submitts a BlastP search,
# and records the result in a fasta file
def BlastP(dirname, sequence, hits=2000, expect_value=0.2, sequence_name=None):
if not (os.path.isdir(dirname)):
os.mkdir(dirname)
if isinstance(sequence, str):
sequence = sequence.replace('-', '')
if bool([char for char in sequence if (char not in AA_key)]
): # Empty list (all chars in AA_key) comes up true
print(
f"Invalid sequence submitted for BlastP search. Contains the invalid charecter(s) {[char for char in sequence if (char not in AA_key)]}")
raise ValueError("Invalid sequence submitted for BlastP search")
elif isinstance(sequence, dict):
for key, item in sequence.items():
item = item.replace('-', '')
sequence.update({key: item})
if bool([char for char in item if (char not in AA_key)]):
print(
f"Invalid sequence submitted for BlastP search ({key}). Contains the invalid charecter(s) {[char for char in item if (char not in AA_key)]}")
raise ValueError(
f"Invalid sequence submitted for BlastP search ({key})")
try:
print("Acessing the NCBI database....")
if isinstance(sequence, str):
# No need to wait for an API call if this is just a re-run and we
# already have the xmls available
if not os.path.exists(f"{dirname}/BlastP_XML"):
blastp_xml = NCBI_to_XML(dirname, sequence, hits, expect_value)
else:
blastp_xml = ET.parse(f"./{dirname}/BlastP_XML")
elif isinstance(sequence, dict):
xmls = [] # List of ElementTree objects
for key, item in sequence.items():
if not os.path.exists(
f"{dirname}/BlastP_XML_{key.replace('.','_')}"):
xmls.append(
NCBI_to_XML(
dirname,
item,
hits,
expect_value,
key.replace(
'.',
'_')))
else:
xmls.append(
ET.parse(f"./{dirname}/BlastP_XML_{key.replace('.','_')}"))
except BaseException:
print("There was an error fetching the BlastP results")
raise RuntimeError("There was an error fetching the BlastP results.")
try:
# Now, we parse the XML object and make a multi-fasta file. We also write the fasta file which is the result of our BlastP search.
# The output behavior must be different if this the user sequence or a
# Supplement search though.
if sequence_name is None and isinstance(
sequence, str): # A primary search with one user sequence
if not os.path.exists(f"{dirname}/BlastP_Results.fasta"):
return_string = Parse_BlastP_XML(dirname, blastp_xml, sequence)
return (return_string)
else:
return ("BlastP_Results.fasta")
# A primary search with multiple user sequences
elif sequence_name is None and isinstance(sequence, dict):
if not os.path.exists(f"{dirname}/BlastP_Results.fasta"):
return_string = Parse_BlastP_XML(dirname, xmls, sequence)
return (return_string)
else:
return ("BlastP_Results.fasta")
elif isinstance(sequence_name, str): # A supplement search
if not os.path.exists(
f"{dirname}/{sequence_name}_BlastP_Results.fasta"):
try:
return_string = Parse_BlastP_XML(
dirname, blastp_xml, sequence, sequence_name)
return (return_string)
except BaseException:
return (False)
else:
return (f"{sequence_name}_BlastP_Results.fasta")
else:
print("Variable sequence_name was not a string type")
raise ValueError("Variable sequence_name was not a string type")
except BaseException:
print("There was an error recording the BlastP Results")
raise RuntimeError("There was an error recording the BlastP Results")
def Sequence_Processing(dirname, finname, sequence):
Fasta_Dict = {}
if isinstance(sequence, dict):
with open(f"{dirname}/{finname}", 'a') as fin:
fin.write("\n")
for key, item in sequence.items():
fin.write(f">{key}\n{item}\n")
else:
try:
# Alignment is needed for the Hamming Distance
os.system(
f"{MAFFT_Executable} {dirname}/{finname} > {dirname}/Early_Alignment_temp.fasta")
except BaseException:
raise RuntimeError("There was an error running MAFFT.")
Fasta_Dict = fasta2dict(f"{dirname}/Early_Alignment_temp.fasta", {})
os.remove(f"{dirname}/Early_Alignment_temp.fasta")
# We have now computed the hamming distance of all sequences.
Hamming_Dict = Fasta_Dict_Hamming(
Fasta_Dict, Fasta_Dict["User_Sequence"])
for key, item in Hamming_Dict.items():
# If a given sequence has less than 50% similarity with the user
# sequence, remove it.
if (item / len(Fasta_Dict["User_Sequence"])
) > 0.5 and key != "User_Sequence":
Fasta_Dict.pop(key)
for key, item in Fasta_Dict.items():
# We now need to remove all the gaps in all the sequences to use CD-Hit
Fasta_Dict.update({key: item.replace("-", "")})
if supplement_Cutoff != 0: # If we are doing supplementation
# This one line supplements poorly supported areas of the tree, and
# updates the dictionary with those additional sequences.
Fasta_Dict.update(
fasta2dict(f"{dirname}/{Supplement_Sequences(dirname,Fasta_Dict)}"))
# Save the supplemented Fasta_Dict
dict2fasta(Fasta_Dict, f"{dirname}/pre-Alignment.fasta")
try:
# Align the supplemented sequences - called the Raw alignment
os.system(
f"{MAFFT_Executable} {dirname}/pre-Alignment.fasta > {dirname}/Raw_Alignment.fasta")
except BaseException:
raise RuntimeError("There was an error running MAFFT.")
# clean up our clutter, a little
os.remove(f"{dirname}/pre-Alignment.fasta")
Fasta_Dict_Aligned = fasta2dict(f"{dirname}/Raw_Alignment.fasta")
if isinstance(sequence, dict):
# The Raw alignment can then be processed into an alignment that's
# ready for IQTree
user_seq_name = [name for name in sequence.keys(
) if name in Fasta_Dict_Aligned.keys()][0]
return_name = Post_MAFFT_processing(
dirname, Fasta_Dict_Aligned, user_seq_name)
else: # If only one
return_name = Post_MAFFT_processing(dirname, Fasta_Dict_Aligned, False)
return return_name
def CDHit_Cutoff(identity):
if (identity <= 0.4 or identity > 1):
raise ValueError("The CD-Hit identity is invalid")
elif identity > 0.7:
return (5)
elif identity > 0.6:
return (4)
elif identity > 0.5:
return (3)
elif identity > 0.4:
return (2)
# This function will run CD-Hit and also ensure that the user sequence(s)
# are not removed
def CDHit(dirname, fin, fout, cutoff, supplement=False):
try:
# Run CD-Hit with user parameters
os.system(
f'{CDHIT_Executable} -i {dirname}/{fin} -o {dirname}/{fout} -c {cutoff} -n {CDHit_Cutoff(cutoff)}')
except BaseException:
print("There was an error running CD-Hit.")
raise RuntimeError("There was an error running CD-Hit.")
fout_dict = fasta2dict(f"{dirname}/{fout}")
realign = False
if not supplement:
if isinstance(User_Input_Sequence, dict):
for key, seq in User_Input_Sequence.items(
): # Ensure all user seqeucnes are in final fasta file
if key not in fout_dict:
realign = True
fout_dict.update({key: seq.replace("-", '')})
else:
if 'User_Sequence' not in fout_dict.keys():
realign = True
fout_dict.update(
{'User_Sequence': User_Input_Sequence.replace("-", '')})
if realign:
dict2fasta(fout_dict, "Temp.fasta")
try:
# Alignment is needed for the Hamming Distance
os.system(f"{MAFFT_Executable} Temp.fasta > {dirname}/{fout}")
os.remove("Temp.fasta")
except BaseException:
raise RuntimeError("There was an error running MAFFT.")
# This function MODIFIES a fasta file to remove highly-duplicate sequences.
def Remove_Duplicate_Sequences_FASTA(dirname, fpath, supplement=False):
CDHit(dirname, fpath, f"{fpath[:-6]}_temp.fasta", 0.99, supplement)
os.remove(f"{dirname}/{fpath}")
os.remove(f"{dirname}/{fpath[:-6]}_temp.fasta.clstr")
# Replace the original file with one that has no duplicate sequences.
os.system(f"mv {dirname}/{fpath[:-6]}_temp.fasta {dirname}/{fpath}")
# returns the Hamming distance for every sequence in an aligned fasta
# dictionary.
def Fasta_Dict_Hamming(Fasta_Dict, sequence):
Hamming_dict = {}
for key, value in Fasta_Dict.items():
Hamming_Diff = 0
if len(value) != len(sequence):
print("Hamming distance should be computed with properly aligned sequences.")
raise ValueError(
"Hamming distance should be computed with properly aligned sequences.")
for i, char in enumerate(value):
if sequence[i] != char:
Hamming_Diff += 1
Hamming_dict[key] = Hamming_Diff
return Hamming_dict
# Find areas of poor coverage on the current tree and get additional
# BlastP search results.
def Supplement_Sequences(dirname, fasta_dict):
if not (os.path.isdir(f"{dirname}/Sequence_Supplement")
): # Make a directory for this sequence Supplementing process
os.mkdir(f"{dirname}/Sequence_Supplement")
# Now we can begin to identify sequences which are similar enough to the
# rest of the dataset to be retained, but dissimilar enough that they
# would benefit from supplementing.
# Be sure there are no gaps in this dictionary, as CD-Hit will reject
# those.
dict2fasta(
fasta_dict,
f"{dirname}/Sequence_Supplement/Sequences_to_be_supplemented.fasta")
sequences_list_for_search = []
CDHit(
dirname,
"Sequence_Supplement/Sequences_to_be_supplemented.fasta",
"Sequence_Supplement/SingleClusters_Supplemented.fasta",
supplement_Cutoff,
True) # Run CD-Hit with supplementation cutoff to identify 'loner' sequences
# Looking at the CD-Hit clstr file (which holds cluster information)
with open(f"{dirname}/Sequence_Supplement/SingleClusters_Supplemented.fasta.clstr", 'r') as fin:
clusters = (fin.read()).split('>Cluster ')
for cluster in clusters: # For each cluster
# Split off the header from each sequence name
seqs = cluster.split('>')
if len(seqs) == 2: # If there is only one sequence in the cluster
# Seperate and record the name.
sequences_list_for_search.append((seqs[1])[:-6])
# Now we can submit a BlastP search for all of the sequences that need to
# be supplemented, and write all sequences together as one file.
files_list = [
f"{dirname}/Sequence_Supplement/Sequences_to_be_supplemented.fasta"]
print(
f"Supplementing {len(sequences_list_for_search)} sequences with poor neighboorhood coverage.")
for sequence_name in sequences_list_for_search:
if not os.path.exists(
f"{dirname}/Sequence_Supplement/{sequence_name}_BlastP_Results.fasta"):
# Now we submit a BlastP search, but override the default expect
# and hits to get a more narrow set of sequences.
file = BlastP(
f"{dirname}/Sequence_Supplement",
fasta_dict[sequence_name],
100,
0.01,
sequence_name)
if file:
# We also record a list of all the output fasta files to
# concatanate them together later.
files_list.append(f"{dirname}/Sequence_Supplement/{file}")
# Now we need to write all of our Supplemented sequence searches together
# as one fasta file. Just smoosh 'em all together
with open(f"{dirname}/Sequence_Supplement/Supplemented_BlastP_Sequences.fasta", "a+") as fout:
for fname in files_list:
with open(fname, 'r') as supfile:
fout.write(supfile.read())
Remove_Duplicate_Sequences_FASTA(
dirname, "Sequence_Supplement/Supplemented_BlastP_Sequences.fasta", True)
return (f"Sequence_Supplement/Supplemented_BlastP_Sequences.fasta")
# Remove sequnces that cause insertions in the alighment
def Remove_Insertions(
fasta_dict,
User_Sequence,
deletion_percentage=0.02,
termini_length=0.05):
num_pos = len(User_Sequence)
acceptable_num_gaps = round(
len(User_Sequence.replace('-', '')) * deletion_percentage)
if acceptable_num_gaps < 5:
acceptable_num_gaps = 5
user_gap_pos = [i for i in range(num_pos) if User_Sequence.startswith(
'-', i)] # record all positions with a gap in the user sequence
# The object 'key_sequence_list' contains a list of tuples with the name
# of the sequence, the sequence, and a list of all positions with an amino
# acid, in that order.
# Sorry for the mess of a line, but it prevents unessecary looping.
key_sequence_gap_list = [
(key, [
i for i in range(num_pos) if not seq.startswith(
'-', i)]) for key, seq in fasta_dict.items()]
keys_to_pop = []
# The ax is already at the root of the trees, and every tree that does not
# produce good fruit will be cut down and thrown into the fire
for key, no_gap_list in key_sequence_gap_list: # Now, for all sequences
count = 0
for pos in no_gap_list: # Evaluate the positions where it has amino acids
if (pos in user_gap_pos) and (pos > num_pos * termini_length) and (pos < num_pos * (1 - termini_length)
): # If it has an amino acid where the User_Sequence has a gap, and isn't in the N- or C- terminus
count += 1 # Tally a strike
else:
# If we come to a place of agreement, reset the count.
count = 0
# If the number of insertions in a row (count) is higher than what
# is acceptable
if count > acceptable_num_gaps:
if key != 'User_Sequence':
keys_to_pop.append(key) # Record this key as one to remove
# What further testimony do we need? We have heard it ourselves
# from his own lips.
break
else:
continue
for key in keys_to_pop:
fasta_dict.pop(key)
# Remove sequnces that cause insertions in the alighment
def Remove_Deletions(
fasta_dict,
User_Sequence,
deletion_percentage=0.02,
termini_length=0.05):
num_pos = len(User_Sequence)
acceptable_num_gaps = round(
len(User_Sequence.replace('-', '')) * deletion_percentage)
if acceptable_num_gaps < 2:
acceptable_num_gaps = 2
user_gap_pos = [
i for i in range(num_pos) if User_Sequence.startswith(
'-', i)] # record all positions with a gap
# The object 'key_sequence_list' contains a list of tuples with the name
# of the sequence, the sequence, and a list of all positions with a gap,
# in that order.
# Sorry for the mess of a line, but it prevents unessecary looping.
key_sequence_gap_list = [
(key, [
i for i in range(num_pos) if seq.startswith(
'-', i)]) for key, seq in fasta_dict.items()]
keys_to_pop = []
# The ax is already at the root of the trees, and every tree that does not
# produce good fruit will be cut down and thrown into the fire
for key, gap_list in key_sequence_gap_list: # Now, for all sequences
count = 0
for pos in gap_list: # Evaluate the positions where it has gaps
if (pos not in user_gap_pos) and (pos > num_pos * termini_length) and (pos < num_pos * (1 - termini_length)
): # If it has gaps where the User_Sequence does not, and isn't in the N- or C- terminus
count += 1 # Tally a strike
else:
# If we come to a place of agreement, reset the count.
count = 0
# If the number of gaps in a row (count) is higher than what is
# acceptable
if count > acceptable_num_gaps:
if key != 'User_Sequence':
keys_to_pop.append(key) # Record this key as one to remove
# What further testimony do we need? We have heard it ourselves
# from his own lips.
break
else:
continue
for key in keys_to_pop:
fasta_dict.pop(key)
def Clean_all_gaps(fasta_dict):
# For each position, go thorugh all sequences. If any sequence has a
# residue, leave the position in at the end.
sequences_list = [i for i in fasta_dict.values()]
pos_to_leave = []
for pos in range(len(sequences_list[0])):
for seq in sequences_list:
if (seq[pos] != '-'):
pos_to_leave.append(pos)
break
else:
continue
# Remove all positions of all gaps from all sequences in the alignment
for key, sequence in fasta_dict.items(): # Remove all the gaps
fasta_dict.update(
{key: ''.join([char for i, char in enumerate(sequence) if i in pos_to_leave])})
# Modifications after the alignment, mostly having to do with gaps.
def Post_MAFFT_processing(
dirname,
fasta_dict,
multisequence,
dynamic_sequence_reduction=True):
# These functions **MODIFY** the fasta_dict by doing what their names say
# they do.
old_user_length = 0
if multisequence: # Just get one of them
User_Sequence = fasta_dict.get(multisequence)
# Each of these functions mutate the alignment, so it's important to
# repeat until we've finsihed finding all misalignments.
# HOWEVER - with a multi-sequence input, this tends to be overzealous.
# We'll only go through the loop twice.
for i in range(multisequence_iterations):
Remove_Insertions(fasta_dict, User_Sequence) # Insertions
Clean_all_gaps(fasta_dict) # Clean
User_Sequence = fasta_dict.get(multisequence) # Update
Remove_Deletions(fasta_dict, User_Sequence) # Deletions
for key, seq in fasta_dict.items(
): # Now we realign the sequences rather than just removing all the gaps - more relaiable
fasta_dict.update({key: seq.replace("-", '')})
dict2fasta(fasta_dict, f"{dirname}/Post_Mafft_Working1.fasta")
try:
# Alignment is needed for the Hamming Distance
os.system(
f"{MAFFT_Executable} {dirname}/Post_Mafft_Working1.fasta > {dirname}/Post_Mafft_Working2.fasta")
fasta_dict = {}
except BaseException:
raise RuntimeError("There was an error running MAFFT.")
fasta2dict(f"{dirname}/Post_Mafft_Working2.fasta", fasta_dict)
os.system(
f"rm {dirname}/Post_Mafft_Working1.fasta {dirname}/Post_Mafft_Working2.fasta")
User_Sequence = fasta_dict.get(multisequence) # Update
else:
User_Sequence = fasta_dict.get("User_Sequence")
# Each of these functions mutate the alignment, so it's important to
# repeat until we've finsihed finding all misalignments.
while len(User_Sequence) != old_user_length:
old_user_length = len(User_Sequence) # Store length
Remove_Insertions(fasta_dict, User_Sequence) # Insertions
Clean_all_gaps(fasta_dict) # Clean
User_Sequence = fasta_dict.get("User_Sequence") # Update
Remove_Deletions(fasta_dict, User_Sequence) # Deletions
for key, seq in fasta_dict.items(
): # Now we realign the sequences rather than just removing all the gaps - more relaiable
fasta_dict.update({key: seq.replace("-", '')})
dict2fasta(fasta_dict, f"{dirname}/Post_Mafft_Working1.fasta")
try:
# Alignment is needed for the Hamming Distance
os.system(
f"{MAFFT_Executable} {dirname}/Post_Mafft_Working1.fasta > {dirname}/Post_Mafft_Working2.fasta")
fasta_dict = {}
except BaseException:
raise RuntimeError("There was an error running MAFFT.")
fasta2dict(f"{dirname}/Post_Mafft_Working2.fasta", fasta_dict)
os.system(
f"rm {dirname}/Post_Mafft_Working1.fasta {dirname}/Post_Mafft_Working2.fasta")
User_Sequence = fasta_dict.get("User_Sequence") # Update
dict2fasta(fasta_dict, f'{dirname}/Post_MAFFT_Cleaned_Penultimate.fasta')
# If on, this code will reduce the sequence alignment down to less than
# set number of sequences. It's on by default
if dynamic_sequence_reduction and len(fasta_dict) > Final_Dataset_Size:
keep_going = True
identity = 0.98
for key, item in fasta_dict.items():
fasta_dict.update({key: item.replace("-", "")})
dict2fasta(fasta_dict, f'{dirname}/Post_MAFFT_No_Gaps.fasta')
# Otherwise, we'll keep lowering the CD-Hit cutoff until we get below
# the final library size.
while keep_going:
try:
os.system(f"rm {dirname}/Penultimate_Sequences.fasta")
except BaseException:
continue
CDHit(
dirname,
"Post_MAFFT_No_Gaps.fasta",
"Penultimate_Sequences.fasta",
round(
identity,
3)) # Run CD-Hit
identity -= 0.01
with open(f"{dirname}/Penultimate_Sequences.fasta", "r") as fin:
lines = fin.readlines()
if len(lines) <= (Final_Dataset_Size * 2) or identity < 0.80:
print(f"Number Reached. Lines={len(lines)}")
keep_going = False
if len(lines) <= 100:
keep_going = False
try:
os.system(f"rm {dirname}/Penultimate_Sequences.fasta")
except BaseException:
continue
CDHit(
dirname,
"Post_MAFFT_No_Gaps.fasta",
"Penultimate_Sequences.fasta",
round(
(identity + 0.01),
3)) # Run CD-Hit
try:
os.system(f"rm {dirname}/Post_MAFFT_No_Gaps.fasta")
# align the sequences passed into the function
os.system(
f"{MAFFT_Executable} {dirname}/Penultimate_Sequences.fasta > {dirname}/Final_Sequences.fasta")
except BaseException:
print("There was an error creating the sequence alignemnt.")
raise RuntimeError(
"There was an error creating the sequence alignemnt.")
return ("Final_Sequences.fasta")
else:
os.system(
f'cp {dirname}/Post_MAFFT_Cleaned_Penultimate.fasta {dirname}/Final_Sequences.fasta')
os.system(f'rm {dirname}/Post_MAFFT_Cleaned_Penultimate.fasta')
return ("Final_Sequences.fasta")
# Construct a phylogenetic tree, and determine a good model.
def IQTree_Phylo(dirname, finname):
if not os.path.isdir(f"{dirname}/IQTree_Phylo"): # Make the directory
os.mkdir(f"{dirname}/IQTree_Phylo")
try:
# IQTree section
os.system(
f'{IQTREE_Executable} -s {dirname}/{finname} -pre {dirname}/IQTree_Phylo/Phylo -alrt 20000 -bb 20000 -nt AUTO')
except BaseException:
print("There was an error building the phylogeny in IQTree.")
raise RuntimeError(
"There was an error building the phylogeny in IQTree.")
def IQTree_ASR(dirname, finname): # Using the tree/model from the phylogney, do the ASR
model = ''
try: # this can fail because for some reason HPCC tries to run it before the IQTree_Phylo has finished running.
# we need to go to the .iqtree file from the phylogony and find the
# best model for this data set. This will save a ton of time.
with open(f"{dirname}/IQTree_Phylo/Phylo.iqtree", "r") as fin:
iqtree_file_lines = fin.readlines()
if "Best-fit model" in iqtree_file_lines[42]:
model = iqtree_file_lines[42].split(':')[1].strip()
# If we can't find the Best-fit model where we expect it to be, then
# check the whole file.
else:
for line in iqtree_file_lines:
if "Best-fit model" in line:
model = line.split(':')[1].strip()
except BaseException:
pass
if not os.path.isdir(f"{dirname}/IQTree_ASR"): # Make the directory
os.mkdir(f"{dirname}/IQTree_ASR")
# Preventing errors; if we can't find the model, then we'll just use the
# default behavior.
if model == '':
model = 'MFP'
try:
# IQTree section
# Ask what model is best for ASR, use that one here.
os.system(f'{IQTREE_Executable} -s {dirname}/{finname} -te {dirname}/IQTree_Phylo/Phylo.treefile -pre {dirname}/IQTree_ASR/ASR -asr -m {model} -redo -nt AUTO')
except BaseException:
print("There was an error conducting ASR in IQTree.")
raise RuntimeError("There was an error conducting ASR in IQTree.")
with open(f"{dirname}/IQTree_Phylo/Phylo.treefile", "r") as fin:
Phylo_Tree = fin.read()
with open(f"{dirname}/IQTree_ASR/ASR.treefile", "r") as fin:
ASR_Tree = fin.read()
with open(f"{dirname}/Final_Tree.treefile", "w+") as fout:
fout.write(f"{Phylo_Tree}{ASR_Tree}")
# Returns a dictionary out of *.state file
ASR_Statefile_Dict = Statefile_to_Dict(dirname, "IQTree_ASR/ASR.state")
return (ASR_Statefile_Dict)
# Select ancestral nodes which have a high enough confidence to resurect
# ancestors from them.
def Select_Ancestor_Nodes(dirname):
Supports = {}
UFB_Supports = []
SHALRT_Supports = []
# This will evaluate the tree, which is in Newick format, and select nodes of high enough confidence to reconstruct an ancestral library.
# The Phlyo.treefile has confidence values as (SH-aLRT/UFB), and the
# ASR.treefile has node names.
with open(f'{dirname}/IQTree_ASR/ASR.treefile', 'r') as ASRtreefin:
ASRtreefile = ASRtreefin.read()
with open(f'{dirname}/IQTree_Phylo/Phylo.treefile', 'r') as Phylotreefin:
Phylotreefile = Phylotreefin.read()
confident_nodes = [] # these will be the nodes with good values. This isn't quite the same value between *.contree and *.trefile, but close
# Let's split off just the information for each node, which is stored