forked from smirarab/sepp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerge_script.py
66 lines (58 loc) · 2.73 KB
/
merge_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
'''
Created on Oct 10, 2012
@author: smirarab
'''
import sys,random,argparse
from argparse import ArgumentParser, Namespace
from sepp import get_logger
from sepp.alignment import MutableAlignment, ExtendedAlignment,_write_fasta
from sepp.exhaustive import JoinAlignJobs, ExhaustiveAlgorithm
from sepp.jobs import PplacerJob,MafftAlignJob,FastTreeJob,PastaAlignJob
from sepp.filemgr import get_temp_file
from sepp.config import options
import sepp.config
from sepp.math_utils import lcm
from sepp.problem import SeppProblem
from sepp.scheduler import JobPool
from multiprocessing import Pool, Manager
from sepp.alignment import ExtendedAlignment
import glob
job_joiner = JoinAlignJobs
original_backbone_file = '/projects/sate8/namphuon/ultra_large/1000000/sate.fasta'
original_backbone = MutableAlignment()
done = original_backbone.read_filepath(original_backbone_file)
original_frag_file = '/projects/sate8/namphuon/ultra_large/1000000/initial.fas.100'
original_frag = MutableAlignment()
done = original_frag.read_filepath(original_frag_file)
#First build extended alignment on entire fragment set
extendedAlignment = ExtendedAlignment(original_frag.get_sequence_names())
dirs = glob.glob('/projects/sate8/namphuon/ultra_large/1000000/upp_100_10_new/temp/upp.1_HNlM/root/P_0/A_0_*/')
dirs.reverse()
for dir in dirs:
print("Working on %s\n" % dir)
aligned_files = glob.glob('%sFC_*/hmmalign.results.*' % dir)
sequence_files = glob.glob('%sFC_*/hmmalign.frag.*' % dir)
base_alignment_file = glob.glob('%s/*.fasta' % dir)
base_alignment = MutableAlignment()
done = base_alignment.read_filepath(base_alignment_file[0])
subbackbone = original_backbone.get_soft_sub_alignment(base_alignment.get_sequence_names())
frags = MutableAlignment()
sequence_names = []
for file in sequence_files:
seq = MutableAlignment()
done = seq.read_filepath(file)
done = sequence_names.extend(seq.get_sequence_names())
for name, seq in seq.items():
frags[name] = seq.upper()
problem = SeppProblem(sequence_names)
problem.set_subalignment(subbackbone)
mut_subalg = problem.subalignment.get_mutable_alignment()
remaining_cols = mut_subalg.delete_all_gap()
problem.annotations["ref.alignment.columns"] = remaining_cols
problem.fragments = frags
ap_alg = problem.read_extendend_alignment_and_relabel_columns\
(base_alignment_file, aligned_files)
extendedAlignment.merge_in(ap_alg,convert_to_string=False)
extendedAlignment.write_to_path("/projects/sate8/namphuon/ultra_large/1000000/upp_100_10_new/upp.unmasked.fasta")
extendedAlignment.remove_insertion_columns()
extendedAlignment.write_to_path("/projects/sate8/namphuon/ultra_large/1000000/upp_100_10_new/upp.masked.fasta")