-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStr1_1.py
117 lines (102 loc) · 5.88 KB
/
Str1_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Question: When considering beta sheets, are consecutive residues of a
# strand closer than neighbouring residues on different strand? We will write
# program that calculates the distance between Cbeta atoms of consecutive
# residues in a beta strand and the Cbeta atoms of the neighboring residues
import numpy as np
import pandas as pd
import math
def dist(x,y):
return( math.sqrt( ((x[0]-y[0])**2)+ ((x[1]-y[1])**2)+ ((x[2]-y[2])**2)) )
# Using the protein 1bjt from Protein Data Bank, parsing the file
with open('1bjt.pdb') as pdbfile:
lines= list()
sheet = list()
for line in pdbfile:
if line[:4] == 'ATOM' :
splitted_line = [line[:7], line[7:11], line[11:16],line[16:20],
line[20:22], line[22:26], line[26:38],line[38:46],
line[46:54], line[54:60], line[60:66],line[66:]]
splitted_line =[ x.replace(' ', '') for x in splitted_line ]
lines.append(splitted_line)
if line[:5] == 'SHEET' :
splitted_line2 = [line[:6], line[7:10], line[11:14],line[14:16],
line[17:20], line[21], line[22:26],line[26],
line[28:31], line[32], line[33:37],line[37],
line[38:40],line[41:45], line[45:48], line[49],
line[50:54], line[54], line[56:60],line[60:63],
line[64], line[65:69],line[69]]
splitted_line2 =[ y.replace(' ', '') for y in splitted_line2 ]
sheet.append(splitted_line2)
# Protein data
data = pd.DataFrame(lines, columns=["atom","atom_num","atom_name","res_name",
"chain","res_seq_num","x_coor","y_coor",
"z_coor","occup","temp","elt_sym"])
data[['x_coor', 'y_coor','z_coor','occup', 'temp']] = data[['x_coor', 'y_coor',
'z_coor','occup', 'temp']].astype(float)
data[['atom_num', 'res_seq_num']] = data[['atom_num', 'res_seq_num']].astype(int)
# Sheet data
sheet_data = pd.DataFrame(sheet, columns=["sheet","str_num","sheet_id",
"num_str","ini_res_name",
"init_chain_id","init_seq_num",
"init_inst","ter_res_name",
"ter_chain_id","ter_seq_num",
"ter_ins","strand_sense","curr_atom",
"curr_res","curr_chain","curr_res_seq",
"curr_ins","prev_atom","prev_res",
"pre_chain","prev_res_seq","prev_ins"])
sheet_data[['str_num','num_str','init_seq_num',
'ter_seq_num','strand_sense']] = sheet_data[['str_num','num_str',
'init_seq_num','ter_seq_num',
'strand_sense']].astype('int64' )
sheet_data['prev_res_seq'] = pd.to_numeric(sheet_data['prev_res_seq'], errors='coerce')
sheet_data['curr_res_seq'] = pd.to_numeric(sheet_data['curr_res_seq'], errors='coerce')
# Slicing information we need, since first residue in the strand doesnt have previous
#residue information, we discard them
u = sheet_data.loc[sheet_data["str_num"]>1, ["curr_res_seq","prev_res_seq",
"sheet_id","strand_sense"] ]
u= u.reset_index()
#Distances with consecutive residues
cons_dist = np.empty(u.shape[0])
for x in range(0, u.shape[0]):
# Slicing x,y,z coordinates of the current residues
coord = (data.loc[ ( (data["res_seq_num"]==u["curr_res_seq"][x]) & (data["atom_name"]=="CB")),
["res_seq_num","x_coor","y_coor","z_coor"] ] )
# Slicing x,y,z coordinates of the consecutive residues
coord2 = (data.loc[ ( (data["res_seq_num"]==u["curr_res_seq"][x]+1) & (data["atom_name"]=="CB")),
["res_seq_num","x_coor","y_coor","z_coor"] ] )
#If Cbeta atoms exists for both residues
if(coord.shape == (1,4) and coord2.shape == (1,4) ) :
point1 = coord.iloc[0][1:4]
point2 = coord2.iloc[0][1:4]
cons_dist[x] = dist(point1,point2)
else:
cons_dist[x] = None
#Distances with neighboring residues
neighbor_dist = np.empty(u.shape[0])
for x in range(0, u.shape[0]):
coord = (data.loc[ ( (data["res_seq_num"]==u["curr_res_seq"][x]) & (data["atom_name"]=="CB")),
["res_seq_num","x_coor","y_coor","z_coor"] ] )
# Slicing x,y,z coordinates of the neighbouring residues on a different strand
# corresponding neighbor information is being kept on prev_res_seq vrb. in sheetdata
coord2 = (data.loc[ ( (data["res_seq_num"]==u["prev_res_seq"][x]) & (data["atom_name"]=="CB")),
["res_seq_num","x_coor","y_coor","z_coor"] ] )
if(coord.shape == (1,4) and coord2.shape == (1,4) ) :
point1 = coord.iloc[0][1:4]
point2 = coord2.iloc[0][1:4]
neighbor_dist[x] = dist(point1,point2)
else:
neighbor_dist[x] = None
u = u.join(pd.DataFrame(cons_dist,columns=['cons_dist']))
u = u.join(pd.DataFrame(np.array(u.loc[:,"curr_res_seq"]+1),columns=['consequtive_res']))
result = u.join(pd.DataFrame(neighbor_dist,columns=['neighbor_dist']))
result = result[['curr_res_seq','consequtive_res','prev_res_seq','sheet_id','strand_sense',
'cons_dist','neighbor_dist']]
# There are NA values for the distances if there is no Cbeta atoms for the
# corresponding residue. As can be seen in the last three columns of the "result"
# data frame, for parallel beta sheets, consecutive residues of a
# strand closer than neighbouring residues on different strand. However, for
# antiparallel beta sheets, it is the opposite.
# Note: Keep in mind that these statistics are only for one PDB entry. One has to
# be careful while generalizing.