-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsims_ecg_analysis.py
239 lines (183 loc) · 9.6 KB
/
sims_ecg_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
# Function to plot histograms
def plot_histogram(data, title, xlabel, ylabel, rotation=0, colors=None):
plt.figure(figsize=(6, 4))
if colors is None:
data.plot(kind='bar')
else:
data.plot(kind='bar', color=colors)
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.xticks(rotation=rotation)
plt.tight_layout()
plt.show()
# ##############################################
# #Read in files
# ##############################################
result_dir = '/Users/morganfitzgerald/Projects/ecg_param/sims/docs/timedomain_results/'
# ##############################################
# #R squared to assess model fit
# ##############################################
# Get a list of result files ending with '_ecg_output.csv'
result_files = [f for f in os.listdir(result_dir) if f.endswith('_ecg_output.csv')]
# Initialize a dictionary to store R-squared values
r_squared_values = {}
# Iterate over each result file
for file_name in result_files:
# Extract the noise level from the file name
noise_level = file_name.split('_')[0].zfill(4)
# Read the CSV file into a DataFrame
df = pd.read_csv(os.path.join(result_dir, file_name))
# Drop NaN values and filter out R-squared values less than or equal to 0
r_squared_vals = df['r_squared'].dropna().loc[df['r_squared'] > 0]
# Store the filtered R-squared values in the dictionary
r_squared_values[noise_level] = r_squared_vals
# Plotting the distribution R^2 values
plt.figure(figsize=(8, 6))
for noise_level, r_squared_vals in r_squared_values.items():
plt.hist(r_squared_vals, bins=20, alpha=0.5, label=f'Noise Level: {noise_level}')
plt.title('Distribution of $R^2$ Values')
plt.xlabel('$R^2$ Value')
plt.ylabel('Frequency')
plt.legend()
plt.tight_layout()
plt.show()
# # Calculating and printing the overall average R^2 value
# overall_average_r_squared = full_df['Average_R_squared'].mean()
# print(f"The overall average R^2 value across all participants is: {overall_average_r_squared:.4f}")
# # Process and plot R^2 values
# r_squared_values = {}
# result_files = [f for f in os.listdir(result_dir) if f.endswith('_ecg_output.csv')]
# for file_name in result_files:
# subject_num = file_name.split('_')[0].zfill(4)
# df = pd.read_csv(os.path.join(result_dir, file_name))
# r_squared_non_zero = df['r_squared'].dropna()[df['r_squared'] > 0]
# r_squared_values[subject_num] = r_squared_non_zero.mean()
# average_r_squared_df = pd.DataFrame(list(r_squared_values.items()), columns=['ID', 'Average_R_squared'])
# full_df = pd.merge(subject_info_df, average_r_squared_df, on='ID')
# # Plot R^2 values by Age Range and Sex
# def plot_grouped_bar(data, group, title, xlabel, ylabel):
# plt.figure(figsize=(4, 4))
# grouped_data = data.groupby(group)['Average_R_squared'].mean()
# grouped_data.plot(kind='bar', color='skyblue')
# plt.title(title)
# plt.xlabel(xlabel)
# plt.ylabel(ylabel)
# plt.xticks(rotation=45 if group == 'Age_range' else 0)
# plt.tight_layout()
# plt.show()
# plot_grouped_bar(full_df, 'Age_range', 'Average $R^2$ Values by Age Range', 'Age Range', 'Average $R^2$ Value')
# plot_grouped_bar(full_df, 'Sex_class', 'Average $R^2$ Values by Sex', 'Sex', 'Average $R^2$ Value')
# # ##############################################
# # #Sharpness deriv
# # ##############################################
# # Define the result directory
# timed_result_dir = '../docs/saved_files/timedomain_results/'
# # List of peaks and their corresponding sharpness deriv columns
# peaks = ['p', 'q', 'r', 's', 't']
# sharpness_deriv_cols = {peak: f'sharpness_deriv_{peak}' for peak in peaks}
# # Initialize a dictionary to store sharpness deriv values for each peak
# sharpness_deriv_values = {peak: [] for peak in peaks}
# # Get the list of result files
# result_files = [f for f in os.listdir(timed_result_dir) if f.endswith('_ecg_output.csv')]
# # Loop through each result file and extract sharpness deriv values
# for file_name in result_files:
# df = pd.read_csv(os.path.join(timed_result_dir, file_name))
# for peak, col in sharpness_deriv_cols.items():
# # Extract sharpness deriv values and add them to the corresponding list
# sharpness_deriv_values[peak].extend(df[col].dropna())
# # Function to plot histograms with axis limits based on data range
# def plot_histogram(data, title, xlabel, ylabel, bins=20, color='skyblue', extra_padding=0.1):
# plt.figure(figsize=(6, 4))
# plt.hist(data, bins=bins, color=color, edgecolor='black')
# plt.title(title)
# plt.xlabel(xlabel)
# plt.ylabel(ylabel)
# # Calculate axis limits based on min and max data values with extra padding
# min_val = min(data)
# max_val = max(data)
# # Extra padding to ensure the plot doesn't touch the edges
# xlim = (min_val - extra_padding * abs(min_val), max_val + extra_padding * abs(max_val))
# # Set the x-axis limits
# plt.xlim(xlim)
# plt.tight_layout()
# plt.show()
# # Plot histograms for each peak, using axis limits based on data range
# for peak in peaks:
# title = f'Histogram of Sharpness deriv ({peak.upper()})'
# xlabel = f'Sharpness deriv ({peak.upper()})'
# ylabel = 'Frequency'
# plot_histogram(sharpness_deriv_values[peak], title, xlabel, ylabel)
# # Function to plot box plots for given data
# def plot_box_plot(data, title, xlabel, ylabel):
# plt.figure(figsize=(8, 6))
# plt.boxplot(data, patch_artist=True, showfliers=True) # Show outliers as dots
# plt.title(title)
# plt.xlabel(xlabel)
# plt.ylabel(ylabel)
# plt.tight_layout()
# plt.show()
# # Create a list of sharpness deriv values for all peaks
# deriv_peaks_data = [sharpness_deriv_values[peak] for peak in peaks]
# # Plot box plots for sharpness deriv across all peaks
# plot_box_plot(deriv_peaks_data, 'Box Plot of Sharpness Deriv for Each Peak', 'Peaks', 'Sharpness Deriv')
# # You can customize the x-axis labels to reflect the peaks
# plt.xticks(ticks=range(1, len(peaks) + 1), labels=[peak.upper() for peak in peaks], rotation=45)
# ##############################################
# #Average Exponent and Offset Values By Noise
# ##############################################
# # Load and merge datasets
# subject_details = pd.read_csv('../docs/saved_files/spectral_results/subject_details.csv')
# sp_results = pd.read_csv('../docs/saved_files/spectral_results/sp_results.csv')
# merged_data = pd.merge(subject_details, sp_results, on='Subject')
# merged_data = merged_data[merged_data['Age_range'] != 'Unknown']
# # Calculate mean values for each parameter by Age_range
# mean_values = merged_data.groupby('Age_range')[['Offset_sp', 'Exponent_sp']].mean().reset_index()
# def plot_avg_values_histogram_direct(data, parameter, title):
# # Sort data
# sorted_data = data.sort_values(by='Age_range')
# # Create figure and axis
# fig, ax = plt.subplots(figsize=(12, 6))
# # Plot directly using Matplotlib
# ax.bar(sorted_data['Age_range'], sorted_data[parameter], color='skyblue')
# # Set title and labels
# ax.set_title(title)
# ax.set_xlabel('Age Range')
# ax.set_ylabel('Average Value')
# # Set x-ticks
# ax.set_xticks(range(len(sorted_data['Age_range'])))
# ax.set_xticklabels(sorted_data['Age_range'], rotation=45)
# # Apply tight layout and show plot
# plt.tight_layout()
# plt.show()
# ##Trying weighting to adjust for sample size differnces across age
# count_data = merged_data.groupby('Age_range')['Subject'].count().reset_index().rename(columns={'Subject': 'Count'})
# mean_values = mean_values.merge(count_data, on='Age_range')
# # Calculate weighted averages
# mean_values['Weighted_Offset_sp'] = mean_values['Offset_sp'] * mean_values['Count'] / mean_values['Count'].sum()
# mean_values['Weighted_Exponent_sp'] = mean_values['Exponent_sp'] * mean_values['Count'] / mean_values['Count'].sum()
# # # Plot histograms for weighted average Offset and Exponent values
# # plot_avg_values_histogram_direct(mean_values, 'Weighted_Offset_sp', 'Weighted Average Offset Values by Age Group')
# # plot_avg_values_histogram_direct(mean_values, 'Weighted_Exponent_sp', 'Weighted Average Exponent Values by Age Group')
# ##Weighted average significance testing: Use Welch's t-test due to uneven groups!
# from scipy.stats import ttest_ind
# # Age groups definition based on 'Age_range'
# age_20_to_30_ranges = ['20-24 years', '25-29 years']
# age_over_50_ranges = ['50-54 years', '55-59 years', '60-64 years', '65-69 years', '70-74 years', '75-79 years', '80-84 years', '85-92 years']
# # Separating the merged_data into two new groups
# age_20_to_30 = merged_data[merged_data['Age_range'].isin(age_20_to_30_ranges)]
# age_over_50 = merged_data[merged_data['Age_range'].isin(age_over_50_ranges)]
# # Ensure you've imported the necessary function for the t-test
# from scipy.stats import ttest_ind
# # Perform Independent Samples t-test for Exponent_sp between the two age groups
# t_stat_exp_20_30_vs_50, p_value_exp_20_30_vs_50 = ttest_ind(age_20_to_30['Exponent_sp'], age_over_50['Exponent_sp'], equal_var=False) # Assuming unequal variances
# # Perform Independent Samples t-test for Offset_sp between the two age groups
# t_stat_offset_20_30_vs_50, p_value_offset_20_30_vs_50 = ttest_ind(age_20_to_30['Offset_sp'], age_over_50['Offset_sp'], equal_var=False) # Assuming unequal variances
# # Output the results
# print(f'Exponent_sp t-test (20-30 vs. >50): t={t_stat_exp_20_30_vs_50}, p={p_value_exp_20_30_vs_50}')
# print(f'Offset_sp t-test (20-30 vs. >50): t={t_stat_offset_20_30_vs_50}, p={p_value_offset_20_30_vs_50}')
# #checkign 20-30 adn 55+