-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPartB_code
98 lines (78 loc) · 3.24 KB
/
PartB_code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report
from sklearn.manifold import TSNE
# Load the training data (features and labels)
data = pd.read_csv('C:/Users/aaron/Downloads/datasets_with_headers/indexed-TrainingDataMulti.csv')
X = data.drop('marker', axis=1) # Assuming 'marker' is the column name for labels
y = data['marker']
# Plot the distribution of label values
plt.hist(y, bins=50)
plt.title('Distribution of Label Values')
plt.xlabel('Label Value')
plt.ylabel('Count')
plt.show()
# Perform t-SNE visualization
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)
tsne_df = pd.DataFrame(X_tsne, columns=['Dimension 1', 'Dimension 2'])
tsne_df['label'] = y
# Visualize t-SNE results
sns.scatterplot(data=tsne_df, x='Dimension 1', y='Dimension 2', hue='label')
plt.title('t-SNE Visualization')
plt.show()
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Create the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=200, max_depth=15)
# Perform cross-validation
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5)
print("Cross-validation Scores:", cv_scores)
print("Average Cross-validation Score:", cv_scores.mean())
# Train the Random Forest classifier
rf_model.fit(X_train, y_train)
# Make predictions on the training data
train_predictions = rf_model.predict(X_train)
# Compute training accuracy
train_accuracy = accuracy_score(y_train, train_predictions)
print("Training Accuracy:", train_accuracy)
# Make predictions on the testing data
test_predictions = rf_model.predict(X_test)
# Compute testing accuracy
test_accuracy = accuracy_score(y_test, test_predictions)
print("Testing Accuracy:", test_accuracy)
# Compute confusion matrix & classification report
confusion_mat = confusion_matrix(y_test, test_predictions)
print('\n\nClassification Report:')
print(classification_report(y_test, test_predictions))
confusion_mat = confusion_matrix(y_test, test_predictions)
# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_mat, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()
# Print F1 score
f1 = f1_score(y_test, test_predictions, average='weighted')
print("F1 Score:", f1)
# Load the test data
df_test = pd.read_csv('C:/Users/aaron/Downloads/datasets_with_headers/indexed-TestingDataMulti.csv')
df_test.head(2)
# Standardize the test data
test_data_std = scaler.transform(df_test)
print(test_data_std.shape)
# Predict labels for the test data
test_data_predictions = rf_model.predict(test_data_std)
test_data_predictions = pd.DataFrame(test_data_predictions, columns=['Predictions'])
test_data_predictions
# Save the predicted labels to a CSV file
test_data_predictions.to_csv('C:/Users/aaron/Downloads/predictrf1.csv', index=False)