-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathml_option_pricing.py
551 lines (444 loc) · 18.8 KB
/
ml_option_pricing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
# -*- coding: utf-8 -*-
"""ml_option_pricing.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/11kUzwzNSc8-KZJm3tEruf8ojqemp-Nxz
1.0 Import Libraries
"""
import yfinance as yf
import pandas as pd
import numpy as np
import time
import re
from datetime import datetime
from google.colab import files
from scipy.stats import norm
from scipy.interpolate import interp1d
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
"""1.1 Data Collection"""
# List of stocks to fetch options data
stocks = [
"AAPL", "MSFT", "AMZN", "GOOG", "TSLA", "NFLX", "NVDA", "META", "ORCL", "IBM",
"INTC", "AMD", "QCOM", "ADBE", "CSCO", "CRM", "PYPL", "SAP", "UBER", "LYFT",
"PEP", "COST", "AVGO", "INTU", "TXN", "TMUS", "GILD", "CHTR", "AMGN", "ISRG",
"BKNG", "ADP", "MRVL", "WDAY", "SNPS", "PANW", "LRCX", "REGN", "MU", "KLAC",
"ASML", "ZS", "TEAM", "ZM", "DOCU", "OKTA", "CRWD", "SHOP", "DDOG", "MDB"
]
excluded_stocks = []
# Fetch options data
def fetch_options_data(stock):
print(f"\nFetching options data for {stock}...")
ticker = yf.Ticker(stock)
try:
expiration_dates = ticker.options
if not expiration_dates:
print(f"No expiration dates available for {stock}. Skipping.")
return pd.DataFrame()
except Exception as e:
print(f"Failed to fetch expiration dates for {stock}: {e}")
return pd.DataFrame()
all_data = []
for exp_date in expiration_dates:
try:
options_chain = ticker.option_chain(exp_date)
# Fetch calls
calls = options_chain.calls
calls['option_type'] = 'call' # Label calls
calls['expiration_date'] = exp_date
calls['stock'] = stock
# Fetch puts
puts = options_chain.puts
puts['option_type'] = 'put' # Label puts
puts['expiration_date'] = exp_date
puts['stock'] = stock
# Combine calls and puts
all_data.append(pd.concat([calls, puts], ignore_index=True))
print(f" - Fetched data for expiration date: {exp_date}")
except Exception as e:
print(f" - Failed to fetch data for expiration date {exp_date}: {e}")
if all_data:
return pd.concat(all_data, ignore_index=True)
else:
print(f"No data found for {stock}.")
return pd.DataFrame()
# Fetch stock price
def fetch_stock_price(stock):
print(f" - Fetching stock price for {stock}...")
ticker = yf.Ticker(stock)
try:
stock_price = ticker.history(period="1d")['Close'].iloc[-1]
print(f" Stock Price: {stock_price:.2f}")
return stock_price
except Exception as e:
print(f" Failed to fetch stock price for {stock}: {e}")
return np.nan
# Retry mechanism
def fetch_with_retries(stock, retries=3):
print(f"\n=== Fetching Data for {stock} with Retry Mechanism ===")
for attempt in range(retries):
try:
print(f" Attempt {attempt + 1} of {retries}...")
data = fetch_options_data(stock)
if not data.empty:
stock_price = fetch_stock_price(stock)
if not np.isnan(stock_price):
data['stock_price'] = stock_price
print(f" Successfully fetched data for {stock}.")
return data
else:
print(f" Skipping {stock} due to missing stock price.")
except Exception as e:
print(f" Error during attempt {attempt + 1}: {e}")
print(f"Failed to fetch data for {stock} after {retries} attempts.")
return pd.DataFrame()
# Fetch data for all stocks
print("\n=== Starting Data Fetch for All Stocks ===")
all_options_data = []
for stock in stocks:
data = fetch_with_retries(stock)
if not data.empty:
all_options_data.append(data)
else:
excluded_stocks.append(stock)
# Combine results
if all_options_data:
options_data = pd.concat(all_options_data, ignore_index=True)
print(f"\n=== Data Fetch Complete ===")
print(f" Total Data Points Collected: {options_data.shape[0]}")
print(f" Stocks Excluded: {len(excluded_stocks)}")
if excluded_stocks:
print(f" Excluded Stocks: {', '.join(excluded_stocks)}")
else:
print("\nNo data fetched!")
"""1.2.1 Exploratory Data Analysis (EDA): Summary Statistics for Parameters"""
# Full column list and data types
print("All Columns and Their Data Types:")
print(options_data.dtypes)
# Summary of all columns (including non-numeric)
print("\nSummary of All Columns:")
print(options_data.describe(include='all'))
"""1.2.1 Exploratory Data Analysis (EDA): Numerical Parameter Visualization using Histogram"""
# List of parameters to visualize
columns_to_visualize = options_data.select_dtypes(include=['float64', 'int64']).columns
# Iterate through each column and plot its histogram
for column in columns_to_visualize:
plt.figure(figsize=(8, 6))
options_data[column].dropna().hist(bins=50, edgecolor='black') # Drop NaNs to avoid errors
plt.title(f"Distribution of {column}")
plt.xlabel(column)
plt.ylabel("Frequency")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
"""1.3.1 Data Inspection"""
# Check the overall structure of the dataset
print("Dataset Overview:")
print(options_data.info())
# Display the first few rows of the dataset
print("\nFirst Five Rows of the Dataset:")
print(options_data.head())
# Check for missing values
print("\nMissing Values in Each Column:")
missing_values = options_data.isnull().sum()
print(missing_values)
# Check if there are any completely empty rows
empty_rows = options_data.isnull().all(axis=1).sum()
print(f"\nNumber of Completely Empty Rows: {empty_rows}")
# Display rows with missing values if any
if missing_values.any():
print("\nSample Rows with Missing Values:")
print(options_data[options_data.isnull().any(axis=1)].head())
else:
print("\nNo Missing Values in the Dataset.")
"""1.3.2 Data Cleaning"""
# Drop rows with missing values
options_data = options_data.dropna()
# Verify the dataset after cleaning
print("Dataset after removing rows with missing values:")
print(f"Number of rows: {len(options_data)}")
print(f"Number of columns: {len(options_data.columns)}")
# Check for any remaining missing values
print("\nMissing Values in Each Column After Cleaning:")
print(options_data.isnull().sum())
# Display the first few rows of the cleaned dataset
print("\nFirst Five Rows of the Cleaned Dataset:")
print(options_data.head())
"""1.4 Feature Engineering"""
# Add derived features
if 'options_data' in locals():
options_data['expiration_date'] = pd.to_datetime(options_data['expiration_date'])
options_data['time_to_maturity'] = (options_data['expiration_date'] - datetime.now()).dt.days / 365
options_data['moneyness'] = (options_data['stock_price'] - options_data['strike']) / options_data['stock_price']
options_data['bid_ask_spread'] = options_data['ask'] - options_data['bid']
print(f"Cleaned dataset shape: {options_data.shape}")
else:
print("No options data available.")
"""1.5.1 Final Feature Inspection"""
# Full column list and data types
print("All Columns and Their Data Types:")
print(options_data.dtypes)
# Summary of all columns (including non-numeric)
print("\nSummary of All Columns:")
print(options_data.describe(include='all'))
"""1.6.1 Feature Selection via Correlation Matrix"""
# Select only numeric columns
numeric_columns = [col for col in options_data.select_dtypes(include=['float64', 'int64']).columns]
# Compute the correlation matrix for the filtered numeric columns
correlation_matrix = options_data[numeric_columns].corr()
# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True, square=True)
plt.title("Correlation Matrix")
plt.tight_layout()
plt.show()
# Print the correlation matrix
print("Correlation Matrix")
print(correlation_matrix)
"""1.6.2 Feature Selection via Variance Inflation Factor (VIF) Analysis"""
# X feature set based on correlation and logical relationships
X = options_data[['strike', 'bid', 'stock_price', 'impliedVolatility',
'time_to_maturity', 'moneyness', 'bid_ask_spread']]
# Compute VIF
vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif_data)
"""2.0 Data Splitting"""
# Feature set based on VIF analysis
X = options_data[['bid', 'stock_price', 'impliedVolatility',
'time_to_maturity', 'moneyness', 'bid_ask_spread']]
y = options_data['lastPrice']
# Split data into train+validation and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, shuffle=True
)
# Save the test set indices from options_data
test_indices = X_test.index
# Further split train+validation into separate training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
X_train_val, y_train_val, test_size=0.25, random_state=42, shuffle=True
) # 25% of train_val is validation
# Initialize and apply StandardScaler
scaler = StandardScaler()
# Fit on training data and transform all splits
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
# Compute Metrics on Test Set
def compute_metrics(actual, predicted, model_name):
mse = mean_squared_error(actual, predicted)
rmse = np.sqrt(mse)
mae = mean_absolute_error(actual, predicted)
mape = np.mean(np.abs((actual - predicted) / actual)) * 100
r2 = r2_score(actual, predicted)
explained_variance = explained_variance_score(actual, predicted)
return {
"Model": model_name,
"Mean Squared Error (MSE)": mse,
"Root Mean Squared Error (RMSE)": rmse,
"Mean Absolute Error (MAE)": mae,
"Mean Absolute Percentage Error (MAPE)": mape,
"R-squared (R²)": r2,
"Explained Variance Score": explained_variance
}
# Print Metrics
def print_metrics(metrics_dict):
print(f"\nMetrics for {metrics_dict['Model']} Model:")
for key, value in metrics_dict.items():
if isinstance(value, (float, int)) and abs(value) > 1e6:
print(f" {key}: {value:.4e}") # Use scientific notation for large numbers
elif value == np.inf or value == -np.inf:
print(f" {key}: Infinity")
else:
print(f" {key}: {value}")
"""3.2 Machine Learning Models Training and Evaluation"""
# Function to evaluate models
def evaluate_models(X_train, X_val, y_train, y_val):
# Define models
models = {
"Linear Regression": LinearRegression(),
"Ridge Regression": Ridge(alpha=1.0),
"Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
"Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
"Support Vector Regression (SVR)": SVR(kernel='rbf', C=1.0, epsilon=0.1),
}
results = []
# Train and evaluate each model
for name, model in models.items():
try:
print(f"Training and evaluating: {name}")
model.fit(X_train, y_train)
y_pred = model.predict(X_val) # Use validation set for evaluation
# Compute metrics using the provided compute_metrics function
metrics = compute_metrics(y_val, y_pred, name)
# Append metrics to results
results.append(metrics)
except Exception as e:
print(f"Error evaluating {name}: {e}")
results.append({"Model": name, "Error": str(e)})
# Return results as a DataFrame
return pd.DataFrame(results)
# Evaluate models on training and validation sets
results_val = evaluate_models(X_train, X_val, y_train, y_val)
# Display results
print("Validation Set Results:")
print(results_val)
"""3.3 Hyperparameter Tuning for the Best Performed Machine Learning Model"""
# Define the parameter grid
param_distributions = {
'n_estimators': [50, 100, 200, 300],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# Random Forest Tuning using RandomizedSearchCV
def tune_random_forest(X_train, y_train, param_distributions, n_iter=20, n_jobs=-1):
rf = RandomForestRegressor(random_state=42)
random_search = RandomizedSearchCV(
rf,
param_distributions=param_distributions,
n_iter=n_iter,
cv=3, # Cross-validation folds
scoring='neg_mean_squared_error', # Evaluation metric
n_jobs=n_jobs, # Parallel processing for speed
random_state=42 # For reproducibility
)
random_search.fit(X_train, y_train)
return random_search.best_params_, -random_search.best_score_
# Hyperparameter tuning
print("Starting Random Forest Hyperparameter Tuning...")
best_params, best_val_mse = tune_random_forest(X_train, y_train, param_distributions)
print(f"Best Parameters: {best_params}")
print(f"Best Validation MSE: {best_val_mse:.4f}")
"""3.4 Machine Learning Model Retraining"""
# Train Random Forest with Best Parameters
def train_final_rf(X_train, y_train, best_params):
model = RandomForestRegressor(**best_params, random_state=42)
model.fit(X_train, y_train)
return model
# Retrain final model using the entire training set (X_train includes all training data)
print("\nRetraining Final Random Forest Model...")
final_rf_model = train_final_rf(X_train, y_train, best_params)
print("Final model retrained using the best hyperparameters.")
"""3.5 Final Machine Learning Model Evaluation"""
# Evaluate the model using compute_metrics function
def evaluate_final_model(model, X_test, y_test, model_name="Random Forest"):
# Predict test set results
y_test_pred = model.predict(X_test)
# Compute metrics using the custom function
metrics = compute_metrics(y_test, y_test_pred, model_name)
# Print the metrics in a structured format
print(f"\nEvaluation Metrics for {metrics['Model']} Model on Test Set:")
for key, value in metrics.items():
if key != "Model":
print(f" {key}: {value:.4f}" if isinstance(value, float) else f" {key}: {value}")
return metrics
# Perform evaluation
print("\nEvaluating Final Model on Test Set")
test_metrics = evaluate_final_model(final_rf_model, X_test, y_test)
"""3.6.1 Pricing Result from Random Forest vs Actual Prices"""
# Calculate prediction errors
rf_predictions = final_rf_model.predict(X_test) # Predicted prices from Random Forest
prediction_errors = y_test - rf_predictions # Residuals: Actual - Predicted
# Create a DataFrame with all results
pricing_results_complete = pd.DataFrame({
"Actual Prices": y_test.values, # Actual market prices (lastPrice)
"Predicted Prices": rf_predictions, # Predicted prices by Random Forest
"Residuals (Actual - Predicted)": prediction_errors
})
# Display the entire DataFrame
print("Complete Pricing Results:")
print(pricing_results_complete)
# Save the complete pricing results to a CSV file
pricing_results_complete.to_csv("complete_pricing_results_rf.csv", index=False)
# Download the file directly
files.download("complete_pricing_results_rf.csv")
print("Complete pricing results saved to 'complete_pricing_results_rf.csv'.")
"""3.6.2 Scatter Plot for Random Forest based on Actual Prices"""
y_test_pred = final_rf_model.predict(X_test)
# Scatter plot of actual vs. predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_test_pred, alpha=0.6, label="Predictions")
plt.plot(
[y_test.min(), y_test.max()],
[y_test.min(), y_test.max()],
'r--',
label="Ideal Fit"
) # Ideal line (perfect predictions)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Random Forest: Actual vs Predicted Prices on Test Set")
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()
"""3.6.3 Histogram of Prediction Errors from Random Forest"""
# Calculate prediction errors
prediction_errors = y_test - y_test_pred
# Plot the histogram of errors
plt.figure(figsize=(8, 6))
plt.hist(prediction_errors, bins=50, edgecolor='black', alpha=0.7)
plt.title("Histogram of Prediction Errors (Actual - Predicted)")
plt.xlabel("Prediction Error")
plt.ylabel("Frequency")
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.show()
"""3.6.4 Histogram of Actual vs Predicted Prices from Random Forest"""
# Sample data for visualization
sample_indices = np.random.choice(range(len(y_test)), size=20, replace=False)
sample_actual = y_test.iloc[sample_indices]
sample_predicted = y_test_pred[sample_indices]
# Create a bar plot for comparison
bar_width = 0.35
indices = np.arange(len(sample_indices))
plt.figure(figsize=(12, 6))
plt.bar(indices, sample_actual, bar_width, label="Actual Prices")
plt.bar(indices + bar_width, sample_predicted, bar_width, label="Predicted Prices")
plt.xlabel("Sample Index")
plt.ylabel("Option Prices")
plt.title("Comparison of Actual vs Predicted Option Prices (Random Forest)")
plt.xticks(indices + bar_width / 2, indices)
plt.legend()
plt.show()
"""3.6.5 Residual Plot of Predicted Prices vs Residuals"""
# Plot residuals vs predicted prices
plt.figure(figsize=(8, 6))
plt.scatter(y_test_pred, prediction_errors, alpha=0.6, edgecolor='k')
plt.axhline(0, color='red', linestyle='--', linewidth=1)
plt.title("Residuals vs Predicted Prices")
plt.xlabel("Predicted Prices")
plt.ylabel("Residuals (Actual - Predicted)")
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()
"""3.6.6 Error Summary"""
# Calculate errors
rf_predictions = final_rf_model.predict(X_test)
rf_errors = y_test - rf_predictions
# Calculate Mean and Standard Deviation of Errors
error_summary = pd.DataFrame({
"Model": ["Random Forest"],
"Mean Error": [rf_errors.mean()],
"Std Deviation of Error": [rf_errors.std()]
})
print("\nError Summary:")
print(error_summary)
"""3.7 Feature Importance of Random Forest for Model Intepretability"""
# Get feature importances from the final trained model
feature_importance = final_rf_model.feature_importances_
features = X.columns # Assuming X is a pandas DataFrame
# Sort the features by importance
sorted_idx = np.argsort(feature_importance)
# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.barh(features[sorted_idx], feature_importance[sorted_idx], color='skyblue')
plt.xlabel("Feature Importance")
plt.title("Feature Importance of Random Forest Model")
plt.show()