-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 27c367f
Showing
7 changed files
with
175 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
--- | ||
title: "Untitled" | ||
format: html | ||
--- | ||
|
||
```{python} | ||
import pandas as pd | ||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
from scipy.stats import skew, kurtosis | ||
df = pd.read_csv('Real_Estate_Sales_2001-2020_GL.csv') | ||
# Remove data where the cost is less than 100 | ||
df = df[df['Non Use Code'].notnull()] | ||
freq = df['List Year'].value_counts().sort_index() | ||
freq.plot(kind='bar') | ||
plt.title('Amount of Sales per List Year (Filtered by Non Use Code)') | ||
plt.xlabel('List Year') | ||
plt.ylabel('Frequency') | ||
# calculate skewness and kurtosis | ||
skewness = skew(freq) | ||
kurt = kurtosis(freq) | ||
print(f"Skewness: {skewness:.2f}") | ||
print(f"Kurtosis: {kurt:.2f}") | ||
``` | ||
```{python} | ||
import pandas as pd | ||
from scipy.stats import kruskal | ||
# Group data by listing year and calculate median sales ratio | ||
grouped = df.groupby('List Year')['Sales Ratio'].median() | ||
# Convert groupby object back into DataFrame and drop NaN values | ||
data = pd.DataFrame({'List Year': grouped.index, 'Sales Ratio': grouped.values}).dropna() | ||
# Run Kruskal-Wallis test | ||
stat, p = kruskal(*[group['Sales Ratio'] for name, group in data.groupby('List Year')]) | ||
# Print results | ||
print('Kruskal-Wallis test:') | ||
print('H-statistic = {:.3f}'.format(stat)) | ||
print('p-value = {:.3f}'.format(p)) | ||
if p < 0.05: | ||
print('Medians of List Years are statistically different') | ||
else: | ||
print('Not enough evidence to conclude that the medians of List Years are not statistically different') | ||
``` | ||
|
||
```{python} | ||
# List of years to analyze | ||
years = [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020] | ||
# Convert "Year" column to integer | ||
df["List Year"] = df["List Year"].astype(int) | ||
# Create an empty DataFrame to store the results | ||
results_df = pd.DataFrame(columns=["List Year", "Count", "Mean", "Std Dev", "Median"]) | ||
df['Profit'] = df['Sale Amount'] - df['Assessed Value'] | ||
## Loop over the years and calculate the descriptive statistics for Sales Ratio | ||
for year in years: | ||
year_df = df[df["List Year"] == year] | ||
sales_ratio_stats = year_df["Profit"].describe() | ||
results_df = results_df.append({ | ||
"List Year": year, | ||
"Count": sales_ratio_stats["count"].astype(int), | ||
"Mean": sales_ratio_stats["mean"], | ||
"Std Dev": sales_ratio_stats["std"], | ||
"Median": sales_ratio_stats["50%"], | ||
}, ignore_index=True) | ||
# Print the results DataFrame | ||
print(results_df['Count']) | ||
#print(results_df.to_latex(index=False)) | ||
``` | ||
|
||
```{python} | ||
df.head() | ||
``` | ||
|
||
```{python} | ||
from scipy.stats import shapiro | ||
from scipy.stats import kstest | ||
# extract the column to test | ||
data = df['Sales Ratio'] | ||
# perform Shapiro-Wilk test | ||
stat, p = shapiro(data) | ||
# print the results | ||
print('Shapiro-Wilk test statistic:', stat) | ||
print('p-value:', p) | ||
if p > 0.05: | ||
print('Data is likely normally distributed') | ||
else: | ||
print('Data is not likely normally distributed') | ||
``` | ||
|
||
```{python} | ||
import pandas as pd | ||
from scipy.stats import kruskal | ||
# Run Kruskal-Wallis test | ||
stat, p = kruskal(results_df['List Year'], results_df['Count']) | ||
# Print results | ||
print('Kruskal-Wallis test:') | ||
print('H-statistic = {:.3f}'.format(stat)) | ||
print('p-value = {:.3f}'.format(p)) | ||
if p < 0.05: | ||
print('Data is likely statistically significiant') | ||
else: | ||
print('Data is not likely statistically significant') | ||
``` | ||
```{python} | ||
import pandas as pd | ||
from plotnine import * | ||
# Create Q-Q plot using ggplot2 | ||
ggplot(df, aes(sample='Sales Ratio')) + \ | ||
stat_qq() + \ | ||
ggtitle("Q-Q Plot for Sales Ratio") | ||
``` | ||
|
||
```{python} | ||
import seaborn as sns | ||
# Create Boxplot | ||
YearDayPlot = sns.boxplot(x='Residential Type', y="Sales Ratio", data=df) | ||
YearDayPlot.set(title = "Boxplot of Durations from Boroughs on Days of the Week", ylabel = "Sales Ratio") | ||
``` | ||
|
||
```{python} | ||
import pandas as pd | ||
from scikit_posthocs import posthoc_dunn | ||
# Perform post hoc Dunn's test | ||
dunn_results = posthoc_dunn(results_df, val_col='Count', group_col='List Year') | ||
# Set values greater than 0.05 to 0 | ||
dunn_results[dunn_results > 0.05] = 0 | ||
# Create heatmap | ||
sns.heatmap(dunn_results, cmap="coolwarm", annot=True, fmt=".2f", annot_kws={'fontsize': 5}) | ||
``` |
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.