-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
194 lines (156 loc) · 6.63 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# Import necessary libraries
import streamlit as st
import pandas as pd
import plotly.express as px
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
import plotly.graph_objs as go # Import graph objects
# Load the housing dataset from a CSV file
housing = pd.read_csv('housing.csv')
# Drop NaN values and duplicates
housing.dropna(inplace=True)
housing.drop_duplicates(inplace=True)
# Define ocean proximity options
ocean_proximity_options = ['<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND']
# Streamlit UI setup
st.set_page_config(page_title="California Housing Dashboard", page_icon="🏠", layout="wide", initial_sidebar_state="expanded")
st.title("Housing Price Prediction 🏠")
st.write('Top left are the sliders to make the housing prediction')
st.write('---')
# Sidebar for user inputs
with st.sidebar:
st.title("Input Features")
# Collect user input
user_input = {
'total_bedrooms': st.slider('Total Bedrooms', housing['total_bedrooms'].min(), housing['total_bedrooms'].max(), housing['total_bedrooms'].median()),
'median_income': st.slider('Median Income', housing['median_income'].min(), housing['median_income'].max(), housing['median_income'].median()),
'housing_median_age': st.slider('Housing Median Age', housing['housing_median_age'].min(), housing['housing_median_age'].max(), housing['housing_median_age'].median()),
}
# Convert user input to DataFrame
user_input_df = pd.DataFrame([user_input])
# Train the model with the entire dataset
X = housing[['total_bedrooms', 'median_income', 'housing_median_age']]
Y = housing['median_house_value']
model = DecisionTreeRegressor()
model.fit(X, Y)
# Define columns layout for the main panel
col1, col2, col3 = st.columns((2, 3, 2), gap="medium")
# Column 1: Input parameters and prediction
with col1:
st.markdown('### Prediction')
# Predict the price
prediction = model.predict(user_input_df)[0]
st.metric(label="Predicted Median House Value", value=f"${prediction * 1000:,.0f}")
# Column 2: Geographical distribution of predicted house
with col2:
st.markdown('### Geographical Distribution of Predicted House')
# Predict the price for the user input
prediction = model.predict(user_input_df)[0]
# Find the nearest record in the dataset to the predicted price
nearest_record = housing.iloc[(housing['median_house_value'] - prediction).abs().argsort()[:1]]
# Extract latitude and longitude from the nearest record
latitude = nearest_record['latitude'].values[0]
longitude = nearest_record['longitude'].values[0]
# Create a DataFrame with predicted house values using the latitude and longitude from the nearest record
predicted_house = pd.DataFrame({
'latitude': [latitude],
'longitude': [longitude],
'PredictedValue': [prediction]
})
# Create a scatter plot for the predicted house
fig = px.scatter_mapbox(
predicted_house,
lat="latitude",
lon="longitude",
color="PredictedValue",
size="PredictedValue",
color_continuous_scale='viridis',
size_max=15,
zoom=3
)
fig.update_layout(mapbox_style="carto-positron")
st.plotly_chart(fig, use_container_width=True)
# Column 3: Information about the data and top districts
with col3:
st.markdown('### Dataset Citation and Reference')
st.write("""
The data was originally featured in a paper titled "Sparse spatial autoregressions" by
R. Kelley Pace and Ronald Barry. This dataset is a modified version of the California Housing dataset,
which is available from Luís Torgo's page at the University of Porto.
The information was encountered in "Hands-On Machine learning with Scikit-Learn and TensorFlow" by
[Aurélien Géron](https://inria.github.io/scikit-learn-mooc/python_scripts/datasets_california_housing.html).
""")
# Additional analysis and visualizations can be added below
# Additional analysis and visualizations can be added below
st.markdown('### Geographical Distribution of Median House Value')
# Create a scatter plot
fig = px.scatter_mapbox(
housing,
lat="latitude",
lon="longitude",
color="median_house_value",
size="median_house_value",
color_continuous_scale='viridis',
size_max=15,
zoom=3
)
fig.update_layout(mapbox_style="carto-positron")
# Add the predicted house location with a distinct color and legend
fig.add_trace(px.scatter_mapbox(
predicted_house,
lat="latitude",
lon="longitude",
color="PredictedValue",
size="PredictedValue",
color_continuous_scale=[[0, 'red'], [1, 'red']],
size_max=15
).data[0])
st.plotly_chart(fig, use_container_width=True)
st.write("---")
# Select relevant features for clustering
features_for_clustering = housing[['longitude', 'latitude']]
# Perform k-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
housing['district'] = kmeans.fit_predict(features_for_clustering)
# Group by the fictitious 'district' column and calculate the median house value
district_median_values = housing.groupby('district')['median_house_value'].median()
# Identify the top districts by median house value
top_districts = district_median_values.nlargest(5)
# Assign colors to each district for the map
color_discrete_map = {0: 'blue', 1: 'green', 2: 'red', 3: 'purple', 4: 'orange'}
# Create a scatter mapbox for the top districts
fig_districts = go.Figure()
for district in top_districts.index:
district_data = housing[housing['district'] == district]
fig_districts.add_trace(
go.Scattermapbox(
lat=district_data['latitude'],
lon=district_data['longitude'],
mode='markers',
marker=dict(size=9, color=color_discrete_map[district]),
name=f"District {district}"
)
)
# Update layout to set mapbox properties
fig_districts.update_layout(
title='Map of Top Districts',
mapbox=dict(
style="carto-positron",
zoom=4,
center=dict(lat=district_data['latitude'].mean(), lon=district_data['longitude'].mean())
),
margin={"r": 0, "t": 0, "l": 0, "b": 0},
legend_title_text='District'
)
# Geographical Distribution of Top Districts
st.markdown('### Geographical Distribution of Top Districts')
# Create two columns for the map and the table
col1, col2 = st.columns([3, 1])
with col1:
st.plotly_chart(fig_districts, use_container_width=True)
with col2:
# Create and display a table of top districts
top_districts_table = top_districts.reset_index()
top_districts_table.columns = ['District', 'Median House Value']
st.write(top_districts)