-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathsql-equivalents.py
173 lines (100 loc) · 3.3 KB
/
sql-equivalents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# coding: utf-8
# In[1]:
import pandas as pd
# In[2]:
df = pd.read_csv("./buildings.csv")
df.head()
# In[3]:
df.describe()
# We want to get the first 10 results
# `SELECT * FROM buildings LIMIT 10`
#
# In[4]:
df[:10]
# Get only the City and Building name
# `SELECT city, building FROM buildings LIMIT 3`
# In[5]:
df[["City", "Building"]][:3]
# What are the three oldest buildings in our data
# `SELECT city, building, built FROM buildings ORDER BY built LIMIT 3`
#
# In[ ]:
# In[6]:
df.sort("Built")[["City", "Building", "Built"]][:3]
# What the the three highest buildings?
# `SELECT city, building, height FROM buildings ORDER BY height desc LIMIT 3`
# In[7]:
df.sort("Height", ascending=False)[["City", "Building", "Height"]][:3]
# What are the three tallest buildings in Dubai?
#
# SELECT city, building, height FROM buildings
# WHERE city='Dubai' ORDER BY height LIMIT 3;
#
# In[9]:
df[df["City"] == 'Dubai'].sort("Height", ascending=False)[:3]
# What are the five tallest buildings in Dubai or Chicago?
#
# SELECT * FROM buildings WHERE city='Dubai'
# or city=='Chicago' ORDER BY height desc LIMIT 5;
# In[10]:
df[(df["City"] == 'Dubai') | (df["City"] == "Chicago")].sort("Height", ascending=False)[:5]
# Among the buildings built in 20th century, which has the most floors?
#
# SELECT * FROM buildings WHERE built < 2000
# ORDER BY floors DESC LIMIT 1;
# In[11]:
df[df["Built"] < 2000].sort("Floors", ascending=False)[:1]
# Among the buildings built in 20th century, which is the tallest?
#
# SELECT * FROM buildings WHERE built < 2000
# ORDER BY height DESC LIMIT 1;
# In[12]:
df[df["Built"] < 2000].sort("Height", ascending=False)[:1]
# How many different countries and cities do we have in our data?
#
# SELECT COUNT(DISTINCT(country)) FROM buildings;
# SELECT COUNT(DISTINCT(city)) FROM buildings;
# In[13]:
len(df["Country"].unique())
len(df["City"].unique())
# Which cities make an apperance in top 20 tallest building list?
#
# SELECT DISTINCT(city) from
# (SELECT * FROM buildings ORDER BY height DESC LIMIT 20);
#
# In[15]:
df[:20].sort("Height", ascending=False)["City"].unique()
# How many appearances dos each conutry make in a top 100 list?
#
# SELECT country, COUNT(country) AS building_count
# FROM
# (select * from buildings order by height limit 100)
# GROUP BY country
# ORDER BY building_count DESC;
#
#
# In[16]:
df[:100]["Country"].value_counts()
# What is the average floors and average height, per country.
#
# SELECT country, AVG(floors), AVG(height) from
# (SELECT * FROM buildings ORDER BY height DESC LIMIT 100)
# GROUP BY country;
# In[17]:
df.sort("Height")[:100].groupby("Country").mean()[["Floors", "Height"]]
# What is the average floors and average height, per country, sorted by height
#
# SELECT country, AVG(floors) as avg_floor, AVG(height) as avg_height from
# (SELECT * FROM buildings ORDER BY height DESC LIMIT 100)
# GROUP BY country
# ORDER by avg_height DESC;
# In[ ]:
# In[18]:
df[:100].groupby("Country").mean()[["Floors", "Height"]].sort("Height", ascending=False)
# What years were the buildings completed?
#
# SELECT built, count(built) FROM buildings GROUP BY built;
# In[19]:
df.groupby("Built").count()["Building"]
# In[ ]:
# In[ ]: