-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtop-cities.py
57 lines (44 loc) · 1.31 KB
/
top-cities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
import sqlite3
import pandas as pd
import geoip2.database
# Connect to the SQLite database
conn = sqlite3.connect('mydatabase.db')
YEAR = 2022
# Read the data
df = pd.read_sql_query(f"""
SELECT ip_addr_decrypted, COUNT(*) AS count
FROM mytable
-- WHERE strftime('%Y', ts) AS year is "{YEAR}"
GROUP BY ip_addr_decrypted
ORDER BY count DESC
""", conn)
# Initialize a reader for the GeoIP database
reader = geoip2.database.Reader('city.mmdb')
# Initialize lists to store the latitude and longitude values
city = []
country = []
# Loop through IPs and get lat/long
for ip in df['ip_addr_decrypted']:
try:
response = reader.city(ip)
city.append(response.city.name)
country.append(response.country.name)
except:
city.append("")
country.append("")
df['city'] = city
df['country'] = country
# Remove countries with missing location information
df = df.dropna(subset=['city'])
df = df.dropna(subset=['country'])
# drop ip address
df = df.drop(columns=['ip_addr_decrypted'])
# group identical lat long
df = df.groupby(['city', 'country']).sum().reset_index()
# sort by count
df = df.sort_values(by=['count'], ascending=False)
# limit to top 20
df = df.head(20)
print("df", df.to_string())