-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
46 lines (38 loc) · 1.2 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import sys
import time
area_list = []
name_list = []
url_list = []
#og = pd.read_csv..
urls=og.values.flatten()
for url in urls:
time.sleep(1)
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
name = [x.text for x in soup.select('name')]
area = [x.text for x in soup.select('area + name')]
url_list.append(url)
if (len(name) > 0):
name_list.append(name[0])
else:
name_list.append(" ")
if (len(area) > 0):
area_list.append(area[0])
else:
area_list.append(" ")
# experimenting with one url:
# url = "https://musicbrainz.org/ws/2/artist/?query=artist:hall-of-fame"
# r = requests.get(url)
# soup = BeautifulSoup(r.text, 'lxml')
# name = soup.find('name') #[x.text for x in soup.select('name')]
# area = soup.select('area + name')[0]
# beginarea = soup.findChild(name)
# print(name, area.text, beginarea)
# create dataframe from your lists
df = pd.DataFrame(list(zip(url_list, name_list, area_list)),
columns =['url', 'name', 'area'])
df.to_csv('CRAWLERRESULTS.csv', index=False, errors='ignore')