-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper_tools.py
92 lines (82 loc) · 4.97 KB
/
scraper_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import requests
from bs4 import BeautifulSoup
import pandas as pd
from statistics import mode
from datetime import datetime
def fetch_html(url):
"""Fetch the HTML content of a given URL."""
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:94.0) Gecko/20100101 Firefox/94.0', 'Referer': url}
return requests.get(url, headers=headers).text
def parse_sitemap(sitemap_url, base_url):
"""Parse the sitemap and return a list of stock URLs."""
sitemap_content = BeautifulSoup(fetch_html(sitemap_url), "xml").text.split("\n")
pages_toscrape = sorted([line for line in sitemap_content if base_url in line and len(base_url) < len(line) < len(base_url)+9])
print(f"Number of urls to scrape: {len(pages_toscrape)}\n")
return pages_toscrape
def get_tickers_urls(base_url):
tickers_url = []
for row in BeautifulSoup(fetch_html(base_url), 'html.parser').find_all('td'):
for item in row.find_all('a'):
tickers_url.append(base_url+item.get_text())
return tickers_url
def parse_stock_data(soup,site):
"""Parse stock data from the BeautifulSoup object and return a dictionary."""
stock_data = {}
if 'investsite' in site:
for title,value in zip(soup.find_all('td',class_='esquerda'),soup.find_all('td',class_='direita')):
stock_data[title.get_text().strip()] = value.get_text().strip()
elif 'fundamentus' in site:
for label,data in zip(soup.find_all('td', class_='label'),soup.find_all('td', class_='data')):
title = label.find('span', class_='txt')
value = data.find('span', class_=['txt','oscil'])
if title and value:
stock_data[title.get_text().strip()] = value.get_text().strip()
elif 'status' in site:
for info,indicator in zip(soup.find_all('div', class_='info'),soup.find_all('div', class_='item')):
title = info.find(['h3', 'span'], class_=['title', 'd-none', 'd-inline-block', 'sub-value']) or indicator.find('h3', class_='title')
value = info.find('strong', class_='value') or indicator.find('strong', class_='value')
if title and value and "\n" not in title.get_text() and value.get_text() != "\xa0":
stock_data[title.get_text().strip()] = value.get_text().strip()
elif 'investidor10' in site:
for title,value in zip(soup.find_all('div',class_='_card-header'),soup.find_all('div',class_='_card-body')):
stock_data[title.get_text().strip()] = value.get_text().strip()
for cell in soup.find_all('div',class_='cell'):
title = cell.find(['span'], class_=['d-flex', 'title'])
value = cell.find(['div', 'span'], class_='value')
if title and value:
if 'DIVIDEND YIELD - ' in title.get_text().strip():
stock_data[title.get_text().strip().split('-')[0]] = value.get_text().strip().split('\n')[-1]
else:
stock_data[title.get_text().strip()] = value.get_text().strip().split('\n')[-1]
# Extract the price date
for item in soup.find_all('div', class_='_card cotacao'):
stock_data['Data da Cotação'] = item.find('i', class_='far')['data-content'].split()[-1]
else:
raise Exception('Site unknown!!!')
return stock_data
def scrape_stock_data(tickers_urls,split_parameter):
"""Scrape stock data from a list of URLs."""
all_stocks_data = []
for n, url in enumerate(tickers_urls): # Limit to n stocks for testing: tickers_urls[:n]
print(f"Scraping {url.split(split_parameter)[-1]} data ({n+1} of {len(tickers_urls)})\n page: {url}")
soup = BeautifulSoup(fetch_html(url), 'html.parser')
stock_data = {"Ticker": url.split(split_parameter)[-1]} # Add ticker symbol to the stock data
stock_data.update(parse_stock_data(soup,url)) # Merge parsed data
all_stocks_data.append(stock_data) # Append to the list
return all_stocks_data
def save_to_csv(data_list, database_name):
"""Save scraped data to a CSV file."""
df = pd.DataFrame(data_list)
if 'Data da Cotação' in df.columns: # Writes date in filename if df has a price date column
price_date = mode(df['Data da Cotação']) # get the most common date as the most recent price date
YMD_date = datetime.strptime(price_date,'%d/%m/%Y').strftime('%Y-%m-%d') # format date to be Y-m-d
file_name = YMD_date+'_'+database_name+'.csv'
elif 'Data últ cot' in df.columns:
price_date = mode(df['Data últ cot']) # get the most recent date as the most common date
YMD_date = datetime.strptime(price_date,'%d/%m/%Y').strftime('%Y-%m-%d') # format date to be Y-m-d
file_name = YMD_date+'_'+database_name+'.csv'
else:
today = str(datetime.now().date()) # Get today date
file_name = today+'_'+database_name+'.csv' # Writes today date in filename in case price date is not present in the dataframe
df.to_csv(file_name, index=False)
print(f"\nData saved to ---> {file_name}\n")