-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscript.py
66 lines (57 loc) · 3.13 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
already_scraped = {} #A hash to keep track of all the chats that have been already scraped by the script
#launches browser
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
#options.add_argument('--incognito')
driver = webdriver.Chrome("--driver_path", options=options) #Give the path of your chrome driver
driver.get('https://web.whatsapp.com')
async def start_scrape(scroll=0):
""" This is a test function which will contain the possible scraping ways"""
target = driver.find_element_by_id('pane-side')
driver.execute_script(f"arguments[0].scrollTop = {scroll}", target)
source = driver.page_source
soup = BeautifulSoup(source, 'html.parser')
left_panel = soup.findAll("div", {"id": "pane-side"})[0] #This will fetch all the left side panel's code
left_panel_soup = BeautifulSoup(str(left_panel), 'html.parser')
chat_div_list = left_panel_soup.findAll('div', {'tabindex' : '-1'})[1:] #This _list will contain all the chats div , 0th index is ignored([1:]) because it will be parent div with all the children div in it
for chat_div in chat_div_list:
chat_name = chat_div.find('span', {'title': True})['title']
if chat_name not in already_scraped.keys():
driver.find_element_by_xpath(f"//span[text()='{chat_name}']").click()
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "main")))
already_scraped.update({chat_name: True})
reloaded_soup = await reload_soup(driver)
while await print_to_console(driver, reloaded_soup):
print('processing...')
reloaded_soup = await reload_soup(driver)
scroll += 350
start_scrape(scroll)
async def print_to_console(driver, c):
try:
element = WebDriverWait(driver, 15).until(EC.visibility_of_element_located((By.XPATH, "//div[contains(@title,'load earlier messages…')]")))
driver.find_element_by_xpath("//div[contains(@title,'load earlier messages…')]").location_once_scrolled_into_view
return True
except Exception as e:
reloaded_soup = await reload_soup(driver)
print(e)
for text_div in reloaded_soup:
print(text_div.text)
print('\n \n \n Scrapping next chat.......')
return False
async def reload_soup(driver):
source = driver.page_source
soup = BeautifulSoup(source, 'html.parser')
all_soup = soup.find('div', {"id" : "main"})
soup = BeautifulSoup(str(all_soup), 'html.parser')
filtered_soup = soup.find('div', {"class" : "copyable-area"})
filtered_soup = list(filtered_soup)[2]
soup = BeautifulSoup(str(filtered_soup), 'html.parser')
final_soup = soup.findAll('div',{"class" : "copyable-text"}) #This has all divs for each message which has information if the message is replied to or normal message
return final_soup
if __name__ == '__main__':
launch_browser()