Skip to content

Commit

Permalink
Initial commit.
Browse files Browse the repository at this point in the history
  • Loading branch information
weirdindiankid committed Oct 13, 2020
0 parents commit bbd393f
Show file tree
Hide file tree
Showing 13 changed files with 358 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.DS_Store
__pycache__/*
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
## Scraping Resources

This is an evolving, non-exhaustive list of resources presented to the students at
Boston University's CS 506 class on Wednesday 10/7/2020.

Scrapers are in the `scrapers` directory. It is worth noting that these scrapers
were built by me as a student for my use and will likely require some modifications
before you can use them in your own project.

Lecture slides are in the references directory.

## Questions?

Please email me ([email protected]) with questions and CC [email protected]

Binary file added references/Web Scraping.pdf
Binary file not shown.
4 changes: 4 additions & 0 deletions scrapers/IMDBReviews/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.DS_Store
__pycache__/*
*.htm
*.html
35 changes: 35 additions & 0 deletions scrapers/IMDBReviews/eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import pandas as pd
from bs4 import BeautifulSoup

file = open("dreviews.htm", "r")
file = file.read()

phtml = BeautifulSoup(file, features='lxml')

m = phtml.body.find('div', attrs={'class': 'lister-list'}) #.text

all_reviews = m.findAll('div')

fin_ratings = []
fin_reviews = []
fin_titles = []

raw_reviews = m.findAll('div', attrs={'class': 'text'})
raw_reviews = [i.text.split('show-more__control">')[0] for i in raw_reviews]

# Out of 10
raw_ratings = phtml.body.findAll('span', attrs={'class': 'rating-other-user-rating'})
#raw_ratings = raw_ratings.body.findAll('span')
raw_ratings = [i.text.split('>')[0] for i in raw_ratings]
raw_ratings = [' '.join(i.split()) for i in raw_ratings]
raw_ratings = [i.split('/')[0] for i in raw_ratings]

raw_titles = phtml.body.findAll('a', attrs={'class': 'title'})
raw_titles = [i.text.split('t_urv">')[0] for i in raw_titles]
raw_titles = [' '.join(i.split()) for i in raw_titles]

df = pd.DataFrame()
df['title'] = raw_titles
df['review'] = raw_reviews

df.to_csv("savedreviews.csv")
92 changes: 92 additions & 0 deletions scrapers/IMDBReviews/imdbscraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from bs4 import BeautifulSoup
import urllib.request as req
from tabulate import tabulate

def getResponse(url):
response = req.urlopen(url)
data = response.read()
soup = BeautifulSoup(data, "lxml")
#print(soup.prettify("utf-8"))
return soup

def selectChoice():

'''options = {
1: ('top'),
2: ('moviemeter'),
3: ('top-english-movies'),
4: ('toptv'),
5: ('tvmeter'),
6: ('bottom'),
7: ('boxoffice')
}
'''
options_map = {
1: ('Top movies' , 'top'),
2: ('Most Popular Movies' , 'moviemeter'),
3: ('Top English Movies' , 'top-english-movies'),
4: ('Top TV Shows' , 'toptv'),
5: ('Most Popular TV Shows' , 'tvmeter'),
6: ('Low Rated Movies', 'bottom'),
7: ('Top Box Office collection', 'boxoffice')
}

for i,option in enumerate(options_map,1):
print("{}) {}".format(i,options_map[option][0]))

choice = int(input('\nChoice please..\n'))
while(choice<1 or choice>len(options_map)):
print('Wrong choice, enter again..')
choice = int(input('\nChoice please..\n'))
return options_map[choice][1]

def getData(base_url, option):
complete_url = base_url + option
soup = getResponse(complete_url)
card_list = soup.find_all('span',{'class':'media-body media-vertical-align'}) #material card list
result = []
count = 1
for card in card_list:
try:
name = card.find('h4').text.replace("\n"," ").lstrip("0123456789.- ") #removes order indexes for movies 1,2,3,4,...
except:
pass
try:
rating = card.find('p').text.strip()
except:
pass
result.append([count,name,rating])
count += 1
print(tabulate(result, headers=["Index", "Name", "Ratings"], tablefmt="grid"))

def main():
base_url = "http://m.imdb.com/chart/"
choice = selectChoice()
#print(choice)
getData(base_url, choice)

if __name__ == '__main__':
main()


'''
#table formats
- "plain"
- "simple"
- "grid"
- "fancy_grid"
- "pipe"
- "orgtbl"
- "jira"
- "presto"
- "psql"
- "rst"
- "mediawiki"
- "moinmoin"
- "youtrack"
- "html"
- "latex"
- "latex_raw"
- "latex_booktabs"
- "textile"
'''
35 changes: 35 additions & 0 deletions scrapers/IMDBReviews/scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

driver = webdriver.Chrome()
driver.get("https://www.imdb.com/title/tt0451279/reviews?ref_=tt_ql_3")

elem = driver.find_element_by_id('load-more-trigger')

while 1 == 1:
try:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
elem.click()
continue
except:
print("Done")
break


nelem = driver.find_elements_by_class_name('expander-icon-wrapper')
for i in nelem:
try:
i.click()
time.sleep(0.2)
except:
print("Arrow not needed")
continue

print("Scraping complete...now attempting a save")

page = driver.page_source
file_ = open('npage.html', 'w')
file_.write(page)
file_.close()
5 changes: 5 additions & 0 deletions scrapers/reddit-scraper/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
config.py
*.pyc
data/
checkpoint.txt
checkpoint_cs.txt
30 changes: 30 additions & 0 deletions scrapers/reddit-scraper/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
## Quickstart

Fast subreddit image scraper. Has no real limits.

Place `config.py` as

```python
SUBREDDIT = ''

USERNAME = ''
PASSWORD = ''
CLIENT_SECRET = ''
CLIENT_ID = ''
PLATFORM = 'python'
APP_ID = ''
VERSION = 'v0.0.1'
LIMIT = 200
DATA_DIR = './data'

USER_AGENT = '{platform}:{app_id}:{version} (by /u/{username})'.format(platform=PLATFORM,
app_id=APP_ID,
version=VERSION,
username=USERNAME)
```

Use `python3 cloudsearch.py` to start.

## Dependencies

Depends on `praw` and `aiohttp`
53 changes: 53 additions & 0 deletions scrapers/reddit-scraper/cloudsearch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os
import sys
import praw
from datetime import datetime
from config import *
from image_resolver import *
from task_downloader import *

if __name__ == '__main__':
reddit = praw.Reddit(client_id=CLIENT_ID,
client_secret=CLIENT_SECRET,
password=PASSWORD,
user_agent=USER_AGENT,
username=USERNAME)

last_upper = int((datetime.utcnow() - datetime(1970, 1, 1)).total_seconds())
download_data = []

# Checkpoint
if os.path.isfile('checkpoint_cs.txt'):
with open('checkpoint_cs.txt', 'r') as file:
last_upper = int(file.read())
print("Loaded Checkpoint,", last_upper)

while True:
print("Collecting")
download_data = []

upper = last_upper
lower = upper - 86400
query = 'timestamp:%d..%d' % (lower, upper)

generator = reddit.subreddit(SUBREDDIT).search(query, sort='new', limit=100, syntax='cloudsearch')

for submission in generator:
link = parse_url(submission.url)
id_ = submission.fullname
if link is not None:
download_data.append((link, id_))

print("Downloading", len(download_data))
download_images(download_data)

print('Done')

with open('checkpoint_cs.txt', 'w') as file:
file.write(str(last_upper))

print('Checkpointing')
print('')

last_upper = lower

49 changes: 49 additions & 0 deletions scrapers/reddit-scraper/default.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import os
import sys
import praw
from config import *
from image_resolver import *
from task_downloader import *

if __name__ == '__main__':
reddit = praw.Reddit(client_id=CLIENT_ID,
client_secret=CLIENT_SECRET,
password=PASSWORD,
user_agent=USER_AGENT,
username=USERNAME)

last_id = ''
download_data = []

# Checkpoint
if os.path.isfile('checkpoint.txt'):
with open('checkpoint.txt', 'r') as file:
last_id = file.read()
print("Loaded Checkpoint,", last_id)

while True:
print("Collecting")
download_data = []

if last_id == '':
generator = reddit.subreddit(SUBREDDIT).top('week', limit=LIMIT)
else:
generator = reddit.subreddit(SUBREDDIT).top('week', limit=LIMIT, params={'after': last_id})

for submission in generator:
link = parse_url(submission.url)
last_id = submission.fullname
if link is not None:
download_data.append((link, last_id))

print("Downloading", len(download_data))
download_images(download_data)

print('Done')

with open('checkpoint.txt', 'w') as file:
file.write(last_id)

print('Checkpointing')
print('')

15 changes: 15 additions & 0 deletions scrapers/reddit-scraper/image_resolver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
'''
Contains rules to decide wether image is worth it or not.
'''

def parse_url(url):
parsed = None
if url.endswith('.jpg'):
parsed = url
elif 'imgur.com' in url:
if '/a/' not in url and '/gallery/' not in url:
parsed = 'http://i.imgur.com/{id}.jpg'.format(id=url.split('/')[-1])
elif 'reddituploads.com' in url:
parsed = url

return parsed
23 changes: 23 additions & 0 deletions scrapers/reddit-scraper/task_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import asyncio
import os
from aiohttp import ClientSession
from config import *

async def download_image(url, id):
if os.path.isfile(os.path.join(DATA_DIR, id + '.jpg')):
return
async with ClientSession() as session:
async with session.get(url) as response:
response = await response.read()
with open(os.path.join(DATA_DIR, id + '.jpg'), 'wb') as f:
f.write(response)

def download_images(download_data):
loop = asyncio.get_event_loop()

tasks = []
for dl in download_data:
task = asyncio.ensure_future(download_image(dl[0], dl[1]))
tasks.append(task)

loop.run_until_complete(asyncio.wait(tasks))

0 comments on commit bbd393f

Please sign in to comment.