-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit bbd393f
Showing
13 changed files
with
358 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
.DS_Store | ||
__pycache__/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
## Scraping Resources | ||
|
||
This is an evolving, non-exhaustive list of resources presented to the students at | ||
Boston University's CS 506 class on Wednesday 10/7/2020. | ||
|
||
Scrapers are in the `scrapers` directory. It is worth noting that these scrapers | ||
were built by me as a student for my use and will likely require some modifications | ||
before you can use them in your own project. | ||
|
||
Lecture slides are in the references directory. | ||
|
||
## Questions? | ||
|
||
Please email me ([email protected]) with questions and CC [email protected] | ||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
.DS_Store | ||
__pycache__/* | ||
*.htm | ||
*.html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import pandas as pd | ||
from bs4 import BeautifulSoup | ||
|
||
file = open("dreviews.htm", "r") | ||
file = file.read() | ||
|
||
phtml = BeautifulSoup(file, features='lxml') | ||
|
||
m = phtml.body.find('div', attrs={'class': 'lister-list'}) #.text | ||
|
||
all_reviews = m.findAll('div') | ||
|
||
fin_ratings = [] | ||
fin_reviews = [] | ||
fin_titles = [] | ||
|
||
raw_reviews = m.findAll('div', attrs={'class': 'text'}) | ||
raw_reviews = [i.text.split('show-more__control">')[0] for i in raw_reviews] | ||
|
||
# Out of 10 | ||
raw_ratings = phtml.body.findAll('span', attrs={'class': 'rating-other-user-rating'}) | ||
#raw_ratings = raw_ratings.body.findAll('span') | ||
raw_ratings = [i.text.split('>')[0] for i in raw_ratings] | ||
raw_ratings = [' '.join(i.split()) for i in raw_ratings] | ||
raw_ratings = [i.split('/')[0] for i in raw_ratings] | ||
|
||
raw_titles = phtml.body.findAll('a', attrs={'class': 'title'}) | ||
raw_titles = [i.text.split('t_urv">')[0] for i in raw_titles] | ||
raw_titles = [' '.join(i.split()) for i in raw_titles] | ||
|
||
df = pd.DataFrame() | ||
df['title'] = raw_titles | ||
df['review'] = raw_reviews | ||
|
||
df.to_csv("savedreviews.csv") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
from bs4 import BeautifulSoup | ||
import urllib.request as req | ||
from tabulate import tabulate | ||
|
||
def getResponse(url): | ||
response = req.urlopen(url) | ||
data = response.read() | ||
soup = BeautifulSoup(data, "lxml") | ||
#print(soup.prettify("utf-8")) | ||
return soup | ||
|
||
def selectChoice(): | ||
|
||
'''options = { | ||
1: ('top'), | ||
2: ('moviemeter'), | ||
3: ('top-english-movies'), | ||
4: ('toptv'), | ||
5: ('tvmeter'), | ||
6: ('bottom'), | ||
7: ('boxoffice') | ||
} | ||
''' | ||
options_map = { | ||
1: ('Top movies' , 'top'), | ||
2: ('Most Popular Movies' , 'moviemeter'), | ||
3: ('Top English Movies' , 'top-english-movies'), | ||
4: ('Top TV Shows' , 'toptv'), | ||
5: ('Most Popular TV Shows' , 'tvmeter'), | ||
6: ('Low Rated Movies', 'bottom'), | ||
7: ('Top Box Office collection', 'boxoffice') | ||
} | ||
|
||
for i,option in enumerate(options_map,1): | ||
print("{}) {}".format(i,options_map[option][0])) | ||
|
||
choice = int(input('\nChoice please..\n')) | ||
while(choice<1 or choice>len(options_map)): | ||
print('Wrong choice, enter again..') | ||
choice = int(input('\nChoice please..\n')) | ||
return options_map[choice][1] | ||
|
||
def getData(base_url, option): | ||
complete_url = base_url + option | ||
soup = getResponse(complete_url) | ||
card_list = soup.find_all('span',{'class':'media-body media-vertical-align'}) #material card list | ||
result = [] | ||
count = 1 | ||
for card in card_list: | ||
try: | ||
name = card.find('h4').text.replace("\n"," ").lstrip("0123456789.- ") #removes order indexes for movies 1,2,3,4,... | ||
except: | ||
pass | ||
try: | ||
rating = card.find('p').text.strip() | ||
except: | ||
pass | ||
result.append([count,name,rating]) | ||
count += 1 | ||
print(tabulate(result, headers=["Index", "Name", "Ratings"], tablefmt="grid")) | ||
|
||
def main(): | ||
base_url = "http://m.imdb.com/chart/" | ||
choice = selectChoice() | ||
#print(choice) | ||
getData(base_url, choice) | ||
|
||
if __name__ == '__main__': | ||
main() | ||
|
||
|
||
''' | ||
#table formats | ||
- "plain" | ||
- "simple" | ||
- "grid" | ||
- "fancy_grid" | ||
- "pipe" | ||
- "orgtbl" | ||
- "jira" | ||
- "presto" | ||
- "psql" | ||
- "rst" | ||
- "mediawiki" | ||
- "moinmoin" | ||
- "youtrack" | ||
- "html" | ||
- "latex" | ||
- "latex_raw" | ||
- "latex_booktabs" | ||
- "textile" | ||
''' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
from selenium import webdriver | ||
from selenium.webdriver.common.keys import Keys | ||
import time | ||
|
||
driver = webdriver.Chrome() | ||
driver.get("https://www.imdb.com/title/tt0451279/reviews?ref_=tt_ql_3") | ||
|
||
elem = driver.find_element_by_id('load-more-trigger') | ||
|
||
while 1 == 1: | ||
try: | ||
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | ||
time.sleep(1) | ||
elem.click() | ||
continue | ||
except: | ||
print("Done") | ||
break | ||
|
||
|
||
nelem = driver.find_elements_by_class_name('expander-icon-wrapper') | ||
for i in nelem: | ||
try: | ||
i.click() | ||
time.sleep(0.2) | ||
except: | ||
print("Arrow not needed") | ||
continue | ||
|
||
print("Scraping complete...now attempting a save") | ||
|
||
page = driver.page_source | ||
file_ = open('npage.html', 'w') | ||
file_.write(page) | ||
file_.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
config.py | ||
*.pyc | ||
data/ | ||
checkpoint.txt | ||
checkpoint_cs.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
## Quickstart | ||
|
||
Fast subreddit image scraper. Has no real limits. | ||
|
||
Place `config.py` as | ||
|
||
```python | ||
SUBREDDIT = '' | ||
|
||
USERNAME = '' | ||
PASSWORD = '' | ||
CLIENT_SECRET = '' | ||
CLIENT_ID = '' | ||
PLATFORM = 'python' | ||
APP_ID = '' | ||
VERSION = 'v0.0.1' | ||
LIMIT = 200 | ||
DATA_DIR = './data' | ||
|
||
USER_AGENT = '{platform}:{app_id}:{version} (by /u/{username})'.format(platform=PLATFORM, | ||
app_id=APP_ID, | ||
version=VERSION, | ||
username=USERNAME) | ||
``` | ||
|
||
Use `python3 cloudsearch.py` to start. | ||
|
||
## Dependencies | ||
|
||
Depends on `praw` and `aiohttp` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import os | ||
import sys | ||
import praw | ||
from datetime import datetime | ||
from config import * | ||
from image_resolver import * | ||
from task_downloader import * | ||
|
||
if __name__ == '__main__': | ||
reddit = praw.Reddit(client_id=CLIENT_ID, | ||
client_secret=CLIENT_SECRET, | ||
password=PASSWORD, | ||
user_agent=USER_AGENT, | ||
username=USERNAME) | ||
|
||
last_upper = int((datetime.utcnow() - datetime(1970, 1, 1)).total_seconds()) | ||
download_data = [] | ||
|
||
# Checkpoint | ||
if os.path.isfile('checkpoint_cs.txt'): | ||
with open('checkpoint_cs.txt', 'r') as file: | ||
last_upper = int(file.read()) | ||
print("Loaded Checkpoint,", last_upper) | ||
|
||
while True: | ||
print("Collecting") | ||
download_data = [] | ||
|
||
upper = last_upper | ||
lower = upper - 86400 | ||
query = 'timestamp:%d..%d' % (lower, upper) | ||
|
||
generator = reddit.subreddit(SUBREDDIT).search(query, sort='new', limit=100, syntax='cloudsearch') | ||
|
||
for submission in generator: | ||
link = parse_url(submission.url) | ||
id_ = submission.fullname | ||
if link is not None: | ||
download_data.append((link, id_)) | ||
|
||
print("Downloading", len(download_data)) | ||
download_images(download_data) | ||
|
||
print('Done') | ||
|
||
with open('checkpoint_cs.txt', 'w') as file: | ||
file.write(str(last_upper)) | ||
|
||
print('Checkpointing') | ||
print('') | ||
|
||
last_upper = lower | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import os | ||
import sys | ||
import praw | ||
from config import * | ||
from image_resolver import * | ||
from task_downloader import * | ||
|
||
if __name__ == '__main__': | ||
reddit = praw.Reddit(client_id=CLIENT_ID, | ||
client_secret=CLIENT_SECRET, | ||
password=PASSWORD, | ||
user_agent=USER_AGENT, | ||
username=USERNAME) | ||
|
||
last_id = '' | ||
download_data = [] | ||
|
||
# Checkpoint | ||
if os.path.isfile('checkpoint.txt'): | ||
with open('checkpoint.txt', 'r') as file: | ||
last_id = file.read() | ||
print("Loaded Checkpoint,", last_id) | ||
|
||
while True: | ||
print("Collecting") | ||
download_data = [] | ||
|
||
if last_id == '': | ||
generator = reddit.subreddit(SUBREDDIT).top('week', limit=LIMIT) | ||
else: | ||
generator = reddit.subreddit(SUBREDDIT).top('week', limit=LIMIT, params={'after': last_id}) | ||
|
||
for submission in generator: | ||
link = parse_url(submission.url) | ||
last_id = submission.fullname | ||
if link is not None: | ||
download_data.append((link, last_id)) | ||
|
||
print("Downloading", len(download_data)) | ||
download_images(download_data) | ||
|
||
print('Done') | ||
|
||
with open('checkpoint.txt', 'w') as file: | ||
file.write(last_id) | ||
|
||
print('Checkpointing') | ||
print('') | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
''' | ||
Contains rules to decide wether image is worth it or not. | ||
''' | ||
|
||
def parse_url(url): | ||
parsed = None | ||
if url.endswith('.jpg'): | ||
parsed = url | ||
elif 'imgur.com' in url: | ||
if '/a/' not in url and '/gallery/' not in url: | ||
parsed = 'http://i.imgur.com/{id}.jpg'.format(id=url.split('/')[-1]) | ||
elif 'reddituploads.com' in url: | ||
parsed = url | ||
|
||
return parsed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
import asyncio | ||
import os | ||
from aiohttp import ClientSession | ||
from config import * | ||
|
||
async def download_image(url, id): | ||
if os.path.isfile(os.path.join(DATA_DIR, id + '.jpg')): | ||
return | ||
async with ClientSession() as session: | ||
async with session.get(url) as response: | ||
response = await response.read() | ||
with open(os.path.join(DATA_DIR, id + '.jpg'), 'wb') as f: | ||
f.write(response) | ||
|
||
def download_images(download_data): | ||
loop = asyncio.get_event_loop() | ||
|
||
tasks = [] | ||
for dl in download_data: | ||
task = asyncio.ensure_future(download_image(dl[0], dl[1])) | ||
tasks.append(task) | ||
|
||
loop.run_until_complete(asyncio.wait(tasks)) |