diff --git a/Python/RanjanPaudel/.gitignore b/Python/RanjanPaudel/.gitignore index 51b4c71..98fd422 100644 --- a/Python/RanjanPaudel/.gitignore +++ b/Python/RanjanPaudel/.gitignore @@ -1,10 +1,15 @@ +#python caches */__pycache__ */*/__pycache__ */*/*/__pycache__ - .pytest_cache +#vscode configs .vscode +#environment files .env */.env + +#python virtual environments +*/*venv diff --git a/Python/RanjanPaudel/scraper/.env.example b/Python/RanjanPaudel/scraper/.env.example new file mode 100644 index 0000000..e12ba34 --- /dev/null +++ b/Python/RanjanPaudel/scraper/.env.example @@ -0,0 +1,24 @@ +#MYSQL_DB +MYSQL_DB_HOST= +MYSQL_DB_PORT= +MYSQL_DB_USER= +MYSQL_DB_PASSWORD= + +#MYSQL_TEST_DB +MYSQL_TEST_DB_HOST= +MYSQL_TEST_DB_PORT= +MYSQL_TEST_DB_USER= +MYSQL_TEST_DB_PASSWORD= + +#JWT +JWT_SECRET_A= +JWT_SECRET_B= +JWT_ALGORITHM= +JWT_ACCESS_TOKEN_LIFE= +JWT_REFRESH_TOKEN_LIFE= + +#COOKIE +COOKIE_LIFE= + +#SESSION +SESSION_SECRET="" diff --git a/Python/RanjanPaudel/scraper/.gitignore b/Python/RanjanPaudel/scraper/.gitignore new file mode 100644 index 0000000..98fd422 --- /dev/null +++ b/Python/RanjanPaudel/scraper/.gitignore @@ -0,0 +1,15 @@ +#python caches +*/__pycache__ +*/*/__pycache__ +*/*/*/__pycache__ +.pytest_cache + +#vscode configs +.vscode + +#environment files +.env +*/.env + +#python virtual environments +*/*venv diff --git a/Python/RanjanPaudel/scraper/README.md b/Python/RanjanPaudel/scraper/README.md new file mode 100644 index 0000000..6a5ef89 --- /dev/null +++ b/Python/RanjanPaudel/scraper/README.md @@ -0,0 +1,61 @@ +# IMDb Scraper: A Flask App +Simple Server-side-rendering web-app(Python) in Flask. An authorized user can scrape (or update the scraped data of) four IMDb pages:
+[Top Rated Movies](https://www.imdb.com/chart/top/?ref_=nv_mv_250)
+[Most Popular Movies](https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm)
+[Top Rated TV Shows](https://www.imdb.com/chart/toptv/?ref_=nv_tvv_250)
+[Most Popular TV Shows](https://www.imdb.com/chart/tvmeter/?ref_=nv_tvv_mptv) + +## Install +Clone the repo. +``` +# clone this repo +$ git clone https://github.com/mrranjan31paudel/lf-training.git +$ cd Python/RanjanPaudel/scraper +``` +Create a virtual environment and activate it. (Recommended) +``` +$ python3 -m venv scraper_app_venv +$ source scraper_app_venv/bin/activate + +----Or in Windows cmd---- +$ scraper_app_venv\Scripts\activate.bat +``` +Install packages from requrements.txt. +``` +$ python3 -m pip install -r requirements.txt +``` +## Setup +Install mysql in your system (Follow [this guide](https://dev.mysql.com/doc/mysql-installation-excerpt/5.7/en/)).
+After the installation is complete, copy `.env.example` as `.env`:
+``` +$ cp .env.example .env +``` +Then set the parameters as per your requirement.
+To run the migrations, first create database `scraper_app` for development and `scraper_app_test` for test. Then use the simple migrating CLI app: +``` +$ python3 scraper_app/db_migrator.py --env {test|development} --action {create|drop} +``` +*You can use: `$ python3 scraper_app/db_migrator.py --help` for detailed info about the CLI migrator-app.* +## Run +### Mode: Development +In the terminal do: +``` +$ export FLASK_ENV=development +$ export FLASK_APP=scraper_app/app.py +$ flask run +``` +### Mode: Test +This is used to perform the unit-tests in the `/tests` folder. For this you just have to do: +``` +$ pytest -s +#'-s' to show the logs of test passings. +``` +## Some screenshots +![Screenshot-1](./readme_pics/sc1.png) +![Screenshot-2](./readme_pics/sc2.png) +![Screenshot-3](./readme_pics/sc3.png) +![Screenshot-4](./readme_pics/sc4.png) +![Screenshot-5](./readme_pics/sc5.png) +![Screenshot-6](./readme_pics/sc6.png) +![Screenshot-7](./readme_pics/sc7.png) +![Screenshot-8](./readme_pics/sc8.png) \ No newline at end of file diff --git a/Python/RanjanPaudel/scraper/readme_pics/sc1.png b/Python/RanjanPaudel/scraper/readme_pics/sc1.png new file mode 100644 index 0000000..e4d7aad Binary files /dev/null and b/Python/RanjanPaudel/scraper/readme_pics/sc1.png differ diff --git a/Python/RanjanPaudel/scraper/readme_pics/sc2.png b/Python/RanjanPaudel/scraper/readme_pics/sc2.png new file mode 100644 index 0000000..299ce73 Binary files /dev/null and b/Python/RanjanPaudel/scraper/readme_pics/sc2.png differ diff --git a/Python/RanjanPaudel/scraper/readme_pics/sc3.png b/Python/RanjanPaudel/scraper/readme_pics/sc3.png new file mode 100644 index 0000000..3db5620 Binary files /dev/null and b/Python/RanjanPaudel/scraper/readme_pics/sc3.png differ diff --git a/Python/RanjanPaudel/scraper/readme_pics/sc4.png b/Python/RanjanPaudel/scraper/readme_pics/sc4.png new file mode 100644 index 0000000..3297638 Binary files /dev/null and b/Python/RanjanPaudel/scraper/readme_pics/sc4.png differ diff --git a/Python/RanjanPaudel/scraper/readme_pics/sc5.png b/Python/RanjanPaudel/scraper/readme_pics/sc5.png new file mode 100644 index 0000000..0b6a911 Binary files /dev/null and b/Python/RanjanPaudel/scraper/readme_pics/sc5.png differ diff --git a/Python/RanjanPaudel/scraper/readme_pics/sc6.png b/Python/RanjanPaudel/scraper/readme_pics/sc6.png new file mode 100644 index 0000000..00e3db4 Binary files /dev/null and b/Python/RanjanPaudel/scraper/readme_pics/sc6.png differ diff --git a/Python/RanjanPaudel/scraper/readme_pics/sc7.png b/Python/RanjanPaudel/scraper/readme_pics/sc7.png new file mode 100644 index 0000000..2d602b8 Binary files /dev/null and b/Python/RanjanPaudel/scraper/readme_pics/sc7.png differ diff --git a/Python/RanjanPaudel/scraper/readme_pics/sc8.png b/Python/RanjanPaudel/scraper/readme_pics/sc8.png new file mode 100644 index 0000000..8532b79 Binary files /dev/null and b/Python/RanjanPaudel/scraper/readme_pics/sc8.png differ diff --git a/Python/RanjanPaudel/scraper/requirements.txt b/Python/RanjanPaudel/scraper/requirements.txt new file mode 100644 index 0000000..8577411 --- /dev/null +++ b/Python/RanjanPaudel/scraper/requirements.txt @@ -0,0 +1,11 @@ +Flask==1.1.2 +PyJWT==1.7.1 +pytest==6.0.1 +requests==2.24.0 +watchdog==0.10.3 +cryptography==3.1 +Flask-WTF==0.14.3 +SQLAlchemy==1.3.19 +mysqlclient==2.0.1 +beautifulsoup4==4.9.1 +python-dotenv==0.14.0 diff --git a/Python/RanjanPaudel/scraper/scraper_app/__init__.py b/Python/RanjanPaudel/scraper/scraper_app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Python/RanjanPaudel/scraper/scraper_app/app.py b/Python/RanjanPaudel/scraper/scraper_app/app.py new file mode 100644 index 0000000..bafb6d3 --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/app.py @@ -0,0 +1,197 @@ +from flask import Flask, request, Markup, render_template, redirect, url_for, make_response, flash +from jinja2 import Environment, PackageLoader, select_autoescape +from functools import wraps + +import scraper_app.config as config +import scraper_app.validators as validators +import scraper_app.services as services +from scraper_app.app_constants import ( + empty_signin_form, + empty_login_form, + tab_list, + list_table_columns, + list_table_column_keys, + tab_label_map) + +app = Flask(__name__) +app.secret_key = bytes(config.SESSION_SECRET, encoding='utf8') + + +@app.route('/') +def root_page(): + return redirect(url_for('home_page')) + + +@app.route('/home', methods=['GET']) +@app.route('/home/', methods=['GET']) +def home_page(tab_name=None): + authentication_info = services.authenticate_user( + request.cookies.copy().to_dict(flat=True)) + + if authentication_info == 'token_invalid': + return redirect(url_for('login_page')) + + if ('status' in authentication_info) and ( + authentication_info['status'] == 'token_expired'): + return redirect(url_for('refresh_tokens', tab_name=tab_name)) + + if request.method == 'GET': + scraped_list = '' + if tab_name: + scraped_list = services.get_scraped_list(tab_name) + if ('movie_list' not in scraped_list) or len(scraped_list['movie_list']) < 1: + flash( + f'Could not load the list for {tab_label_map[tab_name]}', 'error') + return render_template('home.html', + user_is_logged_in=True, + user=authentication_info['user'], + tab_list=tab_list, + selected_tab=tab_name, + tab_label_map=tab_label_map, + list_table_columns=list_table_columns, + list_table_column_keys=list_table_column_keys, + scraped_list=scraped_list) + + +@app.route('/scrape/', methods=['GET']) +def scrape_list(list_name=None): + authentication_info = services.authenticate_user( + request.cookies.copy().to_dict(flat=True)) + + if authentication_info == 'token_invalid': + return redirect(url_for('login_page')) + + if list_name in tab_label_map.keys(): + try: + services.scrape(list_name) + flash( + f'{tab_label_map[list_name]} scraped successfully!', 'success') + except Exception as error: + flash( + f'Could not scrape {tab_label_map[list_name]} due to some internal errors!', 'error') + + return redirect(url_for('home_page', tab_name=list_name)) + else: + flash(f'Invalid request!', 'error') + return redirect(url_for('home_page')) + + +@app.route('/refresh', methods=['GET']) +def refresh_tokens(): + if request.method == 'GET': + new_tokens_info = services.refresh_tokens( + request.cookies.copy().to_dict(flat=True)) + + if new_tokens_info in ['token_expired', 'token_invalid', 'token_refresh_error']: + refresh_response = make_response(redirect(url_for('login_page'))) + refresh_response.set_cookie('access_token', '', + path='/', httponly=True, max_age=0) + refresh_response.set_cookie('refresh_token', '', + path='/refresh', httponly=True, max_age=0) + + return refresh_response + + query_params = request.args.copy().to_dict(flat=True) + tab_name = '' + if 'tab_name' in query_params: + tab_name = query_params['tab_name'] + response = make_response( + redirect(url_for('home_page', tab_name=tab_name))) + response.set_cookie('access_token', new_tokens_info['access_token'], + path='/', httponly=True, max_age=config.COOKIE_LIFE) + response.set_cookie('refresh_token', new_tokens_info['refresh_token'], + path='/refresh', httponly=True, max_age=config.COOKIE_LIFE) + + return response + + +@app.route('/login', methods=['GET', 'POST']) +def login_page(): + authentication_info = services.authenticate_user( + request.cookies.copy().to_dict(flat=True)) + + if ('status' in authentication_info) and ( + authentication_info['status'] == 'token_expired'): + return redirect(url_for('refresh_tokens')) + + if 'user' in authentication_info: + return redirect(url_for('home_page')) + + if request.method == 'GET': + return render_template('login.html', form_data=empty_login_form) + + if request.method == 'POST': + validation = validators.validate_login_form(request.form) + + if validation['has_error']: + return render_template('login.html', form_data=request.form, error=validation['error']) + + try: + tokens = services.log_in_user( + request.form.copy().to_dict(flat=True)) + + response = make_response(redirect(url_for('home_page'))) + response.set_cookie('access_token', tokens['access_token'], + path='/', httponly=True, max_age=config.COOKIE_LIFE) + response.set_cookie('refresh_token', tokens['refresh_token'], + path='/refresh', httponly=True, max_age=config.COOKIE_LIFE) + + return response + except Exception as error: + error_dict = error.args[0] + resp = make_response(render_template( + 'login.html', form_data=request.form, error=error_dict), error_dict['code']) + return resp + + +@app.route('/signin', methods=['GET', 'POST']) +def signin_page(): + authentication_info = services.authenticate_user( + request.cookies.copy().to_dict(flat=True)) + + if ('status' in authentication_info) and ( + authentication_info['status'] == 'token_expired'): + return redirect(url_for('refresh_tokens')) + + if 'user' in authentication_info: + return redirect(url_for('home_page')) + + if request.method == 'GET': + return render_template('signin.html', form_data=empty_signin_form) + + if request.method == 'POST': + validation = validators.validate_signin_form(request.form) + + if validation['has_error']: + resp = make_response(render_template( + 'signin.html', form_data=request.form, error=validation['error']), 400) + return resp + + try: + services.create_new_user( + request.form.copy().to_dict(flat=True)) + flash('You were signed in successfully!', 'success') + + return redirect(url_for('login_page')) + except Exception as error: + error_dict = error.args[0] + resp = make_response(render_template( + 'signin.html', form_data=request.form, error=error_dict), error_dict['code']) + return resp + + +@app.route('/logout', methods=['GET']) +def logout_page(): + authentication_info = services.authenticate_user( + request.cookies.copy().to_dict(flat=True)) + + if 'user' in authentication_info: + services.log_out_user(authentication_info['user']) + + response = make_response(redirect(url_for('login_page'))) + response.set_cookie('access_token', '', + path='/', httponly=True, max_age=0) + response.set_cookie('refresh_token', '', + path='/refresh', httponly=True, max_age=0) + + return response diff --git a/Python/RanjanPaudel/scraper/scraper_app/app_constants.py b/Python/RanjanPaudel/scraper/scraper_app/app_constants.py new file mode 100644 index 0000000..03bf7ef --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/app_constants.py @@ -0,0 +1,155 @@ +empty_signin_form = { + "full_name": '', + "username": '', + "dob": '', + "new_password": '', + "confirm_password": '' +} + +empty_login_form = { + "username": '', + "password": '' +} + +tab_list = [ + { + "label": 'Top rated movies', + "value": 'top_rated_movies' + }, + { + "label": 'Top rated TV shows', + "value": 'top_rated_tv_shows' + }, + { + "label": 'Most popular movies', + "value": 'most_popular_movies' + }, + { + "label": 'Most popular TV shows', + "value": 'most_popular_tv_shows' + } +] + +tab_label_map = { + "top_rated_movies": 'Top rated movies', + "top_rated_tv_shows": 'Top rated TV shows', + "most_popular_movies": 'Most popular movies', + "most_popular_tv_shows": 'Most popular TV shows' +} + +list_table_columns = { + "top_rated_movies": { + "image": { + "label": '', + "class": '' + }, + "title": { + "label": 'Title', + "class": 'algn-lft' + }, + "rank": { + "label": 'Rank', + "class": 'algn-rht' + }, + "release_year": { + "label": 'Release Year', + "class": 'algn-rht' + }, + "imdb_rating": { + "label": 'IMDb Rating', + "class": 'algn-rht' + } + }, + "top_rated_tv_shows": { + "image": { + "label": '', + "class": '' + }, + "title": { + "label": 'Title', + "class": 'algn-lft' + }, + "rank": { + "label": 'Rank', + "class": 'algn-rht' + }, + "release_year": { + "label": 'Release Year', + "class": 'algn-rht' + }, + "imdb_rating": { + "label": 'IMDb Rating', + "class": 'algn-rht' + } + }, + "most_popular_movies": { + "image": { + "label": '', + "class": '' + }, + "title": { + "label": 'Title', + "class": 'algn-lft' + }, + "release_year": { + "label": 'Release Year', + "class": 'algn-rht' + }, + "imdb_rating": { + "label": 'IMDb Rating', + "class": 'algn-rht' + }, + "position": { + "label": 'Position (From)', + "class": 'algn-rht', + "popularity": { + "INCREASED": 'color-green', + "DECREASED": 'color-red', + "NO CHANGE": 'color-orange' + } + } + }, + "most_popular_tv_shows": { + "image": { + "label": '', + "class": '' + }, + "title": { + "label": 'Title', + "class": 'algn-lft' + }, + "release_year": { + "label": 'Release Year', + "class": 'algn-rht' + }, + "imdb_rating": { + "label": 'IMDb Rating', + "class": 'algn-rht' + }, + "position": { + "label": 'Position (From)', + "class": 'algn-rht', + "popularity": { + "INCREASED": 'color-green', + "DECREASED": 'color-red', + "NO CHANGE": 'color-orange' + } + } + } +} + +list_table_column_keys = { + "top_rated_movies": ['image', 'title', 'rank', 'release_year', 'imdb_rating'], + "top_rated_tv_shows": ['image', 'title', 'rank', 'release_year', 'imdb_rating'], + "most_popular_movies": ['image', 'title', 'release_year', 'imdb_rating', 'position'], + "most_popular_tv_shows": ['image', 'title', 'release_year', 'imdb_rating', 'position'] +} + +imdb_base_url = "https://www.imdb.com" + +search_map = { + "top_rated_movies": "/chart/top/?ref_=nv_mv_250", + "most_popular_movies": "/chart/moviemeter/?ref_=nv_mv_mpm", + "top_rated_tv_shows": "/chart/toptv/?ref_=nv_tvv_250", + "most_popular_tv_shows": "/chart/tvmeter/?ref_=nv_tvv_mptv" +} diff --git a/Python/RanjanPaudel/scraper/scraper_app/auth.py b/Python/RanjanPaudel/scraper/scraper_app/auth.py new file mode 100644 index 0000000..3beb1be --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/auth.py @@ -0,0 +1,47 @@ +from werkzeug.security import generate_password_hash, check_password_hash +import jwt +from jwt import InvalidTokenError +import time +import traceback + +import scraper_app.config as config + + +def generate_token(user, _type): + jwt_payload = { + "id": user['id'] + } + + if _type == 'access_token': + jwt_payload['exp'] = time.time() + float(config.JWT_ACCESS_TOKEN_LIFE) + elif _type == 'refresh_token': + jwt_payload['exp'] = time.time() + float(config.JWT_REFRESH_TOKEN_LIFE) + + try: + return jwt.encode(payload=jwt_payload, + key=config.JWT_SECRET, algorithm=config.JWT_ALGORITHM) + except Exception as error: + raise Exception('Error genarating token.') + + +def decode_token(token, _type): + token_data = jwt.decode(jwt=token, key=config.JWT_SECRET, + algorithms=config.JWT_ALGORITHM, verify=False) + if time.time() > token_data['exp']: + return { + "status": 'token_expired', + "payload": token_data + } + + return { + "status": 'valid', + "payload": token_data + } + + +def encrypt_password(password): + return generate_password_hash(password, method='sha256', salt_length=10) + + +def decrypt_password(hashed_password, input_password): + return check_password_hash(hashed_password, input_password) diff --git a/Python/RanjanPaudel/scraper/scraper_app/config.py b/Python/RanjanPaudel/scraper/scraper_app/config.py new file mode 100644 index 0000000..053987d --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/config.py @@ -0,0 +1,11 @@ +import os +from dotenv import load_dotenv + +load_dotenv() + +JWT_SECRET = os.getenv('JWT_SECRET') +JWT_ALGORITHM = os.getenv('JWT_ALGORITHM') +JWT_ACCESS_TOKEN_LIFE = os.getenv('JWT_ACCESS_TOKEN_LIFE') +JWT_REFRESH_TOKEN_LIFE = os.getenv('JWT_REFRESH_TOKEN_LIFE') +COOKIE_LIFE = float(os.getenv('COOKIE_LIFE')) +SESSION_SECRET = os.getenv('SESSION_SECRET') diff --git a/Python/RanjanPaudel/scraper/scraper_app/db_config.py b/Python/RanjanPaudel/scraper/scraper_app/db_config.py new file mode 100644 index 0000000..7463a53 --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/db_config.py @@ -0,0 +1,30 @@ +import os +from dotenv import load_dotenv +from sqlalchemy import create_engine + +load_dotenv() + +_MYSQL_DB_HOST = os.getenv("MYSQL_DB_HOST") +_MYSQL_DB_PORT = os.getenv("MYSQL_DB_PORT") +_MYSQL_DB_USER = os.getenv("MYSQL_DB_USER") +_MYSQL_DB_PASSWORD = os.getenv("MYSQL_DB_PASSWORD") + +_MYSQL_TEST_DB_HOST = os.getenv("MYSQL_TEST_DB_HOST") +_MYSQL_TEST_DB_PORT = os.getenv("MYSQL_TEST_DB_PORT") +_MYSQL_TEST_DB_USER = os.getenv("MYSQL_TEST_DB_USER") +_MYSQL_TEST_DB_PASSWORD = os.getenv("MYSQL_TEST_DB_PASSWORD") + +app_env = os.environ['FLASK_ENV'] +engine = '' + +print('APP ENV: ', app_env) + +if app_env == 'development': + engine = create_engine( + f"mysql+mysqldb://{_MYSQL_DB_USER}:{_MYSQL_DB_PASSWORD}@{_MYSQL_DB_HOST}:{_MYSQL_DB_PORT}/scraper_app") + +if app_env == 'test': + engine = create_engine( + f"mysql+mysqldb://{_MYSQL_TEST_DB_USER}:{_MYSQL_TEST_DB_PASSWORD}@{_MYSQL_TEST_DB_HOST}:{_MYSQL_TEST_DB_PORT}/test_scraper_app") + +MYSQL_DB_ENGINE = 'InnoDB' diff --git a/Python/RanjanPaudel/scraper/scraper_app/db_migrator.py b/Python/RanjanPaudel/scraper/scraper_app/db_migrator.py new file mode 100644 index 0000000..b109424 --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/db_migrator.py @@ -0,0 +1,57 @@ +import os +import sys +import argparse + +new_sys_path = os.path.abspath('./') +sys.path.append(new_sys_path) + +parser = argparse.ArgumentParser(description='MySQL migrator', + formatter_class=argparse.RawDescriptionHelpFormatter) +parser.add_argument("--env", + help='Choose the environment.', + choices=("development", "test"), + required=True) +parser.add_argument("--action", + help='To create/drop all tables', + choices=("create", "drop"), + required=True) + + +def main(arg_dict): + import scraper_app.db_config as db_config + import scraper_app.db_models as db_models + + if arg_dict['action'] == 'create': + db_models.meta_data.create_all(db_config.engine) + + with db_models.conn.begin(): + inital_insert = db_models.tables['movie_list_meta'].insert().values([ + {"list_name": 'top_rated_movies'}, + {"list_name": 'top_rated_tv_shows'}, + {"list_name": 'most_popular_movies'}, + {"list_name": 'most_popular_tv_shows'} + ]) + db_models.conn.execute(inital_insert) + + if arg_dict['env'] == 'test': # Create a test user if the environment is 'test' + create_test_user = db_models.tables['users'].insert().values([ + { + 'full_name': 'Test User', + 'username': 'testuser', + 'dob': '2000-01-01', + 'password': 'sha256$e5S53XFBq6$9f6111fd4e448fffe3f2d6ea84dce01d59f6b26de925a39ec7134f51397bb6c8' + } + ]) + db_models.conn.execute(create_test_user) + + if arg_dict['action'] == 'drop': + db_models.meta_data.drop_all(db_config.engine) + + +if __name__ == "__main__": + opts = parser.parse_args() + dict_opts = opts.__dict__ + + os.environ['FLASK_ENV'] = str(dict_opts['env']) + + main(dict_opts) diff --git a/Python/RanjanPaudel/scraper/scraper_app/db_models.py b/Python/RanjanPaudel/scraper/scraper_app/db_models.py new file mode 100644 index 0000000..9d07477 --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/db_models.py @@ -0,0 +1,236 @@ +from sqlalchemy import ( + Table, + Column, + MetaData, + UniqueConstraint, + ForeignKey, + text +) +from sqlalchemy.dialects.mysql import ( + INTEGER, + VARCHAR, + FLOAT, + ENUM, + DATE, + TIMESTAMP, + TEXT +) +from sqlalchemy.engine import Connection +from sqlalchemy.sql import Select, Delete, Update +from datetime import datetime + +import scraper_app.db_config as db_config + +meta_data = MetaData() + +users = Table('users', meta_data, + Column('id', INTEGER(), primary_key=True), + Column('full_name', VARCHAR(100), nullable=False), + Column('username', VARCHAR(100), + unique=True, nullable=False), + Column('dob', DATE, nullable=False), + Column('password', VARCHAR(255), nullable=False), + Column('created_at', TIMESTAMP(), + nullable=False, server_default=text('CURRENT_TIMESTAMP')), + mysql_engine=db_config.MYSQL_DB_ENGINE) + +refresh_tokens = Table('refresh_tokens', meta_data, + Column('id', INTEGER(), primary_key=True), + Column('user_id', INTEGER(), ForeignKey( + 'users.id'), nullable=False), + Column('refresh_token', TEXT, nullable=False), + mysql_engine=db_config.MYSQL_DB_ENGINE) + +top_rated_movies = Table('top_rated_movies', meta_data, + Column('id', INTEGER(), primary_key=True), + Column('rank', INTEGER(), unique=True), + Column('title', VARCHAR(255)), + Column('release_year', INTEGER()), + Column('imdb_rating', FLOAT(2)), + Column('img_link', VARCHAR(255)), + Column('imdb_link', VARCHAR(255)), + mysql_engine=db_config.MYSQL_DB_ENGINE) + +top_rated_tv_shows = Table('top_rated_tv_shows', meta_data, + Column('id', INTEGER(), primary_key=True), + Column('rank', INTEGER(), unique=True), + Column('title', VARCHAR(255)), + Column('release_year', INTEGER()), + Column('imdb_rating', FLOAT(2)), + Column('img_link', VARCHAR(255)), + Column('imdb_link', VARCHAR(255)), + mysql_engine=db_config.MYSQL_DB_ENGINE) + +most_popular_movies = Table('most_popular_movies', meta_data, + Column('id', INTEGER(), primary_key=True), + Column('title', VARCHAR(255)), + Column('release_year', INTEGER()), + Column('imdb_rating', VARCHAR(255)), + Column('position', INTEGER()), + Column('pre_position', INTEGER()), + Column('popularity', + ENUM("INCREASED", "DECREASED", "NO CHANGE")), + Column('img_link', VARCHAR(255)), + Column('imdb_link', VARCHAR(255)), + UniqueConstraint( + "title", "release_year", "position"), + mysql_engine=db_config.MYSQL_DB_ENGINE) + +most_popular_tv_shows = Table('most_popular_tv_shows', meta_data, + Column('id', INTEGER(), primary_key=True), + Column('title', VARCHAR(255)), + Column('release_year', INTEGER()), + Column('imdb_rating', VARCHAR(255)), + Column('position', INTEGER()), + Column('pre_position', INTEGER()), + Column('popularity', + ENUM("INCREASED", "DECREASED", "NO CHANGE")), + Column('img_link', VARCHAR(255)), + Column('imdb_link', VARCHAR(255)), + UniqueConstraint( + "title", "release_year", "position"), + mysql_engine=db_config.MYSQL_DB_ENGINE) + +movie_list_meta = Table('movie_list_meta', meta_data, + Column('id', INTEGER(), primary_key=True), + Column('list_name', ENUM("top_rated_movies", "top_rated_tv_shows", + "most_popular_movies", "most_popular_tv_shows"), unique=True, nullable=False), + Column('last_updated', TIMESTAMP()), + mysql_engine=db_config.MYSQL_DB_ENGINE) + + +conn = Connection(db_config.engine) +tables = meta_data.tables + + +def create_user(user_data): + result = '' + with conn.begin(): + user_stmt = users.insert().values(user_data) + result = conn.execute(user_stmt) + + return result + + +def find_user_by_username(username): + result = [] + with conn.begin(): + user_stmt = Select([users]).where(users.c.username == username) + for row in conn.execute(user_stmt): + result.append(dict(row)) + + return result + + +def find_user_by_id(user_id): + result = [] + with conn.begin(): + user_stmt = Select([users]).where(users.c.id == user_id) + for row in conn.execute(user_stmt): + result.append(dict(row)) + + return result + + +def check_user_existance(username): + result = find_user_by_username(username) + if len(result) > 0: + return True + + return False + + +def create_token(user, refresh_token): + new_token = { + "user_id": user['id'], + "refresh_token": refresh_token + } + + result = '' + with conn.begin(): + token_stmt = refresh_tokens.insert().values(new_token) + result = conn.execute(token_stmt) + + return result + + +def find_token(user): + result = [] + with conn.begin(): + token_stmt = Select([refresh_tokens]).where( + refresh_tokens.c.user_id == user['id']) + for row in conn.execute(token_stmt): + result.append(dict(row)) + + return result + + +def update_token(user, old_token, new_token): + result = '' + with conn.begin(): + token_stmt = refresh_tokens.update().where(refresh_tokens.c.user_id == user['id']).where( + refresh_tokens.c.refresh_token == old_token).values( + refresh_token=new_token) + result = conn.execute(token_stmt) + + return result + + +def remove_token(user): + result = '' + with conn.begin(): + token_stmt = refresh_tokens.delete().where( + refresh_tokens.c.user_id == user['id']) + result = conn.execute(token_stmt) + + return result + + +def create_movie_list(table_name, movie_list): + result = '' + with conn.begin(): + list_stmt = tables[table_name].insert().values(movie_list) + result = conn.execute(list_stmt) + + return result + + +def update_movie_list(table_name, movie_list): + result = '' + with conn.begin(): + remove_all_stmt = tables[table_name].delete() + conn.execute(remove_all_stmt) + insert_many_stmt = tables[table_name].insert().values(movie_list) + result = conn.execute(insert_many_stmt) + + return result + + +def update_movie_meta(list_name): + result = '' + with conn.begin(): + update_stmt = movie_list_meta.update().where(movie_list_meta.c.list_name == + list_name).values(last_updated=datetime.utcnow()) + result = conn.execute(update_stmt) + + return result + + +def find_movie_meta(list_name): + result = [] + with conn.begin(): + stmt = movie_list_meta.select().where(movie_list_meta.c.list_name == list_name) + for row in conn.execute(stmt): + result.append(dict(row)) + + return result[0] + + +def find_all_movies(table_name): + result = [] + with conn.begin(): + list_stmt = tables[table_name].select() + for row in conn.execute(list_stmt): + result.append(dict(row)) + + return result diff --git a/Python/RanjanPaudel/scraper/scraper_app/errors.py b/Python/RanjanPaudel/scraper/scraper_app/errors.py new file mode 100644 index 0000000..2ba08c9 --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/errors.py @@ -0,0 +1,32 @@ +import traceback + + +class BadRequest(Exception): + def __init__(self, key=None, message=None): + super().__init__({ + "code": 400, + "message": message, + f"{key}": message + }) + + +class InternalError(Exception): + def __init__(self, key=None, message=None): + super().__init__({ + "code": 500, + "message": message, + f"{key}": message + }) + + +class NotFound(Exception): + def __init__(self, key=None, message=None): + super().__init__({ + "code": 404, + "message": message, + f"{key}": message + }) + + +def print_traceback(error): + traceback.print_exception(type(error), error, error.__traceback__) diff --git a/Python/RanjanPaudel/scraper/scraper_app/scraper.py b/Python/RanjanPaudel/scraper/scraper_app/scraper.py new file mode 100644 index 0000000..04dfb1b --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/scraper.py @@ -0,0 +1,114 @@ +import re +from bs4 import BeautifulSoup + +imdb_base_url = "https://www.imdb.com" + +search_map = { + "top_rated_movies": "/chart/top/?ref_=nv_mv_250", + "most_popular_movies": "/chart/moviemeter/?ref_=nv_mv_mpm", + "top_rated_tv_shows": "/chart/toptv/?ref_=nv_tvv_250", + "most_popular_tv_shows": "/chart/tvmeter/?ref_=nv_tvv_mptv" +} + + +def get_top_rated_list(table_rows): + movie_list = [] + for tr in table_rows: + title_column = tr.find('td', {'class': 'titleColumn'}) + rank_column = tr.find('td', {'class': 'ratingColumn imdbRating'}) + if not (title_column and rank_column): + return + + title_texts = title_column.get_text( + separator='_//\\_', strip=True).split('_//\\_') + + rank = title_texts[0].replace('.', '') + title = title_texts[1] + year = re.sub(r"^\(|\)$", "", title_texts[2]) + rating_text = rank_column.get_text(strip=True) + movie_link = title_column.find('a') + movie_img = tr.find('img', {'alt': f'{title}'}) + + movie_list.append({ + "rank": rank, + "title": title, + "release_year": year, + "imdb_rating": rating_text, + "imdb_link": f"{imdb_base_url}{movie_link['href']}", + "img_link": movie_img['src'] + }) + + return movie_list + + +def get_most_popular_list(table_rows): + movie_list = [] + for tr in table_rows: + title_column = tr.find('td', {'class': 'titleColumn'}) + rank_column = tr.find('td', {'class': 'ratingColumn imdbRating'}) + if not (title_column and rank_column): + return + + title_texts = title_column.get_text( + separator='_//\\_', strip=True).replace('\n', '_//\\_').split('_//\\_') + + title = title_texts[0] + year = re.sub(r"^\(|\)$", "", title_texts[1]) + position = title_texts[2].replace(',', '') + pre_position = re.sub( + r"^\(|\)$", "", title_texts[len(title_texts) - 1].strip()).replace(',', '') + rating_text = rank_column.get_text(strip=True) or 'NOT RATED' + popularity = '' + + if pre_position == 'no change': + popularity = pre_position.upper() + pre_position = position + elif int(position) < int(pre_position): + popularity = 'INCREASED' + else: + popularity = 'DECREASED' + movie_link = title_column.find('a') + movie_img = tr.find('img', {'alt': f'{title}'}) + + movie_list.append({ + "title": title, + "release_year": year, + "imdb_rating": rating_text, + "position": position, + "pre_position": pre_position, + "popularity": popularity, + "imdb_link": f"{imdb_base_url}{movie_link['href']}", + "img_link": movie_img['src'] + }) + + return movie_list + + +def get_movie_list(content, _type): + soup = BeautifulSoup(content, 'html.parser') + + if not soup: + raise Exception(f'Could not generate the soup for {_type} list!') + + table_body = soup.find('tbody', {'class': 'lister-list'}) + if not table_body: + raise Exception( + f'Listing table not found in given URL: {imdb_base_url + search_map[_type]}.') + + table_rows = table_body.find_all('tr') + if not table_rows or len(table_rows) < 1: + raise Exception(f'No table rows found in table lister-list.') + + movie_list = '' + try: + if _type == 'top_rated_movies' or _type == 'top_rated_tv_shows': + movie_list = get_top_rated_list(table_rows) + elif _type == 'most_popular_movies' or _type == 'most_popular_tv_shows': + movie_list = get_most_popular_list(table_rows) + except Exception as error: + raise Exception(f'Error genarating {_type} list: ', error) + + if not movie_list or len(movie_list) < 1: + raise Exception(f'Could not prepare list for {_type}!') + + return movie_list diff --git a/Python/RanjanPaudel/scraper/scraper_app/services.py b/Python/RanjanPaudel/scraper/scraper_app/services.py new file mode 100644 index 0000000..b0d1242 --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/services.py @@ -0,0 +1,218 @@ +from sqlalchemy.exc import DBAPIError +import traceback +import requests +from requests import RequestException +from datetime import datetime +from jwt import InvalidTokenError + +import scraper_app.db_models as db_models +import scraper_app.errors as errors +import scraper_app.auth as auth +import scraper_app.scraper as scraper +from scraper_app.app_constants import imdb_base_url, search_map + + +def create_new_user(user_data): + user_exists = db_models.check_user_existance( + user_data['username']) + + if user_exists: + raise errors.BadRequest( + key='username', message='Account for the user already exists!') + + try: + new_user = map_to_users_db_model(user_data) + result = db_models.create_user(new_user) + return result + except DBAPIError as error: + errors.print_traceback(error) + raise errors.InternalError( + key='other', message='Could not sign in now!') + + +def log_in_user(user_data): + user = [] + try: + user = db_models.find_user_by_username(user_data['username']) + except (DBAPIError, Exception) as error: + errors.print_traceback(error) + raise errors.InternalError( + key='other', message='Could not log in now!') + + if len(user) != 1: + raise errors.NotFound( + key='other', message='Account does not exist!') + + is_password_valid = auth.decrypt_password( + user[0]['password'], user_data['password']) + if not is_password_valid: + raise errors.BadRequest( + key='other', message='Username and password do not match') + + try: + access_token = auth.generate_token(user[0], 'access_token') + refresh_token = auth.generate_token(user[0], 'refresh_token') + + existing_token = db_models.find_token(user[0]) + if existing_token and len(existing_token) > 0: + db_models.remove_token(user[0]) + + db_models.create_token(user[0], refresh_token) + + return { + "access_token": access_token, + "refresh_token": refresh_token + } + except DBAPIError as db_error: + errors.print_traceback(db_error) + raise errors.InternalError( + key='other', message='Could not log in now!') + except Exception as error: + errors.print_traceback(error) + raise errors.InternalError( + key='other', message='Could not log in now!') + + +def log_out_user(user_data): + try: + db_models.remove_token(user_data) + except DBAPIError as error: + errors.print_traceback(error) + + +def get_token_info(cookies, _type): + if _type not in cookies: + return { + "status": 'token_invalid' + } + + try: + return auth.decode_token( + cookies[_type], _type) + except InvalidTokenError as token_error: + errors.print_traceback(token_error) + return { + "status": 'token_invalid' + } + + +def authenticate_user(cookies): + access_token_info = get_token_info(cookies, 'access_token') + + if access_token_info['status'] == 'token_invalid': + return access_token_info['status'] + + try: + user = db_models.find_user_by_id(access_token_info['payload']['id']) + if len(user) != 1: + return 'token_invalid' + + token_status = 'valid' + if(access_token_info['status'] == 'token_expired'): + token_status = 'token_expired' + + return { + "status": token_status, + "user": user[0] + } + except DBAPIError as db_error: + errors.print_traceback(error) + return 'token_invalid' + + +def refresh_tokens(cookies): + access_token_info = get_token_info(cookies, 'access_token') + if access_token_info['status'] == 'token_invalid': + return access_token_info['status'] + + refresh_token_info = get_token_info(cookies, 'refresh_token') + if refresh_token_info['status'] == 'token_invalid': + return refresh_token_info['status'] + + if access_token_info['payload']['id'] != refresh_token_info['payload']['id']: + return 'token_invalid' + + try: + user = db_models.find_user_by_id(refresh_token_info['payload']['id']) + if len(user) != 1: + return 'token_invalid' + + user_refresh_token = db_models.find_token(user[0]) + if not (len(user_refresh_token) == 1 + and user_refresh_token[0]['refresh_token'] == cookies['refresh_token']): + return 'token_invalid' + + if refresh_token_info['status'] == 'token_expired': + db_models.remove_token(user[0]) + return refresh_token_info['status'] + + new_refresh_token = auth.generate_token(user[0], 'refresh_token') + db_models.update_token( + user[0], cookies['refresh_token'], new_refresh_token) + new_access_token = auth.generate_token(user[0], 'access_token') + + return { + "access_token": new_access_token, + "refresh_token": new_refresh_token + } + except DBAPIError as db_error: + errors.print_traceback(db_error) + return 'token_refresh_error' + except Exception as error: + errors.print_traceback(error) + return 'token_refresh_error' + + +def scrape(list_name): + search_url = imdb_base_url + search_map[list_name] + try: + response_content = '' + with requests.get(url=search_url) as req: + response_content = req.content + + movie_list = scraper.get_movie_list(response_content, list_name) + db_models.update_movie_list(list_name, movie_list) + db_models.update_movie_meta(list_name) + except (RequestException, DBAPIError) as error: + errors.print_traceback(error) + raise Exception + except Exception as other_error: + errors.print_traceback(other_error) + raise Exception + + +def get_scraped_list(list_name): + try: + movie_list = db_models.find_all_movies(list_name), + movie_meta = db_models.find_movie_meta(list_name) + last_updated = '' + if movie_meta['last_updated']: + last_updated = datetime.isoformat(movie_meta['last_updated']) + + return { + "movie_list": movie_list[0], + "last_updated": last_updated + } + except DBAPIError as error: + errors.print_traceback(error) + return { + "movie_list": [], + "last_updated": '' + } + + +def map_to_users_db_model(user_data): + return { + "full_name": user_data['full_name'], + "username": user_data['username'], + "dob": user_data['dob'], + "password": auth.encrypt_password(user_data['confirm_password']) + } + + +def map_db_to_user(user_data): # To exclude password + return { + "full_name": user_data['full_name'], + "username": user_data['username'], + "dob": user_data['dob'] + } diff --git a/Python/RanjanPaudel/scraper/scraper_app/static/button.css b/Python/RanjanPaudel/scraper/scraper_app/static/button.css new file mode 100644 index 0000000..97294ab --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/static/button.css @@ -0,0 +1,54 @@ +.primary-button { + padding: 8px; + text-transform: uppercase; + border: unset; + border-radius: 4px; + background-color: #0d87b8; + color: white; + box-shadow: 2px 2px 2px grey +} + +.primary-button:focus { + outline: unset; +} + +.primary-button:hover { + cursor: pointer; + background-color: #0b6c92; +} + +.secondary-button { + padding: 8px; + text-transform: uppercase; + border-radius: 4px; + border: 1px solid #0d87b8; + background-color: white; + color: #0d87b8; + box-shadow: 2px 2px 2px grey +} + +.secondary-button:focus { + outline: unset; +} + +.secondary-button:hover { + cursor: pointer; + background-color: #f1f1f1; +} + +.small-button { + font-size: 0.75em; +} + +.disabled-button { + border: 1px solid #f1f1f1; + background-color: #f1f1f1; + color: white; +} + +.disabled-button:hover { + cursor: not-allowed; + border: 1px solid #f1f1f1; + background-color: #f1f1f1; + color: white; +} diff --git a/Python/RanjanPaudel/scraper/scraper_app/static/common.css b/Python/RanjanPaudel/scraper/scraper_app/static/common.css new file mode 100644 index 0000000..8d7373d --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/static/common.css @@ -0,0 +1,151 @@ +ul { + list-style-type: none; +} + +table { + border-spacing: 0; +} + +.container { + padding: 8px; + width: calc(100% - 16px); +} + +.row { + width: 100%; + display: flex; + flex-direction: row; +} + +.col { + display: flex; + flex-direction: column; + padding: 8px; +} + +.col-3 { + width: 33.33%; +} + +.col-9 { + width: 66.67%; +} + +.paper-class { + box-shadow: 0px 3px 6px grey, 0px 1px 6px grey; + border-radius: 8px; +} + +.card-class { + box-shadow: 0px 0px 2px grey; +} + +.hori-center { + display: block; + margin: 0px auto; +} + +.notification-container { + position: sticky; + top: 0; + left: 10%; + z-index: 8888; + width: 80%; + text-align: center; +} + +.notification { + font-size: 0.75em; + padding: 8px; + margin: 4px; + border: 1px solid #00000020; + border-radius: 4px; + position: relative; +} + +.notification .close-alert-button { + position: absolute; + top: calc(50% - 0.75em); + right: 0; + font-size: 1.5em; + height: 1.5em; + width: 1.5em; + background-color: transparent; + border: none; + color: inherit; + border-radius: 50%; +} + +.notification .close-alert-button:hover { + cursor: pointer; + background-color: #00000020; +} + +.notification .close-alert-button:focus { + outline: none; +} + +.success-notification { + background-color: #c1f9b7; + color: #005302; +} + +.error-notification { + background-color: #f9b7b7; + color: #7c0000 +} + +.warn-notification { + background-color: #f9e5b7; + color: #836d00; +} + +.info-notification { + background-color: #b7cbf9; + color: #002679 +} + +.m-8 { + margin: 8px; +} + +.m-l-20 { + margin-left: 20px; +} + +.m-b-20 { + margin-bottom: 20px; +} + +.m-b-10 { + margin-bottom: 10px; +} + +.p-4 { + padding: 4px; +} + +.p-16 { + padding: 16px; +} + +.p-l-r-10 { + padding-left: 10px; + padding-right: 10px; +} + +.app-white { + color: #f7e6dd; +} + +.sticky-box { + position: sticky; + top: 8px; + left: 0; + z-index: 100; +} + +.caption-text { + font-size: 0.6em; + color: #5c5c5c +} diff --git a/Python/RanjanPaudel/scraper/scraper_app/static/favicon.ico b/Python/RanjanPaudel/scraper/scraper_app/static/favicon.ico new file mode 100644 index 0000000..fbe6737 Binary files /dev/null and b/Python/RanjanPaudel/scraper/scraper_app/static/favicon.ico differ diff --git a/Python/RanjanPaudel/scraper/scraper_app/static/form.css b/Python/RanjanPaudel/scraper/scraper_app/static/form.css new file mode 100644 index 0000000..1e38954 --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/static/form.css @@ -0,0 +1,35 @@ +.form-container { + width: 30%; + min-width: 300px; + display: flex; + margin: 4px auto; + text-align: center; +} + +.form-fieldset { + padding: 8px; + border: unset; + text-align: left; +} + +.text-fields { + width: calc(100% - 16px); + padding: 8px; + border-radius: 4px; + display: block; + border: 1px solid #0d87b8; +} + +.text-fields:focus { + outline: unset; + box-shadow: 0px 0px 3px inset #0d87b8; +} + +.form-label { + font-size: 0.75em; +} + +.form-error-message { + font-size: 0.6em; + color: #f66255; +} diff --git a/Python/RanjanPaudel/scraper/scraper_app/static/home.css b/Python/RanjanPaudel/scraper/scraper_app/static/home.css new file mode 100644 index 0000000..fee5b86 --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/static/home.css @@ -0,0 +1,53 @@ +.tab-link { + display: block; + text-decoration: unset; + color: #0d87b8; + border-bottom: 1px solid #afafaf; +} + +.tab-link:hover { + color: white; + background-color: #0d87b8; +} + +.tab-link-selected { + color: white; + background-color: #0d87b8; +} + +.table-header-cell { + background-color: #e9e9e9; + color: #555555 +} + +.table-content-cell { + font-size: 0.75em; +} + +.alter-color-tr:nth-child(even) { + background-color: #f3f3f3 +} + +.algn-lft { + text-align: left; +} + +.algn-rht { + text-align: right; +} + +.line-ht-0 { + line-height: 0; +} + +.color-green { + color: green; +} + +.color-red { + color: red; +} + +.color-orange { + color: orange; +} diff --git a/Python/RanjanPaudel/scraper/scraper_app/static/imdb.svg b/Python/RanjanPaudel/scraper/scraper_app/static/imdb.svg new file mode 100644 index 0000000..8aeb634 --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/static/imdb.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/Python/RanjanPaudel/scraper/scraper_app/static/layout.css b/Python/RanjanPaudel/scraper/scraper_app/static/layout.css new file mode 100644 index 0000000..95d50ba --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/static/layout.css @@ -0,0 +1,8 @@ +* { + margin: 0px; + padding: 0px; +} + +body { + font-family: Arial, Helvetica, sans-serif; +} diff --git a/Python/RanjanPaudel/scraper/scraper_app/static/link.css b/Python/RanjanPaudel/scraper/scraper_app/static/link.css new file mode 100644 index 0000000..e666221 --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/static/link.css @@ -0,0 +1,16 @@ +.custom-link { + color: #0d87b8; + font-size: 0.75em; + text-decoration: unset; +} + +.custom-link:hover { + color: #0b6c92; + text-decoration: underline; +} + +.button-link { + display: block; + color: unset; + text-decoration: unset; +} diff --git a/Python/RanjanPaudel/scraper/scraper_app/static/main.css b/Python/RanjanPaudel/scraper/scraper_app/static/main.css new file mode 100644 index 0000000..6a55ced --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/static/main.css @@ -0,0 +1,33 @@ +#appHeader { + background-color: #0d87b8; + padding: 8px; + display: flex; + flex-direction: row; + align-items: center; + justify-content: space-between +} + +#logoContainer, +#userContainer { + display: flex; + flex-direction: row; + align-items: center; +} + +.app-logo { + width: 50px; + height: 50px; +} + +#mainContainer { + min-height: 100vh; +} + +#appFooter { + background-color: #868686; + padding: 8px; + text-align: center; + color: white; + font-weight: bold; + font-size: 0.75em; +} diff --git a/Python/RanjanPaudel/scraper/scraper_app/static/movie_alt.png b/Python/RanjanPaudel/scraper/scraper_app/static/movie_alt.png new file mode 100644 index 0000000..8dcb9db Binary files /dev/null and b/Python/RanjanPaudel/scraper/scraper_app/static/movie_alt.png differ diff --git a/Python/RanjanPaudel/scraper/scraper_app/static/power.svg b/Python/RanjanPaudel/scraper/scraper_app/static/power.svg new file mode 100644 index 0000000..234b15f --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/static/power.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/Python/RanjanPaudel/scraper/scraper_app/static/scraper_app_logo.png b/Python/RanjanPaudel/scraper/scraper_app/static/scraper_app_logo.png new file mode 100644 index 0000000..acaab69 Binary files /dev/null and b/Python/RanjanPaudel/scraper/scraper_app/static/scraper_app_logo.png differ diff --git a/Python/RanjanPaudel/scraper/scraper_app/static/user-solid.svg b/Python/RanjanPaudel/scraper/scraper_app/static/user-solid.svg new file mode 100644 index 0000000..da11f5c --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/static/user-solid.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/Python/RanjanPaudel/scraper/scraper_app/static/user_logo.css b/Python/RanjanPaudel/scraper/scraper_app/static/user_logo.css new file mode 100644 index 0000000..0d6551b --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/static/user_logo.css @@ -0,0 +1,22 @@ +.app-user-icon { + width: 32px; + height: 32px; +} + +.logout-button { + width: 32px; + height: 32px; + padding: 4px; + background-color: #c04c4c; + border-radius: 50%; +} + +.logout-button:focus { + outline: unset; +} + +.logout-button:hover { + cursor: pointer; + background-color: #b33e3e; + box-shadow: 2px 2px 2px; +} diff --git a/Python/RanjanPaudel/scraper/scraper_app/templates/home.html b/Python/RanjanPaudel/scraper/scraper_app/templates/home.html new file mode 100644 index 0000000..49d5e26 --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/templates/home.html @@ -0,0 +1,198 @@ +{% extends "index.html" %} +{% if user_is_logged_in and user %} +{% block page_title %} +Scraper - Home +{% endblock %} + +{% block style_content %} + + + + +{% endblock %} + +{% block header_content %} +
+

+ {{user['full_name']}} +

+
+ logout +
+ + logout + +
+{% endblock %} + +{% block content %} +
+
+
+
+ +
+
+ {% if not selected_tab %} +

+ Select a category! +

+
+ {% endif %} + {% if selected_tab %} +
+

+ IMDb's {{scraped_list['movie_list']|length}} {{tab_label_map[selected_tab]}} +

+ {% if scraped_list['movie_list'] and scraped_list['movie_list']|length %} + + {% endif %} + {% if not scraped_list['movie_list'] or not scraped_list['movie_list']|length %} + + {% endif %} + + +
+
+ + + + {% for column in list_table_column_keys[selected_tab] %} + + {% endfor %} + + + {% if selected_tab == 'top_rated_movies' or selected_tab == 'top_rated_tv_shows' %} + + {% for row_item in scraped_list['movie_list'] %} + + + + + + + + {% endfor %} + + {% endif %} + {% if selected_tab == 'most_popular_movies' or selected_tab == 'most_popular_tv_shows' %} + + {% for row_item in scraped_list['movie_list'] %} + + + + + + + + {% endfor %} + + {% endif %} +
+ {{list_table_columns[selected_tab][column]['label']}} +
+ {{row_item['title']}} + + {{row_item['title']}} + + {{row_item['rank']}} + + {{row_item['release_year']}} + + {{row_item['imdb_rating']}} +
+ {{row_item['title']}} + + {{row_item['title']}} + + {{row_item['release_year']}} + + {{row_item['imdb_rating']}} + + + {{row_item['position']}} + + + ({{row_item['pre_position']}}) + +
+ {% endif %} +
+
+
+
+{% endblock %} + +{% block footer_content %} +
+
+ Visit  + + + imdb + + +  for more. +
+
+{% endblock %} +{% endif %} + +{% block script_content %} + +{% endblock %} diff --git a/Python/RanjanPaudel/scraper/scraper_app/templates/index.html b/Python/RanjanPaudel/scraper/scraper_app/templates/index.html new file mode 100644 index 0000000..8cddf71 --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/templates/index.html @@ -0,0 +1,64 @@ + + + + + + + {% block page_title %}{% endblock %} + + + + {% block style_content %}{% endblock %} + + + + +
+
+ +

+ Scraper App +

+
+ {% block header_content %}{% endblock %} +
+
+ {% with messages=get_flashed_messages(with_categories=true) %} + {% if messages %} +
+ {% for category, message in messages %} + + {% endfor %} +
+ {% endif %} + {% endwith %} + + {% block content %}{% endblock %} +
+ {% block footer_content %} + {% endblock %} + {% with messages=get_flashed_messages(with_categories=true) %} + {% if messages %} + + {% endif %} + {% endwith %} + {% block script_content %} + {% endblock %} + + + diff --git a/Python/RanjanPaudel/scraper/scraper_app/templates/login.html b/Python/RanjanPaudel/scraper/scraper_app/templates/login.html new file mode 100644 index 0000000..8dee55b --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/templates/login.html @@ -0,0 +1,58 @@ +{% extends "index.html" %} +{% block page_title %} +Scraper - Log In +{% endblock %} +{% block style_content %} + + + +{% endblock %} +{% block content %} +
+
+

+ Log In +

+
+
+ + + {% if error and error['username'] %} + {{ error['username'] }} + {% endif %} +
+
+ + + {% if error and error['password'] %} + {{ error['password'] }} + {% endif %} +
+ {% if error and error['other'] %} + {{ error['other'] }} + {% endif %} + +
+
+ Don't have an account? Sign in here. +
+
+
+{% endblock %} +{% block script_content %} + +{% endblock %} diff --git a/Python/RanjanPaudel/scraper/scraper_app/templates/result.html b/Python/RanjanPaudel/scraper/scraper_app/templates/result.html new file mode 100644 index 0000000..c2016e1 --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/templates/result.html @@ -0,0 +1,13 @@ +{% if signin_successful %} +
+
+
+

+ Sign In Successful! +

+ Go to Log in + page. +
+
+
+{% endif %} diff --git a/Python/RanjanPaudel/scraper/scraper_app/templates/signin.html b/Python/RanjanPaudel/scraper/scraper_app/templates/signin.html new file mode 100644 index 0000000..130ec72 --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/templates/signin.html @@ -0,0 +1,90 @@ +{% extends "index.html" %} +{% block page_title %} +Scraper - Sign In +{% endblock %} +{% block style_content %} + + + +{% endblock %} +{% block content %} +
+
+

+ Sign In +

+
+
+ + + {% if error and error['full_name'] %} + {{ error['full_name'] }} + {% endif %} +
+
+ + + {% if error and error['dob'] %} + {{ error['dob'] }} + {% endif %} +
+
+ + + {% if error and error['username'] %} + {{ error['username'] }} + {% endif %} +
+
+ + + {% if error and error['new_password'] %} + {{ error['new_password'] }} + {% endif %} +
+
+ + + {% if error and error['confirm_password'] %} + {{ error['confirm_password'] }} + {% endif %} +
+ + {% if error and error['other'] %} + {{ error['other'] }} + {% endif %} +
+
+ Already have an account? Log in here. +
+
+
+{% endblock %} +{% block script_content %} + +{% endblock %} diff --git a/Python/RanjanPaudel/scraper/scraper_app/validators.py b/Python/RanjanPaudel/scraper/scraper_app/validators.py new file mode 100644 index 0000000..830ce09 --- /dev/null +++ b/Python/RanjanPaudel/scraper/scraper_app/validators.py @@ -0,0 +1,71 @@ +import datetime + + +def validate_signin_form(signin_data): + if len(signin_data['full_name']) < 3: + return { + "has_error": True, + "error": { + "full_name": "Full name length should be 3 to 100 characters" + } + } + + try: + datetime.datetime.strptime(signin_data['dob'], '%Y-%m-%d') + except ValueError: + return { + "has_error": True, + "error": { + "dob": "Date format should be YYYY-MM-DD" + } + } + + if len(signin_data['username']) < 6 or len(signin_data['username']) > 100: + return { + "has_error": True, + "error": { + "username": "Username length should be 6 to 100 characters" + } + } + + if len(signin_data['new_password']) < 8: + return { + "has_error": True, + "error": { + "new_password": "Password length should be at least 8 characters" + } + } + + if signin_data['new_password'] != signin_data['confirm_password']: + return { + "has_error": True, + "error": { + "confirm_password": "Passwords do not match" + } + } + + return { + "has_error": False + } + + +def validate_login_form(login_data): + if len(login_data['username']) < 1: + return { + "has_error": True, + "error": { + "username": "Username is required" + } + } + + if len(login_data['password']) < 1: + return { + "has_error": True, + "error": { + "password": "Password is required" + } + } + + return { + "has_error": False + } diff --git a/Python/RanjanPaudel/scraper/tests/__init__.py b/Python/RanjanPaudel/scraper/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Python/RanjanPaudel/scraper/tests/test_app.py b/Python/RanjanPaudel/scraper/tests/test_app.py new file mode 100644 index 0000000..d924465 --- /dev/null +++ b/Python/RanjanPaudel/scraper/tests/test_app.py @@ -0,0 +1,351 @@ +import os +import sys +import pytest +from bs4 import BeautifulSoup + +os.environ['FLASK_ENV'] = 'test' + + +@pytest.fixture +def client(): + from scraper_app import app + + app.app.config['TESTING'] = True + + with app.app.test_client(use_cookies=True) as client: + yield client + + +def test_empty_path_redirection_to_home(client): + res = client.get('/') + + soup = BeautifulSoup(res.data, 'html.parser') + title = soup.find(name='title').get_text() + redirection_text = soup.find(name='p').get_text() + + assert title == 'Redirecting...' + assert redirection_text == 'You should be redirected automatically to target URL: /home. If not click the link.' + + print('\n1. App redirects to Home or suggests to click the link "/home" for path "/".') + + +def test_login_page_when_not_logged_in(client): + res = client.get('/login') + + soup = BeautifulSoup(res.data, 'html.parser') + title = soup.find(name='title').get_text() + + assert title == 'Scraper - Log In' + + print('\n2. App renders Login page for route "/login" when user is not logged in.') + + +def test_signin_page_when_not_logged_in(client): + res = client.get('/signin') + + soup = BeautifulSoup(res.data, 'html.parser') + title = soup.find(name='title').get_text() + + assert title == 'Scraper - Sign In' + + print('\n3. App renders Signin page for route "/signin" when user is not logged in.') + + +def test_home_page_when_not_logged_in(client): + res = client.get('/home') + + soup = BeautifulSoup(res.data, 'html.parser') + title = soup.find(name='title').get_text() + redirection_text = soup.find(name='p').get_text() + + assert title == 'Redirecting...' + assert redirection_text == 'You should be redirected automatically to target URL: /login. If not click the link.' + + print('\n4. App redirects to Login page or suggests to click the link "/login" for route "/home" if the user is not logged in.') + + +def test_top_rated_movies_page_when_not_logged_in(client): + res = client.get('/home/top_rated_movies') + + soup = BeautifulSoup(res.data, 'html.parser') + title = soup.find(name='title').get_text() + redirection_text = soup.find(name='p').get_text() + + assert title == 'Redirecting...' + assert redirection_text == 'You should be redirected automatically to target URL: /login. If not click the link.' + + print('\n5. App redirects to Login page or suggests to click the link "/login" for route "/home/top_rated_movies" if the user is not logged in.') + + +def test_top_rated_tv_shows_page_when_not_logged_in(client): + res = client.get('/home/top_rated_tv_shows') + + soup = BeautifulSoup(res.data, 'html.parser') + title = soup.find(name='title').get_text() + redirection_text = soup.find(name='p').get_text() + + assert title == 'Redirecting...' + assert redirection_text == 'You should be redirected automatically to target URL: /login. If not click the link.' + + print('\n6. App redirects to Login page or suggests to click the link "/login" for route "/home/top_rated_tv_shows" if the user is not logged in.') + + +def test_most_popular_movies_page_when_not_logged_in(client): + res = client.get('/home/most_popular_movies') + + soup = BeautifulSoup(res.data, 'html.parser') + title = soup.find(name='title').get_text() + redirection_text = soup.find(name='p').get_text() + + assert title == 'Redirecting...' + assert redirection_text == 'You should be redirected automatically to target URL: /login. If not click the link.' + + print('\n7. App redirects to Login page or suggests to click the link "/login" for route "/home/most_popular_movies" if the user is not logged in.') + + +def test_most_popular_tv_shows_page_when_not_logged_in(client): + res = client.get('/home/most_popular_tv_shows') + + soup = BeautifulSoup(res.data, 'html.parser') + title = soup.find(name='title').get_text() + redirection_text = soup.find(name='p').get_text() + + assert title == 'Redirecting...' + assert redirection_text == 'You should be redirected automatically to target URL: /login. If not click the link.' + + print('\n8. App redirects to Login page or suggests to click the link "/login" for route "/home/most_popular_tv_shows" if the user is not logged in.') + + +def test_scrape_top_rated_movies_when_not_logged_in(client): + res = client.get('/scrape/top_rated_movies') + + soup = BeautifulSoup(res.data, 'html.parser') + title = soup.find(name='title').get_text() + redirection_text = soup.find(name='p').get_text() + + assert title == 'Redirecting...' + assert redirection_text == 'You should be redirected automatically to target URL: /login. If not click the link.' + + print('\n9. App redirects to Login page or suggests to click the link "/login" when user tries to scrape top_rated_movies if the user is not logged in.') + + +def test_scrape_top_rated_tv_shows_when_not_logged_in(client): + res = client.get('/scrape/top_rated_tv_shows') + + soup = BeautifulSoup(res.data, 'html.parser') + title = soup.find(name='title').get_text() + redirection_text = soup.find(name='p').get_text() + + assert title == 'Redirecting...' + assert redirection_text == 'You should be redirected automatically to target URL: /login. If not click the link.' + + print('\n10. App redirects to Login page or suggests to click the link "/login" when user tries to scrape top_rated_tv_shows if the user is not logged in.') + + +def test_scrape_most_popular_movies_when_not_logged_in(client): + res = client.get('/scrape/most_popular_movies') + + soup = BeautifulSoup(res.data, 'html.parser') + title = soup.find(name='title').get_text() + redirection_text = soup.find(name='p').get_text() + + assert title == 'Redirecting...' + assert redirection_text == 'You should be redirected automatically to target URL: /login. If not click the link.' + + print('\n11. App redirects to Login page or suggests to click the link "/login" when user tries to scrape most_popular_movies if the user is not logged in.') + + +def test_scrape_most_popular_tv_shows_when_not_logged_in(client): + res = client.get('/scrape/most_popular_tv_shows') + + soup = BeautifulSoup(res.data, 'html.parser') + title = soup.find(name='title').get_text() + redirection_text = soup.find(name='p').get_text() + + assert title == 'Redirecting...' + assert redirection_text == 'You should be redirected automatically to target URL: /login. If not click the link.' + + print('\n12. App redirects to Login page or suggests to click the link "/login" when user tries to scrape most_popular_tv_shows if the user is not logged in.') + + +def test_logout_when_not_logged_in(client): + res = client.get('/logout') + soup = BeautifulSoup(res.data, 'html.parser') + title = soup.find(name='title').get_text() + redirection_text = soup.find(name='p').get_text() + + assert title == 'Redirecting...' + assert redirection_text == 'You should be redirected automatically to target URL: /login. If not click the link.' + + print('\n13. App redirects to Login page or suggests to click the link "/login" for route "/logout" if the user is not logged in.') + + +def test_signin(client): + print('\n14. Signing in:') + # ************************************************** + res = client.post('/signin', data={ + 'full_name': 'T', + 'dob': '2000-03-21', + 'username': 'testuser1', + 'new_password': 'thisistest1', + 'confirm_password': 'thisistest1' + }) + + soup = BeautifulSoup(res.data, 'html.parser') + full_name_error = soup.find( + name="span", attrs={'id': 'full_name_error'}).get_text() + + assert res.status_code == 400 + assert full_name_error == 'Full name length should be 3 to 100 characters' + + print('\n\ta) With full_name less than 3 characters responds with code 400 and corresponding error message.') + del res, soup, full_name_error + # ************************************************** + res = client.post('/signin', data={ + 'full_name': 'Test User1', + 'dob': '2000-03-2a', + 'username': 'testuser1', + 'new_password': 'thisistest1', + 'confirm_password': 'thisistest1' + }) + + soup = BeautifulSoup(res.data, 'html.parser') + dob_error = soup.find(name="span", attrs={'id': 'dob_error'}).get_text() + + assert res.status_code == 400 + assert dob_error == 'Date format should be YYYY-MM-DD' + + print('\n\tb) With dob not in YYYY-MM-DD format responds with code 400 and corresponding error message.') + del res, soup, dob_error + # ************************************************** + res = client.post('/signin', data={ + 'full_name': 'Test User1', + 'dob': '2000-03-23', + 'username': 'testuser1', + 'new_password': 'this', + 'confirm_password': 'thisistest1' + }) + + soup = BeautifulSoup(res.data, 'html.parser') + new_password_error = soup.find( + name="span", attrs={'id': 'new_password_error'}).get_text() + + assert res.status_code == 400 + assert new_password_error == 'Password length should be at least 8 characters' + + print('\n\tc) With password length less than 8 characters responds with code 400 and corresponding error message.') + del res, soup, new_password_error + # ************************************************** + res = client.post('/signin', data={ + 'full_name': 'Test User1', + 'dob': '2000-03-23', + 'username': 'testuser1', + 'new_password': 'thisistest1', + 'confirm_password': 'thisistest' + }) + + soup = BeautifulSoup(res.data, 'html.parser') + confirm_password_error = soup.find( + name="span", attrs={'id': 'confirm_password_error'}).get_text() + + assert res.status_code == 400 + assert confirm_password_error == 'Passwords do not match' + + print('\n\td) With confirmation password not same as new-password responds with code 400 and corresponding error message.') + del res, soup, confirm_password_error + # ************************************************** + res = client.post('/signin', data={ + 'full_name': 'Test User1', + 'dob': '2000-03-23', + 'username': 'testuser', + 'new_password': 'thisistest1', + 'confirm_password': 'thisistest1' + }) + + soup = BeautifulSoup(res.data, 'html.parser') + username_error = soup.find( + name="span", attrs={'id': 'username_error'}).get_text() + + assert res.status_code == 400 + assert username_error == 'Account for the user already exists!' + + print('\n\te) With username already used responds with code 400 and corresponding error message.') + del res, soup, username_error + # ************************************************** + new_username = 'testuser1' + redirection_text = '' + title = '' + while True: + res = client.post('/signin', data={ + 'full_name': 'Test User1', + 'dob': '2000-03-23', + 'username': new_username, + 'new_password': 'thisistest1', + 'confirm_password': 'thisistest1' + }) + + soup = BeautifulSoup(res.data, 'html.parser') + title = soup.find(name='title').get_text() + redirection_text = soup.find(name='p') + + if redirection_text and redirection_text.get_text() == 'You should be redirected automatically to target URL: /login. If not click the link.': + break + + new_username = input( + f'\n\t- {new_username} seems to be already used please provide a new username to test user creation: ') + + assert title == 'Redirecting...' + assert redirection_text.get_text( + ) == 'You should be redirected automatically to target URL: /login. If not click the link.' + + print('\n\tf) After successful user creation redirects to login page.') + del res, soup, title, redirection_text + + +def test_login(client): + print('\n15. Logging in:') + # ************************************************** + res = client.post('/login', data={ + 'username': 'testuse', + 'password': 'thisistest' + }) + + soup = BeautifulSoup(res.data, 'html.parser') + error_message = soup.find( + name="span", attrs={'id': 'error_message'}).get_text() + + assert res.status_code == 404 + assert error_message == 'Account does not exist!' + + print('\n\ta) With username that is not registered responds with code 404 and corresponding error message.') + del res, soup, error_message + # ************************************************** + res = client.post('/login', data={ + 'username': 'testuser', + 'password': 'thisistes' + }) + + soup = BeautifulSoup(res.data, 'html.parser') + error_message = soup.find( + name="span", attrs={'id': 'error_message'}).get_text() + + assert res.status_code == 400 + assert error_message == 'Username and password do not match' + + print('\n\tb) With wrong password responds with code 400 and corresponding error message.') + del res, soup, error_message + # ************************************************** + res = client.post('/login', data={ + 'username': 'testuser', + 'password': 'qwertyuiop' + }) + + soup = BeautifulSoup(res.data, 'html.parser') + title = soup.find(name='title').get_text() + redirection_text = soup.find(name="p").get_text() + + assert title == 'Redirecting...' + assert redirection_text == 'You should be redirected automatically to target URL: /home. If not click the link.' + + print('\n\tc) With matching username and password redirects to home page with the tokens required.') + del res, soup, redirection_text