diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/.DS_Store differ diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml new file mode 100644 index 0000000..3590d08 --- /dev/null +++ b/.github/workflows/workflow.yml @@ -0,0 +1,28 @@ +name: API workflow + +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-latest + name: Run tests + steps: + - uses: actions/checkout@v3 + + - name: local files + run: ls -al + + - name: Set up Python + uses: actions/setup-python@v4.6.0 + with: + python-version: '3.12.3' + + - name: Install requirements + run: pip install . + + - name: Run tests and collect coverage + run: pytest --cov . + + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v3 + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fbd6c23 --- /dev/null +++ b/.gitignore @@ -0,0 +1,274 @@ +# Created by .ignore support plugin (hsz.mobi) +### JupyterNotebooks template +# gitignore template for Jupyter Notebooks +# website: http://jupyter.org/ + +.ipynb_checkpoints +*/.ipynb_checkpoints/* + +# Remove previous ipynb_checkpoints +# git rm -r .ipynb_checkpoints/ +# + +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf +.idea/* + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Local .terraform directories +**/.terraform/* + +# .tfstate files +*.tfstate +*.tfstate.* + +# plan files +*-plan.out + +# Crash log files +crash.log +crash.*.log + +# Exclude all .tfvars files, which are likely to contain sensitive data, such as +# password, private keys, and other secrets. These should not be part of version +# control as they are data points which are potentially sensitive and subject +# to change depending on the environment. +*.tfvars +*.tfvars.json + +# Ignore override files as they are usually used to override resources locally and so +# are not checked in +override.tf +override.tf.json +*_override.tf +*_override.tf.json + +# Include override files you do wish to add to version control using negated pattern +# !example_override.tf + +# Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan +# example: *tfplan* + +# Ignore CLI configuration files +.terraformrc +terraform.rc + +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + + +####################### +# Unique to this repo # +####################### +scripts/zippedPythonPackages/* +!scripts/zippedPythonPackages/.gitkeep + +scripts/zippedLambdaFunction/* +!scripts/zippedLambdaFunction/.gitkeep + +*reddit*.cfg +!example_reddit.cfg + +model/pickledModels/latestModel.sav diff --git a/.terraform.lock.hcl b/.terraform.lock.hcl new file mode 100644 index 0000000..7912892 --- /dev/null +++ b/.terraform.lock.hcl @@ -0,0 +1,63 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/archive" { + version = "2.4.2" + hashes = [ + "h1:1eOz9vM/55vnQjxk23RhnYga7PZq8n2rGxG+2Vx2s6w=", + "zh:08faed7c9f42d82bc3d406d0d9d4971e2d1c2d34eae268ad211b8aca57b7f758", + "zh:3564112ed2d097d7e0672378044a69b06642c326f6f1584d81c7cdd32ebf3a08", + "zh:53cd9afd223c15828c1916e68cb728d2be1cbccb9545568d6c2b122d0bac5102", + "zh:5ae4e41e3a1ce9d40b6458218a85bbde44f21723943982bca4a3b8bb7c103670", + "zh:5b65499218b315b96e95c5d3463ea6d7c66245b59461217c99eaa1611891cd2c", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:7f45b35a8330bebd184c2545a41782ff58240ed6ba947274d9881dd5da44b02e", + "zh:87e67891033214e55cfead1391d68e6a3bf37993b7607753237e82aa3250bb71", + "zh:de3590d14037ad81fc5cedf7cfa44614a92452d7b39676289b704a962050bc5e", + "zh:e7e6f2ea567f2dbb3baa81c6203be69f9cd6aeeb01204fd93e3cf181e099b610", + "zh:fd24d03c89a7702628c2e5a3c732c0dede56fa75a08da4a1efe17b5f881c88e2", + "zh:febf4b7b5f3ff2adff0573ef6361f09b6638105111644bdebc0e4f575373935f", + ] +} + +provider "registry.terraform.io/hashicorp/aws" { + version = "5.45.0" + constraints = "< 6.0.0" + hashes = [ + "h1:8m3+C1VNevzU/8FsABoKp2rTOx3Ue7674INfhfk0TZY=", + "zh:1379bcf45aef3d486ee18b4f767bfecd40a0056510d26107f388be3d7994c368", + "zh:1615a6f5495acfb3a0cb72324587261dd4d72711a3cc51aff13167b14531501e", + "zh:18b69a0f33f8b1862fbd3f200756b7e83e087b73687085f2cf9c7da4c318e3e6", + "zh:2c5e7aecd197bc3d3b19290bad8cf4c390c2c6a77bb165da4e11f53f2dfe2e54", + "zh:3794da9bef97596e3bc60e12cdd915bda5ec2ed62cd1cd93723d58b4981905fe", + "zh:40a5e45ed91801f83db76dffd467dcf425ea2ca8642327cf01119601cb86021c", + "zh:4abfc3f53d0256a7d5d1fa5e931e4601b02db3d1da28f452341d3823d0518f1a", + "zh:4eb0e98078f79aeb06b5ff6115286dc2135d12a80287885698d04036425494a2", + "zh:75470efbadea4a8d783642497acaeec5077fc4a7f3df3340defeaa1c7de29bf7", + "zh:8861a0b4891d5fa2fa7142f236ae613cea966c45b5472e3915a4ac3abcbaf487", + "zh:8bf6f21cd9390b742ca0b4393fde92616ca9e6553fb75003a0999006ad233d35", + "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", + "zh:ad73008a044e75d337acda910fb54d8b81a366873c8a413fec1291034899a814", + "zh:bf261713b0b8bebfe8c199291365b87d9043849f28a2dc764bafdde73ae43693", + "zh:da3bafa1fd830be418dfcc730e85085fe67c0d415c066716f2ac350a2306f40a", + ] +} + +provider "registry.terraform.io/hashicorp/null" { + version = "3.2.2" + hashes = [ + "h1:IMVAUHKoydFrlPrl9OzasDnw/8ntZFerCC9iXw1rXQY=", + "zh:3248aae6a2198f3ec8394218d05bd5e42be59f43a3a7c0b71c66ec0df08b69e7", + "zh:32b1aaa1c3013d33c245493f4a65465eab9436b454d250102729321a44c8ab9a", + "zh:38eff7e470acb48f66380a73a5c7cdd76cc9b9c9ba9a7249c7991488abe22fe3", + "zh:4c2f1faee67af104f5f9e711c4574ff4d298afaa8a420680b0cb55d7bbc65606", + "zh:544b33b757c0b954dbb87db83a5ad921edd61f02f1dc86c6186a5ea86465b546", + "zh:696cf785090e1e8cf1587499516b0494f47413b43cb99877ad97f5d0de3dc539", + "zh:6e301f34757b5d265ae44467d95306d61bef5e41930be1365f5a8dcf80f59452", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:913a929070c819e59e94bb37a2a253c228f83921136ff4a7aa1a178c7cce5422", + "zh:aa9015926cd152425dbf86d1abdbc74bfe0e1ba3d26b3db35051d7b9ca9f72ae", + "zh:bb04798b016e1e1d49bcc76d62c53b56c88c63d6f2dfe38821afef17c416a0e1", + "zh:c23084e1b23577de22603cff752e59128d83cfecc2e6819edadd8cf7a10af11e", + ] +} diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..de58d93 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Kenneth Myers + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..e88cc57 --- /dev/null +++ b/README.md @@ -0,0 +1,40 @@ +![Python](https://img.shields.io/badge/python-3.12.3-blue.svg) + +# Reddit Scraping + +The purpose of this repo is to deploy AWS Lambda function that scrapes rising and hot reddit posts. + +# How to use + +1. First ensure the DynamoDB tables are set up via [DynamoDB-Setup](https://github.com/ViralRedditPosts/DynamoDB-Setup). +2. Installs - see the [prerequisites section on this page](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/aws-build#prerequisites) for additional information, the steps are essentially: + 1. Install Terraform CLI + 2. Install AWS CLI and run `aws configure` and enter in your aws credentials. +3. Clone this repository +4. You can run the tests locally yourself by doing the following (it is recommended that you manage your python environments with something like [asdf](https://asdf-vm.com/) and use python==3.12.3 as your local runtime): + + ```sh + python -m venv venv # this sets up a local virtual env using the current python runtime + source ./venv//bin/activate # activates the virtual env + pip install -e . # installs this packages in local env with dependencies + pytest . -r f -s # -r f shows extra info for failures, -s disables capturing + ``` + +5. From within this repository run the following: + + ```sh + terraform init + terraform workspace new dev # this should switch you to the dev workspace + terraform plan -var-file="dev.tfvars" -out=dev-plan.out + terraform apply -var-file="dev.tfvars" dev-plan.out + ``` + + For deploying to prd + + ```sh + terraform workspace new prd # or terraform workspace select prd if already created + terraform plan -var-file="prd.tfvars" -out=prd-plan.out + terraform apply -var-file="prd.tfvars" prd-plan.out + ``` + + On subsequent updates you don't need to `init` or make a new workspace again. \ No newline at end of file diff --git a/example_reddit.cfg b/example_reddit.cfg new file mode 100644 index 0000000..e07e9e7 --- /dev/null +++ b/example_reddit.cfg @@ -0,0 +1,22 @@ +# rename this file reddit.cfg +[reddit_api] +CLIENTID: "XXXX" +CLIENTSECRET: "XXXX" +PASSWORD: "XXXX" +USERNAME: "XXXX" + +[S3_access] +ACCESSKEY: "XXXX" +SECRETKEY: "XXXX" + +[Discord] +BOTTOKEN: "XXXX" +MYSNOWFLAKEID: "XXXX" +CHANNELSNOWFLAKEID: [123456789,987654321] + +[Postgres] +USERNAME: "XXXX" +PASSWORD: "XXXX" +HOST: "XXXX" +PORT: "XXXX" +DATABASE: "XXXX" diff --git a/lambdaFunctions/getRedditDataFunction/lambda_function.py b/lambdaFunctions/getRedditDataFunction/lambda_function.py new file mode 100644 index 0000000..37844d8 --- /dev/null +++ b/lambdaFunctions/getRedditDataFunction/lambda_function.py @@ -0,0 +1,61 @@ +import redditUtils as ru +import viral_reddit_posts_utils.configUtils as cu +import tableDefinition +import praw +import boto3 +import os + + +dynamodb_resource = boto3.resource('dynamodb') + + +def lambda_handler(event, context): + # Initializations + subreddits = ["pics", "gaming", "worldnews", "news", "aww", "funny", "todayilearned", "movies"] + + # cfg_file = cu.findConfig() + cfg_file = "s3://"+os.environ['AWS_BUCKET']+"/reddit.cfg" # ie 's3://data-kennethmyers/reddit.cfg' + cfg = cu.parseConfig(cfg_file) + + CLIENTID = cfg['reddit_api']['CLIENTID'] + CLIENTSECRET = cfg['reddit_api']['CLIENTSECRET'] + PASSWORD = cfg['reddit_api']['PASSWORD'] + USERNAME = cfg['reddit_api']['USERNAME'] + + reddit = praw.Reddit( + client_id=f"{CLIENTID}", + client_secret=f"{CLIENTSECRET}", + password=f"{PASSWORD}", + user_agent=f"Post Extraction (by u/{USERNAME})", + username=f"{USERNAME}", + ) + + for subreddit in subreddits: + print(f"Gathering data for {subreddit}") + # Get Rising Reddit data + print("\tGetting Rising Data") + schema = tableDefinition.schema + topN = 25 + view = 'rising' + risingData = ru.getRedditData(reddit=reddit, subreddit=subreddit, view=view, schema=schema, topN=topN) + risingData = ru.deduplicateRedditData(risingData) + + # Push to DynamoDB + tableName = f"{view}-{os.environ['ENV']}" + risingTable = ru.getTable(tableName, dynamodb_resource) + ru.batchWriter(risingTable, risingData, schema) + + # Get Hot Reddit data + print("\tGetting Hot Data") + schema = tableDefinition.schema + topN = 3 + view = 'hot' + hotData = ru.getRedditData(reddit=reddit, subreddit=subreddit, view=view, schema=schema, topN=topN) + hotData = ru.deduplicateRedditData(hotData) + + # Push to DynamoDB + tableName = f"{view}-{os.environ['ENV']}" + hotTable = ru.getTable(tableName, dynamodb_resource) + ru.batchWriter(hotTable, hotData, schema) + + return 200 diff --git a/lambdaFunctions/getRedditDataFunction/redditUtils.py b/lambdaFunctions/getRedditDataFunction/redditUtils.py new file mode 100644 index 0000000..d7ddac7 --- /dev/null +++ b/lambdaFunctions/getRedditDataFunction/redditUtils.py @@ -0,0 +1,119 @@ +from datetime import datetime +from collections import namedtuple +import tableDefinition +import json +from decimal import Decimal +import pickle + + +def saveTestReddit(reddit, filename): + pickle.dump(reddit, open(filename, 'wb')) + + +def getRedditData(reddit, subreddit, topN=25, view='rising', schema=tableDefinition.schema, time_filter=None, verbose=False): + """ + Uses PRAW to get data from reddit using defined parameters. Returns data in a list of row based data. + + :param reddit: PRAW reddit object + :param subreddit: subreddit name + :param topN: Number of posts to return + :param view: view to look at the subreddit. rising, top, hot + :param schema: schema that describes the data. Dynamo is technically schema-less + :param time_filter: range of time to look at the data. all, day, hour, month, week, year + :param verbose: if True then prints more information + :return: list[Row[schema]], Row is a namedtuple defined by the schema + """ + assert topN <= 25 # some, like rising, cap out at 25 and this also is to limit data you're working with + assert view in {'rising', 'top' , 'hot'} + topN += 2 # increment by 2 because of sticky posts + if view == 'top': + assert time_filter in {"all", "day", "hour", "month", "week", "year"} + subredditObject = reddit.subreddit(subreddit) + if view == 'rising': + topNposts = subredditObject.rising(limit=topN) + elif view == 'hot': + topNposts = subredditObject.hot(limit=topN) + elif view == 'top': + topNposts = subredditObject.top(time_filter=time_filter, limit=topN) + + now = datetime.utcnow().replace(tzinfo=None, microsecond=0) + columns = schema.keys() + Row = namedtuple("Row", columns) + dataCollected = [] + subscribers = subredditObject.subscribers + activeUsers = subredditObject.accounts_active + print(f'\tSubscribers: {subscribers}\n\tActive users: {activeUsers}') + for submission in topNposts: + if submission.stickied: + continue # skip stickied posts + createdTSUTC = datetime.utcfromtimestamp(submission.created_utc) + timeSincePost = now - createdTSUTC + timeElapsedMin = timeSincePost.seconds // 60 + timeElapsedDays = timeSincePost.days + if view=='rising' and (timeElapsedMin > 60 or timeElapsedDays>0): # sometime rising has some data that's already older than an hour or day, we don't want that + continue + postId = submission.id + title = submission.title + score = submission.score + numComments = submission.num_comments + upvoteRatio = submission.upvote_ratio + gildings = submission.gildings + numGildings = sum(gildings.values()) + row = Row( + postId=postId, subreddit=subreddit, subscribers=subscribers, activeUsers=activeUsers, + title=title, createdTSUTC=str(createdTSUTC), + timeElapsedMin=timeElapsedMin, score=score, numComments=numComments, + upvoteRatio=upvoteRatio, numGildings=numGildings, + loadTSUTC=str(now), loadDateUTC=str(now.date()), loadTimeUTC=str(now.time())) + dataCollected.append(row) + if verbose: + print(row) + print() + return dataCollected[:topN-2] + + +def deduplicateRedditData(data): + """ + Deduplicates the reddit data. Sometimes there are duplicate keys which throws an error + when writing to dynamo. It is unclear why this happens but I suspect it is an issue with PRAW. + + :param data: list[Row[schema]] + :return: deduplicated data + """ + postIds = set() + newData = [] + # there really shouldn't be more than 1 loadTSUTC for a postId since that is generated + # on our side, but I wanted to handle that since it is part of the key + data = sorted(data, key=lambda x: x.loadTSUTC)[::-1] + for d in data: + if d.postId not in postIds: + postIds.add(d.postId) + newData.append(d) + return newData + + +def getTable(tableName, dynamodb_resource): + table = dynamodb_resource.Table(tableName) + + # Print out some data about the table. + print(f"Item count in table: {table.item_count}") # this only updates every 6 hours + return table + + +def batchWriter(table, data, schema): + """ + https://boto3.amazonaws.com/v1/documentation/api/latest/guide/dynamodb.html#batch-writing + I didn't bother with dealing with duplicates because shouldn't be a problem with this type of data + no built in way to get responses with batch_writer https://peppydays.medium.com/getting-response-of-aws-dynamodb-batchwriter-request-2aa3f81019fa + + :param table: boto3 table object + :param data: list[Row[schema]] + :param schema: OrderedDict containing the dynamodb schema (dynamo technically schema-less) + :return: None + """ + columns = schema.keys() + with table.batch_writer() as batch: + for i in range(len(data)): # for each row obtained + batch.put_item( + Item = json.loads(json.dumps({k:getattr(data[i], k) for k in columns}), parse_float=Decimal) # helps with parsing float to Decimal + ) \ No newline at end of file diff --git a/lambdaFunctions/getRedditDataFunction/tableDefinition.py b/lambdaFunctions/getRedditDataFunction/tableDefinition.py new file mode 100644 index 0000000..04eb9f4 --- /dev/null +++ b/lambdaFunctions/getRedditDataFunction/tableDefinition.py @@ -0,0 +1,76 @@ +from collections import OrderedDict + + +# schema is mainly needed for defining columns and what the column types should be if building additional indices +schema = OrderedDict() +schema["loadDateUTC"] = "S" +schema["loadTimeUTC"] = "S" +schema["loadTSUTC"] = "S" +schema["postId"] = "S" +schema["subreddit"] = "S" +schema["subscribers"] = "N" +schema["activeUsers"] = "N" +schema["title"] = "S" +schema["createdTSUTC"] = "S" +schema["timeElapsedMin"] = "N" +schema["score"] = "N" +schema["numComments"] = "N" +schema["upvoteRatio"] = "N" +schema["numGildings"] = "N" + +baseTableDefinition = dict( + AttributeDefinitions=[ + { + 'AttributeName': k, + 'AttributeType': schema[k] + } for k in ['postId', 'loadDateUTC', 'loadTimeUTC', 'loadTSUTC'] # only need to define the ones that are used in key and sort + ], + KeySchema=[ + { + 'AttributeName': 'postId', + 'KeyType': 'HASH' + }, + { + 'AttributeName': 'loadTSUTC', + 'KeyType': 'RANGE' + + } + ], + GlobalSecondaryIndexes=[ # I wanted to future proof other ways I might look at the table (by subreddit) + { + 'IndexName': 'byLoadDate', + 'KeySchema': [ + { + 'AttributeName': 'loadDateUTC', + 'KeyType': 'HASH' + }, + { + 'AttributeName': 'loadTimeUTC', + 'KeyType': 'RANGE' + }, + ], + 'Projection': { + 'ProjectionType': 'INCLUDE', + 'NonKeyAttributes': [ + 'timeElapsedMin', + ] + }, + 'ProvisionedThroughput': { # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/GSI.html#GSI.ThroughputConsiderations + 'ReadCapacityUnits': 6, # 1 = 4KB/s I think + 'WriteCapacityUnits': 1 # 1 = 1KB/s + } + }, + ], + BillingMode='PROVISIONED', # recommended for consistent work + ProvisionedThroughput={ # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ServiceQuotas.html#default-limits-throughput-capacity-modes + 'ReadCapacityUnits': 6, + 'WriteCapacityUnits': 1 + }, + TableClass='STANDARD', + DeletionProtectionEnabled=False +) + + +def getTableDefinition(tableName, tableDefintion = baseTableDefinition): + tableDefintion['TableName'] = tableName + return tableDefintion diff --git a/lambdaFunctions/getRedditDataFunction/test_lambda.py b/lambdaFunctions/getRedditDataFunction/test_lambda.py new file mode 100644 index 0000000..b618aa2 --- /dev/null +++ b/lambdaFunctions/getRedditDataFunction/test_lambda.py @@ -0,0 +1,152 @@ +import pytest +import redditUtils as ru +import praw +import tableDefinition +from collections import namedtuple +import boto3 +import sys +import os +THIS_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(os.path.join(THIS_DIR, '../../')) +import viral_reddit_posts_utils.configUtils as cu +import pickle +from moto import mock_dynamodb + + +IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" + + +@pytest.fixture(scope='module') +def cfg(): + cfg_file = cu.findConfig() + cfg = cu.parseConfig(cfg_file) + return cfg + + +@pytest.fixture(scope='module') +def reddit(cfg): + if IN_GITHUB_ACTIONS: + pass + redditcfg = cfg['reddit_api'] + return praw.Reddit( + client_id=f"{redditcfg['CLIENTID']}", + client_secret=f"{redditcfg['CLIENTSECRET']}", + password=f"{redditcfg['PASSWORD']}", + user_agent=f"Post Extraction (by u/{redditcfg['USERNAME']})", + username=f"{redditcfg['USERNAME']}", + ) + + +def test_getRedditData(reddit): + subreddit = "pics" + ru.getRedditData( + reddit, + subreddit, + topN=25, + view='rising', + schema=tableDefinition.schema, + time_filter=None, + verbose=True) + + +@pytest.fixture(scope='module') +def duplicatedData(): + schema = tableDefinition.schema + columns = schema.keys() + Row = namedtuple("Row", columns) + # these are identical examples except one has a later loadTSUTC + return [ + Row(subscribers=10000000, activeUsers=10000, + loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fkqz', + subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43', + timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0), + Row(subscribers=10000000, activeUsers=10000, + loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:06:44', postId='133fkqz', + subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43', + timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0) + ] + + +def test_deduplicateRedditData(duplicatedData): + newData = ru.deduplicateRedditData(duplicatedData) + assert len(newData) == 1 + print("test_deduplicateRedditData complete") + + +@mock_dynamodb +class TestBatchWriter: + + + def classSetUp(self): + """ + If we left this at top level of the class then it won't be skipped by `skip` and `skipif` + furthermore we can't have __init__ in a Test Class, so this is called prior to each test + :return: + """ + dynamodb = boto3.resource('dynamodb', region_name='us-east-2') + # create table and write to sample data + tableName = 'rising' + td = tableDefinition.getTableDefinition(tableName=tableName) + self.testTable = dynamodb.create_table(**td) + self.schema = tableDefinition.schema + self.columns = self.schema.keys() + self.Row = namedtuple("Row", self.columns) + + @pytest.mark.xfail(reason="BatchWriter fails on duplicate keys. This might xpass, possibly a fault in mock object.") + def test_duplicateData(self): + self.classSetUp() + testTable = self.testTable + schema = self.schema + Row=self.Row + + data = [ + Row(loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fkqz', + subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43', + timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0), + Row(loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fkqz', + subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43', + timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0) + ] + from redditUtils import batchWriter + batchWriter(table=testTable, data=data, schema=schema) + print("duplicateDataTester test complete") + + def test_uniqueData(self): + self.classSetUp() + testTable = self.testTable + schema = self.schema + Row = self.Row + + data = [ + Row(subscribers=10000000, activeUsers=10000, + loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fkqz', + subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43', + timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0), + Row(subscribers=10000000, activeUsers=10000, + loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fqj7', + subreddit='pics', title='A piece of wood sticking up in front of a fire.', createdTSUTC='2023-04-30 04:29:23', + timeElapsedMin=34, score=0, numComments=0, upvoteRatio=0.4, numGildings=0) + ] + from redditUtils import batchWriter + batchWriter(table=testTable, data=data, schema=schema) + print("uniqueDataTester test complete") + + def test_diffPrimaryIndexSameSecondIndex(self): + self.classSetUp() + testTable = self.testTable + schema = self.schema + Row = self.Row + + data = [ + Row(subscribers=10000000, activeUsers=10000, + loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fkqz', + subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43', + timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0), + Row(subscribers=10000000, activeUsers=10000, + loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fkqy', + subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43', + timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0) + ] + from redditUtils import batchWriter + batchWriter(table=testTable, data=data, schema=schema) + print("diffPrimaryIndexSameSecondIndexTester test complete") diff --git a/main.tf b/main.tf new file mode 100644 index 0000000..3230f5a --- /dev/null +++ b/main.tf @@ -0,0 +1,251 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "< 6.0" + } + } + + required_version = ">= 1.2.0" +} + +provider "aws" { + region = "us-east-2" +} + +variable "info" { + type = map(string) + default = { + name = "viralredditposts" + region = "us-east-2" + pyversion = "3.12" + } +} + +# get account id +data "aws_caller_identity" "current" {} + +locals { + account_id = data.aws_caller_identity.current.account_id +} + +# zip the lambda function +# resource "null_resource" "zip_function" { +# # rebuild zip each time, this is low cost and good for forcing it to upload each terraform apply +# triggers = { +# build_number = timestamp() +# } +# provisioner "local-exec" { +# command = "./scripts/zipLambdaFunction.sh -f getRedditDataFunction" +# on_failure = fail # OR continue +# } +# } + +data "archive_file" "lambda_zip" { + type = "zip" + source_dir = "./lambdaFunctions/getRedditDataFunction/" + output_path = "./scripts/zippedLambdaFunction/getRedditDataFunction.zip" +} + +# zip the PRAW and boto3 packages +resource "null_resource" "zip_python_packages" { + # this a bit slow but this forces this to rerun each time, + # it was easier than trying to get it to track if the zip was deleted for an environment change + triggers = { + build_number = timestamp() + } + provisioner "local-exec" { + command = "source venv/bin/activate && ./scripts/zipPythonPackage.sh -v ${var.info.pyversion} praw==7.7.0 boto3==1.26.117 git+https://github.com/ViralRedditPosts/Utils.git@main" + on_failure = fail # OR continue + } +} + +# add PRAW zip to S3 +resource "aws_s3_object" "move_PRAW_zip" { + depends_on = [null_resource.zip_python_packages] + + bucket = "packages-${var.info.name}-${var.env}-${local.account_id}" + key = "praw==7.7.0.zip" + source = "./scripts/zippedPythonPackages/praw==7.7.0/praw==7.7.0.zip" + tags = { + Name = "praw-zip" + Environment = "${var.env}" + Project = "viral-reddit-posts" + } +} + +# add boto3 zip to S3 +resource "aws_s3_object" "move_boto3_zip" { + depends_on = [null_resource.zip_python_packages] + + bucket = "packages-${var.info.name}-${var.env}-${local.account_id}" + key = "boto3==1.26.117.zip" + source = "./scripts/zippedPythonPackages/boto3==1.26.117/boto3==1.26.117.zip" + tags = { + Name = "boto3-zip" + Environment = "${var.env}" + Project = "viral-reddit-posts" + } +} + +# add git+https://github.com/ViralRedditPosts/Utils.git@main to S3 +resource "aws_s3_object" "move_utils_zip" { + depends_on = [null_resource.zip_python_packages] + + bucket = "packages-${var.info.name}-${var.env}-${local.account_id}" + key = "Utils.git@main.zip" + source = "./scripts/zippedPythonPackages/Utils.git@main/Utils.git@main.zip" + tags = { + Name = "utils-zip" + Environment = "${var.env}" + Project = "viral-reddit-posts" + } +} + +# define policy for attaching role +data "aws_iam_policy_document" "assume_role" { + statement { + effect = "Allow" + + principals { + type = "Service" + identifiers = ["lambda.amazonaws.com"] + } + + actions = [ + "sts:AssumeRole", + ] + } +} + +data "aws_iam_policy_document" "inline_policy" { + statement { + effect = "Allow" + actions = [ + "s3:GetObject", + "s3:ListBucket", + "dynamodb:DescribeTable", + "dynamodb:BatchWriteItem" + ] + resources = [ + "arn:aws:s3:::data-${var.info.name}-${var.env}-${local.account_id}", + "arn:aws:s3:::data-${var.info.name}-${var.env}-${local.account_id}/*", + "arn:aws:dynamodb:${var.info.region}:${local.account_id}:table/hot-${var.env}", + "arn:aws:dynamodb:${var.info.region}:${local.account_id}:table/rising-${var.env}" + ] + } +} + +# create role +# https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role +resource "aws_iam_role" "iam_for_lambda" { + name = "iam-for-lambda-${var.env}" + assume_role_policy = data.aws_iam_policy_document.assume_role.json # Policy that grants an entity permission to assume the role. + + inline_policy { + name = "test-policy" + policy = data.aws_iam_policy_document.inline_policy.json + } + + tags = { + Environment = "${var.env}" + Project = "viral-reddit-posts" + } +} + +resource "aws_lambda_layer_version" "praw_layer" { + depends_on = [aws_s3_object.move_PRAW_zip] + # you either have to specify a local filename or the s3 object + # https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_layer_version + # filename = "lambda_layer_payload.zip" + s3_bucket = "packages-${var.info.name}-${var.env}-${local.account_id}" + s3_key = "praw==7.7.0.zip" + layer_name = "praw-7_7_0" + description = "python binaries for praw==7.7.0 library" + compatible_architectures = ["x86_64"] + compatible_runtimes = ["python${var.info.pyversion}"] +} + +resource "aws_lambda_layer_version" "boto3_layer" { + depends_on = [aws_s3_object.move_boto3_zip] + # you either have to specify a local filename or the s3 object + # https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_layer_version + # filename = "lambda_layer_payload.zip" + s3_bucket = "packages-${var.info.name}-${var.env}-${local.account_id}" + s3_key = "boto3==1.26.117.zip" + layer_name = "boto3-1_26_117" + description = "python binaries for boto3==1.26.117 library" + compatible_architectures = ["x86_64"] + compatible_runtimes = ["python${var.info.pyversion}"] +} + +resource "aws_lambda_layer_version" "utils_layer" { + depends_on = [aws_s3_object.move_boto3_zip] + # you either have to specify a local filename or the s3 object + # https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_layer_version + # filename = "lambda_layer_payload.zip" + s3_bucket = "packages-${var.info.name}-${var.env}-${local.account_id}" + s3_key = "Utils.git@main.zip" + layer_name = "utils_layer" + description = "python binaries for Utils.git@main library" + compatible_architectures = ["x86_64"] + compatible_runtimes = ["python${var.info.pyversion}"] +} + +# make lambda function +# https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function +resource "aws_lambda_function" "lambda_function" { + # depends_on = [resource.null_resource.zip_function] + + filename = "./scripts/zippedLambdaFunction/getRedditDataFunction.zip" + function_name = "lambda-reddit-scraping-${var.env}" + role = aws_iam_role.iam_for_lambda.arn + handler = "lambda_function.lambda_handler" + runtime = "python${var.info.pyversion}" + timeout = 60 + + ephemeral_storage { + size = 512 # Min 512 MB and the Max 10240 MB + } + + layers = [ + aws_lambda_layer_version.praw_layer.arn, + aws_lambda_layer_version.boto3_layer.arn, + aws_lambda_layer_version.utils_layer.arn, + ] + + source_code_hash = data.archive_file.lambda_zip.output_base64sha256 + + environment { + variables = { + AWS_BUCKET = "data-${var.info.name}-${var.env}-${local.account_id}", + ENV = "${var.env}" + } + } + tags = { + Environment = "${var.env}" + Project = "viral-reddit-posts" + } +} + +# Attach event trigger to Lambda Function, see https://stackoverflow.com/questions/35895315/use-terraform-to-set-up-a-lambda-function-triggered-by-a-scheduled-event-source +resource "aws_cloudwatch_event_rule" "every_one_minute" { + name = "every-one-minute" + description = "Fires every one minute" + schedule_expression = "rate(1 minute)" + state=var.cloudwatch_state +} + +resource "aws_cloudwatch_event_target" "scrape_reddit_every_minute" { + rule = aws_cloudwatch_event_rule.every_one_minute.name + target_id = "scrape_reddit" + arn = aws_lambda_function.lambda_function.arn +} + +resource "aws_lambda_permission" "allow_cloudwatch_to_call_lambda_function" { + statement_id = "AllowExecutionFromCloudWatch" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.lambda_function.function_name + principal = "events.amazonaws.com" + source_arn = aws_cloudwatch_event_rule.every_one_minute.arn +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..158f31c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,33 @@ +[build-system] +requires = ["setuptools >= 61.0"] +build-backend = "setuptools.build_meta" + +# see https://packaging.python.org/en/latest/guides/writing-pyproject-toml/ +[project] +name = "Reddit-Scraping" + +dynamic = ["version"] + +dependencies = [ + "boto3==1.26.117", + "moto[dynamodb,s3]==4.1.8", + "pre-commit==2.21.0", + "praw==7.7.0", + "pytest==7.3.1", + "pytest-cov==4.0.0", + "viral_reddit_posts_utils@git+https://github.com/ViralRedditPosts/Utils.git@main" +] + +requires-python = "== 3.12.3" + +authors = [ + {name = "Kenneth Myers", email = "myers.kenneth.james@gmail.com"}, +] + +description = "This project scrapes Reddit data and loads it to a DynamoDB. It is intended to be run through an AWS Lambda function." + +readme = "README.md" + + + + diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..7538128 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +addopts = --ignore=scripts/ \ No newline at end of file diff --git a/scripts/zipLambdaFunction.sh b/scripts/zipLambdaFunction.sh new file mode 100755 index 0000000..0aef85f --- /dev/null +++ b/scripts/zipLambdaFunction.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# This is meant to zip a lambda function with the reddit config +# use: +# zipLambdaFunction.sh -f someFunction +# saves zip to zippedLambdaFunction/someFunction.zip +# you may need to run chmod +x ./zipLambdaFunction.sh + +set -e + +while getopts f: flag +do + case "${flag}" in + f) function_name=${OPTARG};; # ie someFunction located in ../lambdaFunction/someFunction + esac +done +: ${function_name:?Missing -f} # checks if these have been set https://unix.stackexchange.com/questions/621004/bash-getopts-mandatory-arguments +echo "lambda function: $function_name"; + +SCRIPT_PATH=${0%/*} # https://stackoverflow.com/questions/6393551/what-is-the-meaning-of-0-in-a-bash-script +CWD=${pwd} +cd $SCRIPT_PATH + +[ -d "../lambdaFunctions/${function_name}" ] && echo "Directory ../lambdaFunctions/${function_name} exists." || { echo "Error: Directory ../lambdaFunctions/${function_name} does not exist."; exit 1; } + +cd ./zippedLambdaFunction/ +rm -r ./${function_name} || true +cp -r ../../lambdaFunctions/${function_name} ./ # copy lambda function files here +rm -rf ${function_name}.zip # remove first if it exists +cd ./${function_name}/ # for some reason you have to zip from within this folder or it wont work, it otherwise wraps it in another folder +#rm -rf ./*.ipynb* # remove any notebook stuff +zip -r ../${function_name}.zip * -x "*.ipynb*" "*pycache*" # zip of function +cd .. +rm -r ./${function_name} # clean up unzipped file + +cd $CWD # return to original place diff --git a/scripts/zipPythonPackage.sh b/scripts/zipPythonPackage.sh new file mode 100755 index 0000000..54fc07f --- /dev/null +++ b/scripts/zipPythonPackage.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# This is a script to download a python package binaries and zip it +# The intention is to use that package as a layer for a lambda function (or something else). +# use: +# sh zipPythonPackage.sh -v 3.7 praw==7.7.0 boto3==1.26.117 +# as you can see, packages are just listed as non-option arguments +# based on https://www.linkedin.com/pulse/add-external-python-libraries-aws-lambda-using-layers-gabe-olokun/ +# Note: an old version of this script also moved the zip file to s3, this functionality has been removed. +# you may need to run chmod +x ./zipPythonPackage.sh + +set -e + +while getopts v: flag +do + case "${flag}" in + v) version=${OPTARG};; # for python ie 3.7 + esac +done +: ${version:?Missing -v} # checks if these have been set https://unix.stackexchange.com/questions/621004/bash-getopts-mandatory-arguments +shift $(( OPTIND - 1 )) +echo "packages: $@"; +echo "python version: $version"; + +SCRIPT_PATH=${0%/*} # https://stackoverflow.com/questions/6393551/what-is-the-meaning-of-0-in-a-bash-script +CWD=${pwd} +cd $SCRIPT_PATH + +for package in "$@"; do + echo "Preparing ${package}..." + # format the zip file. needed for the git packages which have lots of slashes. + if [[ ${package} == "git+"* ]]; then + package_name=${package##*/} # https://stackoverflow.com/questions/3162385/how-to-split-a-string-in-shell-and-get-the-last-field + else + package_name=${package} + fi + mkdir -p ./zippedPythonPackages/${package_name}/python + + cd ./zippedPythonPackages/${package_name}/python + + # install binaries for package + pip install \ + --platform manylinux2014_x86_64 \ + --target=. \ + --implementation cp \ + --python ${version} \ + --only-binary=:all: \ + --upgrade ${package} + + rm -rf *dist-info # some cleanup of unnecessary stuff + # zip package + cd .. + rm -rf ${package_name}.zip # remove first if it exists + echo "Zipping ${package_name} at $(pwd)" + zip -r ${package_name}.zip python # zip contents of python to zip name + cd ../../ # go back out to scripts dir +done + +cd $CWD # return to original location diff --git a/scripts/zippedLambdaFunction/.gitkeep b/scripts/zippedLambdaFunction/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/scripts/zippedPythonPackages/.gitkeep b/scripts/zippedPythonPackages/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/variable.tf b/variable.tf new file mode 100644 index 0000000..4bceb5b --- /dev/null +++ b/variable.tf @@ -0,0 +1,11 @@ +variable "env" { + type = string + default = "dev" + description = "environment to deploy to" +} + +variable "cloudwatch_state" { + type = string + default = "DISABLED" + description = "Whether or not the lambda function schedule is enabled or not. Valid values are DISABLED, ENABLED, and ENABLED_WITH_ALL_CLOUDTRAIL_MANAGEMENT_EVENT" +}