From 9048982e2689334090ff0ded97fc98af9c589bec Mon Sep 17 00:00:00 2001
From: Kenneth Myers <kennjamesmyers@gmail.com>
Date: Sat, 13 Apr 2024 17:37:58 -0400
Subject: [PATCH] clean commit

---
 .DS_Store                                     | Bin 0 -> 6148 bytes
 .github/workflows/workflow.yml                |  28 ++
 .gitignore                                    | 274 ++++++++++++++++++
 .terraform.lock.hcl                           |  63 ++++
 LICENSE                                       |  21 ++
 README.md                                     |  40 +++
 example_reddit.cfg                            |  22 ++
 .../getRedditDataFunction/lambda_function.py  |  61 ++++
 .../getRedditDataFunction/redditUtils.py      | 119 ++++++++
 .../getRedditDataFunction/tableDefinition.py  |  76 +++++
 .../getRedditDataFunction/test_lambda.py      | 152 ++++++++++
 main.tf                                       | 251 ++++++++++++++++
 pyproject.toml                                |  33 +++
 pytest.ini                                    |   2 +
 scripts/zipLambdaFunction.sh                  |  35 +++
 scripts/zipPythonPackage.sh                   |  58 ++++
 scripts/zippedLambdaFunction/.gitkeep         |   0
 scripts/zippedPythonPackages/.gitkeep         |   0
 variable.tf                                   |  11 +
 19 files changed, 1246 insertions(+)
 create mode 100644 .DS_Store
 create mode 100644 .github/workflows/workflow.yml
 create mode 100644 .gitignore
 create mode 100644 .terraform.lock.hcl
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 example_reddit.cfg
 create mode 100644 lambdaFunctions/getRedditDataFunction/lambda_function.py
 create mode 100644 lambdaFunctions/getRedditDataFunction/redditUtils.py
 create mode 100644 lambdaFunctions/getRedditDataFunction/tableDefinition.py
 create mode 100644 lambdaFunctions/getRedditDataFunction/test_lambda.py
 create mode 100644 main.tf
 create mode 100644 pyproject.toml
 create mode 100644 pytest.ini
 create mode 100755 scripts/zipLambdaFunction.sh
 create mode 100755 scripts/zipPythonPackage.sh
 create mode 100644 scripts/zippedLambdaFunction/.gitkeep
 create mode 100644 scripts/zippedPythonPackages/.gitkeep
 create mode 100644 variable.tf
diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6
GIT binary patch
literal 6148
zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3
zem<@ulZcFPQ@L2!n>{z**<q8>++&mCkOWA81W14cNZ<zv;LbK1Poaz?KmsK2CSc!(
z0ynLxE!0092;Krf2c+FF_Fe*7ECH>lEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ
zLs35+`xjp>T0<F0fCPF1$Cyrb|F7^5{eNG?83~ZUUlGt@xh*qZDeu<Z%US-OSsOPv
j)R!Z4KLME7ReXlK;d!wEw5GODWMKRea10D2@KpjYNUI8I

literal 0
HcmV?d00001

diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml
new file mode 100644
index 0000000..3590d08
--- /dev/null
+++ b/.github/workflows/workflow.yml
@@ -0,0 +1,28 @@
+name: API workflow
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    name: Run tests
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: local files
+      run: ls -al
+    
+    - name: Set up Python
+      uses: actions/setup-python@v4.6.0
+      with:
+        python-version: '3.12.3'
+    
+    - name: Install requirements
+      run: pip install .
+      
+    - name: Run tests and collect coverage
+      run: pytest --cov .
+      
+    - name: Upload coverage reports to Codecov
+      uses: codecov/codecov-action@v3
+      
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..fbd6c23
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,274 @@
+# Created by .ignore support plugin (hsz.mobi)
+### JupyterNotebooks template
+# gitignore template for Jupyter Notebooks
+# website: http://jupyter.org/
+
+.ipynb_checkpoints
+*/.ipynb_checkpoints/*
+
+# Remove previous ipynb_checkpoints
+#   git rm -r .ipynb_checkpoints/
+#
+
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+.idea/*
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Local .terraform directories
+**/.terraform/*
+
+# .tfstate files
+*.tfstate
+*.tfstate.*
+
+# plan files
+*-plan.out
+
+# Crash log files
+crash.log
+crash.*.log
+
+# Exclude all .tfvars files, which are likely to contain sensitive data, such as
+# password, private keys, and other secrets. These should not be part of version 
+# control as they are data points which are potentially sensitive and subject 
+# to change depending on the environment.
+*.tfvars
+*.tfvars.json
+
+# Ignore override files as they are usually used to override resources locally and so
+# are not checked in
+override.tf
+override.tf.json
+*_override.tf
+*_override.tf.json
+
+# Include override files you do wish to add to version control using negated pattern
+# !example_override.tf
+
+# Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
+# example: *tfplan*
+
+# Ignore CLI configuration files
+.terraformrc
+terraform.rc
+
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+
+#######################
+# Unique to this repo #
+#######################
+scripts/zippedPythonPackages/*
+!scripts/zippedPythonPackages/.gitkeep
+
+scripts/zippedLambdaFunction/*
+!scripts/zippedLambdaFunction/.gitkeep
+
+*reddit*.cfg
+!example_reddit.cfg
+
+model/pickledModels/latestModel.sav
diff --git a/.terraform.lock.hcl b/.terraform.lock.hcl
new file mode 100644
index 0000000..7912892
--- /dev/null
+++ b/.terraform.lock.hcl
@@ -0,0 +1,63 @@
+# This file is maintained automatically by "terraform init".
+# Manual edits may be lost in future updates.
+
+provider "registry.terraform.io/hashicorp/archive" {
+  version = "2.4.2"
+  hashes = [
+    "h1:1eOz9vM/55vnQjxk23RhnYga7PZq8n2rGxG+2Vx2s6w=",
+    "zh:08faed7c9f42d82bc3d406d0d9d4971e2d1c2d34eae268ad211b8aca57b7f758",
+    "zh:3564112ed2d097d7e0672378044a69b06642c326f6f1584d81c7cdd32ebf3a08",
+    "zh:53cd9afd223c15828c1916e68cb728d2be1cbccb9545568d6c2b122d0bac5102",
+    "zh:5ae4e41e3a1ce9d40b6458218a85bbde44f21723943982bca4a3b8bb7c103670",
+    "zh:5b65499218b315b96e95c5d3463ea6d7c66245b59461217c99eaa1611891cd2c",
+    "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3",
+    "zh:7f45b35a8330bebd184c2545a41782ff58240ed6ba947274d9881dd5da44b02e",
+    "zh:87e67891033214e55cfead1391d68e6a3bf37993b7607753237e82aa3250bb71",
+    "zh:de3590d14037ad81fc5cedf7cfa44614a92452d7b39676289b704a962050bc5e",
+    "zh:e7e6f2ea567f2dbb3baa81c6203be69f9cd6aeeb01204fd93e3cf181e099b610",
+    "zh:fd24d03c89a7702628c2e5a3c732c0dede56fa75a08da4a1efe17b5f881c88e2",
+    "zh:febf4b7b5f3ff2adff0573ef6361f09b6638105111644bdebc0e4f575373935f",
+  ]
+}
+
+provider "registry.terraform.io/hashicorp/aws" {
+  version     = "5.45.0"
+  constraints = "< 6.0.0"
+  hashes = [
+    "h1:8m3+C1VNevzU/8FsABoKp2rTOx3Ue7674INfhfk0TZY=",
+    "zh:1379bcf45aef3d486ee18b4f767bfecd40a0056510d26107f388be3d7994c368",
+    "zh:1615a6f5495acfb3a0cb72324587261dd4d72711a3cc51aff13167b14531501e",
+    "zh:18b69a0f33f8b1862fbd3f200756b7e83e087b73687085f2cf9c7da4c318e3e6",
+    "zh:2c5e7aecd197bc3d3b19290bad8cf4c390c2c6a77bb165da4e11f53f2dfe2e54",
+    "zh:3794da9bef97596e3bc60e12cdd915bda5ec2ed62cd1cd93723d58b4981905fe",
+    "zh:40a5e45ed91801f83db76dffd467dcf425ea2ca8642327cf01119601cb86021c",
+    "zh:4abfc3f53d0256a7d5d1fa5e931e4601b02db3d1da28f452341d3823d0518f1a",
+    "zh:4eb0e98078f79aeb06b5ff6115286dc2135d12a80287885698d04036425494a2",
+    "zh:75470efbadea4a8d783642497acaeec5077fc4a7f3df3340defeaa1c7de29bf7",
+    "zh:8861a0b4891d5fa2fa7142f236ae613cea966c45b5472e3915a4ac3abcbaf487",
+    "zh:8bf6f21cd9390b742ca0b4393fde92616ca9e6553fb75003a0999006ad233d35",
+    "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425",
+    "zh:ad73008a044e75d337acda910fb54d8b81a366873c8a413fec1291034899a814",
+    "zh:bf261713b0b8bebfe8c199291365b87d9043849f28a2dc764bafdde73ae43693",
+    "zh:da3bafa1fd830be418dfcc730e85085fe67c0d415c066716f2ac350a2306f40a",
+  ]
+}
+
+provider "registry.terraform.io/hashicorp/null" {
+  version = "3.2.2"
+  hashes = [
+    "h1:IMVAUHKoydFrlPrl9OzasDnw/8ntZFerCC9iXw1rXQY=",
+    "zh:3248aae6a2198f3ec8394218d05bd5e42be59f43a3a7c0b71c66ec0df08b69e7",
+    "zh:32b1aaa1c3013d33c245493f4a65465eab9436b454d250102729321a44c8ab9a",
+    "zh:38eff7e470acb48f66380a73a5c7cdd76cc9b9c9ba9a7249c7991488abe22fe3",
+    "zh:4c2f1faee67af104f5f9e711c4574ff4d298afaa8a420680b0cb55d7bbc65606",
+    "zh:544b33b757c0b954dbb87db83a5ad921edd61f02f1dc86c6186a5ea86465b546",
+    "zh:696cf785090e1e8cf1587499516b0494f47413b43cb99877ad97f5d0de3dc539",
+    "zh:6e301f34757b5d265ae44467d95306d61bef5e41930be1365f5a8dcf80f59452",
+    "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3",
+    "zh:913a929070c819e59e94bb37a2a253c228f83921136ff4a7aa1a178c7cce5422",
+    "zh:aa9015926cd152425dbf86d1abdbc74bfe0e1ba3d26b3db35051d7b9ca9f72ae",
+    "zh:bb04798b016e1e1d49bcc76d62c53b56c88c63d6f2dfe38821afef17c416a0e1",
+    "zh:c23084e1b23577de22603cff752e59128d83cfecc2e6819edadd8cf7a10af11e",
+  ]
+}
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..de58d93
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Kenneth Myers
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e88cc57
--- /dev/null
+++ b/README.md
@@ -0,0 +1,40 @@
+![Python](https://img.shields.io/badge/python-3.12.3-blue.svg) 
+
+# Reddit Scraping
+
+The purpose of this repo is to deploy AWS Lambda function that scrapes rising and hot reddit posts.
+
+# How to use
+
+1. First ensure the DynamoDB tables are set up via [DynamoDB-Setup](https://github.com/ViralRedditPosts/DynamoDB-Setup).
+2. Installs - see the [prerequisites section on this page](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/aws-build#prerequisites) for additional information, the steps are essentially:
+    1. Install Terraform CLI
+    2. Install AWS CLI and run `aws configure` and enter in your aws credentials.
+3. Clone this repository 
+4. You can run the tests locally yourself by doing the following (it is recommended that you manage your python environments with something like [asdf](https://asdf-vm.com/) and use python==3.12.3 as your local runtime):
+    
+    ```sh
+    python -m venv venv  # this sets up a local virtual env using the current python runtime
+    source ./venv//bin/activate  # activates the virtual env
+    pip install -e .  # installs this packages in local env with dependencies
+    pytest . -r f -s   # -r f shows extra info for failures, -s disables capturing
+    ```
+   
+5. From within this repository run the following:
+  
+    ```sh
+    terraform init
+    terraform workspace new dev  # this should switch you to the dev workspace
+    terraform plan -var-file="dev.tfvars" -out=dev-plan.out
+    terraform apply -var-file="dev.tfvars" dev-plan.out
+    ```
+   
+    For deploying to prd
+
+    ```sh
+    terraform workspace new prd  # or terraform workspace select prd if already created
+    terraform plan -var-file="prd.tfvars" -out=prd-plan.out
+    terraform apply -var-file="prd.tfvars" prd-plan.out
+    ```
+   
+   On subsequent updates you don't need to `init` or make a new workspace again.
\ No newline at end of file
diff --git a/example_reddit.cfg b/example_reddit.cfg
new file mode 100644
index 0000000..e07e9e7
--- /dev/null
+++ b/example_reddit.cfg
@@ -0,0 +1,22 @@
+# rename this file reddit.cfg
+[reddit_api]
+CLIENTID: "XXXX"
+CLIENTSECRET: "XXXX"
+PASSWORD: "XXXX"
+USERNAME: "XXXX"
+
+[S3_access]
+ACCESSKEY: "XXXX"
+SECRETKEY: "XXXX"
+
+[Discord]
+BOTTOKEN: "XXXX"
+MYSNOWFLAKEID: "XXXX"
+CHANNELSNOWFLAKEID: [123456789,987654321]
+
+[Postgres]
+USERNAME: "XXXX"
+PASSWORD: "XXXX"
+HOST: "XXXX"
+PORT: "XXXX"
+DATABASE: "XXXX"
diff --git a/lambdaFunctions/getRedditDataFunction/lambda_function.py b/lambdaFunctions/getRedditDataFunction/lambda_function.py
new file mode 100644
index 0000000..37844d8
--- /dev/null
+++ b/lambdaFunctions/getRedditDataFunction/lambda_function.py
@@ -0,0 +1,61 @@
+import redditUtils as ru
+import viral_reddit_posts_utils.configUtils as cu
+import tableDefinition
+import praw
+import boto3
+import os
+
+
+dynamodb_resource = boto3.resource('dynamodb')
+
+
+def lambda_handler(event, context):
+  # Initializations
+  subreddits = ["pics", "gaming", "worldnews", "news", "aww", "funny", "todayilearned", "movies"]
+
+  # cfg_file = cu.findConfig()
+  cfg_file = "s3://"+os.environ['AWS_BUCKET']+"/reddit.cfg" # ie 's3://data-kennethmyers/reddit.cfg'
+  cfg = cu.parseConfig(cfg_file)
+
+  CLIENTID = cfg['reddit_api']['CLIENTID']
+  CLIENTSECRET = cfg['reddit_api']['CLIENTSECRET']
+  PASSWORD = cfg['reddit_api']['PASSWORD']
+  USERNAME = cfg['reddit_api']['USERNAME']
+
+  reddit = praw.Reddit(
+    client_id=f"{CLIENTID}",
+    client_secret=f"{CLIENTSECRET}",
+    password=f"{PASSWORD}",
+    user_agent=f"Post Extraction (by u/{USERNAME})",
+    username=f"{USERNAME}",
+  )
+
+  for subreddit in subreddits:
+    print(f"Gathering data for {subreddit}")
+    # Get Rising Reddit data
+    print("\tGetting Rising Data")
+    schema = tableDefinition.schema
+    topN = 25
+    view = 'rising'
+    risingData = ru.getRedditData(reddit=reddit, subreddit=subreddit, view=view, schema=schema, topN=topN)
+    risingData = ru.deduplicateRedditData(risingData)
+
+    # Push to DynamoDB
+    tableName = f"{view}-{os.environ['ENV']}"
+    risingTable = ru.getTable(tableName, dynamodb_resource)
+    ru.batchWriter(risingTable, risingData, schema)
+
+    # Get Hot Reddit data
+    print("\tGetting Hot Data")
+    schema = tableDefinition.schema
+    topN = 3
+    view = 'hot'
+    hotData = ru.getRedditData(reddit=reddit, subreddit=subreddit, view=view, schema=schema, topN=topN)
+    hotData = ru.deduplicateRedditData(hotData)
+
+    # Push to DynamoDB
+    tableName = f"{view}-{os.environ['ENV']}"
+    hotTable = ru.getTable(tableName, dynamodb_resource)
+    ru.batchWriter(hotTable, hotData, schema)
+
+  return 200
diff --git a/lambdaFunctions/getRedditDataFunction/redditUtils.py b/lambdaFunctions/getRedditDataFunction/redditUtils.py
new file mode 100644
index 0000000..d7ddac7
--- /dev/null
+++ b/lambdaFunctions/getRedditDataFunction/redditUtils.py
@@ -0,0 +1,119 @@
+from datetime import datetime
+from collections import namedtuple
+import tableDefinition
+import json
+from decimal import Decimal
+import pickle
+
+
+def saveTestReddit(reddit, filename):
+  pickle.dump(reddit, open(filename, 'wb'))
+
+
+def getRedditData(reddit, subreddit, topN=25, view='rising', schema=tableDefinition.schema, time_filter=None, verbose=False):
+  """
+  Uses PRAW to get data from reddit using defined parameters. Returns data in a list of row based data.
+
+  :param reddit: PRAW reddit object
+  :param subreddit: subreddit name
+  :param topN: Number of posts to return
+  :param view: view to look at the subreddit. rising, top, hot
+  :param schema: schema that describes the data. Dynamo is technically schema-less
+  :param time_filter: range of time to look at the data. all, day, hour, month, week, year
+  :param verbose: if True then prints more information
+  :return: list[Row[schema]], Row is a namedtuple defined by the schema
+  """
+  assert topN <= 25  # some, like rising, cap out at 25 and this also is to limit data you're working with
+  assert view in {'rising', 'top' , 'hot'}
+  topN += 2  # increment by 2 because of sticky posts
+  if view == 'top':
+    assert time_filter in {"all", "day", "hour", "month", "week", "year"}
+  subredditObject = reddit.subreddit(subreddit)
+  if view == 'rising':
+    topNposts = subredditObject.rising(limit=topN)
+  elif view == 'hot':
+    topNposts = subredditObject.hot(limit=topN)
+  elif view == 'top':
+    topNposts = subredditObject.top(time_filter=time_filter, limit=topN)
+
+  now = datetime.utcnow().replace(tzinfo=None, microsecond=0)
+  columns = schema.keys()
+  Row = namedtuple("Row", columns)
+  dataCollected = []
+  subscribers = subredditObject.subscribers
+  activeUsers = subredditObject.accounts_active
+  print(f'\tSubscribers: {subscribers}\n\tActive users: {activeUsers}')
+  for submission in topNposts:
+    if submission.stickied:
+      continue  # skip stickied posts
+    createdTSUTC = datetime.utcfromtimestamp(submission.created_utc)
+    timeSincePost = now - createdTSUTC
+    timeElapsedMin = timeSincePost.seconds // 60
+    timeElapsedDays = timeSincePost.days
+    if view=='rising' and (timeElapsedMin > 60 or timeElapsedDays>0):  # sometime rising has some data that's already older than an hour or day, we don't want that
+      continue
+    postId = submission.id
+    title = submission.title
+    score = submission.score
+    numComments = submission.num_comments
+    upvoteRatio = submission.upvote_ratio
+    gildings = submission.gildings
+    numGildings = sum(gildings.values())
+    row = Row(
+      postId=postId, subreddit=subreddit, subscribers=subscribers, activeUsers=activeUsers,
+      title=title, createdTSUTC=str(createdTSUTC),
+      timeElapsedMin=timeElapsedMin, score=score, numComments=numComments,
+      upvoteRatio=upvoteRatio, numGildings=numGildings,
+      loadTSUTC=str(now), loadDateUTC=str(now.date()), loadTimeUTC=str(now.time()))
+    dataCollected.append(row)
+    if verbose:
+      print(row)
+      print()
+  return dataCollected[:topN-2]
+
+
+def deduplicateRedditData(data):
+  """
+  Deduplicates the reddit data. Sometimes there are duplicate keys which throws an error
+  when writing to dynamo. It is unclear why this happens but I suspect it is an issue with PRAW.
+
+  :param data: list[Row[schema]]
+  :return: deduplicated data
+  """
+  postIds = set()
+  newData = []
+  # there really shouldn't be more than 1 loadTSUTC for a postId since that is generated
+  # on our side, but I wanted to handle that since it is part of the key
+  data = sorted(data, key=lambda x: x.loadTSUTC)[::-1]
+  for d in data:
+    if d.postId not in postIds:
+      postIds.add(d.postId)
+      newData.append(d)
+  return newData
+
+
+def getTable(tableName, dynamodb_resource):
+    table = dynamodb_resource.Table(tableName)
+
+    # Print out some data about the table.
+    print(f"Item count in table: {table.item_count}")  # this only updates every 6 hours
+    return table
+
+
+def batchWriter(table, data, schema):
+  """
+  https://boto3.amazonaws.com/v1/documentation/api/latest/guide/dynamodb.html#batch-writing
+  I didn't bother with dealing with duplicates because shouldn't be a problem with this type of data
+  no built in way to get responses with batch_writer https://peppydays.medium.com/getting-response-of-aws-dynamodb-batchwriter-request-2aa3f81019fa
+
+  :param table: boto3 table object
+  :param data: list[Row[schema]]
+  :param schema: OrderedDict containing the dynamodb schema (dynamo technically schema-less)
+  :return: None
+  """
+  columns = schema.keys()
+  with table.batch_writer() as batch:
+    for i in range(len(data)):  # for each row obtained
+      batch.put_item(
+        Item = json.loads(json.dumps({k:getattr(data[i], k) for k in columns}), parse_float=Decimal) # helps with parsing float to Decimal
+      )
\ No newline at end of file
diff --git a/lambdaFunctions/getRedditDataFunction/tableDefinition.py b/lambdaFunctions/getRedditDataFunction/tableDefinition.py
new file mode 100644
index 0000000..04eb9f4
--- /dev/null
+++ b/lambdaFunctions/getRedditDataFunction/tableDefinition.py
@@ -0,0 +1,76 @@
+from collections import OrderedDict
+
+
+# schema is mainly needed for defining columns and what the column types should be if building additional indices
+schema = OrderedDict()
+schema["loadDateUTC"] = "S"
+schema["loadTimeUTC"] = "S"
+schema["loadTSUTC"] = "S"
+schema["postId"] = "S"
+schema["subreddit"] = "S"
+schema["subscribers"] = "N"
+schema["activeUsers"] = "N"
+schema["title"] = "S"
+schema["createdTSUTC"] = "S"
+schema["timeElapsedMin"] = "N"
+schema["score"] = "N"
+schema["numComments"] = "N"
+schema["upvoteRatio"] = "N"
+schema["numGildings"] = "N"
+
+baseTableDefinition = dict(
+  AttributeDefinitions=[
+    {
+      'AttributeName': k,
+      'AttributeType': schema[k]
+    } for k in ['postId', 'loadDateUTC', 'loadTimeUTC', 'loadTSUTC']  # only need to define the ones that are used in key and sort
+  ],
+  KeySchema=[
+    {
+      'AttributeName': 'postId',
+      'KeyType': 'HASH'
+    },
+    {
+      'AttributeName': 'loadTSUTC',
+      'KeyType': 'RANGE'
+
+    }
+  ],
+  GlobalSecondaryIndexes=[  # I wanted to future proof other ways I might look at the table (by subreddit)
+      {
+          'IndexName': 'byLoadDate',
+          'KeySchema': [
+              {
+                  'AttributeName': 'loadDateUTC',
+                  'KeyType': 'HASH'
+              },
+              {
+                  'AttributeName': 'loadTimeUTC',
+                  'KeyType': 'RANGE'
+              },
+          ],
+          'Projection': {
+              'ProjectionType': 'INCLUDE',
+              'NonKeyAttributes': [
+                    'timeElapsedMin',
+                ]
+          },
+          'ProvisionedThroughput': {  # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/GSI.html#GSI.ThroughputConsiderations
+              'ReadCapacityUnits': 6,  # 1 = 4KB/s I think
+              'WriteCapacityUnits': 1  # 1 = 1KB/s
+          }
+      },
+  ],
+  BillingMode='PROVISIONED',  # recommended for consistent work
+  ProvisionedThroughput={  # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ServiceQuotas.html#default-limits-throughput-capacity-modes
+      'ReadCapacityUnits': 6,
+      'WriteCapacityUnits': 1
+  },
+  TableClass='STANDARD',
+  DeletionProtectionEnabled=False
+)
+
+
+def getTableDefinition(tableName, tableDefintion = baseTableDefinition):
+  tableDefintion['TableName'] = tableName
+  return tableDefintion
diff --git a/lambdaFunctions/getRedditDataFunction/test_lambda.py b/lambdaFunctions/getRedditDataFunction/test_lambda.py
new file mode 100644
index 0000000..b618aa2
--- /dev/null
+++ b/lambdaFunctions/getRedditDataFunction/test_lambda.py
@@ -0,0 +1,152 @@
+import pytest
+import redditUtils as ru
+import praw
+import tableDefinition
+from collections import namedtuple
+import boto3
+import sys
+import os
+THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.join(THIS_DIR, '../../'))
+import viral_reddit_posts_utils.configUtils as cu
+import pickle
+from moto import mock_dynamodb
+
+
+IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true"
+
+
+@pytest.fixture(scope='module')
+def cfg():
+  cfg_file = cu.findConfig()
+  cfg = cu.parseConfig(cfg_file)
+  return cfg
+
+
+@pytest.fixture(scope='module')
+def reddit(cfg):
+  if IN_GITHUB_ACTIONS:
+    pass
+  redditcfg = cfg['reddit_api']
+  return praw.Reddit(
+    client_id=f"{redditcfg['CLIENTID']}",
+    client_secret=f"{redditcfg['CLIENTSECRET']}",
+    password=f"{redditcfg['PASSWORD']}",
+    user_agent=f"Post Extraction (by u/{redditcfg['USERNAME']})",
+    username=f"{redditcfg['USERNAME']}",
+  )
+
+
+def test_getRedditData(reddit):
+  subreddit = "pics"
+  ru.getRedditData(
+    reddit,
+    subreddit,
+    topN=25,
+    view='rising',
+    schema=tableDefinition.schema,
+    time_filter=None,
+    verbose=True)
+
+
+@pytest.fixture(scope='module')
+def duplicatedData():
+  schema = tableDefinition.schema
+  columns = schema.keys()
+  Row = namedtuple("Row", columns)
+  # these are identical examples except one has a later loadTSUTC
+  return [
+    Row(subscribers=10000000, activeUsers=10000,
+        loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fkqz',
+        subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43',
+        timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0),
+    Row(subscribers=10000000, activeUsers=10000,
+        loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:06:44', postId='133fkqz',
+        subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43',
+        timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0)
+  ]
+
+
+def test_deduplicateRedditData(duplicatedData):
+  newData = ru.deduplicateRedditData(duplicatedData)
+  assert len(newData) == 1
+  print("test_deduplicateRedditData complete")
+
+
+@mock_dynamodb
+class TestBatchWriter:
+
+
+  def classSetUp(self):
+    """
+    If we left this at top level of the class then it won't be skipped by `skip` and `skipif`
+    furthermore we can't have __init__ in a Test Class, so this is called prior to each test
+    :return:
+    """
+    dynamodb = boto3.resource('dynamodb', region_name='us-east-2')
+    # create table and write to sample data
+    tableName = 'rising'
+    td = tableDefinition.getTableDefinition(tableName=tableName)
+    self.testTable = dynamodb.create_table(**td)
+    self.schema = tableDefinition.schema
+    self.columns = self.schema.keys()
+    self.Row = namedtuple("Row", self.columns)
+
+  @pytest.mark.xfail(reason="BatchWriter fails on duplicate keys. This might xpass, possibly a fault in mock object.")
+  def test_duplicateData(self):
+    self.classSetUp()
+    testTable = self.testTable
+    schema = self.schema
+    Row=self.Row
+
+    data = [
+      Row(loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fkqz',
+         subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43',
+         timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0),
+      Row(loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fkqz',
+          subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43',
+          timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0)
+     ]
+    from redditUtils import batchWriter
+    batchWriter(table=testTable, data=data, schema=schema)
+    print("duplicateDataTester test complete")
+
+  def test_uniqueData(self):
+    self.classSetUp()
+    testTable = self.testTable
+    schema = self.schema
+    Row = self.Row
+
+    data = [
+      Row(subscribers=10000000, activeUsers=10000,
+          loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fkqz',
+          subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43',
+          timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0),
+      Row(subscribers=10000000, activeUsers=10000,
+          loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fqj7',
+          subreddit='pics', title='A piece of wood sticking up in front of a fire.', createdTSUTC='2023-04-30 04:29:23',
+          timeElapsedMin=34, score=0, numComments=0, upvoteRatio=0.4, numGildings=0)
+    ]
+    from redditUtils import batchWriter
+    batchWriter(table=testTable, data=data, schema=schema)
+    print("uniqueDataTester test complete")
+
+  def test_diffPrimaryIndexSameSecondIndex(self):
+    self.classSetUp()
+    testTable = self.testTable
+    schema = self.schema
+    Row = self.Row
+
+    data = [
+      Row(subscribers=10000000, activeUsers=10000,
+          loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fkqz',
+          subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43',
+          timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0),
+      Row(subscribers=10000000, activeUsers=10000,
+          loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fkqy',
+          subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43',
+          timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0)
+    ]
+    from redditUtils import batchWriter
+    batchWriter(table=testTable, data=data, schema=schema)
+    print("diffPrimaryIndexSameSecondIndexTester test complete")
diff --git a/main.tf b/main.tf
new file mode 100644
index 0000000..3230f5a
--- /dev/null
+++ b/main.tf
@@ -0,0 +1,251 @@
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "< 6.0"
+    }
+  }
+
+  required_version = ">= 1.2.0"
+}
+
+provider "aws" {
+  region = "us-east-2"
+}
+
+variable "info" {
+  type = map(string)
+  default = {
+    name      = "viralredditposts"
+    region    = "us-east-2"
+    pyversion = "3.12"
+  }
+}
+
+# get account id
+data "aws_caller_identity" "current" {}
+
+locals {
+  account_id = data.aws_caller_identity.current.account_id
+}
+
+# zip the lambda function
+# resource "null_resource" "zip_function" {
+#   # rebuild zip each time, this is low cost and good for forcing it to upload each terraform apply
+#   triggers = {
+#     build_number = timestamp()
+#   }
+#   provisioner "local-exec" {
+#     command    = "./scripts/zipLambdaFunction.sh -f getRedditDataFunction"
+#     on_failure = fail # OR continue
+#   }
+# }
+
+data "archive_file" "lambda_zip" {
+  type        = "zip"
+  source_dir  = "./lambdaFunctions/getRedditDataFunction/"
+  output_path = "./scripts/zippedLambdaFunction/getRedditDataFunction.zip"
+}
+
+# zip the PRAW and boto3 packages
+resource "null_resource" "zip_python_packages" {
+  # this a bit slow but this forces this to rerun each time, 
+  # it was easier than trying to get it to track if the zip was deleted for an environment change
+  triggers = {
+    build_number = timestamp()
+  }
+  provisioner "local-exec" {
+    command    = "source venv/bin/activate && ./scripts/zipPythonPackage.sh -v ${var.info.pyversion} praw==7.7.0 boto3==1.26.117 git+https://github.com/ViralRedditPosts/Utils.git@main"
+    on_failure = fail # OR continue
+  }
+}
+
+# add PRAW zip to S3
+resource "aws_s3_object" "move_PRAW_zip" {
+  depends_on = [null_resource.zip_python_packages]
+
+  bucket = "packages-${var.info.name}-${var.env}-${local.account_id}"
+  key    = "praw==7.7.0.zip"
+  source = "./scripts/zippedPythonPackages/praw==7.7.0/praw==7.7.0.zip"
+  tags = {
+    Name        = "praw-zip"
+    Environment = "${var.env}"
+    Project     = "viral-reddit-posts"
+  }
+}
+
+# add boto3 zip to S3
+resource "aws_s3_object" "move_boto3_zip" {
+  depends_on = [null_resource.zip_python_packages]
+
+  bucket = "packages-${var.info.name}-${var.env}-${local.account_id}"
+  key    = "boto3==1.26.117.zip"
+  source = "./scripts/zippedPythonPackages/boto3==1.26.117/boto3==1.26.117.zip"
+  tags = {
+    Name        = "boto3-zip"
+    Environment = "${var.env}"
+    Project     = "viral-reddit-posts"
+  }
+}
+
+# add git+https://github.com/ViralRedditPosts/Utils.git@main to S3
+resource "aws_s3_object" "move_utils_zip" {
+  depends_on = [null_resource.zip_python_packages]
+
+  bucket = "packages-${var.info.name}-${var.env}-${local.account_id}"
+  key    = "Utils.git@main.zip"
+  source = "./scripts/zippedPythonPackages/Utils.git@main/Utils.git@main.zip"
+  tags = {
+    Name        = "utils-zip"
+    Environment = "${var.env}"
+    Project     = "viral-reddit-posts"
+  }
+}
+
+# define policy for attaching role
+data "aws_iam_policy_document" "assume_role" {
+  statement {
+    effect = "Allow"
+
+    principals {
+      type        = "Service"
+      identifiers = ["lambda.amazonaws.com"]
+    }
+
+    actions = [
+      "sts:AssumeRole",
+    ]
+  }
+}
+
+data "aws_iam_policy_document" "inline_policy" {
+  statement {
+    effect = "Allow"
+    actions = [
+      "s3:GetObject",
+      "s3:ListBucket",
+      "dynamodb:DescribeTable",
+      "dynamodb:BatchWriteItem"
+    ]
+    resources = [
+      "arn:aws:s3:::data-${var.info.name}-${var.env}-${local.account_id}",
+      "arn:aws:s3:::data-${var.info.name}-${var.env}-${local.account_id}/*",
+      "arn:aws:dynamodb:${var.info.region}:${local.account_id}:table/hot-${var.env}",
+      "arn:aws:dynamodb:${var.info.region}:${local.account_id}:table/rising-${var.env}"
+    ]
+  }
+}
+
+# create role
+# https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role
+resource "aws_iam_role" "iam_for_lambda" {
+  name               = "iam-for-lambda-${var.env}"
+  assume_role_policy = data.aws_iam_policy_document.assume_role.json # Policy that grants an entity permission to assume the role.
+
+  inline_policy {
+    name   = "test-policy"
+    policy = data.aws_iam_policy_document.inline_policy.json
+  }
+
+  tags = {
+    Environment = "${var.env}"
+    Project     = "viral-reddit-posts"
+  }
+}
+
+resource "aws_lambda_layer_version" "praw_layer" {
+  depends_on = [aws_s3_object.move_PRAW_zip]
+  # you either have to specify a local filename or the s3 object
+  # https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_layer_version
+  # filename   = "lambda_layer_payload.zip"
+  s3_bucket                = "packages-${var.info.name}-${var.env}-${local.account_id}"
+  s3_key                   = "praw==7.7.0.zip"
+  layer_name               = "praw-7_7_0"
+  description              = "python binaries for praw==7.7.0 library"
+  compatible_architectures = ["x86_64"]
+  compatible_runtimes      = ["python${var.info.pyversion}"]
+}
+
+resource "aws_lambda_layer_version" "boto3_layer" {
+  depends_on = [aws_s3_object.move_boto3_zip]
+  # you either have to specify a local filename or the s3 object
+  # https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_layer_version
+  # filename   = "lambda_layer_payload.zip"
+  s3_bucket                = "packages-${var.info.name}-${var.env}-${local.account_id}"
+  s3_key                   = "boto3==1.26.117.zip"
+  layer_name               = "boto3-1_26_117"
+  description              = "python binaries for boto3==1.26.117 library"
+  compatible_architectures = ["x86_64"]
+  compatible_runtimes      = ["python${var.info.pyversion}"]
+}
+
+resource "aws_lambda_layer_version" "utils_layer" {
+  depends_on = [aws_s3_object.move_boto3_zip]
+  # you either have to specify a local filename or the s3 object
+  # https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_layer_version
+  # filename   = "lambda_layer_payload.zip"
+  s3_bucket                = "packages-${var.info.name}-${var.env}-${local.account_id}"
+  s3_key                   = "Utils.git@main.zip"
+  layer_name               = "utils_layer"
+  description              = "python binaries for Utils.git@main library"
+  compatible_architectures = ["x86_64"]
+  compatible_runtimes      = ["python${var.info.pyversion}"]
+}
+
+# make lambda function
+# https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function
+resource "aws_lambda_function" "lambda_function" {
+  # depends_on = [resource.null_resource.zip_function]
+
+  filename      = "./scripts/zippedLambdaFunction/getRedditDataFunction.zip"
+  function_name = "lambda-reddit-scraping-${var.env}"
+  role          = aws_iam_role.iam_for_lambda.arn
+  handler       = "lambda_function.lambda_handler"
+  runtime       = "python${var.info.pyversion}"
+  timeout       = 60
+
+  ephemeral_storage {
+    size = 512 # Min 512 MB and the Max 10240 MB
+  }
+
+  layers = [
+    aws_lambda_layer_version.praw_layer.arn, 
+    aws_lambda_layer_version.boto3_layer.arn,
+    aws_lambda_layer_version.utils_layer.arn,
+    ]
+
+  source_code_hash = data.archive_file.lambda_zip.output_base64sha256
+
+  environment {
+    variables = {
+      AWS_BUCKET = "data-${var.info.name}-${var.env}-${local.account_id}",
+      ENV        = "${var.env}"
+    }
+  }
+  tags = {
+    Environment = "${var.env}"
+    Project     = "viral-reddit-posts"
+  }
+}
+
+# Attach event trigger to Lambda Function, see https://stackoverflow.com/questions/35895315/use-terraform-to-set-up-a-lambda-function-triggered-by-a-scheduled-event-source
+resource "aws_cloudwatch_event_rule" "every_one_minute" {
+    name = "every-one-minute"
+    description = "Fires every one minute"
+    schedule_expression = "rate(1 minute)"
+    state=var.cloudwatch_state
+}
+
+resource "aws_cloudwatch_event_target" "scrape_reddit_every_minute" {
+    rule = aws_cloudwatch_event_rule.every_one_minute.name
+    target_id = "scrape_reddit"
+    arn = aws_lambda_function.lambda_function.arn
+}
+
+resource "aws_lambda_permission" "allow_cloudwatch_to_call_lambda_function" {
+    statement_id = "AllowExecutionFromCloudWatch"
+    action = "lambda:InvokeFunction"
+    function_name = aws_lambda_function.lambda_function.function_name
+    principal = "events.amazonaws.com"
+    source_arn = aws_cloudwatch_event_rule.every_one_minute.arn
+}
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..158f31c
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,33 @@
+[build-system]
+requires = ["setuptools >= 61.0"]
+build-backend = "setuptools.build_meta"
+
+# see https://packaging.python.org/en/latest/guides/writing-pyproject-toml/
+[project]
+name = "Reddit-Scraping"
+
+dynamic = ["version"]
+
+dependencies = [
+    "boto3==1.26.117",
+    "moto[dynamodb,s3]==4.1.8",
+    "pre-commit==2.21.0",
+    "praw==7.7.0",
+    "pytest==7.3.1",
+    "pytest-cov==4.0.0",
+    "viral_reddit_posts_utils@git+https://github.com/ViralRedditPosts/Utils.git@main"
+]
+
+requires-python = "== 3.12.3"
+
+authors = [
+  {name = "Kenneth Myers", email = "myers.kenneth.james@gmail.com"},
+]
+
+description = "This project scrapes Reddit data and loads it to a DynamoDB. It is intended to be run through an AWS Lambda function."
+
+readme = "README.md"
+
+
+
+
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..7538128
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+addopts = --ignore=scripts/
\ No newline at end of file
diff --git a/scripts/zipLambdaFunction.sh b/scripts/zipLambdaFunction.sh
new file mode 100755
index 0000000..0aef85f
--- /dev/null
+++ b/scripts/zipLambdaFunction.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# This is meant to zip a lambda function with the reddit config
+# use: 
+#   zipLambdaFunction.sh -f someFunction
+# saves zip to zippedLambdaFunction/someFunction.zip
+# you may need to run chmod +x ./zipLambdaFunction.sh
+
+set -e
+
+while getopts f: flag
+do
+    case "${flag}" in
+        f) function_name=${OPTARG};;  # ie someFunction located in ../lambdaFunction/someFunction
+    esac
+done
+: ${function_name:?Missing -f}   # checks if these have been set https://unix.stackexchange.com/questions/621004/bash-getopts-mandatory-arguments
+echo "lambda function: $function_name";
+
+SCRIPT_PATH=${0%/*}  # https://stackoverflow.com/questions/6393551/what-is-the-meaning-of-0-in-a-bash-script
+CWD=${pwd}
+cd $SCRIPT_PATH
+
+[ -d "../lambdaFunctions/${function_name}" ] && echo "Directory ../lambdaFunctions/${function_name} exists." || { echo "Error: Directory ../lambdaFunctions/${function_name} does not exist."; exit 1; }
+
+cd ./zippedLambdaFunction/
+rm -r ./${function_name} || true
+cp -r ../../lambdaFunctions/${function_name} ./  # copy lambda function files here
+rm -rf ${function_name}.zip # remove first if it exists
+cd ./${function_name}/  # for some reason you have to zip from within this folder or it wont work, it otherwise wraps it in another folder
+#rm -rf ./*.ipynb*  # remove any notebook stuff
+zip -r ../${function_name}.zip * -x "*.ipynb*" "*pycache*"    # zip of function 
+cd ..
+rm -r ./${function_name}  # clean up unzipped file
+
+cd $CWD  # return to original place
diff --git a/scripts/zipPythonPackage.sh b/scripts/zipPythonPackage.sh
new file mode 100755
index 0000000..54fc07f
--- /dev/null
+++ b/scripts/zipPythonPackage.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# This is a script to download a python package binaries and zip it
+# The intention is to use that package as a layer for a lambda function (or something else).
+# use: 
+#   sh zipPythonPackage.sh -v 3.7 praw==7.7.0 boto3==1.26.117
+# as you can see, packages are just listed as non-option arguments
+# based on https://www.linkedin.com/pulse/add-external-python-libraries-aws-lambda-using-layers-gabe-olokun/
+# Note: an old version of this script also moved the zip file to s3, this functionality has been removed.
+# you may need to run chmod +x ./zipPythonPackage.sh
+
+set -e
+
+while getopts v: flag
+do
+    case "${flag}" in
+        v) version=${OPTARG};;  # for python ie 3.7
+    esac
+done
+: ${version:?Missing -v} # checks if these have been set https://unix.stackexchange.com/questions/621004/bash-getopts-mandatory-arguments
+shift $(( OPTIND - 1 ))
+echo "packages: $@";
+echo "python version: $version";
+
+SCRIPT_PATH=${0%/*}  # https://stackoverflow.com/questions/6393551/what-is-the-meaning-of-0-in-a-bash-script
+CWD=${pwd}
+cd $SCRIPT_PATH
+
+for package in "$@"; do
+  echo "Preparing ${package}..."
+  # format the zip file. needed for the git packages which have lots of slashes.
+  if [[ ${package} == "git+"* ]]; then
+    package_name=${package##*/}  # https://stackoverflow.com/questions/3162385/how-to-split-a-string-in-shell-and-get-the-last-field
+  else
+    package_name=${package}
+  fi
+  mkdir -p ./zippedPythonPackages/${package_name}/python
+
+  cd ./zippedPythonPackages/${package_name}/python
+
+  # install binaries for package
+  pip install \
+      --platform manylinux2014_x86_64 \
+      --target=. \
+      --implementation cp \
+      --python ${version} \
+      --only-binary=:all: \
+      --upgrade ${package}
+
+  rm -rf *dist-info  # some cleanup of unnecessary stuff
+  # zip package
+  cd ..
+  rm -rf ${package_name}.zip # remove first if it exists
+  echo "Zipping ${package_name} at $(pwd)"
+  zip -r ${package_name}.zip python  # zip contents of python to zip name
+  cd ../../ # go back out to scripts dir
+done
+
+cd $CWD  # return to original location
diff --git a/scripts/zippedLambdaFunction/.gitkeep b/scripts/zippedLambdaFunction/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/zippedPythonPackages/.gitkeep b/scripts/zippedPythonPackages/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/variable.tf b/variable.tf
new file mode 100644
index 0000000..4bceb5b
--- /dev/null
+++ b/variable.tf
@@ -0,0 +1,11 @@
+variable "env" {
+  type = string
+  default = "dev"
+  description = "environment to deploy to"
+}
+
+variable "cloudwatch_state" {
+  type = string
+  default = "DISABLED"
+  description = "Whether or not the lambda function schedule is enabled or not. Valid values are DISABLED, ENABLED, and ENABLED_WITH_ALL_CLOUDTRAIL_MANAGEMENT_EVENT"
+}