diff --git a/.terraform.lock.hcl b/.terraform.lock.hcl index 6ebcb8c..153af9b 100644 --- a/.terraform.lock.hcl +++ b/.terraform.lock.hcl @@ -1,6 +1,25 @@ # This file is maintained automatically by "terraform init". # Manual edits may be lost in future updates. +provider "registry.terraform.io/hashicorp/archive" { + version = "2.4.0" + hashes = [ + "h1:cJokkjeH1jfpG4QEHdRx0t2j8rr52H33A7C/oX73Ok4=", + "zh:18e408596dd53048f7fc8229098d0e3ad940b92036a24287eff63e2caec72594", + "zh:392d4216ecd1a1fd933d23f4486b642a8480f934c13e2cae3c13b6b6a7e34a7b", + "zh:655dd1fa5ca753a4ace21d0de3792d96fff429445717f2ce31c125d19c38f3ff", + "zh:70dae36c176aa2b258331ad366a471176417a94dd3b4985a911b8be9ff842b00", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:7d8c8e3925f1e21daf73f85983894fbe8868e326910e6df3720265bc657b9c9c", + "zh:a032ec0f0aee27a789726e348e8ad20778c3a1c9190ef25e7cff602c8d175f44", + "zh:b8e50de62ba185745b0fe9713755079ad0e9f7ac8638d204de6762cc36870410", + "zh:c8ad0c7697a3d444df21ff97f3473a8604c8639be64afe3f31b8ec7ad7571e18", + "zh:df736c5a2a7c3a82c5493665f659437a22f0baf8c2d157e45f4dd7ca40e739fc", + "zh:e8ffbf578a0977074f6d08aa8734e36c726e53dc79894cfc4f25fadc4f45f1df", + "zh:efea57ff23b141551f92b2699024d356c7ffd1a4ad62931da7ed7a386aef7f1f", + ] +} + provider "registry.terraform.io/hashicorp/aws" { version = "4.67.0" constraints = "~> 4.16" diff --git a/configUtils.py b/configUtils.py deleted file mode 100644 index 9883e7b..0000000 --- a/configUtils.py +++ /dev/null @@ -1,65 +0,0 @@ -from configparser import ConfigParser -import json -import boto3 -import os -from collections import defaultdict - - -def findConfig() -> str: - """ - Finds config file locally - - :return: string of config file location - """ - # searches for main file, falls back to example file if not found - fileList = [ - './reddit.cfg', - '../reddit.cfg', - '../../reddit.cfg', - './example_reddit.cfg', - '../example_reddit.cfg', - '../../example_reddit.cfg' - ] - for f in fileList: - if os.path.exists(f): - return f - raise RuntimeError("Reddit config file not found. Place it in either ./ or ../") - - -DEFAULT_KEYS = { - 'reddit_api': ['CLIENTID', 'CLIENTSECRET', 'PASSWORD', 'USERNAME'], - 'S3_access': ['ACCESSKEY', 'SECRETKEY', ], - 'Discord': ['BOTTOKEN', 'MYSNOWFLAKEID', 'CHANNELSNOWFLAKEID'], - 'Postgres': ['USERNAME', 'PASSWORD', 'HOST', 'PORT', 'DATABASE'] -} - - -def parseConfig( - cfgFile: str, - keysToRead: dict = None -) -> dict: - """ - Read in the config data from a location to a dictionary and return that dictionary. - - :param cfgFile: location of config file. Can be an S3 location - :param keysToRead: - :return: config dictionary - """ - if keysToRead is None: - keysToRead = DEFAULT_KEYS - parser = ConfigParser() - cfg = defaultdict(dict) - - if cfgFile[:2].lower() == 's3': - s3 = boto3.client('s3') - pathSplit = cfgFile.replace('s3://', '').split('/') - bucket = pathSplit[0] - objLoc = '/'.join(pathSplit[1:]) - obj = s3.get_object(Bucket=bucket, Key=objLoc) - _ = parser.read_string(obj['Body'].read().decode()) - else: - _ = parser.read(cfgFile) - for k, vList in keysToRead.items(): - for v in vList: - cfg[k][v] = json.loads(parser.get(k, v)) # json helps with list conversion - return cfg diff --git a/lambdaFunctions/getRedditDataFunction/lambda_function.py b/lambdaFunctions/getRedditDataFunction/lambda_function.py index 15f17a0..37844d8 100644 --- a/lambdaFunctions/getRedditDataFunction/lambda_function.py +++ b/lambdaFunctions/getRedditDataFunction/lambda_function.py @@ -1,5 +1,5 @@ import redditUtils as ru -import configUtils as cu +import viral_reddit_posts_utils.configUtils as cu import tableDefinition import praw import boto3 @@ -41,9 +41,8 @@ def lambda_handler(event, context): risingData = ru.deduplicateRedditData(risingData) # Push to DynamoDB - tableName = view - risingRawTableDefinition = tableDefinition.getTableDefinition(tableName) - risingTable = ru.getOrCreateTable(risingRawTableDefinition, dynamodb_resource) + tableName = f"{view}-{os.environ['ENV']}" + risingTable = ru.getTable(tableName, dynamodb_resource) ru.batchWriter(risingTable, risingData, schema) # Get Hot Reddit data @@ -55,9 +54,8 @@ def lambda_handler(event, context): hotData = ru.deduplicateRedditData(hotData) # Push to DynamoDB - tableName = view - hotTableDefinition = tableDefinition.getTableDefinition(tableName) - hotTable = ru.getOrCreateTable(hotTableDefinition, dynamodb_resource) + tableName = f"{view}-{os.environ['ENV']}" + hotTable = ru.getTable(tableName, dynamodb_resource) ru.batchWriter(hotTable, hotData, schema) return 200 diff --git a/lambdaFunctions/getRedditDataFunction/redditUtils.py b/lambdaFunctions/getRedditDataFunction/redditUtils.py index b2561a5..d7ddac7 100644 --- a/lambdaFunctions/getRedditDataFunction/redditUtils.py +++ b/lambdaFunctions/getRedditDataFunction/redditUtils.py @@ -92,22 +92,8 @@ def deduplicateRedditData(data): return newData -def getOrCreateTable(tableDefinition, dynamodb_resource): - existingTables = [a.name for a in dynamodb_resource.tables.all()] # client method: dynamodb_client.list_tables()['TableNames'] - tableName = tableDefinition['TableName'] - if tableName not in existingTables: - print(f"Table {tableName} not found, creating table") - # create table - # boto3: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/dynamodb/service-resource/create_table.html#DynamoDB.ServiceResource.create_table - # dynamodb keyschemas and secondary indexes: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.CoreComponents.html - table = dynamodb_resource.create_table(**tableDefinition) - - # Wait until the table exists. - table.wait_until_exists() - - else: - print(f"Table {tableName} exists, grabbing table...") - table = dynamodb_resource.Table(tableName) +def getTable(tableName, dynamodb_resource): + table = dynamodb_resource.Table(tableName) # Print out some data about the table. print(f"Item count in table: {table.item_count}") # this only updates every 6 hours diff --git a/main.tf b/main.tf index f937e59..9da605e 100644 --- a/main.tf +++ b/main.tf @@ -18,6 +18,7 @@ variable "info" { default = { name = "viralredditposts" env = "dev" + region = "us-east-2" pyversion = "3.7" } } @@ -30,11 +31,21 @@ locals { } # zip the lambda function -resource "null_resource" "zip_function" { - provisioner "local-exec" { - command = "./scripts/zipLambdaFunction.sh -f getRedditDataFunction" - on_failure = fail # OR continue - } +# resource "null_resource" "zip_function" { +# # rebuild zip each time, this is low cost and good for forcing it to upload each terraform apply +# triggers = { +# build_number = timestamp() +# } +# provisioner "local-exec" { +# command = "./scripts/zipLambdaFunction.sh -f getRedditDataFunction" +# on_failure = fail # OR continue +# } +# } + +data "archive_file" "lambda_zip" { + type = "zip" + source_dir = "./lambdaFunctions/getRedditDataFunction/" + output_path = "./scripts/zippedLambdaFunction/getRedditDataFunction.zip" } # zip the PRAW and boto3 packages @@ -45,7 +56,7 @@ resource "null_resource" "zip_python_packages" { build_number = timestamp() } provisioner "local-exec" { - command = "source venv/bin/activate && ./scripts/zipPythonPackage.sh -v ${var.info.pyversion} praw==7.7.0 boto3==1.26.117" + command = "source venv/bin/activate && ./scripts/zipPythonPackage.sh -v ${var.info.pyversion} praw==7.7.0 boto3==1.26.117 git+https://github.com/ViralRedditPosts/Utils.git@main" on_failure = fail # OR continue } } @@ -78,6 +89,20 @@ resource "aws_s3_object" "move_boto3_zip" { } } +# add git+https://github.com/ViralRedditPosts/Utils.git@main to S3 +resource "aws_s3_object" "move_utils_zip" { + depends_on = [null_resource.zip_python_packages] + + bucket = "packages-${var.info.name}-${var.info.env}-${local.account_id}" + key = "Utils.git@main.zip" + source = "./scripts/zippedPythonPackages/Utils.git@main/Utils.git@main.zip" + tags = { + Name = "utils-zip" + Environment = "${var.info.env}" + Project = "viral-reddit-posts" + } +} + # define policy for attaching role data "aws_iam_policy_document" "assume_role" { statement { @@ -99,11 +124,15 @@ data "aws_iam_policy_document" "inline_policy" { effect = "Allow" actions = [ "s3:GetObject", - "s3:ListBucket" + "s3:ListBucket", + "dynamodb:DescribeTable", + "dynamodb:BatchWriteItem" ] resources = [ "arn:aws:s3:::data-${var.info.name}-${var.info.env}-${local.account_id}", - "arn:aws:s3:::data-${var.info.name}-${var.info.env}-${local.account_id}/*" + "arn:aws:s3:::data-${var.info.name}-${var.info.env}-${local.account_id}/*", + "arn:aws:dynamodb:${var.info.region}:${local.account_id}:table/hot-${var.info.env}", + "arn:aws:dynamodb:${var.info.region}:${local.account_id}:table/rising-${var.info.env}" ] } } @@ -146,15 +175,28 @@ resource "aws_lambda_layer_version" "boto3_layer" { s3_bucket = "packages-${var.info.name}-${var.info.env}-${local.account_id}" s3_key = "boto3==1.26.117.zip" layer_name = "boto3-1_26_117" - description = "python binaries for pboto3==1.26.117 library" + description = "python binaries for boto3==1.26.117 library" + compatible_architectures = ["x86_64"] + compatible_runtimes = ["python${var.info.pyversion}"] +} + +resource "aws_lambda_layer_version" "utils_layer" { + depends_on = [aws_s3_object.move_boto3_zip] + # you either have to specify a local filename or the s3 object + # https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_layer_version + # filename = "lambda_layer_payload.zip" + s3_bucket = "packages-${var.info.name}-${var.info.env}-${local.account_id}" + s3_key = "Utils.git@main.zip" + layer_name = "utils_layer" + description = "python binaries for Utils.git@main library" compatible_architectures = ["x86_64"] compatible_runtimes = ["python${var.info.pyversion}"] } # make lambda function # https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function -resource "aws_lambda_function" "test_lambda" { - depends_on = [resource.null_resource.zip_function] +resource "aws_lambda_function" "lambda_function" { + # depends_on = [resource.null_resource.zip_function] filename = "./scripts/zippedLambdaFunction/getRedditDataFunction.zip" function_name = "lambda-reddit-scraping-${var.info.env}" @@ -167,11 +209,18 @@ resource "aws_lambda_function" "test_lambda" { size = 512 # Min 512 MB and the Max 10240 MB } - layers = [aws_lambda_layer_version.praw_layer.arn, aws_lambda_layer_version.boto3_layer.arn] + layers = [ + aws_lambda_layer_version.praw_layer.arn, + aws_lambda_layer_version.boto3_layer.arn, + aws_lambda_layer_version.utils_layer.arn, + ] + + source_code_hash = data.archive_file.lambda_zip.output_base64sha256 environment { variables = { - AWS_BUCKET = "data-${var.info.name}-${var.info.env}-${local.account_id}" + AWS_BUCKET = "data-${var.info.name}-${var.info.env}-${local.account_id}", + ENV = "${var.info.env}" } } tags = { diff --git a/scripts/zipPythonPackage.sh b/scripts/zipPythonPackage.sh index 42946bb..54fc07f 100755 --- a/scripts/zipPythonPackage.sh +++ b/scripts/zipPythonPackage.sh @@ -27,9 +27,15 @@ cd $SCRIPT_PATH for package in "$@"; do echo "Preparing ${package}..." - mkdir -p ./zippedPythonPackages/${package}/python + # format the zip file. needed for the git packages which have lots of slashes. + if [[ ${package} == "git+"* ]]; then + package_name=${package##*/} # https://stackoverflow.com/questions/3162385/how-to-split-a-string-in-shell-and-get-the-last-field + else + package_name=${package} + fi + mkdir -p ./zippedPythonPackages/${package_name}/python - cd ./zippedPythonPackages/${package}/python + cd ./zippedPythonPackages/${package_name}/python # install binaries for package pip install \ @@ -43,9 +49,9 @@ for package in "$@"; do rm -rf *dist-info # some cleanup of unnecessary stuff # zip package cd .. - rm -rf ${package}.zip # remove first if it exists - echo "Zipping ${package} at $(pwd)" - zip -r ${package}.zip python # zip contents of python to zip name + rm -rf ${package_name}.zip # remove first if it exists + echo "Zipping ${package_name} at $(pwd)" + zip -r ${package_name}.zip python # zip contents of python to zip name cd ../../ # go back out to scripts dir done