From 9048982e2689334090ff0ded97fc98af9c589bec Mon Sep 17 00:00:00 2001 From: Kenneth Myers Date: Sat, 13 Apr 2024 17:37:58 -0400 Subject: [PATCH] clean commit --- .DS_Store | Bin 0 -> 6148 bytes .github/workflows/workflow.yml | 28 ++ .gitignore | 274 ++++++++++++++++++ .terraform.lock.hcl | 63 ++++ LICENSE | 21 ++ README.md | 40 +++ example_reddit.cfg | 22 ++ .../getRedditDataFunction/lambda_function.py | 61 ++++ .../getRedditDataFunction/redditUtils.py | 119 ++++++++ .../getRedditDataFunction/tableDefinition.py | 76 +++++ .../getRedditDataFunction/test_lambda.py | 152 ++++++++++ main.tf | 251 ++++++++++++++++ pyproject.toml | 33 +++ pytest.ini | 2 + scripts/zipLambdaFunction.sh | 35 +++ scripts/zipPythonPackage.sh | 58 ++++ scripts/zippedLambdaFunction/.gitkeep | 0 scripts/zippedPythonPackages/.gitkeep | 0 variable.tf | 11 + 19 files changed, 1246 insertions(+) create mode 100644 .DS_Store create mode 100644 .github/workflows/workflow.yml create mode 100644 .gitignore create mode 100644 .terraform.lock.hcl create mode 100644 LICENSE create mode 100644 README.md create mode 100644 example_reddit.cfg create mode 100644 lambdaFunctions/getRedditDataFunction/lambda_function.py create mode 100644 lambdaFunctions/getRedditDataFunction/redditUtils.py create mode 100644 lambdaFunctions/getRedditDataFunction/tableDefinition.py create mode 100644 lambdaFunctions/getRedditDataFunction/test_lambda.py create mode 100644 main.tf create mode 100644 pyproject.toml create mode 100644 pytest.ini create mode 100755 scripts/zipLambdaFunction.sh create mode 100755 scripts/zipPythonPackage.sh create mode 100644 scripts/zippedLambdaFunction/.gitkeep create mode 100644 scripts/zippedPythonPackages/.gitkeep create mode 100644 variable.tf diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 60 or timeElapsedDays>0): # sometime rising has some data that's already older than an hour or day, we don't want that + continue + postId = submission.id + title = submission.title + score = submission.score + numComments = submission.num_comments + upvoteRatio = submission.upvote_ratio + gildings = submission.gildings + numGildings = sum(gildings.values()) + row = Row( + postId=postId, subreddit=subreddit, subscribers=subscribers, activeUsers=activeUsers, + title=title, createdTSUTC=str(createdTSUTC), + timeElapsedMin=timeElapsedMin, score=score, numComments=numComments, + upvoteRatio=upvoteRatio, numGildings=numGildings, + loadTSUTC=str(now), loadDateUTC=str(now.date()), loadTimeUTC=str(now.time())) + dataCollected.append(row) + if verbose: + print(row) + print() + return dataCollected[:topN-2] + + +def deduplicateRedditData(data): + """ + Deduplicates the reddit data. Sometimes there are duplicate keys which throws an error + when writing to dynamo. It is unclear why this happens but I suspect it is an issue with PRAW. + + :param data: list[Row[schema]] + :return: deduplicated data + """ + postIds = set() + newData = [] + # there really shouldn't be more than 1 loadTSUTC for a postId since that is generated + # on our side, but I wanted to handle that since it is part of the key + data = sorted(data, key=lambda x: x.loadTSUTC)[::-1] + for d in data: + if d.postId not in postIds: + postIds.add(d.postId) + newData.append(d) + return newData + + +def getTable(tableName, dynamodb_resource): + table = dynamodb_resource.Table(tableName) + + # Print out some data about the table. + print(f"Item count in table: {table.item_count}") # this only updates every 6 hours + return table + + +def batchWriter(table, data, schema): + """ + https://boto3.amazonaws.com/v1/documentation/api/latest/guide/dynamodb.html#batch-writing + I didn't bother with dealing with duplicates because shouldn't be a problem with this type of data + no built in way to get responses with batch_writer https://peppydays.medium.com/getting-response-of-aws-dynamodb-batchwriter-request-2aa3f81019fa + + :param table: boto3 table object + :param data: list[Row[schema]] + :param schema: OrderedDict containing the dynamodb schema (dynamo technically schema-less) + :return: None + """ + columns = schema.keys() + with table.batch_writer() as batch: + for i in range(len(data)): # for each row obtained + batch.put_item( + Item = json.loads(json.dumps({k:getattr(data[i], k) for k in columns}), parse_float=Decimal) # helps with parsing float to Decimal + ) \ No newline at end of file diff --git a/lambdaFunctions/getRedditDataFunction/tableDefinition.py b/lambdaFunctions/getRedditDataFunction/tableDefinition.py new file mode 100644 index 0000000..04eb9f4 --- /dev/null +++ b/lambdaFunctions/getRedditDataFunction/tableDefinition.py @@ -0,0 +1,76 @@ +from collections import OrderedDict + + +# schema is mainly needed for defining columns and what the column types should be if building additional indices +schema = OrderedDict() +schema["loadDateUTC"] = "S" +schema["loadTimeUTC"] = "S" +schema["loadTSUTC"] = "S" +schema["postId"] = "S" +schema["subreddit"] = "S" +schema["subscribers"] = "N" +schema["activeUsers"] = "N" +schema["title"] = "S" +schema["createdTSUTC"] = "S" +schema["timeElapsedMin"] = "N" +schema["score"] = "N" +schema["numComments"] = "N" +schema["upvoteRatio"] = "N" +schema["numGildings"] = "N" + +baseTableDefinition = dict( + AttributeDefinitions=[ + { + 'AttributeName': k, + 'AttributeType': schema[k] + } for k in ['postId', 'loadDateUTC', 'loadTimeUTC', 'loadTSUTC'] # only need to define the ones that are used in key and sort + ], + KeySchema=[ + { + 'AttributeName': 'postId', + 'KeyType': 'HASH' + }, + { + 'AttributeName': 'loadTSUTC', + 'KeyType': 'RANGE' + + } + ], + GlobalSecondaryIndexes=[ # I wanted to future proof other ways I might look at the table (by subreddit) + { + 'IndexName': 'byLoadDate', + 'KeySchema': [ + { + 'AttributeName': 'loadDateUTC', + 'KeyType': 'HASH' + }, + { + 'AttributeName': 'loadTimeUTC', + 'KeyType': 'RANGE' + }, + ], + 'Projection': { + 'ProjectionType': 'INCLUDE', + 'NonKeyAttributes': [ + 'timeElapsedMin', + ] + }, + 'ProvisionedThroughput': { # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/GSI.html#GSI.ThroughputConsiderations + 'ReadCapacityUnits': 6, # 1 = 4KB/s I think + 'WriteCapacityUnits': 1 # 1 = 1KB/s + } + }, + ], + BillingMode='PROVISIONED', # recommended for consistent work + ProvisionedThroughput={ # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ServiceQuotas.html#default-limits-throughput-capacity-modes + 'ReadCapacityUnits': 6, + 'WriteCapacityUnits': 1 + }, + TableClass='STANDARD', + DeletionProtectionEnabled=False +) + + +def getTableDefinition(tableName, tableDefintion = baseTableDefinition): + tableDefintion['TableName'] = tableName + return tableDefintion diff --git a/lambdaFunctions/getRedditDataFunction/test_lambda.py b/lambdaFunctions/getRedditDataFunction/test_lambda.py new file mode 100644 index 0000000..b618aa2 --- /dev/null +++ b/lambdaFunctions/getRedditDataFunction/test_lambda.py @@ -0,0 +1,152 @@ +import pytest +import redditUtils as ru +import praw +import tableDefinition +from collections import namedtuple +import boto3 +import sys +import os +THIS_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(os.path.join(THIS_DIR, '../../')) +import viral_reddit_posts_utils.configUtils as cu +import pickle +from moto import mock_dynamodb + + +IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" + + +@pytest.fixture(scope='module') +def cfg(): + cfg_file = cu.findConfig() + cfg = cu.parseConfig(cfg_file) + return cfg + + +@pytest.fixture(scope='module') +def reddit(cfg): + if IN_GITHUB_ACTIONS: + pass + redditcfg = cfg['reddit_api'] + return praw.Reddit( + client_id=f"{redditcfg['CLIENTID']}", + client_secret=f"{redditcfg['CLIENTSECRET']}", + password=f"{redditcfg['PASSWORD']}", + user_agent=f"Post Extraction (by u/{redditcfg['USERNAME']})", + username=f"{redditcfg['USERNAME']}", + ) + + +def test_getRedditData(reddit): + subreddit = "pics" + ru.getRedditData( + reddit, + subreddit, + topN=25, + view='rising', + schema=tableDefinition.schema, + time_filter=None, + verbose=True) + + +@pytest.fixture(scope='module') +def duplicatedData(): + schema = tableDefinition.schema + columns = schema.keys() + Row = namedtuple("Row", columns) + # these are identical examples except one has a later loadTSUTC + return [ + Row(subscribers=10000000, activeUsers=10000, + loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fkqz', + subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43', + timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0), + Row(subscribers=10000000, activeUsers=10000, + loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:06:44', postId='133fkqz', + subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43', + timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0) + ] + + +def test_deduplicateRedditData(duplicatedData): + newData = ru.deduplicateRedditData(duplicatedData) + assert len(newData) == 1 + print("test_deduplicateRedditData complete") + + +@mock_dynamodb +class TestBatchWriter: + + + def classSetUp(self): + """ + If we left this at top level of the class then it won't be skipped by `skip` and `skipif` + furthermore we can't have __init__ in a Test Class, so this is called prior to each test + :return: + """ + dynamodb = boto3.resource('dynamodb', region_name='us-east-2') + # create table and write to sample data + tableName = 'rising' + td = tableDefinition.getTableDefinition(tableName=tableName) + self.testTable = dynamodb.create_table(**td) + self.schema = tableDefinition.schema + self.columns = self.schema.keys() + self.Row = namedtuple("Row", self.columns) + + @pytest.mark.xfail(reason="BatchWriter fails on duplicate keys. This might xpass, possibly a fault in mock object.") + def test_duplicateData(self): + self.classSetUp() + testTable = self.testTable + schema = self.schema + Row=self.Row + + data = [ + Row(loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fkqz', + subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43', + timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0), + Row(loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fkqz', + subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43', + timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0) + ] + from redditUtils import batchWriter + batchWriter(table=testTable, data=data, schema=schema) + print("duplicateDataTester test complete") + + def test_uniqueData(self): + self.classSetUp() + testTable = self.testTable + schema = self.schema + Row = self.Row + + data = [ + Row(subscribers=10000000, activeUsers=10000, + loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fkqz', + subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43', + timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0), + Row(subscribers=10000000, activeUsers=10000, + loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fqj7', + subreddit='pics', title='A piece of wood sticking up in front of a fire.', createdTSUTC='2023-04-30 04:29:23', + timeElapsedMin=34, score=0, numComments=0, upvoteRatio=0.4, numGildings=0) + ] + from redditUtils import batchWriter + batchWriter(table=testTable, data=data, schema=schema) + print("uniqueDataTester test complete") + + def test_diffPrimaryIndexSameSecondIndex(self): + self.classSetUp() + testTable = self.testTable + schema = self.schema + Row = self.Row + + data = [ + Row(subscribers=10000000, activeUsers=10000, + loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fkqz', + subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43', + timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0), + Row(subscribers=10000000, activeUsers=10000, + loadDateUTC='2023-04-30', loadTimeUTC='05:03:44', loadTSUTC='2023-04-30 05:03:44', postId='133fkqy', + subreddit='pics', title='Magnolia tree blooming in my friends yard', createdTSUTC='2023-04-30 04:19:43', + timeElapsedMin=44, score=3, numComments=0, upvoteRatio=1.0, numGildings=0) + ] + from redditUtils import batchWriter + batchWriter(table=testTable, data=data, schema=schema) + print("diffPrimaryIndexSameSecondIndexTester test complete") diff --git a/main.tf b/main.tf new file mode 100644 index 0000000..3230f5a --- /dev/null +++ b/main.tf @@ -0,0 +1,251 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "< 6.0" + } + } + + required_version = ">= 1.2.0" +} + +provider "aws" { + region = "us-east-2" +} + +variable "info" { + type = map(string) + default = { + name = "viralredditposts" + region = "us-east-2" + pyversion = "3.12" + } +} + +# get account id +data "aws_caller_identity" "current" {} + +locals { + account_id = data.aws_caller_identity.current.account_id +} + +# zip the lambda function +# resource "null_resource" "zip_function" { +# # rebuild zip each time, this is low cost and good for forcing it to upload each terraform apply +# triggers = { +# build_number = timestamp() +# } +# provisioner "local-exec" { +# command = "./scripts/zipLambdaFunction.sh -f getRedditDataFunction" +# on_failure = fail # OR continue +# } +# } + +data "archive_file" "lambda_zip" { + type = "zip" + source_dir = "./lambdaFunctions/getRedditDataFunction/" + output_path = "./scripts/zippedLambdaFunction/getRedditDataFunction.zip" +} + +# zip the PRAW and boto3 packages +resource "null_resource" "zip_python_packages" { + # this a bit slow but this forces this to rerun each time, + # it was easier than trying to get it to track if the zip was deleted for an environment change + triggers = { + build_number = timestamp() + } + provisioner "local-exec" { + command = "source venv/bin/activate && ./scripts/zipPythonPackage.sh -v ${var.info.pyversion} praw==7.7.0 boto3==1.26.117 git+https://github.com/ViralRedditPosts/Utils.git@main" + on_failure = fail # OR continue + } +} + +# add PRAW zip to S3 +resource "aws_s3_object" "move_PRAW_zip" { + depends_on = [null_resource.zip_python_packages] + + bucket = "packages-${var.info.name}-${var.env}-${local.account_id}" + key = "praw==7.7.0.zip" + source = "./scripts/zippedPythonPackages/praw==7.7.0/praw==7.7.0.zip" + tags = { + Name = "praw-zip" + Environment = "${var.env}" + Project = "viral-reddit-posts" + } +} + +# add boto3 zip to S3 +resource "aws_s3_object" "move_boto3_zip" { + depends_on = [null_resource.zip_python_packages] + + bucket = "packages-${var.info.name}-${var.env}-${local.account_id}" + key = "boto3==1.26.117.zip" + source = "./scripts/zippedPythonPackages/boto3==1.26.117/boto3==1.26.117.zip" + tags = { + Name = "boto3-zip" + Environment = "${var.env}" + Project = "viral-reddit-posts" + } +} + +# add git+https://github.com/ViralRedditPosts/Utils.git@main to S3 +resource "aws_s3_object" "move_utils_zip" { + depends_on = [null_resource.zip_python_packages] + + bucket = "packages-${var.info.name}-${var.env}-${local.account_id}" + key = "Utils.git@main.zip" + source = "./scripts/zippedPythonPackages/Utils.git@main/Utils.git@main.zip" + tags = { + Name = "utils-zip" + Environment = "${var.env}" + Project = "viral-reddit-posts" + } +} + +# define policy for attaching role +data "aws_iam_policy_document" "assume_role" { + statement { + effect = "Allow" + + principals { + type = "Service" + identifiers = ["lambda.amazonaws.com"] + } + + actions = [ + "sts:AssumeRole", + ] + } +} + +data "aws_iam_policy_document" "inline_policy" { + statement { + effect = "Allow" + actions = [ + "s3:GetObject", + "s3:ListBucket", + "dynamodb:DescribeTable", + "dynamodb:BatchWriteItem" + ] + resources = [ + "arn:aws:s3:::data-${var.info.name}-${var.env}-${local.account_id}", + "arn:aws:s3:::data-${var.info.name}-${var.env}-${local.account_id}/*", + "arn:aws:dynamodb:${var.info.region}:${local.account_id}:table/hot-${var.env}", + "arn:aws:dynamodb:${var.info.region}:${local.account_id}:table/rising-${var.env}" + ] + } +} + +# create role +# https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role +resource "aws_iam_role" "iam_for_lambda" { + name = "iam-for-lambda-${var.env}" + assume_role_policy = data.aws_iam_policy_document.assume_role.json # Policy that grants an entity permission to assume the role. + + inline_policy { + name = "test-policy" + policy = data.aws_iam_policy_document.inline_policy.json + } + + tags = { + Environment = "${var.env}" + Project = "viral-reddit-posts" + } +} + +resource "aws_lambda_layer_version" "praw_layer" { + depends_on = [aws_s3_object.move_PRAW_zip] + # you either have to specify a local filename or the s3 object + # https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_layer_version + # filename = "lambda_layer_payload.zip" + s3_bucket = "packages-${var.info.name}-${var.env}-${local.account_id}" + s3_key = "praw==7.7.0.zip" + layer_name = "praw-7_7_0" + description = "python binaries for praw==7.7.0 library" + compatible_architectures = ["x86_64"] + compatible_runtimes = ["python${var.info.pyversion}"] +} + +resource "aws_lambda_layer_version" "boto3_layer" { + depends_on = [aws_s3_object.move_boto3_zip] + # you either have to specify a local filename or the s3 object + # https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_layer_version + # filename = "lambda_layer_payload.zip" + s3_bucket = "packages-${var.info.name}-${var.env}-${local.account_id}" + s3_key = "boto3==1.26.117.zip" + layer_name = "boto3-1_26_117" + description = "python binaries for boto3==1.26.117 library" + compatible_architectures = ["x86_64"] + compatible_runtimes = ["python${var.info.pyversion}"] +} + +resource "aws_lambda_layer_version" "utils_layer" { + depends_on = [aws_s3_object.move_boto3_zip] + # you either have to specify a local filename or the s3 object + # https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_layer_version + # filename = "lambda_layer_payload.zip" + s3_bucket = "packages-${var.info.name}-${var.env}-${local.account_id}" + s3_key = "Utils.git@main.zip" + layer_name = "utils_layer" + description = "python binaries for Utils.git@main library" + compatible_architectures = ["x86_64"] + compatible_runtimes = ["python${var.info.pyversion}"] +} + +# make lambda function +# https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function +resource "aws_lambda_function" "lambda_function" { + # depends_on = [resource.null_resource.zip_function] + + filename = "./scripts/zippedLambdaFunction/getRedditDataFunction.zip" + function_name = "lambda-reddit-scraping-${var.env}" + role = aws_iam_role.iam_for_lambda.arn + handler = "lambda_function.lambda_handler" + runtime = "python${var.info.pyversion}" + timeout = 60 + + ephemeral_storage { + size = 512 # Min 512 MB and the Max 10240 MB + } + + layers = [ + aws_lambda_layer_version.praw_layer.arn, + aws_lambda_layer_version.boto3_layer.arn, + aws_lambda_layer_version.utils_layer.arn, + ] + + source_code_hash = data.archive_file.lambda_zip.output_base64sha256 + + environment { + variables = { + AWS_BUCKET = "data-${var.info.name}-${var.env}-${local.account_id}", + ENV = "${var.env}" + } + } + tags = { + Environment = "${var.env}" + Project = "viral-reddit-posts" + } +} + +# Attach event trigger to Lambda Function, see https://stackoverflow.com/questions/35895315/use-terraform-to-set-up-a-lambda-function-triggered-by-a-scheduled-event-source +resource "aws_cloudwatch_event_rule" "every_one_minute" { + name = "every-one-minute" + description = "Fires every one minute" + schedule_expression = "rate(1 minute)" + state=var.cloudwatch_state +} + +resource "aws_cloudwatch_event_target" "scrape_reddit_every_minute" { + rule = aws_cloudwatch_event_rule.every_one_minute.name + target_id = "scrape_reddit" + arn = aws_lambda_function.lambda_function.arn +} + +resource "aws_lambda_permission" "allow_cloudwatch_to_call_lambda_function" { + statement_id = "AllowExecutionFromCloudWatch" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.lambda_function.function_name + principal = "events.amazonaws.com" + source_arn = aws_cloudwatch_event_rule.every_one_minute.arn +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..158f31c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,33 @@ +[build-system] +requires = ["setuptools >= 61.0"] +build-backend = "setuptools.build_meta" + +# see https://packaging.python.org/en/latest/guides/writing-pyproject-toml/ +[project] +name = "Reddit-Scraping" + +dynamic = ["version"] + +dependencies = [ + "boto3==1.26.117", + "moto[dynamodb,s3]==4.1.8", + "pre-commit==2.21.0", + "praw==7.7.0", + "pytest==7.3.1", + "pytest-cov==4.0.0", + "viral_reddit_posts_utils@git+https://github.com/ViralRedditPosts/Utils.git@main" +] + +requires-python = "== 3.12.3" + +authors = [ + {name = "Kenneth Myers", email = "myers.kenneth.james@gmail.com"}, +] + +description = "This project scrapes Reddit data and loads it to a DynamoDB. It is intended to be run through an AWS Lambda function." + +readme = "README.md" + + + + diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..7538128 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +addopts = --ignore=scripts/ \ No newline at end of file diff --git a/scripts/zipLambdaFunction.sh b/scripts/zipLambdaFunction.sh new file mode 100755 index 0000000..0aef85f --- /dev/null +++ b/scripts/zipLambdaFunction.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# This is meant to zip a lambda function with the reddit config +# use: +# zipLambdaFunction.sh -f someFunction +# saves zip to zippedLambdaFunction/someFunction.zip +# you may need to run chmod +x ./zipLambdaFunction.sh + +set -e + +while getopts f: flag +do + case "${flag}" in + f) function_name=${OPTARG};; # ie someFunction located in ../lambdaFunction/someFunction + esac +done +: ${function_name:?Missing -f} # checks if these have been set https://unix.stackexchange.com/questions/621004/bash-getopts-mandatory-arguments +echo "lambda function: $function_name"; + +SCRIPT_PATH=${0%/*} # https://stackoverflow.com/questions/6393551/what-is-the-meaning-of-0-in-a-bash-script +CWD=${pwd} +cd $SCRIPT_PATH + +[ -d "../lambdaFunctions/${function_name}" ] && echo "Directory ../lambdaFunctions/${function_name} exists." || { echo "Error: Directory ../lambdaFunctions/${function_name} does not exist."; exit 1; } + +cd ./zippedLambdaFunction/ +rm -r ./${function_name} || true +cp -r ../../lambdaFunctions/${function_name} ./ # copy lambda function files here +rm -rf ${function_name}.zip # remove first if it exists +cd ./${function_name}/ # for some reason you have to zip from within this folder or it wont work, it otherwise wraps it in another folder +#rm -rf ./*.ipynb* # remove any notebook stuff +zip -r ../${function_name}.zip * -x "*.ipynb*" "*pycache*" # zip of function +cd .. +rm -r ./${function_name} # clean up unzipped file + +cd $CWD # return to original place diff --git a/scripts/zipPythonPackage.sh b/scripts/zipPythonPackage.sh new file mode 100755 index 0000000..54fc07f --- /dev/null +++ b/scripts/zipPythonPackage.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# This is a script to download a python package binaries and zip it +# The intention is to use that package as a layer for a lambda function (or something else). +# use: +# sh zipPythonPackage.sh -v 3.7 praw==7.7.0 boto3==1.26.117 +# as you can see, packages are just listed as non-option arguments +# based on https://www.linkedin.com/pulse/add-external-python-libraries-aws-lambda-using-layers-gabe-olokun/ +# Note: an old version of this script also moved the zip file to s3, this functionality has been removed. +# you may need to run chmod +x ./zipPythonPackage.sh + +set -e + +while getopts v: flag +do + case "${flag}" in + v) version=${OPTARG};; # for python ie 3.7 + esac +done +: ${version:?Missing -v} # checks if these have been set https://unix.stackexchange.com/questions/621004/bash-getopts-mandatory-arguments +shift $(( OPTIND - 1 )) +echo "packages: $@"; +echo "python version: $version"; + +SCRIPT_PATH=${0%/*} # https://stackoverflow.com/questions/6393551/what-is-the-meaning-of-0-in-a-bash-script +CWD=${pwd} +cd $SCRIPT_PATH + +for package in "$@"; do + echo "Preparing ${package}..." + # format the zip file. needed for the git packages which have lots of slashes. + if [[ ${package} == "git+"* ]]; then + package_name=${package##*/} # https://stackoverflow.com/questions/3162385/how-to-split-a-string-in-shell-and-get-the-last-field + else + package_name=${package} + fi + mkdir -p ./zippedPythonPackages/${package_name}/python + + cd ./zippedPythonPackages/${package_name}/python + + # install binaries for package + pip install \ + --platform manylinux2014_x86_64 \ + --target=. \ + --implementation cp \ + --python ${version} \ + --only-binary=:all: \ + --upgrade ${package} + + rm -rf *dist-info # some cleanup of unnecessary stuff + # zip package + cd .. + rm -rf ${package_name}.zip # remove first if it exists + echo "Zipping ${package_name} at $(pwd)" + zip -r ${package_name}.zip python # zip contents of python to zip name + cd ../../ # go back out to scripts dir +done + +cd $CWD # return to original location diff --git a/scripts/zippedLambdaFunction/.gitkeep b/scripts/zippedLambdaFunction/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/scripts/zippedPythonPackages/.gitkeep b/scripts/zippedPythonPackages/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/variable.tf b/variable.tf new file mode 100644 index 0000000..4bceb5b --- /dev/null +++ b/variable.tf @@ -0,0 +1,11 @@ +variable "env" { + type = string + default = "dev" + description = "environment to deploy to" +} + +variable "cloudwatch_state" { + type = string + default = "DISABLED" + description = "Whether or not the lambda function schedule is enabled or not. Valid values are DISABLED, ENABLED, and ENABLED_WITH_ALL_CLOUDTRAIL_MANAGEMENT_EVENT" +}