walthamforest_etl

This commit is contained in:
Jun-te Kim 2025-09-09 10:46:58 +00:00
parent 382f37531c
commit 5c2a8f0755
10 changed files with 225 additions and 0 deletions

View file

@ -0,0 +1,21 @@
# Ignore junk and large files
*.pdf
*.csv
*.xml
*.parquet
*.ipynb
*.mp4
*.mov
*.jpg
*.png
*.zip
*.tar.gz
__pycache__/
*.pyc
*.pyo
*.pyd
build/
dist/
.etl_cache/
tests/
docs/

View file

@ -0,0 +1,25 @@
FROM public.ecr.aws/lambda/python:3.12
# Install Poetry (you could pin a version if you like)
RUN curl -sSL https://install.python-poetry.org | python3 -
# Add Poetry to PATH
ENV PATH="/root/.local/bin:$PATH"
# Set working directory
WORKDIR /var/task
# Copy Poetry files first to leverage Docker layer caching
COPY pyproject.toml poetry.lock README.md ./
COPY etl/ etl/
# Install dependencies into /var/task
RUN poetry config virtualenvs.create false \
&& poetry install --only main --no-interaction --no-ansi
# Copy app code
COPY deployment/lambda/extractor_and_loader/docker/app.py ./
# Set Lambda handler
CMD ["app.handler"]

View file

@ -0,0 +1,2 @@
def handler(event, context):
print("hello world")

View file

@ -0,0 +1,62 @@
# ECR repo
resource "aws_ecr_repository" "walthamforest_etl" {
name = "walthamforest_etl"
}
# ECR policy to allow Lambda access
resource "aws_ecr_repository_policy" "walthamforest_etl_ecr_access" {
repository = aws_ecr_repository.walthamforest_etl.name
policy = jsonencode({
Version = "2008-10-17",
Statement = [{
Sid = "AllowLambdaPull",
Effect = "Allow",
Principal = {
Service = "lambda.amazonaws.com"
},
Action = [
"ecr:GetDownloadUrlForLayer",
"ecr:BatchGetImage",
"ecr:BatchCheckLayerAvailability"
]
}]
})
}
# ECR lifecycle policy to delete tagged images older than 14 days
resource "aws_ecr_lifecycle_policy" "walthamforest_etl_loader_lifecycle" {
repository = aws_ecr_repository.walthamforest_etl.name
policy = jsonencode({
"rules": [
{
"rulePriority": 2,
"description": "Expire images older than 14 days",
"selection": {
"tagStatus": "untagged",
"countType": "sinceImagePushed",
"countUnit": "days",
"countNumber": 1
},
"action": {
"type": "expire"
}
},
{
"rulePriority": 1,
"description": "Keep last 5 images",
"selection": {
"tagStatus": "tagged",
"tagPrefixList": ["feature"],
"countType": "imageCountMoreThan",
"countNumber": 5
},
"action": {
"type": "expire"
}
}
]
})
}

View file

@ -0,0 +1,15 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 6.3.0"
}
}
backend "s3" {
bucket = "survey-extractor-tf-state"
region = "eu-west-2"
key = "env:/dev/lambda/ecr/walthamforest_etl.tfstate"
}
required_version = ">= 1.2.0"
}

View file

@ -0,0 +1,15 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 6.3.0"
}
}
backend "s3" {
bucket = "survey-extractor-tf-state"
region = "eu-west-2"
key = "env:/dev/lambda/eachlambda/extractor_and_loader_lambda.tfstate"
}
required_version = ">= 1.2.0"
}

View file

@ -0,0 +1,5 @@
variable "lambda_image_tag" {
description = "Docker image tag (e.g. GitHub SHA)"
type = string
default = "local-dev-latest"
}

View file

@ -0,0 +1,80 @@
# Reference existing IAM role
data "aws_iam_role" "lambda_exec_role" {
name = "lambda-exec-role"
}
# Reference existing ECR repository
data "aws_ecr_repository" "walthamforest_etl_ecr" {
name = "walthamforest_etl_ecr"
}
# SQS queue for extractor_and_loader
resource "aws_sqs_queue" "walthamforest_etl_queue" {
name = "walthamforest_etl-queue"
}
# IAM policy specific to this Lambda
resource "aws_iam_policy" "walthamforest_etl_policy" {
name = "walthamforest_etl-loader-policy"
policy = jsonencode({
Version = "2012-10-17",
Statement = [
{
Effect = "Allow",
Action = [
"sqs:ReceiveMessage",
"sqs:DeleteMessage",
"sqs:GetQueueAttributes"
],
Resource = aws_sqs_queue.walthamforest_etl_queue.arn
},
{
Effect = "Allow",
Action = [
"ecr:GetDownloadUrlForLayer",
"ecr:BatchGetImage",
"ecr:BatchCheckLayerAvailability"
],
Resource = data.aws_ecr_repository.walthamforest_etl_ecr.arn
},
{
Effect = "Allow",
Action = ["ecr:GetAuthorizationToken"],
Resource = "*"
}
]
})
}
resource "aws_iam_role_policy_attachment" "walthamforest_etl_policy_attach" {
role = data.aws_iam_role.lambda_exec_role.name
policy_arn = aws_iam_policy.walthamforest_etl-loader-policy.arn
}
# Lambda function
resource "aws_lambda_function" "waltham_forest_etl" {
function_name = "walthamforest_etl"
role = data.aws_iam_role.lambda_exec_role.arn
package_type = "Image"
image_uri = "${data.aws_ecr_repository.walthamforest_etl_ecr.repository_url}:${var.lambda_image_tag}"
# Increase timeout (max 900 sec / 15 min)
timeout = 300 # e.g. 5 minutes
# Increase memory (default 128 MB)
memory_size = 2048 # try 1024 or 2048 MB to start
# environment {
# variables = {
# DATABASE_URL = "postgresql://postgres:makingwarmhomes@terraform-20250331175522503500000002.cdgzupxvdyp0.eu-west-2.rds.amazonaws.com:5432/surveyDB"
# }
# }
}
# SQS trigger
resource "aws_lambda_event_source_mapping" "extractor_and_loader_trigger" {
event_source_arn = aws_sqs_queue.walthamforest_etl_queue.arn
function_name = aws_lambda_function.walthamforest_etl.arn
batch_size = 1
}