From 549d7265cb3bac4ef258d270bdee99e17ffdd05f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 14 Jul 2025 15:00:32 +0000 Subject: [PATCH] need to upload an image otherwise nothing will work --- .../workflows/push_docker_image_to_ecr.yml | 43 +++++++++ deployment/lambda.tf | 90 +++++++++++++++---- etl/epr_etl_example.py | 8 ++ etl/fileReader/pdfReaderToText.py | 3 +- 4 files changed, 128 insertions(+), 16 deletions(-) create mode 100644 .github/workflows/push_docker_image_to_ecr.yml diff --git a/.github/workflows/push_docker_image_to_ecr.yml b/.github/workflows/push_docker_image_to_ecr.yml new file mode 100644 index 0000000..394f7da --- /dev/null +++ b/.github/workflows/push_docker_image_to_ecr.yml @@ -0,0 +1,43 @@ +name: Build and Push Docker to ECR + +on: + push: + branches: [feature/energy_report_etl, main] + +env: + AWS_REGION: eu-west-2 + ECR_REPOSITORY: survey_extractor + +jobs: + deploy: + runs-on: ubuntu-latest + + permissions: + id-token: write + contents: read + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam:::role/ + aws-region: ${{ env.AWS_REGION }} + + - name: Log in to Amazon ECR + uses: aws-actions/amazon-ecr-login@v2 + + - name: Build, tag, and push Docker image to ECR + env: + ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} + IMAGE_TAG: latest + run: | + IMAGE_URI=${{ env.ECR_REGISTRY }}/${{ env.ECR_REPOSITORY }}:${{ env.IMAGE_TAG }} + + echo "Building Docker image..." + docker build -t $IMAGE_URI . + + echo "Pushing Docker image to ECR..." + docker push $IMAGE_URI diff --git a/deployment/lambda.tf b/deployment/lambda.tf index dc24055..4b56763 100644 --- a/deployment/lambda.tf +++ b/deployment/lambda.tf @@ -3,16 +3,21 @@ resource "aws_sqs_queue" "my_queue" { name = "my-lambda-queue" } -# IAM role that the Lambda function will assume to get permissions +# Create an ECR repository to store the Docker image for the Lambda function +resource "aws_ecr_repository" "lambda_repo" { + name = "survey_extractor" +} + +# IAM role that the Lambda function will assume resource "aws_iam_role" "lambda_exec_role" { name = "lambda-exec-role" assume_role_policy = jsonencode({ - Version = "2012-10-17" + Version = "2012-10-17", Statement = [ { - Action = "sts:AssumeRole" - Effect = "Allow" + Action = "sts:AssumeRole", + Effect = "Allow", Principal = { Service = "lambda.amazonaws.com" } @@ -21,22 +26,55 @@ resource "aws_iam_role" "lambda_exec_role" { }) } -# Attach the basic execution policy (writes logs to CloudWatch) to the Lambda role +# Attach AWS-managed policy for basic Lambda execution (CloudWatch logging) resource "aws_iam_role_policy_attachment" "lambda_basic_execution" { role = aws_iam_role.lambda_exec_role.name policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" } -# Give Lambda permission to poll and process SQS messages -resource "aws_iam_role_policy_attachment" "sqs_access" { - role = aws_iam_role.lambda_exec_role.name - policy_arn = "arn:aws:iam::aws:policy/AWSLambdaSQSQueueExecutionRole" +# Custom policy: SQS access + ECR image pull permissions +resource "aws_iam_policy" "lambda_custom_policy" { + name = "lambda-sqs-ecr-policy" + + policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + # Allow Lambda to read from SQS + { + Effect = "Allow", + Action = [ + "sqs:ReceiveMessage", + "sqs:DeleteMessage", + "sqs:GetQueueAttributes" + ], + Resource = aws_sqs_queue.my_queue.arn + }, + # Allow Lambda to pull images from ECR + { + Effect = "Allow", + Action = [ + "ecr:GetDownloadUrlForLayer", + "ecr:BatchGetImage", + "ecr:BatchCheckLayerAvailability" + ], + Resource = aws_ecr_repository.lambda_repo.arn + }, + # Needed to authenticate to ECR (pulling the image) + { + Effect = "Allow", + Action = [ + "ecr:GetAuthorizationToken" + ], + Resource = "*" + } + ] + }) } - -# Create an ECR repository to store the Docker image for the Lambda function -resource "aws_ecr_repository" "lambda_repo" { - name = "lambda-hello-world" +# Attach the custom policy to the Lambda role +resource "aws_iam_role_policy_attachment" "lambda_custom_policy_attach" { + role = aws_iam_role.lambda_exec_role.name + policy_arn = aws_iam_policy.lambda_custom_policy.arn } # Define the Lambda function using a Docker image from ECR @@ -45,8 +83,7 @@ resource "aws_lambda_function" "lambda_docker" { role = aws_iam_role.lambda_exec_role.arn package_type = "Image" image_uri = "${aws_ecr_repository.lambda_repo.repository_url}:latest" - - timeout = 10 + timeout = 10 } # Connect the SQS queue to the Lambda so it gets triggered by incoming messages @@ -55,3 +92,26 @@ resource "aws_lambda_event_source_mapping" "sqs_trigger" { function_name = aws_lambda_function.lambda_docker.arn batch_size = 1 } + + +resource "aws_ecr_repository_policy" "lambda_ecr_access" { + repository = aws_ecr_repository.lambda_repo.name + + policy = jsonencode({ + Version = "2008-10-17", + Statement = [ + { + Sid = "AllowLambdaPull", + Effect = "Allow", + Principal = { + Service = "lambda.amazonaws.com" + }, + Action = [ + "ecr:GetDownloadUrlForLayer", + "ecr:BatchGetImage", + "ecr:BatchCheckLayerAvailability" + ] + } + ] + }) +} \ No newline at end of file diff --git a/etl/epr_etl_example.py b/etl/epr_etl_example.py index e69de29..bd7601e 100644 --- a/etl/epr_etl_example.py +++ b/etl/epr_etl_example.py @@ -0,0 +1,8 @@ +from etl.surveyedData.surveryedData import surveyedDataProcessor + +files = [ + "/tmp/sharepoint/Sandwell/SANDWELL-001/26 Willow close B64 6EG/Content (13).pdf", +] + +from sqlalchemy.dialects.postgresql import UUID +sdp = surveyedDataProcessor("fake address", files) diff --git a/etl/fileReader/pdfReaderToText.py b/etl/fileReader/pdfReaderToText.py index bc9643f..531c316 100644 --- a/etl/fileReader/pdfReaderToText.py +++ b/etl/fileReader/pdfReaderToText.py @@ -3,6 +3,7 @@ import logging import pymupdf from etl.fileReader.sitenotes import QuidosSiteNotesExtractor, CSR, WarmHomesConditionReport, ECOConditionReport, RDSAPEnergyReport from etl.fileReader.reportType import ReportType +from pprint import pprint class pdfReaderToText(): @@ -24,7 +25,7 @@ class pdfReaderToText(): self.all_text += text self.text_list = self.all_text.split('\n') - print(self.text_list) + pprint(self.text_list) def get_list_of_text(self): return self.text_list