From 37ba26a2feb4a630b9c432283401ba21fba0dae6 Mon Sep 17 00:00:00 2001 From: Jun-te kim Date: Tue, 19 Aug 2025 12:05:40 +0000 Subject: [PATCH 01/10] database now links --- .db-env | 8 +- alembic/env.py | 1 + .../lambda/extractor_and_loader/docker/app.py | 168 ++++++++++++++--- etl/fileReader/sitenotes.py | 170 +++++++++--------- etl/models/topLevel.py | 5 +- etl/transform/conditionReportTypes.py | 8 +- migration_db.sh | 2 +- 7 files changed, 245 insertions(+), 117 deletions(-) diff --git a/.db-env b/.db-env index 9174c1b..6a6bf61 100644 --- a/.db-env +++ b/.db-env @@ -1,10 +1,10 @@ -POSTGRES_USER=postgres +OSTGRES_USER=postgres PGDATABASE=surveyDB POSTGRES_PASSWORD=makingwarmhomes POSTGRES_HOST=localhost POSTGRES_PORT=5432 PGADMIN_DEFAULT_EMAIL=junte@domna.homes PGADMIN_DEFAULT_PASSWORD=makingwarmhomes -DATABASE_URL=postgresql://postgres:makingwarmhomes@db:5432/postgres -# Prod(dev-aws) Database Don't use!!!! -#DATABASE_URL=postgresql://postgres:makingwarmhomes@terraform-20250331175522503500000002.cdgzupxvdyp0.eu-west-2.rds.amazonaws.com:5432/surveyDB \ No newline at end of file +# DATABASE_URL=postgresql://postgres:makingwarmhomes@db:5432/postgres +# Prod(dev-aws) Database Don't use +DATABASE_URL=postgresql://postgres:makingwarmhomes@terraform-20250331175522503500000002.cdgzupxvdyp0.eu-west-2.rds.amazonaws.com:5432/surveyDB \ No newline at end of file diff --git a/alembic/env.py b/alembic/env.py index a9d58f7..bc0c904 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -8,6 +8,7 @@ from sqlmodel import SQLModel from etl.models.topLevel import * from etl.models.preSiteNoteTypes import * from etl.models.conditionReport import * +from etl.fileReader.reportType import ReportType import os diff --git a/deployment/lambda/extractor_and_loader/docker/app.py b/deployment/lambda/extractor_and_loader/docker/app.py index 4a281ed..df0d585 100644 --- a/deployment/lambda/extractor_and_loader/docker/app.py +++ b/deployment/lambda/extractor_and_loader/docker/app.py @@ -1,27 +1,155 @@ -""" -A quick example of lambda working a function in python -""" -from etl.read_stuff_from_s3_example import print_hello_from_etl_module +import os +import tempfile +import requests +import boto3 +from urllib.parse import urlparse +from etl.fileReader.pdfReaderToText import pdfReaderToText +from etl.fileReader.sitenotes import ( + SiteNotesExtractor, + WarmHomesConditionReport +) + +from uuid import UUID +import json +from typing import Any +from etl.db.db import get_db_session, init_db +from typing import Union +import uuid +from datetime import datetime, timezone +from sqlmodel import select +from sqlalchemy import update +from etl.models.topLevel import uploaded_files + +def update_uploaded_file_json_uri_by_query( + db_session, + file_id: Union[str, uuid.UUID], + json_: str, +): + """ + Query uploaded_files by id, update s3_json_uri and s3_json_upload_timestamp, + commit, refresh, and return the ORM object. Raises ValueError if not found. + """ + try: + file_id_norm = uuid.UUID(str(file_id)) + except (ValueError, AttributeError, TypeError): + file_id_norm = file_id # leave as-is if not a UUID + + obj = ( + db_session + .query(uploaded_files) + .filter(uploaded_files.id == file_id_norm) + .first() + ) + obj.s3_json_uri = json_ + obj.s3_json_upload_timestamp = datetime.now(timezone.utc) + + db_session.add(obj) + db_session.commit() + db_session.refresh(obj) + return obj + +def serialize_model(model: Any): + """Recursively convert Pydantic models/lists into plain dicts.""" + if hasattr(model, "dict"): + return {k: serialize_model(v) for k, v in model.dict().items()} + elif isinstance(model, list): + return [serialize_model(item) for item in model] + else: + return model + +def make_final_json(rooms_obj, heating_system_obj): + # Convert to dict recursively + rooms_data = serialize_model(rooms_obj) + heating_data = serialize_model(heating_system_obj) + + # Combine into one big JSON-ready dict + final_data = { + "rooms": rooms_data, + "heating_system": heating_data + } + + # Convert to pretty JSON string + return final_data + +def parse_s3_uri(uri: str): + """ + Parse an S3 URI or HTTPS S3 URL into bucket and key. + Supports formats: + - s3://bucket-name/path/to/file + - https://bucket-name.s3.region.amazonaws.com/path/to/file + """ + parsed = urlparse(uri) + + if parsed.scheme == "s3": + # s3://bucket/key + bucket = parsed.netloc + key = parsed.path.lstrip("/") + elif parsed.scheme in ("http", "https"): + # https://bucket-name.s3.region.amazonaws.com/key + host_parts = parsed.netloc.split(".") + if len(host_parts) >= 3 and host_parts[1] == "s3": + bucket = host_parts[0] + else: + raise ValueError("Not a valid S3 HTTPS URL format") + key = parsed.path.lstrip("/") + else: + raise ValueError("Unsupported URI scheme") + + return bucket, key + + +def download_private_s3_file(uri) -> str: + + bucket_name, key = parse_s3_uri(uri) + """ + Download a private S3 file using hardcoded AWS credentials. + Saves it to /tmp and returns the local file path. + """ + + # Hardcoded AWS credentials (quick testing only) + aws_access_key = "AKIAU5A36PPNJMZZ3KRW" + aws_secret_key = "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" + aws_region = "eu-west-2" + + # Where to store the file locally + tmp_dir = tempfile.gettempdir() + filename = os.path.basename(key) + file_path = os.path.join(tmp_dir, filename) + + # Create S3 client with hardcoded creds + s3 = boto3.client( + "s3", + aws_access_key_id=aws_access_key, + aws_secret_access_key=aws_secret_key, + region_name=aws_region + ) + + # Download file + s3.download_file(bucket_name, key, file_path) + + return file_path def handler(event, context): - print("Outside try statment") - print_hello_from_etl_module() try: - print("show me something.. anything...") - s3_uri = event.get("file_location") - if not s3_uri: - print("failed to get s3_uri") - return { - "statusCode": 400, - "body": "Missing 'file_location' in event" - } - print(f"s3 uri is {s3_uri}") - - return { - "statusCode": 200, - "body": f"s3 uri {s3_uri}" - } + id_ = "2d17c316-65b1-4cc6-ba29-e813a9a989ef" + file_uri = "https://retrofit-energy-assessments-dev.s3.eu-west-2.amazonaws.com/documents/10034911080/osmosis_condition_pas_2035_report/20250819_120258.pdf" + local_path = download_private_s3_file(file_uri) + print(f"File saved to: {local_path}") + reader = pdfReaderToText(local_path) + print(reader.text_list) + obj = WarmHomesConditionReport(reader.text_list) + json_ = make_final_json(obj.master_obj[0], obj.master_obj[1]) + from pprint import pprint + pprint(json_) + # json_ = {} + init_db() + with get_db_session() as session: + update_uploaded_file_json_uri_by_query( + session, + id_, + json_ + ) except Exception as e: print(f"❌ Error: {e}") return { diff --git a/etl/fileReader/sitenotes.py b/etl/fileReader/sitenotes.py index 7799252..073a30a 100644 --- a/etl/fileReader/sitenotes.py +++ b/etl/fileReader/sitenotes.py @@ -105,32 +105,24 @@ class WarmHomesConditionReport(SiteNotesExtractor): def __init__(self, data_list): super().__init__(data_list) self.type = ReportType.WARM_HOMES_CONDITION_REPORT - self.master_obj = self.setup_condition_report() + room, heating_system = self.setup_condition_report() + self.master_obj = room, heating_system def setup_condition_report(self): - general_information = self.get_section_1() - access_and_elevations = self.get_section_2() + # general_information = self.get_section_1() + # access_and_elevations = self.get_section_2() rooms = self.get_section_3() heating_system = self.get_section_4() - occupant_assessment = self.get_section_5() - site_name, reference_code, address, postcode = self.get_section_0() + # occupant_assessment = self.get_section_5() + # site_name, reference_code, address, postcode = self.get_section_0() - return ConditionReportModel( - project_site_name=site_name, - property_reference_code=reference_code, - property_address=address, - postcode=postcode, - general_information=general_information, - access_and_elevations=access_and_elevations, - rooms=rooms, - heating_system=heating_system, - occupancy_assessment=occupant_assessment, - ) + return rooms, heating_system def get_section_0(self): data = self.get_data_between("Project Site Name", "1. General Information") site_name = self.get_next_value(data, "Project Site Name") - reference_code = self.get_next_value(data, "Property Reference Code") + # reference_code = self.get_next_value(data, "Property Reference Code") + reference_code = "Place holder, please delete me for prod" address = self.get_next_value(data, "Property Address") postcode = self.get_data_between("Postcode", "Main Image")[1:] postcode = " ".join(postcode) @@ -155,7 +147,8 @@ class WarmHomesConditionReport(SiteNotesExtractor): def get_assessor_details(self): data = self.get_data_between("1.1 Assessor details","1.2 Inspection & Project") assessor_name = self.get_next_value(data, "Assessor Name & ID") - elmhirst_id = self.get_next_value(data, "Please enter name & Elmhurst ID") + # elmhirst_id = self.get_next_value(data, "Property Reference Code") + elmhirst_id = "please remove me in prod" return AssessorDetails(assessor_name_and_id=assessor_name, elmhurst_id=elmhirst_id) def get_inspection_and_project(self): @@ -164,7 +157,7 @@ class WarmHomesConditionReport(SiteNotesExtractor): return InspectionAndProject(inspection_date=date) def get_the_property(self): - data = self.get_data_between("1.3 The property", "Main elevation") + data = self.get_data_between("1.3 The property", "3. Rooms") return TheProperty( house_type=self.get_next_value(data, "House Type"), on_which_floor_is_the_flat_located = self.get_next_value(data, "On which floor is the flat located?"), @@ -356,7 +349,7 @@ class WarmHomesConditionReport(SiteNotesExtractor): ventilation = VentilationInfo( is_there_a_ventilation_system_present_in_the_room=True if self.get_next_value(data, "Is there a ventilation system present in the room?").lower() == "yes" else False, are_there_any_visible_or_reported_signs_of_damp_mould_or_excessive_condensation_within_the_room=True if self.get_next_value(data, "excessive condensation within the room?").lower() == "yes" else False, - are_there_sufficient_undercuts_on_the_closed_door=self.get_next_value(data, "- min 7.6mm per 1000mm width door)?"), + are_there_sufficient_undercuts_on_the_closed_door=self.get_next_value(data, "- min 10mm)?"), is_there_any_open_flue_heating_appliances_within_the_room=True if self.get_next_value(data, "Is there any open flue heating appliances within the room?").lower() == "yes" else False, ) @@ -520,7 +513,7 @@ class WarmHomesConditionReport(SiteNotesExtractor): general_condition_of_heating_system = self.get_general_condition_of_heating_system() main_heating_one = self.get_main_heating_one() main_heating_two = self.get_main_heating_two() - secondary_heating = self.get_secondary_heating() + # secondary_heating = self.get_secondary_heating() heating_by_room = self.get_heating_by_room() renewables = self.get_renewables() @@ -528,7 +521,7 @@ class WarmHomesConditionReport(SiteNotesExtractor): general_condition=general_condition_of_heating_system, main_heating_one=main_heating_one, main_heating_two=main_heating_two, - secondary_heating=secondary_heating, + # secondary_heating=secondary_heating, heating_by_room=heating_by_room, renewables=renewables ) @@ -547,19 +540,19 @@ class WarmHomesConditionReport(SiteNotesExtractor): is_there_a_main_heating_two=True if self.get_next_value(data, "Is there a Main Heating 2?").lower() == "yes" else False, ) - def get_secondary_heating(self): - data = self.get_data_between("Secondary Heating", "Heating by room") - return SecondaryHeating( - is_there_a_secondary_heating=True if self.get_next_value(data, "Is there a Secondary Heating?").lower() == "yes" else False, - fuel=self.get_next_value(data, "Fuel"), - electric_heating_type=self.get_next_value_greedy(data, "Type", 2), - gas_heating_type=self.get_x_value(data, "Type", 1), - ) + # def get_secondary_heating(self): + # data = self.get_data_between("Secondary Heating", "Heating by room") + # return SecondaryHeating( + # is_there_a_secondary_heating=True if self.get_next_value(data, "Is there a Secondary Heating?").lower() == "yes" else False, + # fuel=self.get_next_value(data, "Fuel:"), + # electric_heating_type=self.get_next_value_greedy(data, "Type", 2), + # gas_heating_type=self.get_x_value(data, "Type", 1), + # ) def get_heating_by_room(self): data = self.get_data_between("Heating by room", "Renewables") - list_of_main_heating_system_by_one = self.get_data_between("Rooms heated by Main System 1:", "Rooms Heated by Main System 2")[1:] - list_of_main_heating_system_by_two = self.get_data_between("Rooms Heated by Main System 2", "Rooms heated by Secondary Heating:")[1:] + list_of_main_heating_system_by_one = self.get_data_between("Rooms heated by Main System 1:", "Rooms heated by Main System 2:")[1:] + list_of_main_heating_system_by_two = self.get_data_between("Rooms heated by Main System 2:", "Rooms heated by Secondary Heating:")[1:] list_of_rooms_heated_by_secondary_heating = self.get_data_between("Rooms heated by Secondary Heating:", "Are there any partially heated rooms?")[1:] are_there_any_partially_heated_rooms = True if self.get_next_value(data, "Are there any partially heated rooms?").lower() == "yes" else False are_there_any_unheated_rooms = True if self.get_next_value(data, "Are there any unheated rooms?").lower() == "yes" else False @@ -606,8 +599,9 @@ class WarmHomesConditionReport(SiteNotesExtractor): is_the_heating_system_in_working_order=True if self.get_next_value(data, "Is the Heating System in working order?") else False, does_the_occupant_have_a_smart_meter=True if self.get_next_value(data, "Does the occupant have a Smart Meter?") else False, are_there_any_smart_monitoring_devices=True if self.get_next_value(data, "Are there any smart monitoring devices (Switchee etc)?") else False, - are_the_gas_and_electricity_meters_accessible=True if self.get_next_value(data, "Are the Gas and Electricity Meters accessible?") else False, - dual_or_single_electric_meter=self.get_next_value(data, "Dual or single electric meter?"), + are_the_electricity_meters_accessible=True if self.get_next_value(data, "Is the Electricity Meter accessible?") else False, + are_the_gas_meters_accessible=True if self.get_next_value(data, "Is the Gas Meter accessible?") else False, + # dual_or_single_electric_meter=self.get_next_value(data, "Dual or single electric meter?"), ) def get_section_5(self): @@ -1373,62 +1367,62 @@ class QuidosSiteNotesExtractor(SiteNotesExtractor): ) ) - def get_secondary_heating(self): - data = self.raw_data[self.raw_data.index("14.1 Main Heating2"):self.raw_data.index("14.2 Secondary Heating Type")] - main_titles = [ - "Second Main Heating Type", - "Main Heating System Controls", - ] - sub_titles = [ - "Percentage of Heated Floor Area Served (%)", - "Heating Source", - "Efficiency Source", - "Heating Fuel", - "SAP 2009 Table 4a/4b", - "Heating Type", - "Heating Description", - "Control Type", - "Flue Type", - "Fan Assisted Flue", - "Heat Emitter Type", - ] - list = self.two_column_with_extension_processor(data, sub_titles, main_titles) - dict_ = list[0] + # def get_secondary_heating(self): + # data = self.raw_data[self.raw_data.index("14.1 Main Heating2"):self.raw_data.index("14.2 Secondary Heating Type")] + # main_titles = [ + # "Second Main Heating Type", + # "Main Heating System Controls", + # ] + # sub_titles = [ + # "Percentage of Heated Floor Area Served (%)", + # "Heating Source", + # "Efficiency Source", + # "Heating Fuel", + # "SAP 2009 Table 4a/4b", + # "Heating Type", + # "Heating Description", + # "Control Type", + # "Flue Type", + # "Fan Assisted Flue", + # "Heat Emitter Type", + # ] + # list = self.two_column_with_extension_processor(data, sub_titles, main_titles) + # dict_ = list[0] - return Heating( - type="secondary", - percentage_of_heated_floor_area_served=dict_.get("percentage_of_heated_floor_area_served_(%)", ""), - heating_source=dict_.get("heating_source", "") if dict_["heating_source"] is not None else "", - efficiency_source=dict_.get("efficiency_source", "") if dict_["efficiency_source"] is not None else "", - heating_fuel=dict_.get("heating_fuel", "") if dict_.get("heating_fuel") is not None else "", - brand_name=dict_.get("brand_name", ""), - model_name=dict_.get("model_name", ""), - model_qualifer=dict_.get("model_qualifier", ""), - controls=HeatingSystemControls( - control_type=dict_.get("control_type", "") if dict_.get("control_type", "") is not None else "", - flue_type=dict_.get("flue_type","") if dict_.get("flue_type", "") is not None else "", - fan_assisted_flue=True if dict_.get("fan_assisted_flue", "NO").upper() == "YES" else False, - heat_emitter_type=dict_.get("heat_emitter_type", "") if dict_.get("heat_emitter_type") is not None else "", - electricity_meter_type=dict_.get("electricity_meter_type", "") if dict_.get("electricity_meter_type", "") is not None else "", - mains_gas_available=True if dict_.get("mains_gas_available", "NO").upper() == "YES" else False, - ) - ) + # return Heating( + # type="secondary", + # percentage_of_heated_floor_area_served=dict_.get("percentage_of_heated_floor_area_served_(%)", ""), + # heating_source=dict_.get("heating_source", "") if dict_["heating_source"] is not None else "", + # efficiency_source=dict_.get("efficiency_source", "") if dict_["efficiency_source"] is not None else "", + # heating_fuel=dict_.get("heating_fuel", "") if dict_.get("heating_fuel") is not None else "", + # brand_name=dict_.get("brand_name", ""), + # model_name=dict_.get("model_name", ""), + # model_qualifer=dict_.get("model_qualifier", ""), + # controls=HeatingSystemControls( + # control_type=dict_.get("control_type", "") if dict_.get("control_type", "") is not None else "", + # flue_type=dict_.get("flue_type","") if dict_.get("flue_type", "") is not None else "", + # fan_assisted_flue=True if dict_.get("fan_assisted_flue", "NO").upper() == "YES" else False, + # heat_emitter_type=dict_.get("heat_emitter_type", "") if dict_.get("heat_emitter_type") is not None else "", + # electricity_meter_type=dict_.get("electricity_meter_type", "") if dict_.get("electricity_meter_type", "") is not None else "", + # mains_gas_available=True if dict_.get("mains_gas_available", "NO").upper() == "YES" else False, + # ) + # ) - def get_secondary_heating_type(self): - data = self.raw_data[self.raw_data.index("14.2 Secondary Heating Type"):self.raw_data.index("15.0 Water Heating")] - avoid = [ - "14.2 Secondary Heating Type", - "15.0 Water Heating", - ] - sub_titles = [ - "Heating Type", - "Fuel Type", - ] - dict_ = self.two_columns_processor(data, sub_titles, avoid) - return HeatingType( - heating_type=dict_.get("heating_type", ""), - fuel_type=dict_.get("fuel_type", ""), - ) + # def get_secondary_heating_type(self): + # data = self.raw_data[self.raw_data.index("14.2 Secondary Heating Type"):self.raw_data.index("15.0 Water Heating")] + # avoid = [ + # "14.2 Secondary Heating Type", + # "15.0 Water Heating", + # ] + # sub_titles = [ + # "Heating Type", + # "Fuel Type", + # ] + # dict_ = self.two_columns_processor(data, sub_titles, avoid) + # return HeatingType( + # heating_type=dict_.get("heating_type", ""), + # fuel_type=dict_.get("fuel_type", ""), + # ) def get_water_heating(self): data = self.raw_data[self.raw_data.index("15.0 Water Heating"):self.raw_data.index("15.1 Hot Water Cylinder")] diff --git a/etl/models/topLevel.py b/etl/models/topLevel.py index 4a06b80..57a8f1e 100644 --- a/etl/models/topLevel.py +++ b/etl/models/topLevel.py @@ -8,6 +8,7 @@ from sqlalchemy import Column from sqlalchemy.dialects.postgresql import UUID from etl.fileReader.reportType import ReportType from sqlalchemy import DateTime +from sqlalchemy.dialects.postgresql import JSON class BaseModel(SQLModel): # Put primary_key=True in Column; don't pass primary_key to Field @@ -56,7 +57,9 @@ class BaseModel(SQLModel): class uploaded_files(BaseModel, table=True): __tablename__ = "uploaded_files" - s3_json_uri: Optional[str] = None + s3_json_uri: Optional[dict] = Field( + sa_column=Column(JSON, nullable=True) + ) s3_file_uri: str = Field(index=True) doc_type: ReportType = Field( diff --git a/etl/transform/conditionReportTypes.py b/etl/transform/conditionReportTypes.py index e064694..abd53f8 100644 --- a/etl/transform/conditionReportTypes.py +++ b/etl/transform/conditionReportTypes.py @@ -169,8 +169,10 @@ class GeneralConditionHeatingSystem(BaseModel): is_the_heating_system_in_working_order: bool does_the_occupant_have_a_smart_meter: bool are_there_any_smart_monitoring_devices: bool - are_the_gas_and_electricity_meters_accessible: bool - dual_or_single_electric_meter: str + are_the_gas_meters_accessible: bool + are_the_electricity_meters_accessible: bool + + # dual_or_single_electric_meter: str class SecondaryHeating(BaseModel): is_there_a_secondary_heating: bool @@ -210,7 +212,7 @@ class HeatingSystem(BaseModel): general_condition: GeneralConditionHeatingSystem main_heating_one: MainHeatingOne main_heating_two: MainHeatingTwo - secondary_heating: SecondaryHeating + # secondary_heating: SecondaryHeating heating_by_room: HeatingByRoom renewables: Renewables diff --git a/migration_db.sh b/migration_db.sh index 9eecf21..e9b4aec 100644 --- a/migration_db.sh +++ b/migration_db.sh @@ -1,4 +1,4 @@ -#poetry run alembic revision --autogenerate -m "update enum" +#poetry run alembic revision --autogenerate -m "enum things" poetry run alembic upgrade head From ff4268a05e1b51008793d49596900a9452aefaf2 Mon Sep 17 00:00:00 2001 From: Jun-te kim Date: Tue, 19 Aug 2025 12:06:04 +0000 Subject: [PATCH 02/10] migrations scripts --- alembic/versions/29113d69989e_enum_things.py | 34 +++++++++++++++ .../2cf02c9f71f8_add_missing_report_type.py | 41 +++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 alembic/versions/29113d69989e_enum_things.py create mode 100644 alembic/versions/2cf02c9f71f8_add_missing_report_type.py diff --git a/alembic/versions/29113d69989e_enum_things.py b/alembic/versions/29113d69989e_enum_things.py new file mode 100644 index 0000000..428e6b8 --- /dev/null +++ b/alembic/versions/29113d69989e_enum_things.py @@ -0,0 +1,34 @@ +"""enum things + +Revision ID: 29113d69989e +Revises: 2cf02c9f71f8 +Create Date: 2025-08-19 11:40:52.712131 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '29113d69989e' +down_revision: Union[str, None] = '2cf02c9f71f8' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade(): + op.execute("ALTER TYPE reporttype ADD VALUE IF NOT EXISTS 'osmosis_condition_pas_2035_report'") + op.execute("ALTER TYPE reporttype ADD VALUE IF NOT EXISTS 'warm_homes_condition_pas_2035_report'") + op.execute("ALTER TYPE reporttype ADD VALUE IF NOT EXISTS 'energy_performance_report_with_data'") + op.execute("ALTER TYPE reporttype ADD VALUE IF NOT EXISTS 'energy_performance_report_summary_information'") + op.execute("ALTER TYPE reporttype ADD VALUE IF NOT EXISTS 'lodgement_xml_needed_for_lodgement_to_like_trademark'") + op.execute("ALTER TYPE reporttype ADD VALUE IF NOT EXISTS 'reduce_xml_needed_to_generate_full_sap_xml'") + op.execute("ALTER TYPE reporttype ADD VALUE IF NOT EXISTS 'full_xml_needed_for_co_ordination'") + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + pass + # ### end Alembic commands ### diff --git a/alembic/versions/2cf02c9f71f8_add_missing_report_type.py b/alembic/versions/2cf02c9f71f8_add_missing_report_type.py new file mode 100644 index 0000000..3dcaebb --- /dev/null +++ b/alembic/versions/2cf02c9f71f8_add_missing_report_type.py @@ -0,0 +1,41 @@ +"""add missing report type + +Revision ID: 2cf02c9f71f8 +Revises: 253a1047c623 +Create Date: 2025-08-19 11:36:16.006276 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql +from sqlalchemy.dialects import postgresql as psql + +# revision identifiers, used by Alembic. +revision: str = '2cf02c9f71f8' +down_revision: Union[str, None] = '253a1047c623' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + + +def upgrade() -> None: + op.alter_column( + "uploaded_files", + "s3_json_uri", + type_=psql.JSON(), # or psql.JSONB() + postgresql_using="s3_json_uri::json", # or ::jsonb + existing_type=sa.VARCHAR(), + existing_nullable=True, + ) + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column('uploaded_files', 's3_json_uri', + existing_type=postgresql.JSON(astext_type=sa.Text()), + type_=sa.VARCHAR(), + existing_nullable=True) + # ### end Alembic commands ### From 38f4ac151b92fdc33b60b3c4e7d9191e0cd00c0d Mon Sep 17 00:00:00 2001 From: Jun-te kim Date: Tue, 19 Aug 2025 12:11:24 +0000 Subject: [PATCH 03/10] push sqs to aws --- .github/workflows/lambda_main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lambda_main.yml b/.github/workflows/lambda_main.yml index 39b9e25..94e053a 100644 --- a/.github/workflows/lambda_main.yml +++ b/.github/workflows/lambda_main.yml @@ -2,7 +2,7 @@ name: Lambda Main Workflow on: push: - branches: [main, feature/seperate_terraform_with_different_states] + branches: [main, feature/document_upload] env: AWS_REGION: eu-west-2 From b790d650b0a5b54620093c54eca221c0cc9f4861 Mon Sep 17 00:00:00 2001 From: Jun-te kim Date: Tue, 19 Aug 2025 12:44:40 +0000 Subject: [PATCH 04/10] s3 json body is now juplaoded as uri --- .../a8cc4a5fccb6_json_uri_is_a_string.py | 38 ++++++++++++++ .../lambda/extractor_and_loader/docker/app.py | 51 ++++++++++++++++--- etl/models/topLevel.py | 5 +- migration_db.sh | 2 +- 4 files changed, 86 insertions(+), 10 deletions(-) create mode 100644 alembic/versions/a8cc4a5fccb6_json_uri_is_a_string.py diff --git a/alembic/versions/a8cc4a5fccb6_json_uri_is_a_string.py b/alembic/versions/a8cc4a5fccb6_json_uri_is_a_string.py new file mode 100644 index 0000000..fa12c67 --- /dev/null +++ b/alembic/versions/a8cc4a5fccb6_json_uri_is_a_string.py @@ -0,0 +1,38 @@ +"""json_uri is a string + +Revision ID: a8cc4a5fccb6 +Revises: 29113d69989e +Create Date: 2025-08-19 12:35:59.456912 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = 'a8cc4a5fccb6' +down_revision: Union[str, None] = '29113d69989e' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column('uploaded_files', 's3_json_uri', + existing_type=postgresql.JSON(astext_type=sa.Text()), + type_=sa.Text(), + existing_nullable=True) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column('uploaded_files', 's3_json_uri', + existing_type=sa.Text(), + type_=postgresql.JSON(astext_type=sa.Text()), + existing_nullable=True) + # ### end Alembic commands ### diff --git a/deployment/lambda/extractor_and_loader/docker/app.py b/deployment/lambda/extractor_and_loader/docker/app.py index df0d585..087896b 100644 --- a/deployment/lambda/extractor_and_loader/docker/app.py +++ b/deployment/lambda/extractor_and_loader/docker/app.py @@ -23,7 +23,7 @@ from etl.models.topLevel import uploaded_files def update_uploaded_file_json_uri_by_query( db_session, file_id: Union[str, uuid.UUID], - json_: str, + json_uri: str, ): """ Query uploaded_files by id, update s3_json_uri and s3_json_upload_timestamp, @@ -40,7 +40,7 @@ def update_uploaded_file_json_uri_by_query( .filter(uploaded_files.id == file_id_norm) .first() ) - obj.s3_json_uri = json_ + obj.s3_json_uri = json_uri obj.s3_json_upload_timestamp = datetime.now(timezone.utc) db_session.add(obj) @@ -99,7 +99,6 @@ def parse_s3_uri(uri: str): def download_private_s3_file(uri) -> str: - bucket_name, key = parse_s3_uri(uri) """ Download a private S3 file using hardcoded AWS credentials. @@ -129,10 +128,48 @@ def download_private_s3_file(uri) -> str: return file_path +def upload_json_to_s3(json_obj, dest_uri: str) -> str: + """ + Upload a JSON-serializable object to S3 at the given s3:// or https S3 URL. + Returns the public-style HTTPS S3 URL (still private if bucket is private). + """ + bucket, pdf_key = parse_s3_uri(dest_uri) + base_folder = os.path.dirname(pdf_key) # e.g. ".../report" + + # Build jsonified folder + timestamp filename + timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") + json_key = f"{base_folder}/jsonified/{timestamp}.json" + + # Same region/creds you used for download + aws_access_key = "AKIAU5A36PPNJMZZ3KRW" + aws_secret_key = "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" + aws_region = "eu-west-2" + + s3 = boto3.client( + "s3", + aws_access_key_id=aws_access_key, + aws_secret_access_key=aws_secret_key, + region_name=aws_region + ) + + body = json.dumps(json_obj, ensure_ascii=False, indent=2).encode("utf-8") + + s3.put_object( + Bucket=bucket, + Key=json_key, + Body=body, + ContentType="application/json" + # Optional hardening: + # , ServerSideEncryption="AES256" + ) + + # Return an HTTPS-style S3 URL (matches your input style) + return f"https://{bucket}.s3.{aws_region}.amazonaws.com/{json_key}" + def handler(event, context): try: - id_ = "2d17c316-65b1-4cc6-ba29-e813a9a989ef" - file_uri = "https://retrofit-energy-assessments-dev.s3.eu-west-2.amazonaws.com/documents/10034911080/osmosis_condition_pas_2035_report/20250819_120258.pdf" + id_ = "aa0a22aa-2e08-40b9-8309-fa6df1efffbc" + file_uri = "https://retrofit-energy-assessments-dev.s3.eu-west-2.amazonaws.com/documents/10034911080/osmosis_condition_pas_2035_report/20250819_123831.pdf" local_path = download_private_s3_file(file_uri) print(f"File saved to: {local_path}") @@ -142,13 +179,13 @@ def handler(event, context): json_ = make_final_json(obj.master_obj[0], obj.master_obj[1]) from pprint import pprint pprint(json_) - # json_ = {} + json_uri = upload_json_to_s3(json_, file_uri) init_db() with get_db_session() as session: update_uploaded_file_json_uri_by_query( session, id_, - json_ + json_uri, ) except Exception as e: print(f"❌ Error: {e}") diff --git a/etl/models/topLevel.py b/etl/models/topLevel.py index 57a8f1e..98cb724 100644 --- a/etl/models/topLevel.py +++ b/etl/models/topLevel.py @@ -9,6 +9,7 @@ from sqlalchemy.dialects.postgresql import UUID from etl.fileReader.reportType import ReportType from sqlalchemy import DateTime from sqlalchemy.dialects.postgresql import JSON +from sqlalchemy import Text class BaseModel(SQLModel): # Put primary_key=True in Column; don't pass primary_key to Field @@ -57,8 +58,8 @@ class BaseModel(SQLModel): class uploaded_files(BaseModel, table=True): __tablename__ = "uploaded_files" - s3_json_uri: Optional[dict] = Field( - sa_column=Column(JSON, nullable=True) + s3_json_uri: Optional[str] = Field( + sa_column=Column(Text, nullable=True) ) s3_file_uri: str = Field(index=True) diff --git a/migration_db.sh b/migration_db.sh index e9b4aec..382ea6c 100644 --- a/migration_db.sh +++ b/migration_db.sh @@ -1,4 +1,4 @@ -#poetry run alembic revision --autogenerate -m "enum things" +#poetry run alembic revision --autogenerate -m "json_uri is a string" poetry run alembic upgrade head From c000ce4704322e33ee398dcf93d8388d2cf167c5 Mon Sep 17 00:00:00 2001 From: Jun-te kim Date: Wed, 20 Aug 2025 10:51:43 +0000 Subject: [PATCH 05/10] modified lambda job to have dynamic ip and file_uri --- .../lambda/extractor_and_loader/docker/app.py | 63 +++++++++++++------ 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/deployment/lambda/extractor_and_loader/docker/app.py b/deployment/lambda/extractor_and_loader/docker/app.py index 087896b..36cda55 100644 --- a/deployment/lambda/extractor_and_loader/docker/app.py +++ b/deployment/lambda/extractor_and_loader/docker/app.py @@ -166,27 +166,54 @@ def upload_json_to_s3(json_obj, dest_uri: str) -> str: # Return an HTTPS-style S3 URL (matches your input style) return f"https://{bucket}.s3.{aws_region}.amazonaws.com/{json_key}" +def get_file_uri(id): + with get_db_session() as session: + obj = ( + session + .query(uploaded_files) + .filter(uploaded_files.id == id) + .first() + ) + if obj is None: + raise RuntimeError(f"Failed to find uploaded_files record with id {id}") + + return obj.s3_file_uri + + def handler(event, context): try: - id_ = "aa0a22aa-2e08-40b9-8309-fa6df1efffbc" - file_uri = "https://retrofit-energy-assessments-dev.s3.eu-west-2.amazonaws.com/documents/10034911080/osmosis_condition_pas_2035_report/20250819_123831.pdf" - - local_path = download_private_s3_file(file_uri) - print(f"File saved to: {local_path}") - reader = pdfReaderToText(local_path) - print(reader.text_list) - obj = WarmHomesConditionReport(reader.text_list) - json_ = make_final_json(obj.master_obj[0], obj.master_obj[1]) - from pprint import pprint - pprint(json_) - json_uri = upload_json_to_s3(json_, file_uri) init_db() - with get_db_session() as session: - update_uploaded_file_json_uri_by_query( - session, - id_, - json_uri, - ) + for r in event.get("Records", []): + body = json.loads(r["body"]) + id_ = body.get("id") + if not id_: # covers None or empty string + raise ValueError(f"❌ Missing 'id' in SQS body: {body}") + + print(f"Retrieving file uri with id {id_}") + file_uri = get_file_uri(id_) + print(f"Retrieved file uri with {file_uri}") + + print("Downloading file locally for extraction...") + local_path = download_private_s3_file(file_uri) + + print("Extracting file...") + reader = pdfReaderToText(local_path) + obj = WarmHomesConditionReport(reader.text_list) + json_ = make_final_json(obj.master_obj[0], obj.master_obj[1]) + print("Extracted completed, made json") + + print("uploading json to s3 bucket...") + json_uri = upload_json_to_s3(json_, file_uri) + + print("Updating Database with json_uri") + with get_db_session() as session: + update_uploaded_file_json_uri_by_query( + session, + id_, + json_uri, + ) + print("job completed successfully") + except Exception as e: print(f"❌ Error: {e}") return { From 56f9ffb0edd83ae962e945ed773df458d99aecd8 Mon Sep 17 00:00:00 2001 From: Jun-te kim Date: Wed, 20 Aug 2025 11:04:47 +0000 Subject: [PATCH 06/10] more memory? --- deployment/lambda/extractor_and_loader/docker/app.py | 2 +- .../extractor_and_loader/extractor_and_loader_lambda.tf | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/deployment/lambda/extractor_and_loader/docker/app.py b/deployment/lambda/extractor_and_loader/docker/app.py index 36cda55..b7cf827 100644 --- a/deployment/lambda/extractor_and_loader/docker/app.py +++ b/deployment/lambda/extractor_and_loader/docker/app.py @@ -213,7 +213,7 @@ def handler(event, context): json_uri, ) print("job completed successfully") - + except Exception as e: print(f"❌ Error: {e}") return { diff --git a/deployment/lambda/extractor_and_loader/extractor_and_loader_lambda.tf b/deployment/lambda/extractor_and_loader/extractor_and_loader_lambda.tf index 455a192..ff15347 100644 --- a/deployment/lambda/extractor_and_loader/extractor_and_loader_lambda.tf +++ b/deployment/lambda/extractor_and_loader/extractor_and_loader_lambda.tf @@ -59,7 +59,11 @@ resource "aws_lambda_function" "extractor_and_loader" { role = data.aws_iam_role.lambda_exec_role.arn package_type = "Image" image_uri = "${data.aws_ecr_repository.extractor_and_loader.repository_url}:${var.lambda_image_tag}" - timeout = 30 + # Increase timeout (max 900 sec / 15 min) + timeout = 300 # e.g. 5 minutes + + # Increase memory (default 128 MB) + memory_size = 2048 # try 1024 or 2048 MB to start } # SQS trigger From ec6f0877f027902430baddc183b6d8d110d7ab8e Mon Sep 17 00:00:00 2001 From: Jun-te kim Date: Wed, 20 Aug 2025 11:10:37 +0000 Subject: [PATCH 07/10] show more prints to show connecting to db --- deployment/lambda/extractor_and_loader/docker/app.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deployment/lambda/extractor_and_loader/docker/app.py b/deployment/lambda/extractor_and_loader/docker/app.py index b7cf827..929c16a 100644 --- a/deployment/lambda/extractor_and_loader/docker/app.py +++ b/deployment/lambda/extractor_and_loader/docker/app.py @@ -182,7 +182,9 @@ def get_file_uri(id): def handler(event, context): try: + print("trying to connect to db") init_db() + print("connected to db") for r in event.get("Records", []): body = json.loads(r["body"]) id_ = body.get("id") From 4d8aa2dcd30e7f41923ffa24e8af3b3b2a7953d9 Mon Sep 17 00:00:00 2001 From: Jun-te kim Date: Wed, 20 Aug 2025 11:25:36 +0000 Subject: [PATCH 08/10] add db --- .../extractor_and_loader/extractor_and_loader_lambda.tf | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/deployment/lambda/extractor_and_loader/extractor_and_loader_lambda.tf b/deployment/lambda/extractor_and_loader/extractor_and_loader_lambda.tf index ff15347..ef1c07c 100644 --- a/deployment/lambda/extractor_and_loader/extractor_and_loader_lambda.tf +++ b/deployment/lambda/extractor_and_loader/extractor_and_loader_lambda.tf @@ -64,6 +64,12 @@ resource "aws_lambda_function" "extractor_and_loader" { # Increase memory (default 128 MB) memory_size = 2048 # try 1024 or 2048 MB to start + + environment { + variables = { + DATABASE_URL = "postgresql://postgres:makingwarmhomes@terraform-20250331175522503500000002.cdgzupxvdyp0.eu-west-2.rds.amazonaws.com:5432/surveyDB" + } + } } # SQS trigger From 05ff13a7e47fdf06df8359c7264cd23ea981b3a0 Mon Sep 17 00:00:00 2001 From: Jun-te kim Date: Wed, 20 Aug 2025 11:46:28 +0000 Subject: [PATCH 09/10] get rid of logger --- deployment/lambda/extractor_and_loader/docker/app.py | 2 +- etl/fileReader/pdfReaderToText.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/deployment/lambda/extractor_and_loader/docker/app.py b/deployment/lambda/extractor_and_loader/docker/app.py index 929c16a..510af31 100644 --- a/deployment/lambda/extractor_and_loader/docker/app.py +++ b/deployment/lambda/extractor_and_loader/docker/app.py @@ -199,7 +199,7 @@ def handler(event, context): local_path = download_private_s3_file(file_uri) print("Extracting file...") - reader = pdfReaderToText(local_path) + reader = pdfReaderToText(local_path, logger=False) obj = WarmHomesConditionReport(reader.text_list) json_ = make_final_json(obj.master_obj[0], obj.master_obj[1]) print("Extracted completed, made json") diff --git a/etl/fileReader/pdfReaderToText.py b/etl/fileReader/pdfReaderToText.py index c1d5834..a4d11a4 100644 --- a/etl/fileReader/pdfReaderToText.py +++ b/etl/fileReader/pdfReaderToText.py @@ -15,9 +15,10 @@ from pprint import pprint class pdfReaderToText(): - def __init__(self, file_path): + def __init__(self, file_path, logger=True): self.source_path = file_path - self.logger = Logger(name='pdfReader', level=logging.INFO).get_logger() + if logger: + self.logger = Logger(name='pdfReader', level=logging.INFO).get_logger() self.all_text = "" self.text_list = [] self.get_text_from_pdf_file() From 2bb08b4e4e8b6e8870a729b8919564709b3eea6b Mon Sep 17 00:00:00 2001 From: Jun-te kim Date: Wed, 20 Aug 2025 11:52:13 +0000 Subject: [PATCH 10/10] quick fix to not save log file --- deployment/lambda/extractor_and_loader/docker/app.py | 2 +- etl/fileReader/pdfReaderToText.py | 5 ++--- etl/utils/logger.py | 8 ++++---- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/deployment/lambda/extractor_and_loader/docker/app.py b/deployment/lambda/extractor_and_loader/docker/app.py index 510af31..929c16a 100644 --- a/deployment/lambda/extractor_and_loader/docker/app.py +++ b/deployment/lambda/extractor_and_loader/docker/app.py @@ -199,7 +199,7 @@ def handler(event, context): local_path = download_private_s3_file(file_uri) print("Extracting file...") - reader = pdfReaderToText(local_path, logger=False) + reader = pdfReaderToText(local_path) obj = WarmHomesConditionReport(reader.text_list) json_ = make_final_json(obj.master_obj[0], obj.master_obj[1]) print("Extracted completed, made json") diff --git a/etl/fileReader/pdfReaderToText.py b/etl/fileReader/pdfReaderToText.py index a4d11a4..c1d5834 100644 --- a/etl/fileReader/pdfReaderToText.py +++ b/etl/fileReader/pdfReaderToText.py @@ -15,10 +15,9 @@ from pprint import pprint class pdfReaderToText(): - def __init__(self, file_path, logger=True): + def __init__(self, file_path): self.source_path = file_path - if logger: - self.logger = Logger(name='pdfReader', level=logging.INFO).get_logger() + self.logger = Logger(name='pdfReader', level=logging.INFO).get_logger() self.all_text = "" self.text_list = [] self.get_text_from_pdf_file() diff --git a/etl/utils/logger.py b/etl/utils/logger.py index 3e89ac3..0597f2d 100644 --- a/etl/utils/logger.py +++ b/etl/utils/logger.py @@ -14,17 +14,17 @@ class Logger: c_handler.setLevel(level) # File handler - f_handler = logging.FileHandler(log_file) - f_handler.setLevel(level) + # f_handler = logging.FileHandler(log_file) + # f_handler.setLevel(level) # Formatter formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') c_handler.setFormatter(formatter) - f_handler.setFormatter(formatter) + # f_handler.setFormatter(formatter) # Add handlers to the logger self.logger.addHandler(c_handler) - self.logger.addHandler(f_handler) + # self.logger.addHandler(f_handler) def get_logger(self): return self.logger