diff --git a/.idea/Model.iml b/.idea/Model.iml
index df6c4faa..762580d9 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
-
+
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 50cad4ca..c916a158 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
-
+
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
new file mode 100644
index 00000000..bd916494
--- /dev/null
+++ b/etl/customers/stonewater/Wave 3 Preparation.py
@@ -0,0 +1,92 @@
+import os
+import PyPDF2
+import re
+
+FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys"
+
+
+def extract_summary_report(pdf_path):
+ """
+ Extracts specific data from the provided PDF file.
+ Data includes:
+ - Current SAP rating
+ - Fuel Bill
+ - Emissions (t/year)
+ """
+ data = {
+ "Current SAP rating": None,
+ "Fuel Bill": None,
+ "Emissions (t/year)": None,
+ }
+
+ with open(pdf_path, "rb") as file:
+ reader = PyPDF2.PdfReader(file)
+ text = ""
+ for page in reader.pages:
+ text += page.extract_text()
+
+ # Extract Current SAP rating
+ sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)
+ if sap_match:
+ data["Current SAP rating"] = sap_match.group(1)
+
+ # Extract Fuel Bill
+ fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text)
+ if fuel_bill_match:
+ data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
+
+ # Extract Emissions
+ emissions_match = re.search(r"Emissions \(t/year\):\s*([\d.]+) tonnes", text)
+ if emissions_match:
+ data["Emissions (t/year)"] = f"{emissions_match.group(1)} tonnes"
+
+ return data
+
+
+def main():
+ """
+ This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater.
+ """
+ # List only directories in the specified FILE_PATH
+ survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))]
+
+ extracted_data = []
+ for survey_folder in survey_folders:
+ # List the folders inside of the survey folder
+ survey_subfolders = [name for name in os.listdir(os.path.join(FILE_PATH, survey_folder))
+ if os.path.isdir(os.path.join(FILE_PATH, survey_folder, name))]
+
+ if not survey_subfolders:
+ continue
+
+ # Check for a folder inside of the survey_subfolders containing the phrase "retrofit assessment:
+ # If it exists, we will use the data from that folder
+ retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None)
+
+ # List contents of the retrofit folder
+ retrofit_files = os.listdir(os.path.join(FILE_PATH, survey_folder, retrofit_folder))
+
+ if not retrofit_files:
+ continue
+
+ # We now look for specific files:
+ # 1) Check the summary report.- the title will contain the word "summary" (lowercase) and the file extension is
+ # .pdf
+ summary_report = next(
+ (name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None
+ )
+ if summary_report is not None:
+ pdf_path = os.path.join(FILE_PATH, survey_folder, retrofit_folder, summary_report)
+ summary_data = extract_summary_report(pdf_path)
+ summary_data = {
+ "survey_folder": survey_folder,
+ **summary_data
+ }
+ extracted_data.append(summary_data)
+ continue
+
+ raise NotImplementedError("IMPLEMENT ME!")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
new file mode 100644
index 00000000..e9a5c8ea
--- /dev/null
+++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
@@ -0,0 +1 @@
+PyPDF2