From c68e4f017e48f4cb12639cbd9f69ce40849e68fd Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 12:43:59 +0000 Subject: [PATCH] additional data cleaning --- etl/customers/stonewater/Wave 3 Preparation.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index bc567bd2..c6736ba8 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -86,12 +86,8 @@ def extract_epr(pdf_path): data["Address"] = address_match.group(1).strip() # Extract Total Floor Area - area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text) - data["Total Floor Area"] = area_match.group(1) - - # Extract Estimated Annual Costs - cost_match = re.search(r"TOTAL\s*£(\d+)", text) - data["Estimated Annual Costs"] = f"£{cost_match.group(1)}" + # area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text) + # data["Total Floor Area"] = area_match.group(1) # Extract Current SAP rating # Updated Regular Expression to find "GG (1-20)" followed by two numbers @@ -216,6 +212,5 @@ def main(): extracted_data = pd.DataFrame(extracted_data) - -if __name__ == "__main__": - main() +# if __name__ == "__main__": +# main()