additional data cleaning

This commit is contained in:
Khalim Conn-Kowlessar 2024-10-28 12:43:59 +00:00
parent 33ea47e71d
commit c68e4f017e

View file

@ -86,12 +86,8 @@ def extract_epr(pdf_path):
data["Address"] = address_match.group(1).strip()
# Extract Total Floor Area
area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text)
data["Total Floor Area"] = area_match.group(1)
# Extract Estimated Annual Costs
cost_match = re.search(r"TOTAL\s*£(\d+)", text)
data["Estimated Annual Costs"] = f"£{cost_match.group(1)}"
# area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text)
# data["Total Floor Area"] = area_match.group(1)
# Extract Current SAP rating
# Updated Regular Expression to find "GG (1-20)" followed by two numbers
@ -216,6 +212,5 @@ def main():
extracted_data = pd.DataFrame(extracted_data)
if __name__ == "__main__":
main()
# if __name__ == "__main__":
# main()