adding in the basic structure of the extraction code

This commit is contained in:
Khalim Conn-Kowlessar 2024-10-28 10:57:26 +00:00
parent 8325f1bf7a
commit 7513e475d3
4 changed files with 95 additions and 2 deletions

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyNamespacePackagesService">

2
.idea/misc.xml generated
View file

@ -3,7 +3,7 @@
<component name="Black">
<option name="sdkName" value="Python 3.10 (backend)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
<component name="PyCharmProfessionalAdvertiser">
<option name="shown" value="true" />
</component>

View file

@ -0,0 +1,92 @@
import os
import PyPDF2
import re
FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys"
def extract_summary_report(pdf_path):
"""
Extracts specific data from the provided PDF file.
Data includes:
- Current SAP rating
- Fuel Bill
- Emissions (t/year)
"""
data = {
"Current SAP rating": None,
"Fuel Bill": None,
"Emissions (t/year)": None,
}
with open(pdf_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
# Extract Current SAP rating
sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)
if sap_match:
data["Current SAP rating"] = sap_match.group(1)
# Extract Fuel Bill
fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text)
if fuel_bill_match:
data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
# Extract Emissions
emissions_match = re.search(r"Emissions \(t/year\):\s*([\d.]+) tonnes", text)
if emissions_match:
data["Emissions (t/year)"] = f"{emissions_match.group(1)} tonnes"
return data
def main():
"""
This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater.
"""
# List only directories in the specified FILE_PATH
survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))]
extracted_data = []
for survey_folder in survey_folders:
# List the folders inside of the survey folder
survey_subfolders = [name for name in os.listdir(os.path.join(FILE_PATH, survey_folder))
if os.path.isdir(os.path.join(FILE_PATH, survey_folder, name))]
if not survey_subfolders:
continue
# Check for a folder inside of the survey_subfolders containing the phrase "retrofit assessment:
# If it exists, we will use the data from that folder
retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None)
# List contents of the retrofit folder
retrofit_files = os.listdir(os.path.join(FILE_PATH, survey_folder, retrofit_folder))
if not retrofit_files:
continue
# We now look for specific files:
# 1) Check the summary report.- the title will contain the word "summary" (lowercase) and the file extension is
# .pdf
summary_report = next(
(name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None
)
if summary_report is not None:
pdf_path = os.path.join(FILE_PATH, survey_folder, retrofit_folder, summary_report)
summary_data = extract_summary_report(pdf_path)
summary_data = {
"survey_folder": survey_folder,
**summary_data
}
extracted_data.append(summary_data)
continue
raise NotImplementedError("IMPLEMENT ME!")
if __name__ == "__main__":
main()

View file

@ -0,0 +1 @@
PyPDF2