mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
adding in the basic structure of the extraction code
This commit is contained in:
parent
8325f1bf7a
commit
7513e475d3
4 changed files with 95 additions and 2 deletions
2
.idea/Model.iml
generated
2
.idea/Model.iml
generated
|
|
@ -7,7 +7,7 @@
|
|||
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyNamespacePackagesService">
|
||||
|
|
|
|||
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
|
|
@ -3,7 +3,7 @@
|
|||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.10 (backend)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
|
||||
<component name="PyCharmProfessionalAdvertiser">
|
||||
<option name="shown" value="true" />
|
||||
</component>
|
||||
|
|
|
|||
92
etl/customers/stonewater/Wave 3 Preparation.py
Normal file
92
etl/customers/stonewater/Wave 3 Preparation.py
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
import os
|
||||
import PyPDF2
|
||||
import re
|
||||
|
||||
FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys"
|
||||
|
||||
|
||||
def extract_summary_report(pdf_path):
|
||||
"""
|
||||
Extracts specific data from the provided PDF file.
|
||||
Data includes:
|
||||
- Current SAP rating
|
||||
- Fuel Bill
|
||||
- Emissions (t/year)
|
||||
"""
|
||||
data = {
|
||||
"Current SAP rating": None,
|
||||
"Fuel Bill": None,
|
||||
"Emissions (t/year)": None,
|
||||
}
|
||||
|
||||
with open(pdf_path, "rb") as file:
|
||||
reader = PyPDF2.PdfReader(file)
|
||||
text = ""
|
||||
for page in reader.pages:
|
||||
text += page.extract_text()
|
||||
|
||||
# Extract Current SAP rating
|
||||
sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)
|
||||
if sap_match:
|
||||
data["Current SAP rating"] = sap_match.group(1)
|
||||
|
||||
# Extract Fuel Bill
|
||||
fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text)
|
||||
if fuel_bill_match:
|
||||
data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
|
||||
|
||||
# Extract Emissions
|
||||
emissions_match = re.search(r"Emissions \(t/year\):\s*([\d.]+) tonnes", text)
|
||||
if emissions_match:
|
||||
data["Emissions (t/year)"] = f"{emissions_match.group(1)} tonnes"
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater.
|
||||
"""
|
||||
# List only directories in the specified FILE_PATH
|
||||
survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))]
|
||||
|
||||
extracted_data = []
|
||||
for survey_folder in survey_folders:
|
||||
# List the folders inside of the survey folder
|
||||
survey_subfolders = [name for name in os.listdir(os.path.join(FILE_PATH, survey_folder))
|
||||
if os.path.isdir(os.path.join(FILE_PATH, survey_folder, name))]
|
||||
|
||||
if not survey_subfolders:
|
||||
continue
|
||||
|
||||
# Check for a folder inside of the survey_subfolders containing the phrase "retrofit assessment:
|
||||
# If it exists, we will use the data from that folder
|
||||
retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None)
|
||||
|
||||
# List contents of the retrofit folder
|
||||
retrofit_files = os.listdir(os.path.join(FILE_PATH, survey_folder, retrofit_folder))
|
||||
|
||||
if not retrofit_files:
|
||||
continue
|
||||
|
||||
# We now look for specific files:
|
||||
# 1) Check the summary report.- the title will contain the word "summary" (lowercase) and the file extension is
|
||||
# .pdf
|
||||
summary_report = next(
|
||||
(name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None
|
||||
)
|
||||
if summary_report is not None:
|
||||
pdf_path = os.path.join(FILE_PATH, survey_folder, retrofit_folder, summary_report)
|
||||
summary_data = extract_summary_report(pdf_path)
|
||||
summary_data = {
|
||||
"survey_folder": survey_folder,
|
||||
**summary_data
|
||||
}
|
||||
extracted_data.append(summary_data)
|
||||
continue
|
||||
|
||||
raise NotImplementedError("IMPLEMENT ME!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1 @@
|
|||
PyPDF2
|
||||
Loading…
Add table
Reference in a new issue