mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
extracting from ima
This commit is contained in:
parent
6396f081c1
commit
84d4070b49
1 changed files with 59 additions and 2 deletions
|
|
@ -125,13 +125,13 @@ def extract_summary_report(pdf_path):
|
|||
- Address
|
||||
"""
|
||||
|
||||
blah
|
||||
data = {
|
||||
"Address": None,
|
||||
"Postcode": None,
|
||||
"Current SAP Rating": None,
|
||||
"Current EPC Band": None,
|
||||
"Fuel Bill": None,
|
||||
"Main Building Age Band": None,
|
||||
"Number of Storeys": None,
|
||||
"Window Age Description": None,
|
||||
"Window Age Description Proportion (%)": None,
|
||||
|
|
@ -181,6 +181,10 @@ def extract_summary_report(pdf_path):
|
|||
sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)
|
||||
data["Current SAP Rating"] = sap_match.group(1).split(" ")[1]
|
||||
|
||||
# Extract age
|
||||
age_band_match = re.search(r"3\.0 Date Built:\s*Main Property\s*[A-Z]?\s*(\d{4}-\d{4})", text)
|
||||
data["Main Building Age Band"] = age_band_match.group(1)
|
||||
|
||||
# Number of storeys
|
||||
storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
|
||||
data["Number of Storeys"] = int(storeys_match.group(1))
|
||||
|
|
@ -3027,6 +3031,7 @@ def revised_model():
|
|||
|
||||
# We now do a large pull of all of the data
|
||||
extracted_data = []
|
||||
mtp_extracted_data = [] # Additional data to extract from the medium term plans
|
||||
for survey_folder in tqdm(survey_folders):
|
||||
survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder)
|
||||
|
||||
|
|
@ -3048,6 +3053,58 @@ def revised_model():
|
|||
None
|
||||
)
|
||||
|
||||
mtp_folder = next(
|
||||
(name for name in survey_subfolders if "mid-term" in name.lower() or "mtp" in name.lower()),
|
||||
None
|
||||
)
|
||||
if mtp_folder:
|
||||
# We have a mid term plan:
|
||||
mtp_folder_path = os.path.join(survey_folder_path, mtp_folder)
|
||||
# Get the contents - files and not folder
|
||||
mtp_contents = [
|
||||
os.path.join(mtp_folder, file) for file in os.listdir(mtp_folder_path)
|
||||
if ".DS_Store" not in file and not os.path.isdir(os.path.join(mtp_folder_path, mtp_folder, file))
|
||||
]
|
||||
# We check the the IMA
|
||||
for file_name in mtp_contents:
|
||||
filepath = os.path.join(survey_folder_path, file_name)
|
||||
# We expect a pdf so try and parse it
|
||||
try:
|
||||
with open(filepath, "rb") as file:
|
||||
reader = PyPDF2.PdfReader(file)
|
||||
# Just the first page
|
||||
text = reader.pages[0].extract_text()
|
||||
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
# We check if this is an IMA
|
||||
ima_heading_search = re.search(
|
||||
r"Improvement measure\s+Capital Cost\s+Lifetime of\s*\n\s*measureFuel saving\s*Lifetime fuel", text
|
||||
)
|
||||
|
||||
is_ima = bool(ima_heading_search)
|
||||
if not is_ima:
|
||||
continue
|
||||
|
||||
# Otherwise, extract: RIR, PV
|
||||
pv_search = re.search(r"PV \(\d+Kwp\)", text)
|
||||
has_pv = bool(pv_search)
|
||||
pv_system = pv_search.group(0) if has_pv else None
|
||||
|
||||
rir_search = re.search(r"RIR \(\d+(\.\d+)?\)", text)
|
||||
has_rir = bool(rir_search)
|
||||
rir_spec = rir_search.group(0) if has_rir else None
|
||||
|
||||
mtp_extracted_data.append({
|
||||
"survey_folder": survey_folder,
|
||||
"has_pv": has_pv,
|
||||
"PV System": pv_system,
|
||||
"RIR Specification": rir_spec,
|
||||
"has_rir": has_rir
|
||||
})
|
||||
continue
|
||||
|
||||
# If retrofit assessment folder exists, check if it has content
|
||||
if retrofit_folder or ra_folder:
|
||||
if retrofit_folder:
|
||||
|
|
@ -3094,7 +3151,7 @@ def revised_model():
|
|||
retrofit_assessment_data = pd.DataFrame(extracted_data)
|
||||
|
||||
# retrofit_assessment_data.to_csv(
|
||||
# os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 2.csv"), index=False
|
||||
# os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"), index=False
|
||||
# )
|
||||
retrofit_assessment_data = pd.read_csv(
|
||||
os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 2.csv"),
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue