From c7753c0ed676c1eeda71be6f656ba374e15d93a0 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 13 Mar 2025 12:13:17 +0000 Subject: [PATCH] adding types and transforming my data --- etl/development.py | 1 + etl/pdfReader/sitenotes.py | 49 ++++++++++++++++++++++---------------- etl/transform/types.py | 40 ++++++++++++++++++++++++++----- 3 files changed, 64 insertions(+), 26 deletions(-) diff --git a/etl/development.py b/etl/development.py index 2f59087..bd28e13 100644 --- a/etl/development.py +++ b/etl/development.py @@ -16,6 +16,7 @@ pdfReader = pdfReaderToText(DATA_LOC_1) doc2 = pdfReader.get_reader() pdfReader2 = pdfReaderToText(DATA_LOC_2) doc1 = pdfReader2.get_reader() +print(doc1.survey_information) # Transform diff --git a/etl/pdfReader/sitenotes.py b/etl/pdfReader/sitenotes.py index ea862dc..941cc3e 100644 --- a/etl/pdfReader/sitenotes.py +++ b/etl/pdfReader/sitenotes.py @@ -1,5 +1,6 @@ from etl.pdfReader.reportType import ReportType -from transform.types import CompanyInformation +from transform.types import CompanyInfo, SurverySummaryInfo, AssessorInfo +from datetime import datetime class SiteNotesExtractor(): def __init__(self, data_list): @@ -20,8 +21,9 @@ class QuidosSiteNotesExtractor(SiteNotesExtractor): def __init__(self, data_list): super().__init__(data_list) self.type = ReportType.QUIDOS_SITE_NOTE - self.setup() self.company_information = None + self.survey_information = None + self.setup() def setup(self): """ @@ -53,7 +55,7 @@ class QuidosSiteNotesExtractor(SiteNotesExtractor): self.get_section_20() self.get_section_21() self.get_section_22() - + def get_summary_information(self): # Summary Information avoid = [ @@ -85,28 +87,35 @@ class QuidosSiteNotesExtractor(SiteNotesExtractor): 'Emission figures including 9.92 emission factor of 0.925', ] get_value = lambda key: None if self.raw_data[self.raw_data.index(key) + 1] in avoid else self.raw_data[self.raw_data.index(key) + 1] + + self.survey_information = SurverySummaryInfo( + reference_number = get_value('Reference Number'), + epc_language = get_value('EPC Language'), + uprn = get_value('UPRN'), + postcode = get_value('Postcode'), + region = get_value('Region'), + address = get_value('Address'), + town = get_value('Town'), + county = get_value('County'), + property_tenure = get_value('Property Tenure'), + transaction_type = get_value('Transaction Type'), + inspection_date = datetime.strptime(get_value('Inspection Date'), '%d %B %Y'), + ) - self.reference_number = get_value('Reference Number') - self.epc_language = get_value('EPC Language') - self.UPRN = get_value('UPRN') - self.postcode = get_value('Postcode') - self.region = get_value('Region') - self.address = get_value('Address') - self.town = get_value('Town') - self.country = get_value('County') - self.property_tenure = get_value('Property Tenure') - self.transaction_type = get_value('Transaction Type') - self.inspection_date = get_value('Inspection Date') - self.assessor_accrediation_number = get_value("Assessor’s accreditation number") - self.company_info = CompanyInformation( + + self.company_information = CompanyInfo( trading_name = get_value('Company name/trading name'), post_code = get_value('POST CODE'), fax_number = get_value('Fax number'), - related_party_disclosure= get_value("Related party disclosure") + related_party_disclosure = get_value("Related party disclosure") + ) + + self.assessor_information = AssessorInfo( + accreditation_number = get_value("Assessor’s accreditation number"), + name = get_value("Assessor’s name"), + phone_number = get_value("Phone number"), + email_address = get_value("E-mail address"), ) - self.assessor_name = get_value("Assessor’s name") - self.assessor_phone_number = get_value("Phone number") - self.assessor_email_address = get_value("E-mail address") index = self.get_x_occurance(self.raw_data, "Address") if index: diff --git a/etl/transform/types.py b/etl/transform/types.py index 9fbf69c..65be368 100644 --- a/etl/transform/types.py +++ b/etl/transform/types.py @@ -1,8 +1,36 @@ -from pydantic import BaseModel +from pydantic import BaseModel, constr, field_validator, EmailStr from typing import Optional, List +import re +from datetime import datetime + +class CompanyInfo(BaseModel): + trading_name: str + post_code: str + fax_number: Optional[str] = None + related_party_disclosure: Optional[str] = None + + @field_validator('related_party_disclosure', pre=True, always=True) + def set_none_if_none_of_the_above(cls, v): + if v == "None of the above": + return None + return v + +class SurverySummaryInfo(BaseModel): + reference_number: str + epc_language: str + uprn: str + postcode: str + region: str + address: str + town: str + county: Optional[str] = None + property_tenure: str + transaction_type: str + inspection_date: datetime + +class AssessorInfo(BaseModel): + accreditation_number: str + name: str + phone_number: Optional[str] = None + email_address: Optional[EmailStr] = None -class CompanyInformation(BaseModel): - trading_name: Optional[str] - post_code: Optional[str] - fax_number: Optional[str] - related_party_disclosure: Optional[str]