adding types and transforming my data

This commit is contained in:
Jun-te Kim 2025-03-13 12:13:17 +00:00
parent 85b7b1d772
commit c7753c0ed6
3 changed files with 64 additions and 26 deletions

View file

@ -16,6 +16,7 @@ pdfReader = pdfReaderToText(DATA_LOC_1)
doc2 = pdfReader.get_reader()
pdfReader2 = pdfReaderToText(DATA_LOC_2)
doc1 = pdfReader2.get_reader()
print(doc1.survey_information)
# Transform

View file

@ -1,5 +1,6 @@
from etl.pdfReader.reportType import ReportType
from transform.types import CompanyInformation
from transform.types import CompanyInfo, SurverySummaryInfo, AssessorInfo
from datetime import datetime
class SiteNotesExtractor():
def __init__(self, data_list):
@ -20,8 +21,9 @@ class QuidosSiteNotesExtractor(SiteNotesExtractor):
def __init__(self, data_list):
super().__init__(data_list)
self.type = ReportType.QUIDOS_SITE_NOTE
self.setup()
self.company_information = None
self.survey_information = None
self.setup()
def setup(self):
"""
@ -53,7 +55,7 @@ class QuidosSiteNotesExtractor(SiteNotesExtractor):
self.get_section_20()
self.get_section_21()
self.get_section_22()
def get_summary_information(self):
# Summary Information
avoid = [
@ -85,28 +87,35 @@ class QuidosSiteNotesExtractor(SiteNotesExtractor):
'Emission figures including 9.92 emission factor of 0.925',
]
get_value = lambda key: None if self.raw_data[self.raw_data.index(key) + 1] in avoid else self.raw_data[self.raw_data.index(key) + 1]
self.survey_information = SurverySummaryInfo(
reference_number = get_value('Reference Number'),
epc_language = get_value('EPC Language'),
uprn = get_value('UPRN'),
postcode = get_value('Postcode'),
region = get_value('Region'),
address = get_value('Address'),
town = get_value('Town'),
county = get_value('County'),
property_tenure = get_value('Property Tenure'),
transaction_type = get_value('Transaction Type'),
inspection_date = datetime.strptime(get_value('Inspection Date'), '%d %B %Y'),
)
self.reference_number = get_value('Reference Number')
self.epc_language = get_value('EPC Language')
self.UPRN = get_value('UPRN')
self.postcode = get_value('Postcode')
self.region = get_value('Region')
self.address = get_value('Address')
self.town = get_value('Town')
self.country = get_value('County')
self.property_tenure = get_value('Property Tenure')
self.transaction_type = get_value('Transaction Type')
self.inspection_date = get_value('Inspection Date')
self.assessor_accrediation_number = get_value("Assessors accreditation number")
self.company_info = CompanyInformation(
self.company_information = CompanyInfo(
trading_name = get_value('Company name/trading name'),
post_code = get_value('POST CODE'),
fax_number = get_value('Fax number'),
related_party_disclosure= get_value("Related party disclosure")
related_party_disclosure = get_value("Related party disclosure")
)
self.assessor_information = AssessorInfo(
accreditation_number = get_value("Assessors accreditation number"),
name = get_value("Assessors name"),
phone_number = get_value("Phone number"),
email_address = get_value("E-mail address"),
)
self.assessor_name = get_value("Assessors name")
self.assessor_phone_number = get_value("Phone number")
self.assessor_email_address = get_value("E-mail address")
index = self.get_x_occurance(self.raw_data, "Address")
if index:

View file

@ -1,8 +1,36 @@
from pydantic import BaseModel
from pydantic import BaseModel, constr, field_validator, EmailStr
from typing import Optional, List
import re
from datetime import datetime
class CompanyInfo(BaseModel):
trading_name: str
post_code: str
fax_number: Optional[str] = None
related_party_disclosure: Optional[str] = None
@field_validator('related_party_disclosure', pre=True, always=True)
def set_none_if_none_of_the_above(cls, v):
if v == "None of the above":
return None
return v
class SurverySummaryInfo(BaseModel):
reference_number: str
epc_language: str
uprn: str
postcode: str
region: str
address: str
town: str
county: Optional[str] = None
property_tenure: str
transaction_type: str
inspection_date: datetime
class AssessorInfo(BaseModel):
accreditation_number: str
name: str
phone_number: Optional[str] = None
email_address: Optional[EmailStr] = None
class CompanyInformation(BaseModel):
trading_name: Optional[str]
post_code: Optional[str]
fax_number: Optional[str]
related_party_disclosure: Optional[str]