survey-extraction/etl/validator/validator.py
2025-03-17 21:25:17 +00:00

75 lines
2.2 KiB
Python

import os
import logging
from etl.utils.logger import Logger
import re
from etl.pdfReader.pdfReaderToText import pdfReaderToText
from etl.pdfReader.reportType import ReportType
class DomnaSharePointValidator():
"""
A simple class to check certain things are in certain format in Domna sharepoint with surveyors
"""
def __init__(self):
self.logger = Logger(name='DomnaSharePointValidator', level=logging.DEBUG).get_logger()
@staticmethod
def valid_dates(list_of_dates_to_check):
# Patten Nic wants: W.C. DD.MM.YYYY
# TODO: Ideally split the date and W.C. and use a datatime library so its more standardised
pattern = r"^W\.C\. (0[1-9]|[12][0-9]|3[01])\.(0[1-9]|1[0-2])\.(\d{4})$"
for date in list_of_dates_to_check:
if not re.match(pattern, date):
return False
return True
@staticmethod
def is_quidos_presite(file_path):
file = pdfReaderToText(file_path)
type = file.get_file_type()
return type == ReportType.QUIDOS_PRESITE_NOTE
@staticmethod
def valid_social_housing_associations(list_):
"""
Nic gave me a list of housing association names, will most likely use in future so leaving it here
"""
housing_organisations = [
"SOUTHERN HOUSING",
"UNITAS",
"SETTLE",
"PLUS DANE",
"MIDLANDS HEART",
"EASTLIGHT",
"ROOFTOP",
"RMG GROUP",
"BROMFORD",
"PLACES FOR PEOPLE",
"SOUTHEND-ON-SEA COMMUNITY HOUSING",
"THRIVE HOUSING",
"ANCHOR GROUP",
"LAMBETH COUNCIL",
"ACIS GROUP",
"WATFORD HOUSING",
"ASPIRE",
"BROADLANDS",
"LIVE WEST",
"GUINNESS",
"SOVEREIGN",
"WHITE HORSE HOUSING",
"PRIVATE HOUSING",
"MUIR",
"TOWER HAMLETS",
"FOR HOUSING",
"CAMBRIDGE",
"PAPWORTH TRUST"
]
for name in list_:
if name.upper() not in housing_organisations:
return False
return True