mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
working on survey match for ha107
This commit is contained in:
parent
75102704cd
commit
32352bbde1
1 changed files with 32 additions and 13 deletions
|
|
@ -40,7 +40,9 @@ class DataLoader:
|
|||
UNMATCHED_CIGA = {
|
||||
# We expect 4 unmatched addresses, which have been validated manually as being in the ciga file but not
|
||||
# the asset list
|
||||
"HA14": 4
|
||||
"HA14": 4,
|
||||
# There's just too many unmatched here - if we identify some homes that
|
||||
"HA6": 117
|
||||
}
|
||||
|
||||
def __init__(self, directories, use_cache):
|
||||
|
|
@ -78,11 +80,11 @@ class DataLoader:
|
|||
elif ha_name == "HA107":
|
||||
# Create matching_address by concatenating House No, Street, Town, District, Postcode
|
||||
asset_list["matching_address"] = asset_list["House No"].astype(str).str.lower().str.strip() + ", " + \
|
||||
asset_list["Street"].str.lower().str.strip() + ", " + \
|
||||
asset_list["Town"].str.lower().str.strip() + ", " + \
|
||||
asset_list["District"].str.lower().str.strip() + ", " + \
|
||||
asset_list["Postcode"].str.lower().str.strip()
|
||||
asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip()
|
||||
asset_list["Street"].astype(str).str.lower().str.strip() + ", " + \
|
||||
asset_list["Town"].astype(str).str.lower().str.strip() + ", " + \
|
||||
asset_list["District"].astype(str).str.lower().str.strip() + ", " + \
|
||||
asset_list["Postcode"].astype(str).str.lower().str.strip()
|
||||
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
|
||||
else:
|
||||
raise NotImplementedError("implement me")
|
||||
|
||||
|
|
@ -155,6 +157,13 @@ class DataLoader:
|
|||
else:
|
||||
return "CIGA"
|
||||
|
||||
@staticmethod
|
||||
def get_survey_sheetname(workbook):
|
||||
if "ECO Surveys" in workbook.sheetnames:
|
||||
return "ECO Surveys"
|
||||
else:
|
||||
return "ECO surveys"
|
||||
|
||||
def load_asset_list(self, filepath, ha_name):
|
||||
workbook = openpyxl.load_workbook(filepath)
|
||||
asset_sheetname = self.get_asset_sheetname(workbook)
|
||||
|
|
@ -189,8 +198,13 @@ class DataLoader:
|
|||
asset_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_asset_list")
|
||||
asset_list = asset_list_correction_function(asset_list)
|
||||
|
||||
# For HA1, there is an exception in the structure of the data. We don't have any survey or ciga lists, and so
|
||||
# we can return the asset list now
|
||||
if ha_name == "HA1":
|
||||
return asset_list, pd.DataFrame(), pd.DataFrame()
|
||||
|
||||
# We check if there is a survey list
|
||||
survey_sheetname = "ECO Surveys"
|
||||
survey_sheetname = self.get_survey_sheetname(workbook)
|
||||
survey_sheet = workbook[survey_sheetname]
|
||||
survey_rows = []
|
||||
for row in survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers
|
||||
|
|
@ -217,6 +231,9 @@ class DataLoader:
|
|||
ciga_list = pd.DataFrame(ciga_rows, columns=[cell.value for cell in ciga_sheet[1]])
|
||||
# Remove columns that are None
|
||||
ciga_list = ciga_list.loc[:, ciga_list.columns.notnull()]
|
||||
# Remove rows with missing postcode which happens in a small number of cases
|
||||
ciga_list = ciga_list[~pd.isnull(ciga_list["Matched Postcode"])]
|
||||
|
||||
ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))]
|
||||
# Perform ciga list merge
|
||||
if not ciga_list.empty:
|
||||
|
|
@ -414,6 +431,10 @@ class DataLoader:
|
|||
|
||||
return survey_list
|
||||
|
||||
@staticmethod
|
||||
def correct_ha107_survey_list(survey_list):
|
||||
return survey_list
|
||||
|
||||
def merge_surveys_to_assets(self, asset_list, survey_list, ha_name):
|
||||
|
||||
# Correct the survey list
|
||||
|
|
@ -441,7 +462,7 @@ class DataLoader:
|
|||
|
||||
df = df[df["matching_address"].str.contains(str(house_number))]
|
||||
if df.shape[0] != 1:
|
||||
df = df[df["HouseNo"] == str(house_number)]
|
||||
df = df[df["HouseNo"].astype(str) == str(house_number)]
|
||||
if df.shape[0] != 1:
|
||||
df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower().strip())]
|
||||
if df.shape[0] != 1:
|
||||
|
|
@ -506,6 +527,7 @@ class DataLoader:
|
|||
def merge_ciga_to_assets(self, asset_list, ciga_list, ha_name):
|
||||
matching_lookup = []
|
||||
unmatched_addresses = []
|
||||
|
||||
for _, row in tqdm(ciga_list.iterrows(), total=len(ciga_list)):
|
||||
|
||||
house_number = row["HouseNo"]
|
||||
|
|
@ -528,7 +550,7 @@ class DataLoader:
|
|||
}
|
||||
)
|
||||
continue
|
||||
|
||||
|
||||
if df.shape[0] != 1:
|
||||
|
||||
# We split house number and postcode out of the matched address for ciga
|
||||
|
|
@ -561,9 +583,6 @@ class DataLoader:
|
|||
if len(unmatched_addresses) != self.UNMATCHED_CIGA[ha_name]:
|
||||
raise ValueError(f"Unmatched addresses for {ha_name} is not as expected")
|
||||
|
||||
# In ciga: 35 Valley Drive, Leicester, LE3 3EE
|
||||
#
|
||||
|
||||
matching_lookup = pd.DataFrame(matching_lookup)
|
||||
|
||||
# Merge onto the ciga list
|
||||
|
|
@ -612,7 +631,7 @@ class DataLoader:
|
|||
for filepath in self.directories:
|
||||
ha_name = filepath.split("/")[2]
|
||||
# Load asset list
|
||||
logger.info("Loading asset list for {}".format(ha_name))
|
||||
logger.info("Loading data for {}".format(ha_name))
|
||||
asset_list, survey_list, ciga_list = self.load_asset_list(
|
||||
filepath=filepath,
|
||||
ha_name=ha_name,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue