mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
matching algorithm wip
This commit is contained in:
parent
fd98721748
commit
231069f4e3
1 changed files with 274 additions and 1 deletions
|
|
@ -3077,7 +3077,280 @@ def revised_model():
|
|||
extracted_data.append(summary_data)
|
||||
|
||||
retrofit_assessment_data = pd.DataFrame(extracted_data)
|
||||
# TODO - Save this data
|
||||
|
||||
# Remove some definite duplicates
|
||||
dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].duplicated()]["Address"]
|
||||
dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].isin(dupes)]
|
||||
dupes = dupes.sort_values("Address")
|
||||
# Get all of the folders that end with ROSS
|
||||
to_drop = dupes[dupes["survey_folder"].str.endswith("ROSS")]["survey_folder"].unique().tolist()
|
||||
|
||||
retrofit_assessment_data = retrofit_assessment_data[
|
||||
~retrofit_assessment_data["survey_folder"].isin(
|
||||
[
|
||||
"StonewaterSurveys_10/4 Beech Road, LUTON, LU1 1DP ROSS",
|
||||
"StonewaterSurveys_2/135 Runley Road, LUTON, LU1 1TX ROSS",
|
||||
"StonewaterSurveys_13/7 Saxon Road, LUTON, LU3 1JR ROSS"
|
||||
] + to_drop
|
||||
)
|
||||
]
|
||||
# Replace \n with ""
|
||||
retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "")
|
||||
|
||||
# retrofit_assessment_data.to_csv(
|
||||
# os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet.csv"), index=False
|
||||
# )
|
||||
|
||||
# We can read in the data as needed
|
||||
|
||||
# Next Step: Read in the coordinated measures and match to the extracted data
|
||||
############################################################
|
||||
# CCS
|
||||
#############################################################
|
||||
ccs_coordination_sheet = pd.read_excel(
|
||||
os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "CCS_Installation_Compliance_CCS.xlsx"),
|
||||
header=4
|
||||
)
|
||||
ccs_coordination_sheet["contractor"] = "CCS"
|
||||
# We split ccs into two sections - the first being
|
||||
ccs_coordination_removed_from_programme = ccs_coordination_sheet.tail(21)
|
||||
ccs_coordination_sheet = ccs_coordination_sheet.head(87)
|
||||
ccs_coordination = pd.concat([ccs_coordination_removed_from_programme, ccs_coordination_sheet])
|
||||
|
||||
############################################################
|
||||
# WATES
|
||||
#############################################################
|
||||
wates_coordination_sheet = pd.read_excel(
|
||||
os.path.join(
|
||||
CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "Stonewater_SAP_Installation_Compliance_Vinci-Wates.xlsx"
|
||||
),
|
||||
header=4
|
||||
)
|
||||
wates_coordination_sheet["contractor"] = "Wates"
|
||||
# Break into the different sites:
|
||||
# Wiltshire
|
||||
wates_coordination_sheet_wiltshere = wates_coordination_sheet.head(267)
|
||||
wates_coordination_sheet_herefordshire = wates_coordination_sheet.iloc[271:332, :]
|
||||
wates_coordination_sheet_coventry = wates_coordination_sheet.iloc[336:409, :]
|
||||
wates_coordination_sheet_bedfordshire = wates_coordination_sheet.iloc[413:520, :]
|
||||
wates_coordination_sheet_bournemouth = wates_coordination_sheet.iloc[524:567, :]
|
||||
wates_coordination_sheet_cambridgeshire = wates_coordination_sheet.iloc[571:581, :]
|
||||
wates_coordination_sheet_removed_from_programme = wates_coordination_sheet.iloc[586:926, :]
|
||||
wates_coordination_sheet_abeyance = wates_coordination_sheet.iloc[928:972, :]
|
||||
|
||||
wates_coordination = pd.concat(
|
||||
[
|
||||
wates_coordination_sheet_wiltshere,
|
||||
wates_coordination_sheet_herefordshire,
|
||||
wates_coordination_sheet_coventry,
|
||||
wates_coordination_sheet_bedfordshire,
|
||||
wates_coordination_sheet_bournemouth,
|
||||
wates_coordination_sheet_cambridgeshire,
|
||||
wates_coordination_sheet_removed_from_programme,
|
||||
wates_coordination_sheet_abeyance
|
||||
]
|
||||
)
|
||||
|
||||
# Combine the data back
|
||||
|
||||
############################################################
|
||||
# NEW 450 COORDINATED RETROFIT ASSESSMENTS
|
||||
#############################################################
|
||||
|
||||
retrofit_packages_board = pd.read_excel(
|
||||
os.path.join(
|
||||
CUSTOMER_FOLDER_PATH,
|
||||
"Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1732034933 Final 19.11.24.xlsx"
|
||||
),
|
||||
header=4
|
||||
)
|
||||
retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])]
|
||||
# Take just the rows that have been surveyed
|
||||
retrofit_packages_board = retrofit_packages_board[
|
||||
retrofit_packages_board["RA"].isin(["Invoiced", "Completed"])
|
||||
]
|
||||
|
||||
manual_filters = {
|
||||
"Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD",
|
||||
"6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG",
|
||||
"2 Bromyard Road": "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ",
|
||||
'Flat 18, 1 Raglan Court': "StonewaterSurveys_13/60-3-18 Raglan Court, 1 Raglan Court-MK41 8QT",
|
||||
'14 Raglan Court, 1 Devizes Avenue': 'StonewaterSurveys_12/55-3-14 Raglan Court, Devizes Avenue-MK41 8QT',
|
||||
'19 South Road': 'StonewaterSurveys_4/19 The Oaks, South Road, SMETHWICK, B67 7BY',
|
||||
'Flat 12 Pelican Lane': 'StonewaterSurveys_1/121-3-Flat 12 Lynton Court, Pelican Lane-RG14 1NN',
|
||||
'Flat C, 44 St Leonards Avenue': 'StonewaterSurveys_11/427-2-44c St. Leonards Avenue-MK42 0RB',
|
||||
'16 The Crescent, Kington': 'StonewaterSurveys_9/360-3-16 The Crescent-HR5 3AS',
|
||||
'2 School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA',
|
||||
'14 South Road': 'StonewaterSurveys_2/14 The Oaks, South Road, SMETHWICK, B67 7BY',
|
||||
'1 Groves Street': 'StonewaterSurveys_4/19-5-1 Groves Street-SN2 2BW',
|
||||
'2 Calshot Walk': 'StonewaterSurveys_3/156-3-2 Calshot Walk-MK41 8QS',
|
||||
'21 Constitution Hill': 'StonewaterSurveys_1/112-11-21 Constitution Hill-BH14 0PX',
|
||||
'22 Constitution Hill': 'StonewaterSurveys_4/185-8-22 Constitution Hill-BH14 0PX',
|
||||
'2 Marches Cottages, School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA',
|
||||
'26, Copthorn House, Brighton Road': 'StonewaterSurveys_15/133-1-26 Brighton Road-KT20 6BQ',
|
||||
'4, Old St Marys, Ripley Lane': "StonewaterSurveys_15/433-3-4 Ripley Lane-KT24 6JG",
|
||||
'1 Nelson House, Short Street': 'StonewaterSurveys_15/89-2-1 Short Street-GU11 1HX',
|
||||
"18 Nelson House, Short Street": 'StonewaterSurveys_15/25-3- 18 Short Street- GU11 1HX',
|
||||
'3 Nelson House, Short Street': 'StonewaterSurveys_2/138-1-3 Short Street-GU11 1HX',
|
||||
'16, Copthorn House, Brighton Road': 'StonewaterSurveys_13/78-3-16 Brighton Road-KT20 6BQ',
|
||||
'20 Nelson House, Short Street': 'StonewaterSurveys_15/89-1-20 Short Street-GU11 1HX',
|
||||
'7 Croft Street': 'StonewaterSurveys_8/333-2-7 Croft Street-HR6 8LA'
|
||||
}
|
||||
|
||||
# We now match this retrofit packages board to the extracted data
|
||||
matching_lookup = []
|
||||
for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)):
|
||||
|
||||
# Handle the case that has the wrong postcode in the asset data
|
||||
if home["Name"] in manual_filters:
|
||||
filtered = retrofit_assessment_data[
|
||||
retrofit_assessment_data["survey_folder"] == manual_filters[home["Name"]]
|
||||
].copy()
|
||||
else:
|
||||
filtered = retrofit_assessment_data[
|
||||
retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower()
|
||||
].copy()
|
||||
|
||||
# We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
|
||||
to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
|
||||
home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False
|
||||
)
|
||||
if to_filter.sum() == 0:
|
||||
to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.replace(",", "").str.replace(".",
|
||||
"").str.contains(
|
||||
home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False
|
||||
)
|
||||
filtered = filtered[to_filter]
|
||||
|
||||
if filtered.empty:
|
||||
continue
|
||||
|
||||
if filtered.shape[0] == 1:
|
||||
matching_lookup.append(
|
||||
{
|
||||
"survey_folder": filtered["survey_folder"].values[0],
|
||||
"Address ID": home["Address ID"],
|
||||
"Name": home["Name"]
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
# home["Name"] should be contained in the survey_folder
|
||||
filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)]
|
||||
# We have an edge case wher some properties have two outputs in Sharepoint
|
||||
if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ":
|
||||
raise Exception("Fix me1")
|
||||
# filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"]
|
||||
|
||||
if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB':
|
||||
raise Exception("Fix me2")
|
||||
# filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"]
|
||||
|
||||
if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ':
|
||||
filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"]
|
||||
|
||||
if filtered.empty:
|
||||
continue
|
||||
if filtered.shape[0] != 1:
|
||||
raise Exception("something went wrong")
|
||||
|
||||
matching_lookup.append(
|
||||
{
|
||||
"survey_folder": filtered["survey_folder"].values[0],
|
||||
"Address ID": home["Address ID"],
|
||||
"Name": home["Name"]
|
||||
}
|
||||
)
|
||||
matching_lookup = pd.DataFrame(matching_lookup)
|
||||
|
||||
ccs_coordination = ccs_coordination.rename(
|
||||
columns={"Post Code": "Postcode"}
|
||||
)
|
||||
ccs_coordination = ccs_coordination[~pd.isnull(ccs_coordination["Postcode"])]
|
||||
from fuzzywuzzy import fuzz
|
||||
|
||||
ccs_manual_filters = {}
|
||||
ccs_matching_lookup = []
|
||||
for _, home in tqdm(ccs_coordination.iterrows(), total=len(ccs_coordination)):
|
||||
# Handle the case that has the wrong postcode in the asset data
|
||||
if home["Name"] in manual_filters:
|
||||
filtered = retrofit_assessment_data[
|
||||
retrofit_assessment_data["survey_folder"] == manual_filters[home["Name"]]
|
||||
].copy()
|
||||
else:
|
||||
filtered = retrofit_assessment_data[
|
||||
retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower()
|
||||
].copy()
|
||||
|
||||
# We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
|
||||
to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
|
||||
home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False
|
||||
)
|
||||
if to_filter.sum() == 0:
|
||||
to_filter = (
|
||||
filtered["survey_folder"].
|
||||
str.replace(r"[^\w\s]", "").
|
||||
str.replace(",", "").
|
||||
str.replace(".", "").
|
||||
str.contains(
|
||||
home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False
|
||||
)
|
||||
)
|
||||
if to_filter.sum() == 0:
|
||||
to_filter = filtered["Address"].str.split(",").str[0:2].str.join("") == home["Name"]
|
||||
|
||||
if to_filter.sum() == 0:
|
||||
# Do a fuzzy match on the name
|
||||
# Find the best filter
|
||||
to_filter = filtered["Address"].str.split(",").str[0:2].str.join("").apply(
|
||||
lambda x: fuzz.partial_ratio(home["Name"], x) > 9
|
||||
)
|
||||
|
||||
if to_filter.sum() == 0:
|
||||
blah
|
||||
filtered = filtered[to_filter]
|
||||
|
||||
if filtered.empty:
|
||||
continue
|
||||
|
||||
if filtered.shape[0] == 1:
|
||||
ccs_matching_lookup.append(
|
||||
{
|
||||
"survey_folder": filtered["survey_folder"].values[0],
|
||||
"Asset ID.1": home["Asset ID.1"],
|
||||
"Name": home["Name"]
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
blah2
|
||||
|
||||
# home["Name"] should be contained in the survey_folder
|
||||
# filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)]
|
||||
# # We have an edge case wher some properties have two outputs in Sharepoint
|
||||
# if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ":
|
||||
# raise Exception("Fix me1")
|
||||
# # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"]
|
||||
#
|
||||
# if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB':
|
||||
# raise Exception("Fix me2")
|
||||
# # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"]
|
||||
#
|
||||
# if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ':
|
||||
# filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"]
|
||||
#
|
||||
# if filtered.empty:
|
||||
# continue
|
||||
# if filtered.shape[0] != 1:
|
||||
# raise Exception("something went wrong")
|
||||
#
|
||||
# matching_lookup.append(
|
||||
# {
|
||||
# "survey_folder": filtered["survey_folder"].values[0],
|
||||
# "Address ID": home["Address ID"],
|
||||
# "Name": home["Name"]
|
||||
# }
|
||||
# )
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# main()
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue