completed creationg of matching tables

2026-06-08 11:17:27 +00:00 · 2024-02-23 15:54:28 +00:00 · 2024-02-23 15:54:28 +00:00 · 75183902c1
commit 75183902c1
parent 5a451f2f82
1 changed files with 48 additions and 15 deletions
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@ -43,7 +43,8 @@ class DataLoader:
        # the asset list
        "HA14": 4,
        # There's just too many unmatched here - if we identify some homes that
-        "HA6": 117
+        "HA6": 117,
+        "HA107": 52
    }

    def __init__(self, directories, use_cache):
@ -130,7 +131,7 @@ class DataLoader:
        :return:
        """

-        if ha_name in ["HA6", "HA14"]:
+        if ha_name in ["HA6", "HA14", "HA107"]:
            split_addresses = ciga_list['Matched Address'].str.split(',', expand=True)
            house_numbers = split_addresses[0].str.split(' ', expand=True)
            # THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
@ -153,8 +154,11 @@ class DataLoader:

    @staticmethod
    def get_ciga_sheetname(workbook):
+
        if "CIGA Checks" in workbook.sheetnames:
            return "CIGA Checks"
+        elif "CIGA checks" in workbook.sheetnames:
+            return "CIGA checks"
        else:
            return "CIGA"

@ -490,6 +494,22 @@ class DataLoader:

        return survey_list

+    @staticmethod
+    def levenstein_match(matching_string, df):
+        match_to = df["matching_address"].tolist()
+        # Strip out punctuation and spaces
+        match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to]
+        match_to = [x.replace(" ", "") for x in match_to]
+
+        # Perform matching between full key and match_to
+        distances = [Levenshtein.distance(matching_string, s) for s in match_to]
+        best_match_index = distances.index(min(distances))
+        # We might want to consider a threshold for the distance, however for the momeny,
+        # we don't consider this for the moment
+        df = df.iloc[best_match_index:best_match_index + 1]
+
+        return df
+
    def merge_surveys_to_assets(self, asset_list, survey_list, ha_name):

        # Correct the survey list
@ -544,17 +564,7 @@ class DataLoader:
                    # Remove any spaces from the full key
                    full_key = full_key.replace(" ", "")

-                    match_to = df["matching_address"].tolist()
-                    # Strip out punctuation and spaces
-                    match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to]
-                    match_to = [x.replace(" ", "") for x in match_to]
-
-                    # Perform matching between full key and match_to
-                    distances = [Levenshtein.distance(full_key, s) for s in match_to]
-                    best_match_index = distances.index(min(distances))
-                    # We might want to consider a threshold for the distance, however for the momeny,
-                    # we don't consider this for the moment
-                    df = df.iloc[best_match_index:best_match_index + 1]
+                    df = self.levenstein_match(full_key, df)

                    if df.shape[0] != 1:
                        print(row["Street / Block Name"])
@ -623,7 +633,7 @@ class DataLoader:
                asset_list["matching_address"].str.contains(row["Matched Postcode"].lower().strip())
            ].copy()

-            df = df[df["HouseNo"] == str(house_number)]
+            df = df[df["HouseNo"].astype(str) == str(house_number)]
            # For ciga, we skip
            if df.empty:
                unmatched_addresses.append(
@ -641,7 +651,9 @@ class DataLoader:
                street_name = self.extract_streetname(
                    address=row["Matched Address"], house_number=house_number, postcode=row["Matched Postcode"]
                )
-                df = df[df["matching_address"].str.replace(",", "").str.contains(street_name)]
+                # We check if any of the rows contains the street name and if they do, filter
+                if any(df["matching_address"].str.replace(",", "").str.contains(street_name)):
+                    df = df[df["matching_address"].str.replace(",", "").str.contains(street_name)]

                if df.shape[0] != 1:
                    # The final check we do here is to check for the presence of flat in the address
@ -650,6 +662,13 @@ class DataLoader:
                    else:
                        df = df[df["matching_address"].str.contains("flat") == False]

+                    if df.shape[0] != 1:
+                        full_key = str(row["HouseNo"]).lower().strip() + row["Matched Address"].lower().strip() + row[
+                            "Matched Postcode"].lower().strip()
+                        # Remove any spaces from the full key
+                        full_key = full_key.replace(" ", "")
+                        df = self.levenstein_match(full_key, df)
+
                    if df.shape[0] != 1:
                        print(row["Street / Block Name"])
                        print(house_number)
@ -737,6 +756,19 @@ class DataLoader:
            s3_file_name="ha-analysis/batch3-inputs.pickle",
        )

+    def ha_facts_and_figures(self):
+        """
+        This function will return a dictionary of facts and figures for each HA
+        :return:
+        """
+        ha_facts_and_figures = []
+        for ha_name, data_assets in self.data.items():
+            asset_list = data_assets["asset_list"]
+            survey_list = data_assets["survey_list"]
+            ciga_list = data_assets["ciga_list"]
+
+        return ha_facts_and_figures
+

 def get_epc_data(
    loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=True
@ -1511,6 +1543,7 @@ def app():

    loader = DataLoader(directories, use_cache)
    loader.load()
+    loader.ha_facts_and_figures()

    # TODO: We probably need to make sure that we have all of the columns that we need