diff --git a/.gitignore b/.gitignore index 947e0397..93f6d970 100644 --- a/.gitignore +++ b/.gitignore @@ -237,3 +237,5 @@ fabric.properties # Android studio 3.1+ serialized cache file .idea/caches/build_file_checksums.ser +# Locally stored data +/model_data/local_data/* diff --git a/.idea/encodings.xml b/.idea/encodings.xml new file mode 100644 index 00000000..bc6d25e9 --- /dev/null +++ b/.idea/encodings.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/model_data/LandRegistryClient.py b/model_data/LandRegistryClient.py new file mode 100644 index 00000000..67dfc0d3 --- /dev/null +++ b/model_data/LandRegistryClient.py @@ -0,0 +1,79 @@ +from typing import List, Dict +import pandas as pd +from tqdm import tqdm +import string +from model_data.utils import setup_logger +from fuzzywuzzy import fuzz +import numpy as np + +logger = setup_logger() + + +class LandRegistryClient: + COLUMN_NAMES = [ + "transaction_id", + "price", + "date_of_transfer", + "postcode", + "property_type", + "old_new", + "duration", + "paon", + "saon", + "street", + "locality", + "town_city", + "district", + "county", + "ppd_category_type", + "record_status", + ] + + # A score of 70-100 is a high match + SIMILARITY_THRESHOLD = 70 + + def __init__(self, paths: List[str], addresses: List[Dict[str: str]]): + self.paths = paths + self.addresses = pd.DataFrame(addresses) + + translation_table = str.maketrans("", "", string.punctuation) + # Use the translation table to remove punctuation from the text + self.addresses['address'] = self.addresses['address'].str.translate(translation_table) + + def read(self): + logger.info("Reading in land registry data") + res = [] + + for path in tqdm(self.paths): + df = pd.read_csv(path, header=None) + df.columns = self.COLUMN_NAMES + df = df[df["postcode"].isin(self.addresses["postcode"])] + res.append(df) + del df + + res = pd.concat(res) + res = res.reset_index(drop=True) + res["id"] = res.index + + # We want to remove records that were + # 1) not sold at market value (this is when ppd_category_type is not A) + # 2) propety type is other (this is when property_type is O) + res = res[(res["ppd_category_type"] == "A") & (res["property_type"] != "O")] + # Construct address + res['address'] = res[ + ['paon', 'saon', 'street', 'locality', 'town_city', 'district', 'county'] + ].fillna('').agg(' '.join, axis=1) + + # We now want to fuzzy match between res and self.addresses on postcode and take the + # best fuzzy match + res = res.merge(self.addresses, how="left", on="postcode", suffixes=("_land_registry", "_epc")) + + res = res[res["address_epc"].str.contains(res["paon"])] + + res = res[res['address_epc'].str.contains(res['paon'])] + + res = res[res.apply(lambda row: row['paon'] in row['address_epc'], axis=1)] + + res['match_similarity'] = np.vectorize(fuzz.ratio)(res['address_land_registry'], res['address_epc']) + + res2 = res[res["match_similarity"] >= self.SIMILARITY_THRESHOLD] diff --git a/model_data/Need.py b/model_data/Need.py new file mode 100644 index 00000000..50e30e54 --- /dev/null +++ b/model_data/Need.py @@ -0,0 +1,32 @@ +import pandas as pd +from model_data.utils import setup_logger + +logger = setup_logger() + + +class Need: + """ + Contains methods to read and interface with the NEED dataset. + + Current iterations of this data is the 2021 anonymised dataset, which can be found here: + https://www.gov.uk/government/statistics/national-energy-efficiency-data-framework-need-anonymised-data-2021 + """ + + def __init__(self, local_authorities, path): + self.local_authorities = local_authorities + self.path = path + + def read(self) -> pd.DataFrame: + """ + Reads the NEED dataset from a csv file. + :param path: path to the csv file + :return: pandas dataframe containing the data + """ + logger.info("Reading NEED data - could take a moment") + df = pd.read_csv(self.path) + df = df[df["REGION"].isin(self.local_authorities)] + + z = df[df["REGION"].str.contains("E9")] + + type(df["REGION"].values[0]) + return df diff --git a/model_data/app.py b/model_data/app.py index 479fa419..6d900001 100644 --- a/model_data/app.py +++ b/model_data/app.py @@ -39,37 +39,78 @@ def handler(): cleaner.clean() + import pickle + import os + with open(os.path.abspath(os.path.dirname(__file__)) + "/data.pkl", "rb") as f: + data = pickle.load(f) + + postcodes = [x["postcode"].upper() for x in data] + address_meta = [ + { + "postcode": x["postcode"].upper(), + "address1": x["address1"].upper(), + "address": x["address"].upper() + } for x in data + ] + # For testing: - from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes - from collections import Counter - count = Counter([x["main-fuel"] for x in data]) - descriptions = {x["hotwater-description"] for x in data} - out = [] - for description in descriptions: - res = HotWaterAttributes(description).process() - out.append( - { - "original_description": description, - **res - } - ) - df = pd.DataFrame(out) - df = df.sort_values("original_description") - df = df.reset_index(drop=True) + # from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes + # from collections import Counter + # count = Counter([x["main-fuel"] for x in data]) + # descriptions = {x["hotwater-description"] for x in data} + # out = [] + # for description in descriptions: + # res = HotWaterAttributes(description).process() + # out.append( + # { + # "original_description": description, + # **res + # } + # ) + # df = pd.DataFrame(out) + # df = df.sort_values("original_description") + # df = df.reset_index(drop=True) + # + # import numpy as np + # idx = 1 + # record = df[df.index == idx].to_dict("records")[0] + # record = {k: v for k, v in record.items() if v not in [None, np.nan]} + # from pprint import pprint + # pprint(record) + # + # # Issues: + # # 1) '2207 Time and temperature zone control' - we don't pick up any reference to the fact this is a time and + # # temperature zone control + # # and we only pick up temperature zone control at the moment. Can we capture this too + # # 2) 'Charging system linked to use of community heating, programmer and at least two room stats' - what are room + # # stats and how should + # # we capture this? + # + # df.to_dict("records") - import numpy as np - idx = 1 - record = df[df.index == idx].to_dict("records")[0] - record = {k: v for k, v in record.items() if v not in [None, np.nan]} - from pprint import pprint - pprint(record) + from model_data.Need import Need + import os - # Issues: - # 1) '2207 Time and temperature zone control' - we don't pick up any reference to the fact this is a time and - # temperature zone control - # and we only pick up temperature zone control at the moment. Can we capture this too - # 2) 'Charging system linked to use of community heating, programmer and at least two room stats' - what are room - # stats and how should - # we capture this? + need_client = Need( + local_authorities=local_authorities, + path=os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/need_2021_anon_dataset_4million.csv" + ) + need_data = need_client.read() - df.to_dict("records") + ## Land registry + from model_data.LandRegistryClient import LandRegistryClient + import os + + land_registry_client = LandRegistryClient( + paths=[ + os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv" + os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2022 (1).csv", + os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2021.csv", + os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2020.csv", + os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2019.csv", + os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2018.csv", + os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part1.csv", + os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv", + ], + addresses=address_meta + ) diff --git a/model_data/requirements.txt b/model_data/requirements.txt index ad0cd707..ff906389 100644 --- a/model_data/requirements.txt +++ b/model_data/requirements.txt @@ -6,4 +6,6 @@ mypy pytest mock pytest-cov -pytest-mock \ No newline at end of file +pytest-mock +fuzzywuzzy +python-Levenshtein \ No newline at end of file diff --git a/model_data/utils.py b/model_data/utils.py new file mode 100644 index 00000000..9fe04c89 --- /dev/null +++ b/model_data/utils.py @@ -0,0 +1,31 @@ +import logging + + +def setup_logger(log_file=None, level=logging.INFO): + # Create a logger and set the logging level + logger = logging.getLogger() + logger.setLevel(level) + + # Define the log message format + log_format = "%(asctime)s [%(levelname)s] %(message)s" + date_format = "%Y-%m-%d %H:%M:%S" + formatter = logging.Formatter(log_format, datefmt=date_format) + + # Create a file handler and set the file path and format + if log_file: + file_handler = logging.FileHandler(log_file) + file_handler.setLevel(level) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + # Create a console handler and set the format + console_handler = logging.StreamHandler() + console_handler.setLevel(level) + + # Set the formatter for the handlers + console_handler.setFormatter(formatter) + + # Add the handlers to the logger + logger.addHandler(console_handler) + + return logger