set up basic data preparation process

This commit is contained in:
Khalim Conn-Kowlessar 2024-06-06 11:18:35 +01:00
parent 9217ef67f4
commit 461cdd2367
3 changed files with 77 additions and 2 deletions

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyNamespacePackagesService">

2
.idea/misc.xml generated
View file

@ -3,7 +3,7 @@
<component name="Black">
<option name="sdkName" value="Python 3.10 (backend)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
<component name="PythonCompatibilityInspectionAdvertiser">
<option name="version" value="3" />
</component>

View file

@ -0,0 +1,75 @@
import numpy as np
import pandas as pd
def app():
"""
This script handles the preparation of the data from Stonewater, to archetype a collection
of 5.3k properties and reduce that down to a representative set of 450 properties.
Here, we prepare the input data for clustering
:return:
"""
# TODO: Temp read from local machine - move to s3
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
)
# Drop the bottom 4 rows, which are completely missing
asset_list = asset_list.head(-4)
# Keep just the columns we're interested in
asset_list = asset_list[
[
"Osm. ID",
"Org. ref.",
"Postcode",
"House no",
"Name",
"Address line 2",
"City/Town",
"County",
"Address ID", # This is not uprn
]
].rename(
columns={
"Osm. ID": "internal_id",
"Org. ref.": "customer_asset_id",
"Postcode": "postcode",
"House no": "house_number",
"Name": "address1",
"Address line 2": "address2",
"City/Town": "city_town",
"County": "county",
"Address ID": "external_address_id",
}
)
# Create full address
# TODO: handle cases where one of these is null
asset_list["full_address"] = (
asset_list["address1"] + ", " +
asset_list["address2"] + ", " +
asset_list["city_town"] + ", " +
asset_list["county"] + ", " +
asset_list["postcode"]
)
asset_list["full_address"] = np.where(
~pd.isnull(asset_list["address2"]),
(
asset_list["address1"] + ", " +
asset_list["address2"] + ", " +
asset_list["city_town"].str.title() + ", " +
asset_list["county"] + ", " +
asset_list["postcode"]
),
asset_list["address1"] + ", " +
asset_list["city_town"].str.title() + ", " +
asset_list["county"] + ", " +
asset_list["postcode"]
)
if pd.isnull(asset_list["full_address"]).sum():
raise ValueError("Missing full addresses")