mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
set up basic data preparation process
This commit is contained in:
parent
9217ef67f4
commit
461cdd2367
3 changed files with 77 additions and 2 deletions
2
.idea/Model.iml
generated
2
.idea/Model.iml
generated
|
|
@ -7,7 +7,7 @@
|
|||
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyNamespacePackagesService">
|
||||
|
|
|
|||
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
|
|
@ -3,7 +3,7 @@
|
|||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.10 (backend)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
|
||||
<component name="PythonCompatibilityInspectionAdvertiser">
|
||||
<option name="version" value="3" />
|
||||
</component>
|
||||
|
|
|
|||
75
etl/customers/stonewater/shdf_3_clustering.py
Normal file
75
etl/customers/stonewater/shdf_3_clustering.py
Normal file
|
|
@ -0,0 +1,75 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def app():
|
||||
"""
|
||||
This script handles the preparation of the data from Stonewater, to archetype a collection
|
||||
of 5.3k properties and reduce that down to a representative set of 450 properties.
|
||||
|
||||
Here, we prepare the input data for clustering
|
||||
:return:
|
||||
"""
|
||||
|
||||
# TODO: Temp read from local machine - move to s3
|
||||
asset_list = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
|
||||
)
|
||||
|
||||
# Drop the bottom 4 rows, which are completely missing
|
||||
asset_list = asset_list.head(-4)
|
||||
|
||||
# Keep just the columns we're interested in
|
||||
asset_list = asset_list[
|
||||
[
|
||||
"Osm. ID",
|
||||
"Org. ref.",
|
||||
"Postcode",
|
||||
"House no",
|
||||
"Name",
|
||||
"Address line 2",
|
||||
"City/Town",
|
||||
"County",
|
||||
"Address ID", # This is not uprn
|
||||
]
|
||||
].rename(
|
||||
columns={
|
||||
"Osm. ID": "internal_id",
|
||||
"Org. ref.": "customer_asset_id",
|
||||
"Postcode": "postcode",
|
||||
"House no": "house_number",
|
||||
"Name": "address1",
|
||||
"Address line 2": "address2",
|
||||
"City/Town": "city_town",
|
||||
"County": "county",
|
||||
"Address ID": "external_address_id",
|
||||
}
|
||||
)
|
||||
|
||||
# Create full address
|
||||
# TODO: handle cases where one of these is null
|
||||
asset_list["full_address"] = (
|
||||
asset_list["address1"] + ", " +
|
||||
asset_list["address2"] + ", " +
|
||||
asset_list["city_town"] + ", " +
|
||||
asset_list["county"] + ", " +
|
||||
asset_list["postcode"]
|
||||
)
|
||||
|
||||
asset_list["full_address"] = np.where(
|
||||
~pd.isnull(asset_list["address2"]),
|
||||
(
|
||||
asset_list["address1"] + ", " +
|
||||
asset_list["address2"] + ", " +
|
||||
asset_list["city_town"].str.title() + ", " +
|
||||
asset_list["county"] + ", " +
|
||||
asset_list["postcode"]
|
||||
),
|
||||
asset_list["address1"] + ", " +
|
||||
asset_list["city_town"].str.title() + ", " +
|
||||
asset_list["county"] + ", " +
|
||||
asset_list["postcode"]
|
||||
)
|
||||
|
||||
if pd.isnull(asset_list["full_address"]).sum():
|
||||
raise ValueError("Missing full addresses")
|
||||
Loading…
Add table
Reference in a new issue