mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
add this in a sensible branch
This commit is contained in:
parent
ad189b4cac
commit
1b53b47048
2 changed files with 14 additions and 9 deletions
|
|
@ -5,10 +5,11 @@ Before you run:
|
|||
|
||||
Step 1) Get the list and ensure the following columns exists
|
||||
|
||||
I believe lower and upper case matter:
|
||||
* Address 1
|
||||
* Address 2
|
||||
* Address 3
|
||||
* postcode
|
||||
* Postcode
|
||||
|
||||
And save it as a .csv file
|
||||
|
||||
|
|
@ -23,16 +24,17 @@ For this example I'll be using "s3://retrofit-data-dev/ara_raw_inputs/calico/Cal
|
|||
|
||||
Go to Ara DB and make a new task_id with a randomly generated uuid as the primarily key
|
||||
|
||||
task_id = a7b70a02-4df4-45b5-a50b-196e095910bb
|
||||
sub_task_id = 567cf73b-1210-4909-9ecc-36ae7e23420e
|
||||
task_id = 169ea9b0-01b5-48dc-9f90-ae1989491d09
|
||||
sub_task_id = e5704f9e-29fe-43c8-8913-05be09f2440f
|
||||
s3 => s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching Rerun After Address Fix.csv
|
||||
|
||||
Step 3) Alright, now lets make the input for postcode-splitter sqs to get the ball rolling
|
||||
postcode-splitter-sqs => https://eu-west-2.console.aws.amazon.com/sqs/v3/home?region=eu-west-2#/queues/https%3A%2F%2Fsqs.eu-west-2.amazonaws.com%2F337213553626%2Fpostcode-splitter-queue-dev
|
||||
|
||||
{
|
||||
"task_id": "a7b70a02-4df4-45b5-a50b-196e095910bb",
|
||||
"sub_task_id": "567cf73b-1210-4909-9ecc-36ae7e23420e",
|
||||
"s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv"
|
||||
"task_id": "169ea9b0-01b5-48dc-9f90-ae1989491d09",
|
||||
"sub_task_id": "e5704f9e-29fe-43c8-8913-05be09f2440f",
|
||||
"s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching (1)(Sheet1).csv"
|
||||
}
|
||||
Each batch of csv should be saved in retrofit-data-dev/ara_postcode_splitter_batches/<task-id>/<sub-task-id>/<timestamp:uuid4>.csv
|
||||
|
||||
|
|
|
|||
|
|
@ -351,9 +351,9 @@ def handler(event, context, local=False):
|
|||
{
|
||||
"body": json.dumps(
|
||||
{
|
||||
"task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917",
|
||||
"sub_task_id": "6a427b6e-1ece-4983-b1e5-9bffccc53d1d",
|
||||
"s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-18T11:47:00.822579_f95467f5.csv",
|
||||
"task_id": "169ea9b0-01b5-48dc-9f90-ae1989491d09",
|
||||
"sub_task_id": "e5704f9e-29fe-43c8-8913-05be09f2440f",
|
||||
"s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching Rerun After Address Fix.csv",
|
||||
}
|
||||
)
|
||||
}
|
||||
|
|
@ -441,6 +441,9 @@ def handler(event, context, local=False):
|
|||
# Process the rows
|
||||
logger.info(f"Processing {len(df)} rows for task {task_id}")
|
||||
|
||||
df["postcode_clean"] = (
|
||||
df["Postcode"].astype(str).str.upper().str.strip().str.replace(" ", "")
|
||||
)
|
||||
clean_df = df.dropna(subset=["postcode_clean"])
|
||||
|
||||
postcode_to_addresses = {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue