mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
added __init__.py missing in ordancy survey and helpeer extensions
This commit is contained in:
parent
41d817b218
commit
5b15cb5518
5 changed files with 5379 additions and 2 deletions
|
|
@ -27,7 +27,9 @@
|
||||||
"GrapeCity.gc-excelviewer",
|
"GrapeCity.gc-excelviewer",
|
||||||
"jakobhoeg.vscode-pokemon",
|
"jakobhoeg.vscode-pokemon",
|
||||||
"github.vscode-github-actions",
|
"github.vscode-github-actions",
|
||||||
"me-dutour-mathieu.vscode-github-actions"
|
"me-dutour-mathieu.vscode-github-actions",
|
||||||
|
"anthropic.claude-code",
|
||||||
|
"eamodio.gitlens"
|
||||||
],
|
],
|
||||||
"settings": {
|
"settings": {
|
||||||
"files.defaultWorkspace": "/workspaces/model",
|
"files.defaultWorkspace": "/workspaces/model",
|
||||||
|
|
|
||||||
|
|
@ -34,4 +34,22 @@ postcode-splitter-sqs => https://eu-west-2.console.aws.amazon.com/sqs/v3/home?re
|
||||||
"sub_task_id": "567cf73b-1210-4909-9ecc-36ae7e23420e",
|
"sub_task_id": "567cf73b-1210-4909-9ecc-36ae7e23420e",
|
||||||
"s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv"
|
"s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv"
|
||||||
}
|
}
|
||||||
Each batch of csv should be saved in retrofit-data-dev/<task-id>/<sub-task-id>/<timestamp:uuid4>.csv
|
Each batch of csv should be saved in retrofit-data-dev/ara_postcode_splitter_batches/<task-id>/<sub-task-id>/<timestamp:uuid4>.csv
|
||||||
|
|
||||||
|
outputs of address2uprn ( which is automatically triggered on postcodesplitter) will be saved on retrofit-data-dev/ara_raw_outputs/<task-id>/<subtask-id>/<timestamp:uuid4>.csv
|
||||||
|
|
||||||
|
|
||||||
|
Run the script in backend/scripts/combine_address2uprn_outputs.py with <task-id>.
|
||||||
|
This will combine all the outputs of the files for each address2uprn into one big file
|
||||||
|
|
||||||
|
Find out which ones have missing uprn and save that as a seperate sheet and save it somewhere in s3://retrofit-data-dev
|
||||||
|
|
||||||
|
I uploaded the missing uprn here: s3://retrofit-data-dev/ara_raw_inputs/calico/missinguprn.csv
|
||||||
|
|
||||||
|
ordnance_survey sqs is => https://eu-west-2.console.aws.amazon.com/sqs/v3/home?region=eu-west-2#/queues/https%3A%2F%2Fsqs.eu-west-2.amazonaws.com%2F337213553626%2FordnanceSurvey-queue-dev
|
||||||
|
|
||||||
|
{
|
||||||
|
"s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/missinguprn.csv",
|
||||||
|
"task_id": "a7b70a02-4df4-45b5-a50b-196e095910bb",
|
||||||
|
"sub_task_id": "567cf73b-1210-4909-9ecc-36ae7e23420e"
|
||||||
|
}
|
||||||
|
|
|
||||||
65
backend/scripts/combine_address2uprn_outputs.py
Normal file
65
backend/scripts/combine_address2uprn_outputs.py
Normal file
|
|
@ -0,0 +1,65 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import boto3
|
||||||
|
import pandas as pd
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
BUCKET = "retrofit-data-dev"
|
||||||
|
|
||||||
|
|
||||||
|
def list_csv_files(task_id):
|
||||||
|
s3 = boto3.client("s3")
|
||||||
|
paginator = s3.get_paginator("list_objects_v2")
|
||||||
|
|
||||||
|
prefix = f"ara_raw_outputs/{task_id}"
|
||||||
|
csv_files = []
|
||||||
|
|
||||||
|
for page in paginator.paginate(Bucket=BUCKET, Prefix=prefix):
|
||||||
|
for obj in page.get("Contents", []):
|
||||||
|
key = obj["Key"]
|
||||||
|
if key.endswith(".csv"):
|
||||||
|
csv_files.append(key)
|
||||||
|
|
||||||
|
return csv_files
|
||||||
|
|
||||||
|
|
||||||
|
def download_csv(key):
|
||||||
|
s3 = boto3.client("s3")
|
||||||
|
obj = s3.get_object(Bucket=BUCKET, Key=key)
|
||||||
|
return pd.read_csv(BytesIO(obj["Body"].read()))
|
||||||
|
|
||||||
|
|
||||||
|
def main(task_id, output):
|
||||||
|
print(f"Scanning task: {task_id}")
|
||||||
|
|
||||||
|
csv_files = list_csv_files(task_id)
|
||||||
|
|
||||||
|
if not csv_files:
|
||||||
|
print("No CSV files found")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Found {len(csv_files)} CSV files")
|
||||||
|
|
||||||
|
dfs = []
|
||||||
|
for key in csv_files:
|
||||||
|
print(f"Downloading {key}")
|
||||||
|
df = download_csv(key)
|
||||||
|
dfs.append(df)
|
||||||
|
|
||||||
|
combined = pd.concat(dfs, ignore_index=True)
|
||||||
|
|
||||||
|
combined.to_csv(output, index=False)
|
||||||
|
|
||||||
|
print(f"Combined CSV saved to {output}")
|
||||||
|
print(f"Total rows: {len(combined)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("task_id", help="Task ID folder in S3")
|
||||||
|
parser.add_argument("--output", default="combined.csv")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
main(args.task_id, args.output)
|
||||||
5292
backend/scripts/combined.csv
Normal file
5292
backend/scripts/combined.csv
Normal file
File diff suppressed because it is too large
Load diff
0
backend/utils/__init__.py
Normal file
0
backend/utils/__init__.py
Normal file
Loading…
Add table
Reference in a new issue