Merge pull request #95 from Hestia-Homes/main

Restructuing repo and trying to complete the fastapi plan trigger endpoint
This commit is contained in:
KhalimCK 2023-07-20 13:49:37 +01:00 committed by GitHub
commit ab932eb1cc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
28 changed files with 7886 additions and 93 deletions

9
.gitignore vendored
View file

@ -170,7 +170,7 @@ cython_debug/
.idea/**/shelf
# AWS User-specific
.idea/**/aws.xml
.idea/misc.xml
# Generated files
.idea/**/contentModel.xml
@ -245,3 +245,10 @@ infrastructure/terraform/.terraform*
# Don't commit packages up serverless packages
.serverless
backend/node_modules
node_modules/
backend/.idea
open_uprn/.idea/
conservation_areas/.idea/

3
.idea/Model.iml generated
View file

@ -4,8 +4,9 @@
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/backend" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/model_data" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Python 3.10 fastapi" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

2
.idea/misc.xml generated
View file

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 fastapi" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
<component name="PythonCompatibilityInspectionAdvertiser">
<option name="version" value="3" />
</component>

View file

@ -1,10 +1,11 @@
from fastapi import APIRouter, Depends
from backend.app.dependencies import validate_token
from backend.app.plan.schemas import PlanTriggerRequest
from backend.app.utils import read_csv_from_s3, setup_logger
from backend.app.utils import read_csv_from_s3
from backend.app.config import get_settings
from model_data.Property import Property
from epc_api.client import EpcClient
from utils.logger import setup_logger
logger = setup_logger()
@ -15,6 +16,37 @@ router = APIRouter(
responses={404: {"description": "Not found"}}
)
# TODO: Load this data from db
open_uprn_data = [
{'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
'LONGITUDE': -0.0540506},
{'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
'LONGITUDE': -0.0498772},
{'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
'LONGITUDE': -0.226392},
{'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
'LONGITUDE': -0.0792445},
{'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
'LONGITUDE': -0.0792445},
{'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
'LONGITUDE': -0.0468833},
{'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
'LONGITUDE': -0.1362513},
{'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
'LONGITUDE': -0.0823165}
]
in_conservation_area_data = [
{'uprn': 6032920, 'is_in_conservation_area': 'not_in_conservation_area'},
{'uprn': 6038625, 'is_in_conservation_area': 'not_in_conservation_area'},
{'uprn': 34153991, 'is_in_conservation_area': 'unknown'},
{'uprn': 10008299676, 'is_in_conservation_area': 'in_conservation_area'},
{'uprn': 10008299677, 'is_in_conservation_area': 'in_conservation_area'},
{'uprn': 100021039066, 'is_in_conservation_area': 'not_in_conservation_area'},
{'uprn': 100021226060, 'is_in_conservation_area': 'in_conservation_area'},
{'uprn': 200003489276, 'is_in_conservation_area': 'in_conservation_area'}
]
@router.post("/trigger")
async def trigger_plan(body: PlanTriggerRequest):
@ -22,30 +54,33 @@ async def trigger_plan(body: PlanTriggerRequest):
# Read in the trigger file from s3
bucket_name = get_settings().PLAN_TRIGGER_BUCKET
plan_input = read_csv_from_s3(bucket_name=bucket_name, filepath=body.trigger_file_path)
print(plan_input)
# TODO: Add validation to the file
print("What's the token")
print(get_settings().EPC_AUTH_TOKEN)
logger.info("Getting EPC data")
epc_client = EpcClient(auth_token=get_settings().EPC_AUTH_TOKEN)
input_properties = [
Property(postcode=config['postcode'], address1=config['address'], epc_client=epc_client)
for config in plan_input
]
logger.info("Getting EPC data")
for p in input_properties:
p.search_address_epc()
p.set_year_built()
logger.info("Parsing and validating the file")
logger.info("Getting coordinates")
# This is placeholder, until the full dataset is loaded into the database
for p in input_properties:
coordinate_data = [x for x in open_uprn_data if x['UPRN'] == int(p.data['uprn'])][0]
p.set_coordinates(coordinate_data)
logger.info("properties")
logger.info(input_properties)
logger.info("Check if property is in conservation area")
for p in input_properties:
in_conservation_area = [x for x in in_conservation_area_data if x['uprn'] == int(p.data['uprn'])][0].get(
"is_in_conservation_area"
)
p.set_is_in_conservation_area(in_conservation_area)
# TODO: get co-ordinates
logger.info()
logger.info("Reading in EPC data")
return {"message": "Plan triggered"}
return {"message": "Plan complete"}

View file

@ -1,7 +1,8 @@
from enum import Enum
import geopandas as gpd
from shapely.geometry import Point
from model_data.utils import setup_logger
from utils.logger import setup_logger
from datatypes.datatypes import OpenUprnCoordinateData
logger = setup_logger()
@ -39,14 +40,20 @@ class ConservationAreaClient:
self.gov_data = gpd.read_file(self.gov_path)
self.gov_data = self.gov_data.drop(columns=["dataset"])
def is_in_conservation_area(self, coordinates: dict):
def is_in_conservation_area(self, coordinates: OpenUprnCoordinateData):
"""
Check if a property is in a conservation area
:param coordinates: dictionary, which should have the OpenUprnCoordinateData format
:return:
"""
if not coordinates:
raise ValueError("Coordinates have not been set, run get_coordinates() first")
is_in_conservation_area = self.is_in_conservation_area_historic_england(
x_bng=coordinates["x_coordinate"],
y_bng=coordinates["y_coordinate"]
x_bng=coordinates.X_COORDINATE,
y_bng=coordinates.Y_COORDINATE
)
if is_in_conservation_area != "unknown":
@ -55,8 +62,8 @@ class ConservationAreaClient:
if is_in_conservation_area == "unknown":
# We double check the secondary data source
backup = self.is_in_conservation_area_historic_gov(
longitude=coordinates["longitude"],
latitude=coordinates["latitude"]
longitude=coordinates.LONGITUDE,
latitude=coordinates.LATITUDE
)
if backup:

51
conservation_areas/app.py Normal file
View file

@ -0,0 +1,51 @@
"""
This application reads in the open uprn data from a static location and loads it into
our database for querying from other services
"""
import os
from conservation_areas.ConservationAreaClient import ConservationAreaClient
from datatypes.datatypes import OpenUprnCoordinateData
def app():
conservation_area_client = ConservationAreaClient(
historic_england_path=os.path.abspath(
os.path.dirname(__file__)
) + "/model_data/local_data/Historic_Eng_Conservation_Areas/Conservation_Areas.shp",
gov_path=os.path.abspath(
os.path.dirname(__file__)
) + "/model_data/local_data/gov-conservation-area.geojson"
)
conservation_area_client.read()
# We need to iterate through the open uprn data and check if the coordinates are in a conservation area
open_uprn_data = [
{'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
'LONGITUDE': -0.0540506},
{'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
'LONGITUDE': -0.0498772},
{'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
'LONGITUDE': -0.226392},
{'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
'LONGITUDE': -0.0792445},
{'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
'LONGITUDE': -0.0792445},
{'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
'LONGITUDE': -0.0468833},
{'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
'LONGITUDE': -0.1362513},
{'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
'LONGITUDE': -0.0823165}
]
result = [
{
"uprn": coordinates["UPRN"],
"is_in_conservation_area": conservation_area_client.is_in_conservation_area(
OpenUprnCoordinateData(**coordinates))
} for coordinates in
open_uprn_data
]
# TODO: Add a method to write to the database

View file

@ -0,0 +1,16 @@
attrs==23.1.0
certifi==2023.5.7
click==8.1.6
click-plugins==1.1.1
cligj==0.7.2
fiona==1.9.4.post1
geopandas==0.13.2
numpy==1.25.1
packaging==23.1
pandas==2.0.3
pyproj==3.6.0
python-dateutil==2.8.2
pytz==2023.3
shapely==2.0.1
six==1.16.0
tzdata==2023.3

0
datatypes/__init__.py Normal file
View file

10
datatypes/datatypes.py Normal file
View file

@ -0,0 +1,10 @@
from dataclasses import dataclass
@dataclass
class OpenUprnCoordinateData:
UPRN: int
X_COORDINATE: float
Y_COORDINATE: float
LATITUDE: float
LONGITUDE: float

View file

@ -1,7 +1,7 @@
import math
from tqdm import tqdm
from dbfread import DBF
from model_data.utils import setup_logger
from utils.logger import setup_logger
logger = setup_logger()

View file

@ -2,7 +2,7 @@ from typing import List, Dict
import pandas as pd
from tqdm import tqdm
import string
from model_data.utils import setup_logger
from utils.logger import setup_logger
from fuzzywuzzy import fuzz
import numpy as np

View file

@ -68,24 +68,12 @@ class Property(BaseUtility):
self.data = response["rows"][0]
def get_coordinates(self, open_uprn_client):
def set_coordinates(self, coordinates):
"""
This method utlises the OpenOprnClient to get the coordinates of the property
The OpenOprnClient interfactes with the Ordinance Survey Open UPRN database to extract
property coordinates. This database holds lookups between UPRN and coordinates.
:param open_uprn_client: Instance of OpenOprnClient. This method expects the client to have already read
the data
This method sets the coordinates of the property, given the open uprn data
:param coordinates: dictionary
"""
if open_uprn_client.data is None:
raise ValueError("OpenUprnClient has not read data")
self.coordinates = (
open_uprn_client.data[open_uprn_client.data["UPRN"] == int(self.data["uprn"])]
.to_dict("records")[0]
)
self.coordinates = {key.lower(): value for key, value in self.coordinates.items()}
self.coordinates = {key.lower(): value for key, value in coordinates.items()}
def get_components(self, cleaner):
"""

View file

@ -13,7 +13,7 @@ from model_data.EpcClean import EpcClean
from statsmodels.stats.outliers_influence import variance_inflation_factor
from tqdm import tqdm
from model_data.utils import setup_logger
from utils.logger import setup_logger
logger = setup_logger()

View file

@ -2,7 +2,6 @@ from tqdm import tqdm
import os
from model_data.BoreholeClient import BoreholeClient
from model_data.LandRegistryClient import LandRegistryClient
from model_data.ConservationAreaClient import ConservationAreaClient
from model_data.temp_inputs import input_data
from model_data.Property import Property
@ -10,7 +9,7 @@ from model_data.config import EPC_AUTH_TOKEN
from epc_api.client import EpcClient
from model_data.downloader import pagenated_epc_download
from model_data.EpcClean import EpcClean
from model_data.OpenUprnClient import OpenUprnClient
from open_uprn.OpenUprnClient import OpenUprnClient
from model_data.analysis.UvalueEstimations import UvalueEstimations
LAND_REGISTRY_PATHS = [
@ -245,11 +244,6 @@ def handler():
print(results.summary())
import matplotlib.pyplot as plt
import numpy as np
import numpy as np
grouped_error = []
groupby = ["mainheat-description"]
for group, data in model_data.groupby(groupby, observed=True):
@ -304,3 +298,39 @@ def handler():
result = correct_spelling("excelent lighting in this hosehold")
print(result)
'excellent lighting in this household'
def app():
"""
For a pre-defined list of constituencies and property types, we'll download EPC data from the API
and produce a dataset of cleaned fields so that when we get new properties, we can quickly
sanitise any description data
:return:
"""
# We pull properties from local authorities, by property type. This will allow us to build
# a dataset of up to 10k properties per local authority/property type combination
# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
# and Wales from 31 July 2014
# Download data from August 2014 onwards
data = []
for c in tqdm(constituencies):
for pt in property_types:
data.extend(
pagenated_epc_download(
client=epc_client,
params={
"constituency": c,
"property-type": pt,
"from-month": 8,
"from-year": 2014,
},
page_size=5000,
n_pages=10,
)
)
# Incorporate input data into cleaning
cleaner = EpcClean(data + [p.data for p in input_properties])
cleaner.clean()

View file

@ -10,4 +10,8 @@ pyproj
pint
mip
pyspellchecker
textblob
textblob
pandas==2.0.3
numpy==1.25.1
python-dateutil==2.8.2
six==1.16.0

View file

@ -1,6 +1,3 @@
geopandas
xgboost
statsmodels
scikit-learn
pandas==2.0.3
numpy==1.25.1

View file

@ -3,7 +3,7 @@ import pandas as pd
from unittest.mock import Mock
from epc_api.client import EpcClient
from model_data.Property import Property
from model_data.OpenUprnClient import OpenUprnClient
from open_uprn.OpenUprnClient import OpenUprnClient
from model_data.EpcClean import EpcClean
# Define some test data

View file

@ -1,7 +1,8 @@
import logging
from io import StringIO
from unittest.mock import patch
from model_data.utils import setup_logger, is_percentage_or_number, correct_spelling
from model_data.utils import is_percentage_or_number, correct_spelling
from utils.logger import setup_logger
class TestLogger:

View file

@ -1,4 +1,3 @@
import logging
import re
from textblob import TextBlob
@ -6,40 +5,6 @@ from textblob import TextBlob
PERCENTAGE_PATTERN = re.compile(r'^\d+%?$')
def setup_logger(log_file=None, level=logging.INFO, overwrite_handler=False):
# Create a logger and set the logging level
logger = logging.getLogger()
logger.setLevel(level)
# if logger already has handlers, just return it
if logger.hasHandlers() and not overwrite_handler:
return logger
# Define the log message format
log_format = "%(asctime)s [%(levelname)s] %(message)s"
date_format = "%Y-%m-%d %H:%M:%S"
formatter = logging.Formatter(log_format, datefmt=date_format)
# Create a file handler and set the file path and format
if log_file:
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(level)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
# Create a console handler and set the format
console_handler = logging.StreamHandler()
console_handler.setLevel(level)
# Set the formatter for the handlers
console_handler.setFormatter(formatter)
# Add the handlers to the logger
logger.addHandler(console_handler)
return logger
def is_percentage_or_number(s):
# re.match returns None if the string does not match the pattern
return PERCENTAGE_PATTERN.match(s) is not None

View file

@ -1,5 +1,5 @@
import pandas as pd
from model_data.utils import setup_logger
from utils.logger import setup_logger
logger = setup_logger()
@ -12,9 +12,9 @@ class OpenUprnClient:
# TODO: Document this
def __init__(self, path, uprns):
def __init__(self, path, uprns=None):
self.path = path
self.uprns = [int(x) for x in uprns]
self.uprns = [int(x) for x in uprns] if uprns else None
self.data = None
def read(self):
@ -25,6 +25,7 @@ class OpenUprnClient:
logger.info("Reading in open uprn data")
df = pd.read_csv(self.path)
df = df[df["UPRN"].isin(self.uprns)]
if self.uprns:
df = df[df["UPRN"].isin(self.uprns)]
self.data = df

0
open_uprn/__init__.py Normal file
View file

18
open_uprn/app.py Normal file
View file

@ -0,0 +1,18 @@
"""
This application reads in the open uprn data from a static location and loads it into
our database for querying from other services
"""
import os
from open_uprn.OpenUprnClient import OpenUprnClient
def app():
open_uprn_client = OpenUprnClient(
path=os.path.abspath(
os.path.dirname(__file__)
) + "/model_data/local_data/osopenuprn_202306_csv/osopenuprn_202305.csv",
)
open_uprn_client.read()
# TODO: Add a method to write to the database

View file

@ -0,0 +1,13 @@
numpy==1.25.1
pandas==2.0.3
python-dateutil==2.8.2
pytz==2023.3
six==1.16.0
tzdata==2023.3
click==8.1.6
joblib==1.3.1
nltk==3.8.1
regex==2023.6.3
textblob==0.17.1
tqdm==4.65.0

7606
package-lock.json generated Normal file

File diff suppressed because it is too large Load diff

6
package.json Normal file
View file

@ -0,0 +1,6 @@
{
"devDependencies": {
"serverless-domain-manager": "^7.1.0",
"serverless-python-requirements": "^6.0.0"
}
}

View file

@ -38,6 +38,8 @@ package:
- infrastructure/**
- data_collection/**
- node_modules/**
- conservation_areas/**
- open_uprn/**
plugins:
- serverless-python-requirements

0
utils/__init__.py Normal file
View file

35
utils/logger.py Normal file
View file

@ -0,0 +1,35 @@
import logging
def setup_logger(log_file=None, level=logging.INFO, overwrite_handler=False):
# Create a logger and set the logging level
logger = logging.getLogger()
logger.setLevel(level)
# if logger already has handlers, just return it
if logger.hasHandlers() and not overwrite_handler:
return logger
# Define the log message format
log_format = "%(asctime)s [%(levelname)s] %(message)s"
date_format = "%Y-%m-%d %H:%M:%S"
formatter = logging.Formatter(log_format, datefmt=date_format)
# Create a file handler and set the file path and format
if log_file:
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(level)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
# Create a console handler and set the format
console_handler = logging.StreamHandler()
console_handler.setLevel(level)
# Set the formatter for the handlers
console_handler.setFormatter(formatter)
# Add the handlers to the logger
logger.addHandler(console_handler)
return logger