This commit is contained in:
Khalim Conn-Kowlessar 2024-08-10 02:18:27 +01:00
parent 8c711c9658
commit d65ce731c0
2 changed files with 49 additions and 44 deletions

View file

@ -132,51 +132,56 @@ def app():
energy_consumption_data = []
for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
# Skip the first 50
if i < 18:
continue
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
# Rename the columns to the same format as the api returns
data.columns = [c.replace("_", "-").lower() for c in data.columns]
# Take just date before the date threshold
data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
data = data[~pd.isnull(data["uprn"])]
# Take just the newest EPC per uprn, based on lodgement-date
data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn")
data = data.sample(sample_size, replace=False)
# We use the addreess data to find the related information
collected_data = []
for _, property_data in data.iterrows():
time.sleep(np.random.uniform(0.2, 1.5))
uprn = int(property_data["uprn"])
address = property_data["address1"]
postcode = property_data["postcode"]
expected_expiry_date = calculate_expiry_date(property_data["lodgement-date"])
response = retrieve_find_my_epc_data(
uprn=uprn,
postcode=postcode,
address=address,
expected_expiry_date=expected_expiry_date
)
if response is None:
try:
# Skip the first 50
if i < 40:
continue
collected_data.append(
{
**response,
"epc": property_data.to_dict(),
"epc_directory": str(directory)
}
)
energy_consumption_data.extend(collected_data)
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
# Rename the columns to the same format as the api returns
data.columns = [c.replace("_", "-").lower() for c in data.columns]
# Take just date before the date threshold
data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
data = data[~pd.isnull(data["uprn"])]
# Take just the newest EPC per uprn, based on lodgement-date
data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn")
data = data.sample(sample_size, replace=False)
# We use the addreess data to find the related information
collected_data = []
for _, property_data in data.iterrows():
time.sleep(np.random.uniform(0.2, 1.5))
uprn = int(property_data["uprn"])
address = property_data["address1"]
postcode = property_data["postcode"]
expected_expiry_date = calculate_expiry_date(property_data["lodgement-date"])
response = retrieve_find_my_epc_data(
uprn=uprn,
postcode=postcode,
address=address,
expected_expiry_date=expected_expiry_date
)
if response is None:
continue
collected_data.append(
{
**response,
"epc": property_data.to_dict(),
"epc_directory": str(directory)
}
)
energy_consumption_data.extend(collected_data)
except Exception as e:
print(f"Error for directory {directory}: {e}")
# If we have an error, then we wait for a bit since it's likely due to timeout
time.sleep(300)
continue
# Store the pickle in s3
save_time = datetime.now()

View file

@ -1,7 +1,7 @@
from pprint import pprint
import msgpack
from utils.s3 import read_from_s3
from training_data.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel
from etl.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel
def handler():