Model/etl/customers/stonewater/data_cleaning.py
2025-03-08 15:38:05 +00:00

155 lines
6.8 KiB
Python

import os
import shutil
from tqdm import tqdm
from etl.access_reporting.app import SharePointClient
def delete_large_files():
"""
This function deletes photos, designs and other files which we don't need
:return:
"""
folder_path = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 2.1 Surveys"
# List the contents of this folder since in each sub-folder we have the property folders
contents = os.listdir(folder_path)
for subfolder in contents:
if not os.path.isdir(os.path.join(folder_path, subfolder)):
continue
subfolder_path = os.path.join(folder_path, subfolder)
# List the contents
property_folders = os.listdir(subfolder_path)
for property in tqdm(property_folders):
# Check if it's a directory
if not os.path.isdir(os.path.join(subfolder_path, property)):
continue
property_path = os.path.join(subfolder_path, property)
property_contents = os.listdir(property_path)
# We delete the contents of the following folders:
# '1. RA Property Pics'
# '4. Air Tightness Tests'
# '5. RD Design Info'
for folder_to_delete in ["1. RA Property Pics", "4. Air Tightness Tests", "5. RD Design Info",
"1. RA Property PIcs", "Post EPC Photos", "4. RD Design Info",
"5. Installer Info", "6. Trustmark lodgement", "7.Post Install Inspection Photos",
"6. Trustmark Lodgement", "7. Post Inspection Photos"]:
if folder_to_delete not in property_contents:
continue
folder_to_delete_path = os.path.join(property_path, folder_to_delete)
if os.path.isdir(folder_to_delete_path):
# Delete the folder, even if it's not empty
shutil.rmtree(folder_to_delete_path)
# We now check the '2. RA Coordinator Info' folder for any .MOV files and delete them
if "2. RA Coordinator Info" not in property_contents:
coordinator_folder = "1. RA Coordinator Info"
else:
coordinator_folder = "2. RA Coordinator Info"
coordinator_info_path = os.path.join(property_path, coordinator_folder)
coordinator_info_contents = os.listdir(coordinator_info_path)
# Look for .MOV files and .jpg files
for file in coordinator_info_contents:
if file.endswith(".MOV"):
os.remove(os.path.join(coordinator_info_path, file))
if file.endswith(".jpg"):
os.remove(os.path.join(coordinator_info_path, file))
if "Property Pics" in coordinator_info_contents:
# Delete folder and contents
shutil.rmtree(os.path.join(coordinator_info_path, "Property Pics"))
def download_data_from_sharepoint():
# Given a sharepoint location, this function will download the retrofit assessment folders from the locations
# specified in the sharepoint location
SHAREPOINT_CLIENT_ID = os.getenv("SHAREPOINT_CLIENT_ID", None)
SHAREPOINT_CLIENT_SECRET = os.getenv("SHAREPOINT_CLIENT_SECRET", None)
SHAREPOINT_TENANT_ID = os.getenv("SHAREPOINT_TENANT_ID", None)
OSMOSIS_SHAREPOINT_SITE_ID = os.getenv("OSMOSIS_SHAREPOINT_SITE_ID", None)
sharepoint_client = SharePointClient(
tenant_id=SHAREPOINT_TENANT_ID,
client_id=SHAREPOINT_CLIENT_ID,
client_secret=SHAREPOINT_CLIENT_SECRET,
site_id=OSMOSIS_SHAREPOINT_SITE_ID
)
# Retrieve the data from Sharepoint and write to local machine
contents = sharepoint_client.list_folder_contents(
drive_id=sharepoint_client.document_drive["id"],
folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders"
)
folders_to_keep = [
"1. Herefordshire", "2. Bedfordshire", "3. Wiltshire", "4. Bournemouth",
"5. Coventry", "6. West Sussex", "7. Dorset", "8. Cambridgeshire",
"9. Guildford", "10. Little Island", "11. CCS Dorset",
]
folders_to_pull = [
folder for folder in contents["value"] if folder["name"] in folders_to_keep
]
for folder_to_pull in folders_to_pull:
# Get the contents
folder_contents = sharepoint_client.list_folder_contents(
drive_id=sharepoint_client.document_drive["id"],
folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" + "/" +
folder_to_pull["name"],
page_size=100
)
property_folders = [f for f in folder_contents["value"]]
for property_folder in property_folders:
# We go into each property folder and get the contents
property_folder_contents = sharepoint_client.list_folder_contents(
drive_id=sharepoint_client.document_drive["id"],
folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" + "/" +
folder_to_pull["name"] + "/" + property_folder["name"]
)
if not property_folder_contents.get("value"):
continue
# We look for the retrofit assessment folder or mtp folders:
property_sub_folders = [
f for f in property_folder_contents["value"] if
"ra coordinator info" in f["name"].lower() or
"retrofit assessment" in f["name"].lower() or
"ra info" in f["name"].lower() or
"mtp" in f["name"].lower() or
"mid-term" in f["name"].lower()
]
if not property_sub_folders:
continue
for property_sub_folder in property_sub_folders:
# if we have this, we download the folder and store it on my laptop!
property_folder_path = os.path.join(
"Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders",
folder_to_pull["name"],
property_folder["name"],
property_sub_folder["name"]
)
download_dir = os.path.join(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 2.1 Surveys - 2",
folder_to_pull["name"],
property_folder["name"],
property_sub_folder["name"]
)
# We download the folder
sharepoint_client.download_sharepoint_folder(
drive_id=sharepoint_client.document_drive["id"],
folder_path=property_folder_path,
download_dir=download_dir,
excluded_file_types=["MOV", "jpg"]
)