mirror of
https://github.com/Hestia-Homes/survey-extraction.git
synced 2026-06-30 13:10:56 +00:00
94 lines
3.3 KiB
Python
94 lines
3.3 KiB
Python
import os
|
|
os.environ["SHAREPOINT_CLIENT_ID"] = "895e3b77-b1d7-43ec-b18f-dcfe07cdfeaf"
|
|
os.environ["SHAREPOINT_CLIENT_SECRET"] = "SOf8Q~-is4wdQiqvEEm9FlJQRAY9ELGaj5Qz-a6E"
|
|
os.environ["SHAREPOINT_TENANT_ID"] = "c3f7519c-2719-4547-af04-6da6cbfd8f8f"
|
|
os.environ["SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID"] = "b5a51507-9427-4ee0-b03e-90ec7681e2d3"
|
|
os.environ["JJC_SERVICE_SHAREPOINT_ID"] = "7fdd0485-bbf3-4b29-b30f-98c81c2a6284"
|
|
from etl.scraper.scraper import SharePointScraper, SharePointInstaller, WEEK_COMMENCING
|
|
import pandas as pd
|
|
import hashlib
|
|
|
|
def get_photos_name(installer):
|
|
south_coast_scraper = SharePointScraper(installer)
|
|
folders = south_coast_scraper.get_folders_in_path('/')
|
|
|
|
|
|
list_of_file_names = []
|
|
for folder in folders['value']:
|
|
if "Khalim" in folder["name"]:
|
|
continue
|
|
elif ".Training" in folder["name"]:
|
|
continue
|
|
if 'file' not in folder:
|
|
list_of_file_names.append("/" + folder["name"])
|
|
|
|
list_of_dates = []
|
|
for i, folder in enumerate(list_of_file_names):
|
|
print(f"getting dates {i}")
|
|
dates = south_coast_scraper.get_folders_in_path(folder)
|
|
for date in dates['value']:
|
|
if 'file' not in date:
|
|
list_of_dates.append(folder + "/" + date["name"])
|
|
|
|
|
|
list_of_housing_associations = []
|
|
for i, folder in enumerate(list_of_dates):
|
|
print(f"getting housing assoication {i}")
|
|
house_ass = south_coast_scraper.get_folders_in_path(folder)
|
|
for house in house_ass['value']:
|
|
if 'file' not in house:
|
|
list_of_housing_associations.append(folder + "/" + house["name"])
|
|
list_of_address = []
|
|
|
|
for i, folder in enumerate(list_of_housing_associations):
|
|
print(f"getting address {i}")
|
|
address = south_coast_scraper.get_folders_in_path(folder)
|
|
for add in address['value']:
|
|
if 'file' not in add:
|
|
list_of_address.append(folder + "/" + add['name'])
|
|
|
|
list_of_pictures = []
|
|
|
|
for i, folder in enumerate(list_of_address):
|
|
print(f"getting pictures {i}")
|
|
pictures = south_coast_scraper.get_folders_in_path(folder)
|
|
for pic in pictures['value']:
|
|
if 'file' not in pic:
|
|
list_of_pictures.append(folder + "/" + pic['name'])
|
|
|
|
print(list_of_pictures)
|
|
|
|
final_list = []
|
|
for i,files in enumerate(list_of_pictures):
|
|
print(f"for finali list {i}")
|
|
|
|
content = south_coast_scraper.get_folders_in_path(files)
|
|
parts = files.split("/")
|
|
date = None
|
|
for part in parts:
|
|
if part.startswith("W.C."):
|
|
date = part # Output: W.C. 17.03.2025
|
|
for file in content['value']:
|
|
if 'file' in file:
|
|
final_list.append({
|
|
"Date": date,
|
|
"path": file,
|
|
"Photo Name": file['name'],
|
|
})
|
|
|
|
final_df = pd.DataFrame(final_list)
|
|
return final_df
|
|
|
|
jjc_df = get_photos_name(SharePointInstaller.JJC)
|
|
scis_df = get_photos_name(SharePointInstaller.SOUTH_COAST_INSULATION)
|
|
|
|
all_df = [jjc_df, scis_df]
|
|
|
|
final_df = pd.concat(all_df, ignore_index=True)
|
|
final_df
|
|
|
|
final_df.to_csv("photos_name.csv")
|
|
|
|
duplicate_names = final_df[final_df.duplicated('Photo Name', keep=False)]
|
|
df = final_df
|
|
dupe_names_df = df[df.duplicated('Photo Name', keep=False)].sort_values('Photo Name')
|