survey-extraction/etl/imagefilenamechcker.py
2025-04-23 14:35:45 +00:00

94 lines
3.3 KiB
Python

import os
os.environ["SHAREPOINT_CLIENT_ID"] = "895e3b77-b1d7-43ec-b18f-dcfe07cdfeaf"
os.environ["SHAREPOINT_CLIENT_SECRET"] = "SOf8Q~-is4wdQiqvEEm9FlJQRAY9ELGaj5Qz-a6E"
os.environ["SHAREPOINT_TENANT_ID"] = "c3f7519c-2719-4547-af04-6da6cbfd8f8f"
os.environ["SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID"] = "b5a51507-9427-4ee0-b03e-90ec7681e2d3"
os.environ["JJC_SERVICE_SHAREPOINT_ID"] = "7fdd0485-bbf3-4b29-b30f-98c81c2a6284"
from etl.scraper.scraper import SharePointScraper, SharePointInstaller, WEEK_COMMENCING
import pandas as pd
import hashlib
def get_photos_name(installer):
south_coast_scraper = SharePointScraper(installer)
folders = south_coast_scraper.get_folders_in_path('/')
list_of_file_names = []
for folder in folders['value']:
if "Khalim" in folder["name"]:
continue
elif ".Training" in folder["name"]:
continue
if 'file' not in folder:
list_of_file_names.append("/" + folder["name"])
list_of_dates = []
for i, folder in enumerate(list_of_file_names):
print(f"getting dates {i}")
dates = south_coast_scraper.get_folders_in_path(folder)
for date in dates['value']:
if 'file' not in date:
list_of_dates.append(folder + "/" + date["name"])
list_of_housing_associations = []
for i, folder in enumerate(list_of_dates):
print(f"getting housing assoication {i}")
house_ass = south_coast_scraper.get_folders_in_path(folder)
for house in house_ass['value']:
if 'file' not in house:
list_of_housing_associations.append(folder + "/" + house["name"])
list_of_address = []
for i, folder in enumerate(list_of_housing_associations):
print(f"getting address {i}")
address = south_coast_scraper.get_folders_in_path(folder)
for add in address['value']:
if 'file' not in add:
list_of_address.append(folder + "/" + add['name'])
list_of_pictures = []
for i, folder in enumerate(list_of_address):
print(f"getting pictures {i}")
pictures = south_coast_scraper.get_folders_in_path(folder)
for pic in pictures['value']:
if 'file' not in pic:
list_of_pictures.append(folder + "/" + pic['name'])
print(list_of_pictures)
final_list = []
for i,files in enumerate(list_of_pictures):
print(f"for finali list {i}")
content = south_coast_scraper.get_folders_in_path(files)
parts = files.split("/")
date = None
for part in parts:
if part.startswith("W.C."):
date = part # Output: W.C. 17.03.2025
for file in content['value']:
if 'file' in file:
final_list.append({
"Date": date,
"path": file,
"Photo Name": file['name'],
})
final_df = pd.DataFrame(final_list)
return final_df
jjc_df = get_photos_name(SharePointInstaller.JJC)
scis_df = get_photos_name(SharePointInstaller.SOUTH_COAST_INSULATION)
all_df = [jjc_df, scis_df]
final_df = pd.concat(all_df, ignore_index=True)
final_df
final_df.to_csv("photos_name.csv")
duplicate_names = final_df[final_df.duplicated('Photo Name', keep=False)]
df = final_df
dupe_names_df = df[df.duplicated('Photo Name', keep=False)].sort_values('Photo Name')