save scraped files

This commit is contained in:
Jun-te Kim 2025-03-10 18:55:17 +00:00
parent 704d34d1d9
commit 298ccdbc38
2 changed files with 30 additions and 15 deletions

View file

@ -69,6 +69,16 @@ class SharePointScraper():
)
return sharepoint_client.list_folder_contents(path)
def get_file_content(self, url):
sharepoint_client = SharePointClient(
tenant_id=self.sharepoint_tenant_id,
client_id=self.sharepoint_client_id,
client_secret=self.sharepoint_client_secret,
site_id=self.sharepoint_drive.value,
)
return sharepoint_client.download_sharepoint_file(url)
def upload_names_to_memory(self):
housing_assosiaction_folders = self.get_folders_in_path("/")
@ -203,15 +213,30 @@ class SharePointScraper():
if 'value' not in files_to_download_sharepoint_info:
raise RuntimeError("Failed to get files to download")
else:
file_names_to_download = []
file_names_to_download = {}
avoid = [".jpg",".mov"]
for file in files_to_download_sharepoint_info['value']:
if 'file' in file:
file_names_to_download.append(file['name'])
if any(file["name"].endswith(ext) for ext in avoid):
continue
file_names_to_download.update({file["name"]: file['@microsoft.graph.downloadUrl']})
for file_name, url in file_names_to_download.items():
self.logger.info(f"Downloading {file_name} from {url}")
content = self.get_file_content(url)
self.create_temp_file(content, f"{name}/{WEEK_COMMENCING}/{house_ass}/{address}/{file_name}")
filtered_files = [f for f in file_names_to_download if not f.endswith(tuple(avoid))]
self.logger.warning(filtered_files)
def create_temp_file(self, content, path):
# Ensure the path is under /tmp/
path = os.path.join("/tmp", path)
# Ensure the parent directory exists
os.makedirs(os.path.dirname(path), exist_ok=True)
# Write content to the specified file
with open(path, 'wb+') as temp_file:
temp_file.write(content.getvalue())
self.logger.info(f"Temporary file created at: {path}")
return path

View file

@ -271,13 +271,3 @@ class SharePointClient:
return file_content
def create_temp_file(self, content, path):
# Ensure the directory exists
os.makedirs(os.path.dirname(path), exist_ok=True)
# Write content to the specified temporary file
with open(path, 'w+') as temp_file:
temp_file.write(content + "\n")
logger.info(f"Temporary file created at: {path}")
return path