From 298ccdbc381ecc1e3cdb641c69e7f32de39b0bb9 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 10 Mar 2025 18:55:17 +0000 Subject: [PATCH] save scraped files --- etl/scraper/scraper.py | 35 +++++++++++++++++++++++++----- etl/utils/sharepoint/sharepoint.py | 10 --------- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/etl/scraper/scraper.py b/etl/scraper/scraper.py index 50ac9c6..7867c7f 100644 --- a/etl/scraper/scraper.py +++ b/etl/scraper/scraper.py @@ -69,6 +69,16 @@ class SharePointScraper(): ) return sharepoint_client.list_folder_contents(path) + + def get_file_content(self, url): + sharepoint_client = SharePointClient( + tenant_id=self.sharepoint_tenant_id, + client_id=self.sharepoint_client_id, + client_secret=self.sharepoint_client_secret, + site_id=self.sharepoint_drive.value, + ) + + return sharepoint_client.download_sharepoint_file(url) def upload_names_to_memory(self): housing_assosiaction_folders = self.get_folders_in_path("/") @@ -203,15 +213,30 @@ class SharePointScraper(): if 'value' not in files_to_download_sharepoint_info: raise RuntimeError("Failed to get files to download") else: - file_names_to_download = [] + file_names_to_download = {} avoid = [".jpg",".mov"] for file in files_to_download_sharepoint_info['value']: if 'file' in file: - file_names_to_download.append(file['name']) + if any(file["name"].endswith(ext) for ext in avoid): + continue + file_names_to_download.update({file["name"]: file['@microsoft.graph.downloadUrl']}) + + for file_name, url in file_names_to_download.items(): + self.logger.info(f"Downloading {file_name} from {url}") + content = self.get_file_content(url) + self.create_temp_file(content, f"{name}/{WEEK_COMMENCING}/{house_ass}/{address}/{file_name}") - filtered_files = [f for f in file_names_to_download if not f.endswith(tuple(avoid))] - self.logger.warning(filtered_files) - + def create_temp_file(self, content, path): + # Ensure the path is under /tmp/ + path = os.path.join("/tmp", path) + # Ensure the parent directory exists + os.makedirs(os.path.dirname(path), exist_ok=True) + # Write content to the specified file + with open(path, 'wb+') as temp_file: + temp_file.write(content.getvalue()) + + self.logger.info(f"Temporary file created at: {path}") + return path \ No newline at end of file diff --git a/etl/utils/sharepoint/sharepoint.py b/etl/utils/sharepoint/sharepoint.py index 5b5a921..b649997 100644 --- a/etl/utils/sharepoint/sharepoint.py +++ b/etl/utils/sharepoint/sharepoint.py @@ -271,13 +271,3 @@ class SharePointClient: return file_content - def create_temp_file(self, content, path): - # Ensure the directory exists - os.makedirs(os.path.dirname(path), exist_ok=True) - - # Write content to the specified temporary file - with open(path, 'w+') as temp_file: - temp_file.write(content + "\n") - - logger.info(f"Temporary file created at: {path}") - return path \ No newline at end of file