mirror of
https://github.com/Hestia-Homes/survey-extraction.git
synced 2026-06-30 13:10:56 +00:00
save scraped files
This commit is contained in:
parent
704d34d1d9
commit
298ccdbc38
2 changed files with 30 additions and 15 deletions
|
|
@ -69,6 +69,16 @@ class SharePointScraper():
|
|||
)
|
||||
|
||||
return sharepoint_client.list_folder_contents(path)
|
||||
|
||||
def get_file_content(self, url):
|
||||
sharepoint_client = SharePointClient(
|
||||
tenant_id=self.sharepoint_tenant_id,
|
||||
client_id=self.sharepoint_client_id,
|
||||
client_secret=self.sharepoint_client_secret,
|
||||
site_id=self.sharepoint_drive.value,
|
||||
)
|
||||
|
||||
return sharepoint_client.download_sharepoint_file(url)
|
||||
|
||||
def upload_names_to_memory(self):
|
||||
housing_assosiaction_folders = self.get_folders_in_path("/")
|
||||
|
|
@ -203,15 +213,30 @@ class SharePointScraper():
|
|||
if 'value' not in files_to_download_sharepoint_info:
|
||||
raise RuntimeError("Failed to get files to download")
|
||||
else:
|
||||
file_names_to_download = []
|
||||
file_names_to_download = {}
|
||||
avoid = [".jpg",".mov"]
|
||||
|
||||
for file in files_to_download_sharepoint_info['value']:
|
||||
if 'file' in file:
|
||||
file_names_to_download.append(file['name'])
|
||||
if any(file["name"].endswith(ext) for ext in avoid):
|
||||
continue
|
||||
file_names_to_download.update({file["name"]: file['@microsoft.graph.downloadUrl']})
|
||||
|
||||
for file_name, url in file_names_to_download.items():
|
||||
self.logger.info(f"Downloading {file_name} from {url}")
|
||||
content = self.get_file_content(url)
|
||||
self.create_temp_file(content, f"{name}/{WEEK_COMMENCING}/{house_ass}/{address}/{file_name}")
|
||||
|
||||
filtered_files = [f for f in file_names_to_download if not f.endswith(tuple(avoid))]
|
||||
self.logger.warning(filtered_files)
|
||||
|
||||
def create_temp_file(self, content, path):
|
||||
# Ensure the path is under /tmp/
|
||||
path = os.path.join("/tmp", path)
|
||||
|
||||
# Ensure the parent directory exists
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
|
||||
# Write content to the specified file
|
||||
with open(path, 'wb+') as temp_file:
|
||||
temp_file.write(content.getvalue())
|
||||
|
||||
self.logger.info(f"Temporary file created at: {path}")
|
||||
return path
|
||||
|
|
@ -271,13 +271,3 @@ class SharePointClient:
|
|||
|
||||
return file_content
|
||||
|
||||
def create_temp_file(self, content, path):
|
||||
# Ensure the directory exists
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
|
||||
# Write content to the specified temporary file
|
||||
with open(path, 'w+') as temp_file:
|
||||
temp_file.write(content + "\n")
|
||||
|
||||
logger.info(f"Temporary file created at: {path}")
|
||||
return path
|
||||
Loading…
Add table
Reference in a new issue