From e157d0ce97681b28e262f5bf8f1f083a32a7bd6c Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 3 Mar 2025 12:52:23 +0000 Subject: [PATCH 01/20] added kahlim sharepoint code --- etl/src/etl/utils/sharepoint/__init__.py | 0 etl/src/etl/utils/sharepoint/sharepoint.py | 272 +++++++++++++++++++++ 2 files changed, 272 insertions(+) create mode 100644 etl/src/etl/utils/sharepoint/__init__.py create mode 100644 etl/src/etl/utils/sharepoint/sharepoint.py diff --git a/etl/src/etl/utils/sharepoint/__init__.py b/etl/src/etl/utils/sharepoint/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/etl/src/etl/utils/sharepoint/sharepoint.py b/etl/src/etl/utils/sharepoint/sharepoint.py new file mode 100644 index 0000000..f40d765 --- /dev/null +++ b/etl/src/etl/utils/sharepoint/sharepoint.py @@ -0,0 +1,272 @@ +""" +This file contains the functions which enable interaction with SharePoint via the API. +""" +from msal import ConfidentialClientApplication +from datetime import datetime, timedelta +import requests +from functools import wraps +import time +import logging +from io import BytesIO + +# Configure logging +logger = logging.getLogger(__name__) +if not logger.handlers: + handler = logging.StreamHandler() + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) +logger.setLevel(logging.INFO) + + +def handle_error(response): + """ + Handle errors based on HTTP status codes and log detailed information. + """ + try: + error_json = response.json().get('error', {}) + except ValueError: + error_json = {} + + error_code = error_json.get('code', 'unknownError') + error_message = error_json.get('message', 'No detailed error message provided.') + inner_error = error_json.get('innererror', {}) + details = error_json.get('details', []) + + logger.error(f"Error Code: {error_code}") + logger.error(f"Error Message: {error_message}") + if inner_error: + logger.error(f"Inner Error: {inner_error}") + if details: + logger.error(f"Error Details: {details}") + + if response.status_code == 401: + logger.error("Unauthorized. Token might be invalid.") + elif response.status_code == 403: + logger.error("Forbidden. Access denied to the requested resource.") + elif response.status_code == 404: + logger.error("Not Found. The requested resource doesn’t exist.") + elif response.status_code == 429: + retry_after = int(response.headers.get('Retry-After', 5)) # Default to 5 seconds if not provided + logger.warning(f"Too Many Requests. Retrying after {retry_after} seconds...") + time.sleep(retry_after) + return 'retry' + elif response.status_code in (500, 503): + retry_after = int(response.headers.get('Retry-After', 5)) # Default to 5 seconds if not provided + logger.error(f"Server error. Retrying after {retry_after} seconds...") + time.sleep(retry_after) + return 'retry' + else: + raise ValueError(f"API request failed with status code {response.status_code} - {error_message}") + + raise ValueError(f"API request failed with status code {response.status_code} - {error_message}") + + +def api_call_decorator(func): + """ + Handles various aspects of the API call, including refreshing the access token if needed and handling pagination. + :param func: The function to be decorated. + :return: The wrapped function. + """ + + @wraps(func) + def wrapper(self, *args, **kwargs): + try: + # Check and refresh the access token if needed + if self.is_access_token_expired(): + self.retrieve_access_token() + logger.info("Access token refreshed.") + + # Get the HTTP method, URL, and optionally data from the function + http_method, url, data = func(self, *args, **kwargs) + + # Initialize the results list and handle pagination if page_size is provided + results = [] + page_size = kwargs.get('page_size', None) + response_data = {} + + while url: + response = requests.request(http_method, url, headers=self.headers, json=data) + + # Handle the response + if response.status_code == 200: + response_json = response.json() # Store the response JSON + if page_size: + results.extend(response_json.get('value', [])) + url = response_json.get('@odata.nextLink', None) + else: + response_data = response_json # Capture the full response for consistency + break + else: + retry = handle_error(response) + if retry == 'retry': + continue + + if page_size: + response_data = {'value': results} + + return response_data + + except Exception as e: + logger.exception("An error occurred during the API call.") + raise e + + return wrapper + + +class SharePointClient: + access_token = None + access_token_request_timestamp = None + access_token_expiry = None + headers = None + + TIMESTAMP_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" + + def __init__(self, tenant_id, client_id, client_secret, site_id, access_token=None, + access_token_expiration_details=None): + """ + Initializes the SharePointClient with necessary credentials and site information. + :param tenant_id: The tenant ID. + :param client_id: The client ID. + :param client_secret: The client secret. + :param site_id: The site ID. + :param access_token: The access token (optional) + :param access_token_expiration_details: The access token expiration details (optional) + """ + self.tenant_id = tenant_id + self.client_id = client_id + self.client_secret = client_secret + + if access_token: + if not access_token_expiration_details: + raise ValueError("Access token expiration details must be provided.") + self.access_token = access_token + self.set_access_token_expiration_details(access_token_expiration_details) + self.headers = { + 'Authorization': f"Bearer {self.access_token['access_token']}" + } + else: + self.retrieve_access_token() + + # Retrieve static identifiers + self.site_id = site_id + self.document_drive = self.get_documents_drive() + + def get_token_expiration_details(self): + """ + Returns the access token expiration details. Converts the datetime objects to strings for serialization. + :return: + """ + return { + 'access_token_request_timestamp': datetime.strftime( + self.access_token_request_timestamp, self.TIMESTAMP_FORMAT + ), + 'access_token_expiry': datetime.strftime(self.access_token_expiry, self.TIMESTAMP_FORMAT) + } + + def set_access_token_expiration_details(self, access_token_expiration_details): + """ + Sets the access token expiration details from a serialized dictionary. + :param access_token_expiration_details: The serialized access token expiration details. + :return: + """ + self.access_token_request_timestamp = datetime.strptime( + access_token_expiration_details['access_token_request_timestamp'], self.TIMESTAMP_FORMAT + ) + self.access_token_expiry = datetime.strptime( + access_token_expiration_details['access_token_expiry'], self.TIMESTAMP_FORMAT + ) + + def is_access_token_expired(self): + """ + Checks if the access token has expired. If it has, a new access token is retrieved. + :return: True if expired, False otherwise. + """ + return datetime.now() >= self.access_token_expiry + + def retrieve_access_token(self, refresh=False): + """ + Implements authentication using MSAL. + :param refresh: If True, force a refresh of the access token. + :return: None + """ + app = ConfidentialClientApplication( + self.client_id, + authority=f"https://login.microsoftonline.com/{self.tenant_id}", + client_credential=self.client_secret + ) + + scope = ["https://graph.microsoft.com/.default"] + + access_token_request_timestamp = datetime.now() + + if refresh: + logger.info("Forcing refresh of access token.") + token = app.acquire_token_for_client(scopes=scope) + else: + # Check if a token is already cached + token = app.acquire_token_silent(scope, account=None) + + if not token: + token = app.acquire_token_for_client(scopes=scope) + + if "access_token" not in token: + logger.error("Authentication failed.") + raise ValueError("Authentication failed") + + access_token_expiry = access_token_request_timestamp + timedelta( + seconds=token['expires_in'] - 20 + ) + + self.access_token = token + self.access_token_request_timestamp = access_token_request_timestamp + self.access_token_expiry = access_token_expiry + self.headers = { + 'Authorization': f"Bearer {self.access_token['access_token']}" + } + + logger.info("Access token retrieved successfully.") + + @api_call_decorator + def get_documents_drive(self): + """ + Get the document drive of the SharePoint site. + :return: Tuple containing HTTP method, URL, and None for data. + """ + url = f"https://graph.microsoft.com/v1.0/sites/{self.site_id}/drive" + logger.info(f"Getting document drive from URL: {url}") + return 'GET', url, None + + @api_call_decorator + def list_folder_contents(self, drive_id, folder_path: str, page_size: int = 100): + """ + This function will list the contents of a folder in SharePoint. + :param drive_id: The ID of the drive. + :param folder_path: The path of the folder. + :param page_size: The number of items per page (default is 100). + :return: Tuple containing HTTP method, URL, and None for data. + """ + url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root:/{folder_path}:/children?$top={page_size}" + logger.info(f"Listing folder contents from URL: {url}") + return 'GET', url, None + + @staticmethod + def download_sharepoint_file(download_url): + """ + Downloads a file from the given URL and returns its content. + + :param download_url: The URL to download the file from. + :return: The content of the downloaded file. + """ + response = requests.get(download_url, stream=True) + response.raise_for_status() # Check if the request was successful + + file_content = BytesIO() + + # Read the file content into memory + for chunk in response.iter_content(chunk_size=8192): + file_content.write(chunk) + + file_content.seek(0) # Reset the file pointer to the beginning + + return file_content \ No newline at end of file From cf29413211598f563186ca7ad652de39542be92b Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 3 Mar 2025 12:58:56 +0000 Subject: [PATCH 02/20] adde basic configuration for sharepoint incase I want to do that next --- etl/src/etl/main.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/etl/src/etl/main.py b/etl/src/etl/main.py index 6d95fe9..01e5c4b 100644 --- a/etl/src/etl/main.py +++ b/etl/src/etl/main.py @@ -1 +1,31 @@ -print("Hello world") \ No newline at end of file +from utils.sharepoint.sharepoint import SharePointClient +import os + + +print("Hello world") + +### +''' +Place holder for SharePointClient - When khalim is back see if I can get this wokring +''' +### +# # Share point env variables most likely +# SHAREPOINT_CLIENT_ID = os.getenv("SHAREPOINT_CLIENT_ID", None) +# SHAREPOINT_CLIENT_SECRET = os.getenv("SHAREPOINT_CLIENT_SECRET", None) +# SHAREPOINT_TENANT_ID = os.getenv("SHAREPOINT_TENANT_ID", None) +# WARMFRONT_SHAREPOINT_SITE_ID = os.getenv("WARMFRONT_SHAREPOINT_SITE_ID", None) + +# # Connect to the Sharepoint API, with the access token. +# sharepoint_client = SharePointClient( +# tenant_id=SHAREPOINT_TENANT_ID, +# client_id=SHAREPOINT_CLIENT_ID, +# client_secret=SHAREPOINT_CLIENT_SECRET, +# site_id=WARMFRONT_SHAREPOINT_SITE_ID, +# access_token=message["access_token"], +# access_token_expiration_details=message["access_token_expiration_details"] +# ) + + +### +Get local file read working +### \ No newline at end of file From 7bdd0f3ba3e0c9e2d07b0a0e026456006dc71e06 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 3 Mar 2025 13:01:00 +0000 Subject: [PATCH 03/20] updated to do list --- etl/README.md | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/etl/README.md b/etl/README.md index 2e024b0..e36d2cf 100644 --- a/etl/README.md +++ b/etl/README.md @@ -11,7 +11,22 @@ Definition of multiple places: Definition of one place: - into a CSV...today (03/03/2025) -- [] Read a file from what khalim has shared +- [x] Added sharepointclient that khalim made - Need to proof it works +- [x] Read a file from what khalim has shared + +Add a local file: +- [] mount a local folder directory wiht what Khalim sharepoint he has shared +- [] REad files???? + + +Once I have sharepoint api working: - [] Make validator for retro team - [] once validated, produce a csv file - [] show some cool productivity metric + + + +- With Khalim: +- [] Check if I have access to sharepoint +- [] Try and get his client API working and see if I can read files + From 24ca89eb489c1fdec9381af9e423f58d1deac975 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 3 Mar 2025 13:04:10 +0000 Subject: [PATCH 04/20] comments --- etl/src/etl/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/src/etl/main.py b/etl/src/etl/main.py index 01e5c4b..df3e805 100644 --- a/etl/src/etl/main.py +++ b/etl/src/etl/main.py @@ -27,5 +27,5 @@ Place holder for SharePointClient - When khalim is back see if I can get this wo ### -Get local file read working +#Get local file read working ### \ No newline at end of file From 60a3c0c9e85d46d3ae7a3e96046db71b7912665e Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 3 Mar 2025 13:06:41 +0000 Subject: [PATCH 05/20] added git ignore files --- .gitignore | 3 +++ etl/src/etl/main.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c791ff5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] \ No newline at end of file diff --git a/etl/src/etl/main.py b/etl/src/etl/main.py index df3e805..8b9e27d 100644 --- a/etl/src/etl/main.py +++ b/etl/src/etl/main.py @@ -1,4 +1,4 @@ -from utils.sharepoint.sharepoint import SharePointClient +# from utils.sharepoint.sharepoint import SharePointClient import os From c3c155bdb8c6f3d234b392050a55abdf19586e69 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 3 Mar 2025 13:16:32 +0000 Subject: [PATCH 06/20] added data mounting --- .devcontainer/devcontainer.json | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 875a810..85de800 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -10,5 +10,9 @@ "ms-azuretools.vscode-docker" ] } - } + }, + // temporary mount local file from local computer. DELETE ME if you are not Jun-te Kim + "runArgs": [ + "--mount", "type=bind,source=/home/kimjunte/data,target=/workspaces/survey-extraction/data" + ] } From d778f79169ec1caea46b1278d79930aa11c116d7 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 3 Mar 2025 13:19:35 +0000 Subject: [PATCH 07/20] hid data file --- .gitignore | 3 ++- etl/README.md | 4 ++-- etl/src/etl/main.py | 16 ++++------------ 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index c791ff5..b55da5b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ # Byte-compiled / optimized / DLL files __pycache__/ -*.py[cod] \ No newline at end of file +*.py[cod] +data/ \ No newline at end of file diff --git a/etl/README.md b/etl/README.md index e36d2cf..d8e0725 100644 --- a/etl/README.md +++ b/etl/README.md @@ -15,8 +15,8 @@ Definition of one place: - [x] Read a file from what khalim has shared Add a local file: -- [] mount a local folder directory wiht what Khalim sharepoint he has shared -- [] REad files???? +- [x] mount a local folder directory wiht what Khalim sharepoint he has shared +- [x] REad files and do something with it Once I have sharepoint api working: diff --git a/etl/src/etl/main.py b/etl/src/etl/main.py index 8b9e27d..5d3f3c9 100644 --- a/etl/src/etl/main.py +++ b/etl/src/etl/main.py @@ -1,13 +1,4 @@ # from utils.sharepoint.sharepoint import SharePointClient -import os - - -print("Hello world") - -### -''' -Place holder for SharePointClient - When khalim is back see if I can get this wokring -''' ### # # Share point env variables most likely # SHAREPOINT_CLIENT_ID = os.getenv("SHAREPOINT_CLIENT_ID", None) @@ -26,6 +17,7 @@ Place holder for SharePointClient - When khalim is back see if I can get this wo # ) -### -#Get local file read working -### \ No newline at end of file + +# Do some with local files and read stuff basic + +print("hello world ") \ No newline at end of file From 69b642e68c9b34e8e907444b8a568ab1029c88c2 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 3 Mar 2025 13:20:52 +0000 Subject: [PATCH 08/20] added comment on what to work on next --- etl/src/etl/main.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/etl/src/etl/main.py b/etl/src/etl/main.py index 5d3f3c9..cc0f1a4 100644 --- a/etl/src/etl/main.py +++ b/etl/src/etl/main.py @@ -20,4 +20,9 @@ # Do some with local files and read stuff basic -print("hello world ") \ No newline at end of file +print("hello world ") + +# Read file from local file path directory +# proof of concept of some validator +# proof of concept of something i do with a particular flie +# ask khalim how sharepoint is going \ No newline at end of file From 965475647b53baf16568bfd867a2c5c65e873832 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 3 Mar 2025 13:26:26 +0000 Subject: [PATCH 09/20] added basic class for validator --- etl/src/etl/validator/retrohome.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 etl/src/etl/validator/retrohome.py diff --git a/etl/src/etl/validator/retrohome.py b/etl/src/etl/validator/retrohome.py new file mode 100644 index 0000000..40d1729 --- /dev/null +++ b/etl/src/etl/validator/retrohome.py @@ -0,0 +1,5 @@ + +class RetroHomeValidator(filepath): + def __init__(self): + pass + From 5606f3face7dcedf4974926b93bda9e66509c51b Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 3 Mar 2025 14:42:58 +0000 Subject: [PATCH 10/20] added some retro home class to do some retro home stuff --- etl/src/etl/main.py | 1 + etl/src/etl/validator/retrohome.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/etl/src/etl/main.py b/etl/src/etl/main.py index cc0f1a4..d490a6f 100644 --- a/etl/src/etl/main.py +++ b/etl/src/etl/main.py @@ -25,4 +25,5 @@ print("hello world ") # Read file from local file path directory # proof of concept of some validator # proof of concept of something i do with a particular flie +# the important file at the moment is "Pre site notes" # ask khalim how sharepoint is going \ No newline at end of file diff --git a/etl/src/etl/validator/retrohome.py b/etl/src/etl/validator/retrohome.py index 40d1729..e814361 100644 --- a/etl/src/etl/validator/retrohome.py +++ b/etl/src/etl/validator/retrohome.py @@ -1,5 +1,6 @@ -class RetroHomeValidator(filepath): - def __init__(self): + +class RetroHomeValidator(): + def __init__(self, file_path): pass From 0b813ac51edd8c6cf541ce98b9a6e078ecfd5437 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 3 Mar 2025 14:44:02 +0000 Subject: [PATCH 11/20] updated to do list --- etl/src/etl/main.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/etl/src/etl/main.py b/etl/src/etl/main.py index d490a6f..0074267 100644 --- a/etl/src/etl/main.py +++ b/etl/src/etl/main.py @@ -1,25 +1,3 @@ -# from utils.sharepoint.sharepoint import SharePointClient -### -# # Share point env variables most likely -# SHAREPOINT_CLIENT_ID = os.getenv("SHAREPOINT_CLIENT_ID", None) -# SHAREPOINT_CLIENT_SECRET = os.getenv("SHAREPOINT_CLIENT_SECRET", None) -# SHAREPOINT_TENANT_ID = os.getenv("SHAREPOINT_TENANT_ID", None) -# WARMFRONT_SHAREPOINT_SITE_ID = os.getenv("WARMFRONT_SHAREPOINT_SITE_ID", None) - -# # Connect to the Sharepoint API, with the access token. -# sharepoint_client = SharePointClient( -# tenant_id=SHAREPOINT_TENANT_ID, -# client_id=SHAREPOINT_CLIENT_ID, -# client_secret=SHAREPOINT_CLIENT_SECRET, -# site_id=WARMFRONT_SHAREPOINT_SITE_ID, -# access_token=message["access_token"], -# access_token_expiration_details=message["access_token_expiration_details"] -# ) - - - -# Do some with local files and read stuff basic - print("hello world ") # Read file from local file path directory From c2f94afde466ab2b50baf2f140e8d9c8426cc759 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 3 Mar 2025 15:11:02 +0000 Subject: [PATCH 12/20] basic template of validator --- etl/src/etl/main.py | 13 ++++++++++++- etl/src/etl/validator/__init__.py | 0 etl/src/etl/validator/retrohome.py | 12 +++++++++--- 3 files changed, 21 insertions(+), 4 deletions(-) create mode 100644 etl/src/etl/validator/__init__.py diff --git a/etl/src/etl/main.py b/etl/src/etl/main.py index 0074267..282f405 100644 --- a/etl/src/etl/main.py +++ b/etl/src/etl/main.py @@ -1,5 +1,16 @@ -print("hello world ") +import os +from validator.retrohome import RetroHomeFileStructureValidator +DATA_LOC = "/workspaces/survey-extraction/data/" + +def main(): + RetroHomeFileStructureValidator(DATA_LOC) + +if __name__ == "__main__": + main() + + +# Make a file checker to see if retrohomes as sumbitted the correct structure # Read file from local file path directory # proof of concept of some validator # proof of concept of something i do with a particular flie diff --git a/etl/src/etl/validator/__init__.py b/etl/src/etl/validator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/etl/src/etl/validator/retrohome.py b/etl/src/etl/validator/retrohome.py index e814361..3918ed0 100644 --- a/etl/src/etl/validator/retrohome.py +++ b/etl/src/etl/validator/retrohome.py @@ -1,6 +1,12 @@ +import os +class RetroHomeFileStructureValidator(): + def __init__(self, source_loc_path): + self.source_path = source_loc_path + self.correct_names = [] + self.incorrect_names = [] + self.validate() -class RetroHomeValidator(): - def __init__(self, file_path): - pass + def validate(self): + print("nothing to validate") From 8e6ba54ccbd345ff6d407fae65f1bc99fbe71b78 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 3 Mar 2025 15:25:39 +0000 Subject: [PATCH 13/20] added a basic validator check --- etl/src/etl/utils/logger.py | 22 ++++++++++++++++++++++ etl/src/etl/validator/retrohome.py | 16 +++++++++++++--- 2 files changed, 35 insertions(+), 3 deletions(-) create mode 100644 etl/src/etl/utils/logger.py diff --git a/etl/src/etl/utils/logger.py b/etl/src/etl/utils/logger.py new file mode 100644 index 0000000..b31375f --- /dev/null +++ b/etl/src/etl/utils/logger.py @@ -0,0 +1,22 @@ +import logging +import os + +class Logger: + def __init__(self, name, level=logging.INFO): + # Create a custom logger + self.logger = logging.getLogger(name) + self.logger.setLevel(level) + + # Create handlers + c_handler = logging.StreamHandler() + c_handler.setLevel(level) + + # Create formatters and add it to handlers + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + c_handler.setFormatter(formatter) + + # Add handlers to the logger + self.logger.addHandler(c_handler) + + def get_logger(self): + return self.logger \ No newline at end of file diff --git a/etl/src/etl/validator/retrohome.py b/etl/src/etl/validator/retrohome.py index 3918ed0..9b8b395 100644 --- a/etl/src/etl/validator/retrohome.py +++ b/etl/src/etl/validator/retrohome.py @@ -1,12 +1,22 @@ import os +from utils.logger import Logger class RetroHomeFileStructureValidator(): def __init__(self, source_loc_path): self.source_path = source_loc_path - self.correct_names = [] - self.incorrect_names = [] + self.logger = Logger(name='RetroHomeFileStructureValidator').get_logger() + self.innocent = [] + self.guilty = [] self.validate() def validate(self): - print("nothing to validate") + self.logger.info(f"Starting File Structure Validation on '{self.source_path}'") + for filepath in os.listdir(self.source_path): + if os.path.isdir(os.path.join(self.source_path, filepath)): + self.logger.info(f"Its a file {filepath}") + else: + self.logger.warning(f"Found a file when expecting directory. Ignoring file {filepath}") + + def valid_name(name): + pass \ No newline at end of file From adc7c541709ed941c3b9c61d869493a8b62a9a75 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 3 Mar 2025 16:03:07 +0000 Subject: [PATCH 14/20] move on to file extration --- etl/src/etl/validator/retrohome.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/etl/src/etl/validator/retrohome.py b/etl/src/etl/validator/retrohome.py index 9b8b395..b6b8eac 100644 --- a/etl/src/etl/validator/retrohome.py +++ b/etl/src/etl/validator/retrohome.py @@ -14,9 +14,28 @@ class RetroHomeFileStructureValidator(): for filepath in os.listdir(self.source_path): if os.path.isdir(os.path.join(self.source_path, filepath)): - self.logger.info(f"Its a file {filepath}") + self.innocent.append(filepath) else: self.logger.warning(f"Found a file when expecting directory. Ignoring file {filepath}") + + self.logger.info(self.innocent) + + self.valid_name() + + self.valid_file_structure() - def valid_name(name): - pass \ No newline at end of file + def valid_name(self): + for i, names in enumerate(self.innocent): + temp = names.split(" ") + if len(temp) > 2: + self.logger.warning(f"The name '{names}' is not in the correct format") + self.guilty.append(names) + self.innocent.remove(names) + + def valid_file_structure(self): + for names in self.innocent: + path_to_check = os.path.join(self.source_path, names) + + + def date_checker_extractor(self): + raise NotImplementedError("Please contact Jun-te Kim to make this feature") \ No newline at end of file From 828065f0b82bcd9c182dfc5831e9dbe5612f4318 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 3 Mar 2025 17:22:14 +0000 Subject: [PATCH 15/20] save work --- etl/README.md | 21 +++++++++++++++++++-- etl/src/etl/main.py | 5 +++-- etl/src/etl/validator/retrohome.py | 2 -- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/etl/README.md b/etl/README.md index d8e0725..666519c 100644 --- a/etl/README.md +++ b/etl/README.md @@ -16,7 +16,7 @@ Definition of one place: Add a local file: - [x] mount a local folder directory wiht what Khalim sharepoint he has shared -- [x] REad files and do something with it +- [x] REad files and file path Once I have sharepoint api working: @@ -24,9 +24,26 @@ Once I have sharepoint api working: - [] once validated, produce a csv file - [] show some cool productivity metric - +Currently working on: +- [] Validator + - [x] check names + - [in progress, blocked unitl sharepoint. Easy to add] check it has dates +- [] Useful file reader: + - [] Khalim showed me a useful pdf, that I should try to extract and get some information - With Khalim: - [] Check if I have access to sharepoint - [] Try and get his client API working and see if I can read files + +MVP: + Script we can run that will + Go to share point fetch all the data + provide some form of output + that shows the number of surverys done + +Flat table +
+ +Billing: +Billing table, left join \ No newline at end of file diff --git a/etl/src/etl/main.py b/etl/src/etl/main.py index 282f405..588f4ef 100644 --- a/etl/src/etl/main.py +++ b/etl/src/etl/main.py @@ -2,17 +2,18 @@ import os from validator.retrohome import RetroHomeFileStructureValidator DATA_LOC = "/workspaces/survey-extraction/data/" +INTERESTING_FILE_LOC = "/workspaces/survey-extraction/data/first last/Submission 03.03.25/customer/10 Sandbeck Lane DN21 3LZ/PRE SITE NOTES.pdf" def main(): RetroHomeFileStructureValidator(DATA_LOC) + if __name__ == "__main__": main() -# Make a file checker to see if retrohomes as sumbitted the correct structure # Read file from local file path directory # proof of concept of some validator # proof of concept of something i do with a particular flie # the important file at the moment is "Pre site notes" -# ask khalim how sharepoint is going \ No newline at end of file +# Ask khalim how sharepoint is going \ No newline at end of file diff --git a/etl/src/etl/validator/retrohome.py b/etl/src/etl/validator/retrohome.py index b6b8eac..6aa0b7a 100644 --- a/etl/src/etl/validator/retrohome.py +++ b/etl/src/etl/validator/retrohome.py @@ -19,9 +19,7 @@ class RetroHomeFileStructureValidator(): self.logger.warning(f"Found a file when expecting directory. Ignoring file {filepath}") self.logger.info(self.innocent) - self.valid_name() - self.valid_file_structure() def valid_name(self): From 7537d0405b2ba34f3d2d31b3d9a901394bbb5a5c Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 4 Mar 2025 08:58:41 +0000 Subject: [PATCH 16/20] basic logging level changes --- etl/src/etl/{validator => filePathValidator}/__init__.py | 0 etl/src/etl/{validator => filePathValidator}/retrohome.py | 7 ++++--- etl/src/etl/main.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) rename etl/src/etl/{validator => filePathValidator}/__init__.py (100%) rename etl/src/etl/{validator => filePathValidator}/retrohome.py (87%) diff --git a/etl/src/etl/validator/__init__.py b/etl/src/etl/filePathValidator/__init__.py similarity index 100% rename from etl/src/etl/validator/__init__.py rename to etl/src/etl/filePathValidator/__init__.py diff --git a/etl/src/etl/validator/retrohome.py b/etl/src/etl/filePathValidator/retrohome.py similarity index 87% rename from etl/src/etl/validator/retrohome.py rename to etl/src/etl/filePathValidator/retrohome.py index 6aa0b7a..a4be660 100644 --- a/etl/src/etl/validator/retrohome.py +++ b/etl/src/etl/filePathValidator/retrohome.py @@ -1,16 +1,17 @@ import os +import logging from utils.logger import Logger class RetroHomeFileStructureValidator(): def __init__(self, source_loc_path): self.source_path = source_loc_path - self.logger = Logger(name='RetroHomeFileStructureValidator').get_logger() + self.logger = Logger(name='RetroHomeFileStructureValidator', level=logging.DEBUG).get_logger() self.innocent = [] self.guilty = [] self.validate() def validate(self): - self.logger.info(f"Starting File Structure Validation on '{self.source_path}'") + self.logger.debug(f"Starting File Structure Validation on '{self.source_path}'") for filepath in os.listdir(self.source_path): if os.path.isdir(os.path.join(self.source_path, filepath)): @@ -18,7 +19,7 @@ class RetroHomeFileStructureValidator(): else: self.logger.warning(f"Found a file when expecting directory. Ignoring file {filepath}") - self.logger.info(self.innocent) + self.logger.debug(self.innocent) self.valid_name() self.valid_file_structure() diff --git a/etl/src/etl/main.py b/etl/src/etl/main.py index 588f4ef..49b20b7 100644 --- a/etl/src/etl/main.py +++ b/etl/src/etl/main.py @@ -1,5 +1,5 @@ import os -from validator.retrohome import RetroHomeFileStructureValidator +from filePathValidator.retrohome import RetroHomeFileStructureValidator DATA_LOC = "/workspaces/survey-extraction/data/" INTERESTING_FILE_LOC = "/workspaces/survey-extraction/data/first last/Submission 03.03.25/customer/10 Sandbeck Lane DN21 3LZ/PRE SITE NOTES.pdf" From feed56054b83838c0f793a43edfccc93328c821f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 4 Mar 2025 10:14:19 +0000 Subject: [PATCH 17/20] added file to read files --- etl/poetry.lock | 157 +++++++++++++++++++++++++++++- etl/pyproject.toml | 1 + etl/src/etl/pdfReader/__init__.py | 0 etl/src/etl/pdfReader/pdfTypes.py | 14 +++ 4 files changed, 170 insertions(+), 2 deletions(-) create mode 100644 etl/src/etl/pdfReader/__init__.py create mode 100644 etl/src/etl/pdfReader/pdfTypes.py diff --git a/etl/poetry.lock b/etl/poetry.lock index 503fbc4..8d66d84 100644 --- a/etl/poetry.lock +++ b/etl/poetry.lock @@ -1,7 +1,160 @@ # This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. -package = [] + +[[package]] +name = "pillow" +version = "11.1.0" +description = "Python Imaging Library (Fork)" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "pillow-11.1.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:e1abe69aca89514737465752b4bcaf8016de61b3be1397a8fc260ba33321b3a8"}, + {file = "pillow-11.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c640e5a06869c75994624551f45e5506e4256562ead981cce820d5ab39ae2192"}, + {file = "pillow-11.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a07dba04c5e22824816b2615ad7a7484432d7f540e6fa86af60d2de57b0fcee2"}, + {file = "pillow-11.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e267b0ed063341f3e60acd25c05200df4193e15a4a5807075cd71225a2386e26"}, + {file = "pillow-11.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:bd165131fd51697e22421d0e467997ad31621b74bfc0b75956608cb2906dda07"}, + {file = "pillow-11.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:abc56501c3fd148d60659aae0af6ddc149660469082859fa7b066a298bde9482"}, + {file = "pillow-11.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:54ce1c9a16a9561b6d6d8cb30089ab1e5eb66918cb47d457bd996ef34182922e"}, + {file = "pillow-11.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:73ddde795ee9b06257dac5ad42fcb07f3b9b813f8c1f7f870f402f4dc54b5269"}, + {file = "pillow-11.1.0-cp310-cp310-win32.whl", hash = "sha256:3a5fe20a7b66e8135d7fd617b13272626a28278d0e578c98720d9ba4b2439d49"}, + {file = "pillow-11.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:b6123aa4a59d75f06e9dd3dac5bf8bc9aa383121bb3dd9a7a612e05eabc9961a"}, + {file = "pillow-11.1.0-cp310-cp310-win_arm64.whl", hash = "sha256:a76da0a31da6fcae4210aa94fd779c65c75786bc9af06289cd1c184451ef7a65"}, + {file = "pillow-11.1.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:e06695e0326d05b06833b40b7ef477e475d0b1ba3a6d27da1bb48c23209bf457"}, + {file = "pillow-11.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:96f82000e12f23e4f29346e42702b6ed9a2f2fea34a740dd5ffffcc8c539eb35"}, + {file = "pillow-11.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a3cd561ded2cf2bbae44d4605837221b987c216cff94f49dfeed63488bb228d2"}, + {file = "pillow-11.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f189805c8be5ca5add39e6f899e6ce2ed824e65fb45f3c28cb2841911da19070"}, + {file = "pillow-11.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dd0052e9db3474df30433f83a71b9b23bd9e4ef1de13d92df21a52c0303b8ab6"}, + {file = "pillow-11.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:837060a8599b8f5d402e97197d4924f05a2e0d68756998345c829c33186217b1"}, + {file = "pillow-11.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:aa8dd43daa836b9a8128dbe7d923423e5ad86f50a7a14dc688194b7be5c0dea2"}, + {file = "pillow-11.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0a2f91f8a8b367e7a57c6e91cd25af510168091fb89ec5146003e424e1558a96"}, + {file = "pillow-11.1.0-cp311-cp311-win32.whl", hash = "sha256:c12fc111ef090845de2bb15009372175d76ac99969bdf31e2ce9b42e4b8cd88f"}, + {file = "pillow-11.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:fbd43429d0d7ed6533b25fc993861b8fd512c42d04514a0dd6337fb3ccf22761"}, + {file = "pillow-11.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:f7955ecf5609dee9442cbface754f2c6e541d9e6eda87fad7f7a989b0bdb9d71"}, + {file = "pillow-11.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2062ffb1d36544d42fcaa277b069c88b01bb7298f4efa06731a7fd6cc290b81a"}, + {file = "pillow-11.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a85b653980faad27e88b141348707ceeef8a1186f75ecc600c395dcac19f385b"}, + {file = "pillow-11.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9409c080586d1f683df3f184f20e36fb647f2e0bc3988094d4fd8c9f4eb1b3b3"}, + {file = "pillow-11.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7fdadc077553621911f27ce206ffcbec7d3f8d7b50e0da39f10997e8e2bb7f6a"}, + {file = "pillow-11.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:93a18841d09bcdd774dcdc308e4537e1f867b3dec059c131fde0327899734aa1"}, + {file = "pillow-11.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:9aa9aeddeed452b2f616ff5507459e7bab436916ccb10961c4a382cd3e03f47f"}, + {file = "pillow-11.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3cdcdb0b896e981678eee140d882b70092dac83ac1cdf6b3a60e2216a73f2b91"}, + {file = "pillow-11.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:36ba10b9cb413e7c7dfa3e189aba252deee0602c86c309799da5a74009ac7a1c"}, + {file = "pillow-11.1.0-cp312-cp312-win32.whl", hash = "sha256:cfd5cd998c2e36a862d0e27b2df63237e67273f2fc78f47445b14e73a810e7e6"}, + {file = "pillow-11.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:a697cd8ba0383bba3d2d3ada02b34ed268cb548b369943cd349007730c92bddf"}, + {file = "pillow-11.1.0-cp312-cp312-win_arm64.whl", hash = "sha256:4dd43a78897793f60766563969442020e90eb7847463eca901e41ba186a7d4a5"}, + {file = "pillow-11.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ae98e14432d458fc3de11a77ccb3ae65ddce70f730e7c76140653048c71bfcbc"}, + {file = "pillow-11.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cc1331b6d5a6e144aeb5e626f4375f5b7ae9934ba620c0ac6b3e43d5e683a0f0"}, + {file = "pillow-11.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:758e9d4ef15d3560214cddbc97b8ef3ef86ce04d62ddac17ad39ba87e89bd3b1"}, + {file = "pillow-11.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b523466b1a31d0dcef7c5be1f20b942919b62fd6e9a9be199d035509cbefc0ec"}, + {file = "pillow-11.1.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:9044b5e4f7083f209c4e35aa5dd54b1dd5b112b108648f5c902ad586d4f945c5"}, + {file = "pillow-11.1.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:3764d53e09cdedd91bee65c2527815d315c6b90d7b8b79759cc48d7bf5d4f114"}, + {file = "pillow-11.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:31eba6bbdd27dde97b0174ddf0297d7a9c3a507a8a1480e1e60ef914fe23d352"}, + {file = "pillow-11.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b5d658fbd9f0d6eea113aea286b21d3cd4d3fd978157cbf2447a6035916506d3"}, + {file = "pillow-11.1.0-cp313-cp313-win32.whl", hash = "sha256:f86d3a7a9af5d826744fabf4afd15b9dfef44fe69a98541f666f66fbb8d3fef9"}, + {file = "pillow-11.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:593c5fd6be85da83656b93ffcccc2312d2d149d251e98588b14fbc288fd8909c"}, + {file = "pillow-11.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:11633d58b6ee5733bde153a8dafd25e505ea3d32e261accd388827ee987baf65"}, + {file = "pillow-11.1.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:70ca5ef3b3b1c4a0812b5c63c57c23b63e53bc38e758b37a951e5bc466449861"}, + {file = "pillow-11.1.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:8000376f139d4d38d6851eb149b321a52bb8893a88dae8ee7d95840431977081"}, + {file = "pillow-11.1.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ee85f0696a17dd28fbcfceb59f9510aa71934b483d1f5601d1030c3c8304f3c"}, + {file = "pillow-11.1.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:dd0e081319328928531df7a0e63621caf67652c8464303fd102141b785ef9547"}, + {file = "pillow-11.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e63e4e5081de46517099dc30abe418122f54531a6ae2ebc8680bcd7096860eab"}, + {file = "pillow-11.1.0-cp313-cp313t-win32.whl", hash = "sha256:dda60aa465b861324e65a78c9f5cf0f4bc713e4309f83bc387be158b077963d9"}, + {file = "pillow-11.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ad5db5781c774ab9a9b2c4302bbf0c1014960a0a7be63278d13ae6fdf88126fe"}, + {file = "pillow-11.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:67cd427c68926108778a9005f2a04adbd5e67c442ed21d95389fe1d595458756"}, + {file = "pillow-11.1.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:bf902d7413c82a1bfa08b06a070876132a5ae6b2388e2712aab3a7cbc02205c6"}, + {file = "pillow-11.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c1eec9d950b6fe688edee07138993e54ee4ae634c51443cfb7c1e7613322718e"}, + {file = "pillow-11.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e275ee4cb11c262bd108ab2081f750db2a1c0b8c12c1897f27b160c8bd57bbc"}, + {file = "pillow-11.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4db853948ce4e718f2fc775b75c37ba2efb6aaea41a1a5fc57f0af59eee774b2"}, + {file = "pillow-11.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:ab8a209b8485d3db694fa97a896d96dd6533d63c22829043fd9de627060beade"}, + {file = "pillow-11.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:54251ef02a2309b5eec99d151ebf5c9904b77976c8abdcbce7891ed22df53884"}, + {file = "pillow-11.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5bb94705aea800051a743aa4874bb1397d4695fb0583ba5e425ee0328757f196"}, + {file = "pillow-11.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:89dbdb3e6e9594d512780a5a1c42801879628b38e3efc7038094430844e271d8"}, + {file = "pillow-11.1.0-cp39-cp39-win32.whl", hash = "sha256:e5449ca63da169a2e6068dd0e2fcc8d91f9558aba89ff6d02121ca8ab11e79e5"}, + {file = "pillow-11.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:3362c6ca227e65c54bf71a5f88b3d4565ff1bcbc63ae72c34b07bbb1cc59a43f"}, + {file = "pillow-11.1.0-cp39-cp39-win_arm64.whl", hash = "sha256:b20be51b37a75cc54c2c55def3fa2c65bb94ba859dde241cd0a4fd302de5ae0a"}, + {file = "pillow-11.1.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8c730dc3a83e5ac137fbc92dfcfe1511ce3b2b5d7578315b63dbbb76f7f51d90"}, + {file = "pillow-11.1.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:7d33d2fae0e8b170b6a6c57400e077412240f6f5bb2a342cf1ee512a787942bb"}, + {file = "pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8d65b38173085f24bc07f8b6c505cbb7418009fa1a1fcb111b1f4961814a442"}, + {file = "pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:015c6e863faa4779251436db398ae75051469f7c903b043a48f078e437656f83"}, + {file = "pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d44ff19eea13ae4acdaaab0179fa68c0c6f2f45d66a4d8ec1eda7d6cecbcc15f"}, + {file = "pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d3d8da4a631471dfaf94c10c85f5277b1f8e42ac42bade1ac67da4b4a7359b73"}, + {file = "pillow-11.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:4637b88343166249fe8aa94e7c4a62a180c4b3898283bb5d3d2fd5fe10d8e4e0"}, + {file = "pillow-11.1.0.tar.gz", hash = "sha256:368da70808b36d73b4b390a8ffac11069f8a5c85f29eff1f1b01bcf3ef5b2a20"}, +] + +[package.extras] +docs = ["furo", "olefile", "sphinx (>=8.1)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"] +fpx = ["olefile"] +mic = ["olefile"] +tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout", "trove-classifiers (>=2024.10.12)"] +typing = ["typing-extensions ; python_version < \"3.10\""] +xmp = ["defusedxml"] + +[[package]] +name = "pycryptodome" +version = "3.21.0" +description = "Cryptographic library for Python" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +groups = ["main"] +files = [ + {file = "pycryptodome-3.21.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:dad9bf36eda068e89059d1f07408e397856be9511d7113ea4b586642a429a4fd"}, + {file = "pycryptodome-3.21.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:a1752eca64c60852f38bb29e2c86fca30d7672c024128ef5d70cc15868fa10f4"}, + {file = "pycryptodome-3.21.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:3ba4cc304eac4d4d458f508d4955a88ba25026890e8abff9b60404f76a62c55e"}, + {file = "pycryptodome-3.21.0-cp27-cp27m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cb087b8612c8a1a14cf37dd754685be9a8d9869bed2ffaaceb04850a8aeef7e"}, + {file = "pycryptodome-3.21.0-cp27-cp27m-musllinux_1_1_aarch64.whl", hash = "sha256:26412b21df30b2861424a6c6d5b1d8ca8107612a4cfa4d0183e71c5d200fb34a"}, + {file = "pycryptodome-3.21.0-cp27-cp27m-win32.whl", hash = "sha256:cc2269ab4bce40b027b49663d61d816903a4bd90ad88cb99ed561aadb3888dd3"}, + {file = "pycryptodome-3.21.0-cp27-cp27m-win_amd64.whl", hash = "sha256:0fa0a05a6a697ccbf2a12cec3d6d2650b50881899b845fac6e87416f8cb7e87d"}, + {file = "pycryptodome-3.21.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:6cce52e196a5f1d6797ff7946cdff2038d3b5f0aba4a43cb6bf46b575fd1b5bb"}, + {file = "pycryptodome-3.21.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:a915597ffccabe902e7090e199a7bf7a381c5506a747d5e9d27ba55197a2c568"}, + {file = "pycryptodome-3.21.0-cp27-cp27mu-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4e74c522d630766b03a836c15bff77cb657c5fdf098abf8b1ada2aebc7d0819"}, + {file = "pycryptodome-3.21.0-cp27-cp27mu-musllinux_1_1_aarch64.whl", hash = "sha256:a3804675283f4764a02db05f5191eb8fec2bb6ca34d466167fc78a5f05bbe6b3"}, + {file = "pycryptodome-3.21.0-cp36-abi3-macosx_10_9_universal2.whl", hash = "sha256:2480ec2c72438430da9f601ebc12c518c093c13111a5c1644c82cdfc2e50b1e4"}, + {file = "pycryptodome-3.21.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:de18954104667f565e2fbb4783b56667f30fb49c4d79b346f52a29cb198d5b6b"}, + {file = "pycryptodome-3.21.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2de4b7263a33947ff440412339cb72b28a5a4c769b5c1ca19e33dd6cd1dcec6e"}, + {file = "pycryptodome-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0714206d467fc911042d01ea3a1847c847bc10884cf674c82e12915cfe1649f8"}, + {file = "pycryptodome-3.21.0-cp36-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d85c1b613121ed3dbaa5a97369b3b757909531a959d229406a75b912dd51dd1"}, + {file = "pycryptodome-3.21.0-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:8898a66425a57bcf15e25fc19c12490b87bd939800f39a03ea2de2aea5e3611a"}, + {file = "pycryptodome-3.21.0-cp36-abi3-musllinux_1_2_i686.whl", hash = "sha256:932c905b71a56474bff8a9c014030bc3c882cee696b448af920399f730a650c2"}, + {file = "pycryptodome-3.21.0-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:18caa8cfbc676eaaf28613637a89980ad2fd96e00c564135bf90bc3f0b34dd93"}, + {file = "pycryptodome-3.21.0-cp36-abi3-win32.whl", hash = "sha256:280b67d20e33bb63171d55b1067f61fbd932e0b1ad976b3a184303a3dad22764"}, + {file = "pycryptodome-3.21.0-cp36-abi3-win_amd64.whl", hash = "sha256:b7aa25fc0baa5b1d95b7633af4f5f1838467f1815442b22487426f94e0d66c53"}, + {file = "pycryptodome-3.21.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:2cb635b67011bc147c257e61ce864879ffe6d03342dc74b6045059dfbdedafca"}, + {file = "pycryptodome-3.21.0-pp27-pypy_73-win32.whl", hash = "sha256:4c26a2f0dc15f81ea3afa3b0c87b87e501f235d332b7f27e2225ecb80c0b1cdd"}, + {file = "pycryptodome-3.21.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:d5ebe0763c982f069d3877832254f64974139f4f9655058452603ff559c482e8"}, + {file = "pycryptodome-3.21.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ee86cbde706be13f2dec5a42b52b1c1d1cbb90c8e405c68d0755134735c8dc6"}, + {file = "pycryptodome-3.21.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0fd54003ec3ce4e0f16c484a10bc5d8b9bd77fa662a12b85779a2d2d85d67ee0"}, + {file = "pycryptodome-3.21.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5dfafca172933506773482b0e18f0cd766fd3920bd03ec85a283df90d8a17bc6"}, + {file = "pycryptodome-3.21.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:590ef0898a4b0a15485b05210b4a1c9de8806d3ad3d47f74ab1dc07c67a6827f"}, + {file = "pycryptodome-3.21.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f35e442630bc4bc2e1878482d6f59ea22e280d7121d7adeaedba58c23ab6386b"}, + {file = "pycryptodome-3.21.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff99f952db3db2fbe98a0b355175f93ec334ba3d01bbde25ad3a5a33abc02b58"}, + {file = "pycryptodome-3.21.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:8acd7d34af70ee63f9a849f957558e49a98f8f1634f86a59d2be62bb8e93f71c"}, + {file = "pycryptodome-3.21.0.tar.gz", hash = "sha256:f7787e0d469bdae763b876174cf2e6c0f7be79808af26b1da96f1a64bcf47297"}, +] + +[[package]] +name = "pypdf2" +version = "3.0.1" +description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" +optional = false +python-versions = ">=3.6" +groups = ["main"] +files = [ + {file = "PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440"}, + {file = "pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928"}, +] + +[package.dependencies] +Pillow = {version = "*", optional = true, markers = "extra == \"full\""} +PyCryptodome = {version = "*", optional = true, markers = "extra == \"full\""} + +[package.extras] +crypto = ["PyCryptodome"] +dev = ["black", "flit", "pip-tools", "pre-commit (<2.18.0)", "pytest-cov", "wheel"] +docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] +full = ["Pillow", "PyCryptodome"] +image = ["Pillow"] [metadata] lock-version = "2.1" python-versions = ">=3.12" -content-hash = "75265641fd1a3f2a4d608312a3879427b7141ac2a51d0873da5711cbc8ead28e" +content-hash = "653eb47a984d886bcc8cd1bf551ecc1a4efa26d3335667bf70b535d99e5bcccd" diff --git a/etl/pyproject.toml b/etl/pyproject.toml index 845e955..34e774c 100644 --- a/etl/pyproject.toml +++ b/etl/pyproject.toml @@ -8,6 +8,7 @@ authors = [ readme = "README.md" requires-python = ">=3.12" dependencies = [ + "pypdf2[full] (>=3.0.1,<4.0.0)" ] [tool.poetry] diff --git a/etl/src/etl/pdfReader/__init__.py b/etl/src/etl/pdfReader/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/etl/src/etl/pdfReader/pdfTypes.py b/etl/src/etl/pdfReader/pdfTypes.py new file mode 100644 index 0000000..bb55ac7 --- /dev/null +++ b/etl/src/etl/pdfReader/pdfTypes.py @@ -0,0 +1,14 @@ +from utils.logger import Logger +import logging + + +class pdfReader(): + + def __init__(self, file_path): + self.source_path = file_path + self.logger = Logger(name='pdfReader', level=logging.DEBUG).get_logger() + + def get_text_from_pdf_file(self): + self.logger.debug(f"Extrating text from {self.source_path}") + + return \ No newline at end of file From e5ea08fa0c451fd134bcd82ac5bb67cd37efcb5c Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 4 Mar 2025 12:21:44 +0000 Subject: [PATCH 18/20] added pdf reader --- .gitignore | 3 +- etl/poetry.lock | 159 ++---------------- etl/pyproject.toml | 2 +- etl/src/etl/main.py | 6 +- .../{pdfTypes.py => pdfReaderToText.py} | 11 +- 5 files changed, 26 insertions(+), 155 deletions(-) rename etl/src/etl/pdfReader/{pdfTypes.py => pdfReaderToText.py} (55%) diff --git a/.gitignore b/.gitignore index b55da5b..bbdd514 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] -data/ \ No newline at end of file +data/ +.env \ No newline at end of file diff --git a/etl/poetry.lock b/etl/poetry.lock index 8d66d84..30f5041 100644 --- a/etl/poetry.lock +++ b/etl/poetry.lock @@ -1,160 +1,23 @@ # This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. [[package]] -name = "pillow" -version = "11.1.0" -description = "Python Imaging Library (Fork)" +name = "pymupdf" +version = "1.25.3" +description = "A high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents." optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "pillow-11.1.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:e1abe69aca89514737465752b4bcaf8016de61b3be1397a8fc260ba33321b3a8"}, - {file = "pillow-11.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c640e5a06869c75994624551f45e5506e4256562ead981cce820d5ab39ae2192"}, - {file = "pillow-11.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a07dba04c5e22824816b2615ad7a7484432d7f540e6fa86af60d2de57b0fcee2"}, - {file = "pillow-11.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e267b0ed063341f3e60acd25c05200df4193e15a4a5807075cd71225a2386e26"}, - {file = "pillow-11.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:bd165131fd51697e22421d0e467997ad31621b74bfc0b75956608cb2906dda07"}, - {file = "pillow-11.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:abc56501c3fd148d60659aae0af6ddc149660469082859fa7b066a298bde9482"}, - {file = "pillow-11.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:54ce1c9a16a9561b6d6d8cb30089ab1e5eb66918cb47d457bd996ef34182922e"}, - {file = "pillow-11.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:73ddde795ee9b06257dac5ad42fcb07f3b9b813f8c1f7f870f402f4dc54b5269"}, - {file = "pillow-11.1.0-cp310-cp310-win32.whl", hash = "sha256:3a5fe20a7b66e8135d7fd617b13272626a28278d0e578c98720d9ba4b2439d49"}, - {file = "pillow-11.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:b6123aa4a59d75f06e9dd3dac5bf8bc9aa383121bb3dd9a7a612e05eabc9961a"}, - {file = "pillow-11.1.0-cp310-cp310-win_arm64.whl", hash = "sha256:a76da0a31da6fcae4210aa94fd779c65c75786bc9af06289cd1c184451ef7a65"}, - {file = "pillow-11.1.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:e06695e0326d05b06833b40b7ef477e475d0b1ba3a6d27da1bb48c23209bf457"}, - {file = "pillow-11.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:96f82000e12f23e4f29346e42702b6ed9a2f2fea34a740dd5ffffcc8c539eb35"}, - {file = "pillow-11.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a3cd561ded2cf2bbae44d4605837221b987c216cff94f49dfeed63488bb228d2"}, - {file = "pillow-11.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f189805c8be5ca5add39e6f899e6ce2ed824e65fb45f3c28cb2841911da19070"}, - {file = "pillow-11.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dd0052e9db3474df30433f83a71b9b23bd9e4ef1de13d92df21a52c0303b8ab6"}, - {file = "pillow-11.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:837060a8599b8f5d402e97197d4924f05a2e0d68756998345c829c33186217b1"}, - {file = "pillow-11.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:aa8dd43daa836b9a8128dbe7d923423e5ad86f50a7a14dc688194b7be5c0dea2"}, - {file = "pillow-11.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0a2f91f8a8b367e7a57c6e91cd25af510168091fb89ec5146003e424e1558a96"}, - {file = "pillow-11.1.0-cp311-cp311-win32.whl", hash = "sha256:c12fc111ef090845de2bb15009372175d76ac99969bdf31e2ce9b42e4b8cd88f"}, - {file = "pillow-11.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:fbd43429d0d7ed6533b25fc993861b8fd512c42d04514a0dd6337fb3ccf22761"}, - {file = "pillow-11.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:f7955ecf5609dee9442cbface754f2c6e541d9e6eda87fad7f7a989b0bdb9d71"}, - {file = "pillow-11.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2062ffb1d36544d42fcaa277b069c88b01bb7298f4efa06731a7fd6cc290b81a"}, - {file = "pillow-11.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a85b653980faad27e88b141348707ceeef8a1186f75ecc600c395dcac19f385b"}, - {file = "pillow-11.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9409c080586d1f683df3f184f20e36fb647f2e0bc3988094d4fd8c9f4eb1b3b3"}, - {file = "pillow-11.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7fdadc077553621911f27ce206ffcbec7d3f8d7b50e0da39f10997e8e2bb7f6a"}, - {file = "pillow-11.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:93a18841d09bcdd774dcdc308e4537e1f867b3dec059c131fde0327899734aa1"}, - {file = "pillow-11.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:9aa9aeddeed452b2f616ff5507459e7bab436916ccb10961c4a382cd3e03f47f"}, - {file = "pillow-11.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3cdcdb0b896e981678eee140d882b70092dac83ac1cdf6b3a60e2216a73f2b91"}, - {file = "pillow-11.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:36ba10b9cb413e7c7dfa3e189aba252deee0602c86c309799da5a74009ac7a1c"}, - {file = "pillow-11.1.0-cp312-cp312-win32.whl", hash = "sha256:cfd5cd998c2e36a862d0e27b2df63237e67273f2fc78f47445b14e73a810e7e6"}, - {file = "pillow-11.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:a697cd8ba0383bba3d2d3ada02b34ed268cb548b369943cd349007730c92bddf"}, - {file = "pillow-11.1.0-cp312-cp312-win_arm64.whl", hash = "sha256:4dd43a78897793f60766563969442020e90eb7847463eca901e41ba186a7d4a5"}, - {file = "pillow-11.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ae98e14432d458fc3de11a77ccb3ae65ddce70f730e7c76140653048c71bfcbc"}, - {file = "pillow-11.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cc1331b6d5a6e144aeb5e626f4375f5b7ae9934ba620c0ac6b3e43d5e683a0f0"}, - {file = "pillow-11.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:758e9d4ef15d3560214cddbc97b8ef3ef86ce04d62ddac17ad39ba87e89bd3b1"}, - {file = "pillow-11.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b523466b1a31d0dcef7c5be1f20b942919b62fd6e9a9be199d035509cbefc0ec"}, - {file = "pillow-11.1.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:9044b5e4f7083f209c4e35aa5dd54b1dd5b112b108648f5c902ad586d4f945c5"}, - {file = "pillow-11.1.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:3764d53e09cdedd91bee65c2527815d315c6b90d7b8b79759cc48d7bf5d4f114"}, - {file = "pillow-11.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:31eba6bbdd27dde97b0174ddf0297d7a9c3a507a8a1480e1e60ef914fe23d352"}, - {file = "pillow-11.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b5d658fbd9f0d6eea113aea286b21d3cd4d3fd978157cbf2447a6035916506d3"}, - {file = "pillow-11.1.0-cp313-cp313-win32.whl", hash = "sha256:f86d3a7a9af5d826744fabf4afd15b9dfef44fe69a98541f666f66fbb8d3fef9"}, - {file = "pillow-11.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:593c5fd6be85da83656b93ffcccc2312d2d149d251e98588b14fbc288fd8909c"}, - {file = "pillow-11.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:11633d58b6ee5733bde153a8dafd25e505ea3d32e261accd388827ee987baf65"}, - {file = "pillow-11.1.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:70ca5ef3b3b1c4a0812b5c63c57c23b63e53bc38e758b37a951e5bc466449861"}, - {file = "pillow-11.1.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:8000376f139d4d38d6851eb149b321a52bb8893a88dae8ee7d95840431977081"}, - {file = "pillow-11.1.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ee85f0696a17dd28fbcfceb59f9510aa71934b483d1f5601d1030c3c8304f3c"}, - {file = "pillow-11.1.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:dd0e081319328928531df7a0e63621caf67652c8464303fd102141b785ef9547"}, - {file = "pillow-11.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e63e4e5081de46517099dc30abe418122f54531a6ae2ebc8680bcd7096860eab"}, - {file = "pillow-11.1.0-cp313-cp313t-win32.whl", hash = "sha256:dda60aa465b861324e65a78c9f5cf0f4bc713e4309f83bc387be158b077963d9"}, - {file = "pillow-11.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ad5db5781c774ab9a9b2c4302bbf0c1014960a0a7be63278d13ae6fdf88126fe"}, - {file = "pillow-11.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:67cd427c68926108778a9005f2a04adbd5e67c442ed21d95389fe1d595458756"}, - {file = "pillow-11.1.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:bf902d7413c82a1bfa08b06a070876132a5ae6b2388e2712aab3a7cbc02205c6"}, - {file = "pillow-11.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c1eec9d950b6fe688edee07138993e54ee4ae634c51443cfb7c1e7613322718e"}, - {file = "pillow-11.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e275ee4cb11c262bd108ab2081f750db2a1c0b8c12c1897f27b160c8bd57bbc"}, - {file = "pillow-11.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4db853948ce4e718f2fc775b75c37ba2efb6aaea41a1a5fc57f0af59eee774b2"}, - {file = "pillow-11.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:ab8a209b8485d3db694fa97a896d96dd6533d63c22829043fd9de627060beade"}, - {file = "pillow-11.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:54251ef02a2309b5eec99d151ebf5c9904b77976c8abdcbce7891ed22df53884"}, - {file = "pillow-11.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5bb94705aea800051a743aa4874bb1397d4695fb0583ba5e425ee0328757f196"}, - {file = "pillow-11.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:89dbdb3e6e9594d512780a5a1c42801879628b38e3efc7038094430844e271d8"}, - {file = "pillow-11.1.0-cp39-cp39-win32.whl", hash = "sha256:e5449ca63da169a2e6068dd0e2fcc8d91f9558aba89ff6d02121ca8ab11e79e5"}, - {file = "pillow-11.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:3362c6ca227e65c54bf71a5f88b3d4565ff1bcbc63ae72c34b07bbb1cc59a43f"}, - {file = "pillow-11.1.0-cp39-cp39-win_arm64.whl", hash = "sha256:b20be51b37a75cc54c2c55def3fa2c65bb94ba859dde241cd0a4fd302de5ae0a"}, - {file = "pillow-11.1.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8c730dc3a83e5ac137fbc92dfcfe1511ce3b2b5d7578315b63dbbb76f7f51d90"}, - {file = "pillow-11.1.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:7d33d2fae0e8b170b6a6c57400e077412240f6f5bb2a342cf1ee512a787942bb"}, - {file = "pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8d65b38173085f24bc07f8b6c505cbb7418009fa1a1fcb111b1f4961814a442"}, - {file = "pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:015c6e863faa4779251436db398ae75051469f7c903b043a48f078e437656f83"}, - {file = "pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d44ff19eea13ae4acdaaab0179fa68c0c6f2f45d66a4d8ec1eda7d6cecbcc15f"}, - {file = "pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d3d8da4a631471dfaf94c10c85f5277b1f8e42ac42bade1ac67da4b4a7359b73"}, - {file = "pillow-11.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:4637b88343166249fe8aa94e7c4a62a180c4b3898283bb5d3d2fd5fe10d8e4e0"}, - {file = "pillow-11.1.0.tar.gz", hash = "sha256:368da70808b36d73b4b390a8ffac11069f8a5c85f29eff1f1b01bcf3ef5b2a20"}, + {file = "pymupdf-1.25.3-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:96878e1b748f9c2011aecb2028c5f96b5a347a9a91169130ad0133053d97915e"}, + {file = "pymupdf-1.25.3-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:6ef753005b72ebfd23470f72f7e30f61e21b0b5e748045ec5b8f89e6e3068d62"}, + {file = "pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:46d90c4f9e62d1856e8db4b9f04a202ff4a7f086a816af73abdc86adb7f5e25a"}, + {file = "pymupdf-1.25.3-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a5de51efdbe4d486b6c1111c84e8a231cbfb426f3d6ff31ab530ad70e6f39756"}, + {file = "pymupdf-1.25.3-cp39-abi3-win32.whl", hash = "sha256:bca72e6089f985d800596e22973f79cc08af6cbff1d93e5bda9248326a03857c"}, + {file = "pymupdf-1.25.3-cp39-abi3-win_amd64.whl", hash = "sha256:4fb357438c9129fbf939b5af85323434df64e36759c399c376b62ad6da95498c"}, + {file = "pymupdf-1.25.3.tar.gz", hash = "sha256:b640187c64c5ac5d97505a92e836da299da79c2f689f3f94a67a37a493492193"}, ] -[package.extras] -docs = ["furo", "olefile", "sphinx (>=8.1)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"] -fpx = ["olefile"] -mic = ["olefile"] -tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout", "trove-classifiers (>=2024.10.12)"] -typing = ["typing-extensions ; python_version < \"3.10\""] -xmp = ["defusedxml"] - -[[package]] -name = "pycryptodome" -version = "3.21.0" -description = "Cryptographic library for Python" -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" -groups = ["main"] -files = [ - {file = "pycryptodome-3.21.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:dad9bf36eda068e89059d1f07408e397856be9511d7113ea4b586642a429a4fd"}, - {file = "pycryptodome-3.21.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:a1752eca64c60852f38bb29e2c86fca30d7672c024128ef5d70cc15868fa10f4"}, - {file = "pycryptodome-3.21.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:3ba4cc304eac4d4d458f508d4955a88ba25026890e8abff9b60404f76a62c55e"}, - {file = "pycryptodome-3.21.0-cp27-cp27m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cb087b8612c8a1a14cf37dd754685be9a8d9869bed2ffaaceb04850a8aeef7e"}, - {file = "pycryptodome-3.21.0-cp27-cp27m-musllinux_1_1_aarch64.whl", hash = "sha256:26412b21df30b2861424a6c6d5b1d8ca8107612a4cfa4d0183e71c5d200fb34a"}, - {file = "pycryptodome-3.21.0-cp27-cp27m-win32.whl", hash = "sha256:cc2269ab4bce40b027b49663d61d816903a4bd90ad88cb99ed561aadb3888dd3"}, - {file = "pycryptodome-3.21.0-cp27-cp27m-win_amd64.whl", hash = "sha256:0fa0a05a6a697ccbf2a12cec3d6d2650b50881899b845fac6e87416f8cb7e87d"}, - {file = "pycryptodome-3.21.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:6cce52e196a5f1d6797ff7946cdff2038d3b5f0aba4a43cb6bf46b575fd1b5bb"}, - {file = "pycryptodome-3.21.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:a915597ffccabe902e7090e199a7bf7a381c5506a747d5e9d27ba55197a2c568"}, - {file = "pycryptodome-3.21.0-cp27-cp27mu-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4e74c522d630766b03a836c15bff77cb657c5fdf098abf8b1ada2aebc7d0819"}, - {file = "pycryptodome-3.21.0-cp27-cp27mu-musllinux_1_1_aarch64.whl", hash = "sha256:a3804675283f4764a02db05f5191eb8fec2bb6ca34d466167fc78a5f05bbe6b3"}, - {file = "pycryptodome-3.21.0-cp36-abi3-macosx_10_9_universal2.whl", hash = "sha256:2480ec2c72438430da9f601ebc12c518c093c13111a5c1644c82cdfc2e50b1e4"}, - {file = "pycryptodome-3.21.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:de18954104667f565e2fbb4783b56667f30fb49c4d79b346f52a29cb198d5b6b"}, - {file = "pycryptodome-3.21.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2de4b7263a33947ff440412339cb72b28a5a4c769b5c1ca19e33dd6cd1dcec6e"}, - {file = "pycryptodome-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0714206d467fc911042d01ea3a1847c847bc10884cf674c82e12915cfe1649f8"}, - {file = "pycryptodome-3.21.0-cp36-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d85c1b613121ed3dbaa5a97369b3b757909531a959d229406a75b912dd51dd1"}, - {file = "pycryptodome-3.21.0-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:8898a66425a57bcf15e25fc19c12490b87bd939800f39a03ea2de2aea5e3611a"}, - {file = "pycryptodome-3.21.0-cp36-abi3-musllinux_1_2_i686.whl", hash = "sha256:932c905b71a56474bff8a9c014030bc3c882cee696b448af920399f730a650c2"}, - {file = "pycryptodome-3.21.0-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:18caa8cfbc676eaaf28613637a89980ad2fd96e00c564135bf90bc3f0b34dd93"}, - {file = "pycryptodome-3.21.0-cp36-abi3-win32.whl", hash = "sha256:280b67d20e33bb63171d55b1067f61fbd932e0b1ad976b3a184303a3dad22764"}, - {file = "pycryptodome-3.21.0-cp36-abi3-win_amd64.whl", hash = "sha256:b7aa25fc0baa5b1d95b7633af4f5f1838467f1815442b22487426f94e0d66c53"}, - {file = "pycryptodome-3.21.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:2cb635b67011bc147c257e61ce864879ffe6d03342dc74b6045059dfbdedafca"}, - {file = "pycryptodome-3.21.0-pp27-pypy_73-win32.whl", hash = "sha256:4c26a2f0dc15f81ea3afa3b0c87b87e501f235d332b7f27e2225ecb80c0b1cdd"}, - {file = "pycryptodome-3.21.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:d5ebe0763c982f069d3877832254f64974139f4f9655058452603ff559c482e8"}, - {file = "pycryptodome-3.21.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ee86cbde706be13f2dec5a42b52b1c1d1cbb90c8e405c68d0755134735c8dc6"}, - {file = "pycryptodome-3.21.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0fd54003ec3ce4e0f16c484a10bc5d8b9bd77fa662a12b85779a2d2d85d67ee0"}, - {file = "pycryptodome-3.21.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5dfafca172933506773482b0e18f0cd766fd3920bd03ec85a283df90d8a17bc6"}, - {file = "pycryptodome-3.21.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:590ef0898a4b0a15485b05210b4a1c9de8806d3ad3d47f74ab1dc07c67a6827f"}, - {file = "pycryptodome-3.21.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f35e442630bc4bc2e1878482d6f59ea22e280d7121d7adeaedba58c23ab6386b"}, - {file = "pycryptodome-3.21.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff99f952db3db2fbe98a0b355175f93ec334ba3d01bbde25ad3a5a33abc02b58"}, - {file = "pycryptodome-3.21.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:8acd7d34af70ee63f9a849f957558e49a98f8f1634f86a59d2be62bb8e93f71c"}, - {file = "pycryptodome-3.21.0.tar.gz", hash = "sha256:f7787e0d469bdae763b876174cf2e6c0f7be79808af26b1da96f1a64bcf47297"}, -] - -[[package]] -name = "pypdf2" -version = "3.0.1" -description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" -optional = false -python-versions = ">=3.6" -groups = ["main"] -files = [ - {file = "PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440"}, - {file = "pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928"}, -] - -[package.dependencies] -Pillow = {version = "*", optional = true, markers = "extra == \"full\""} -PyCryptodome = {version = "*", optional = true, markers = "extra == \"full\""} - -[package.extras] -crypto = ["PyCryptodome"] -dev = ["black", "flit", "pip-tools", "pre-commit (<2.18.0)", "pytest-cov", "wheel"] -docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] -full = ["Pillow", "PyCryptodome"] -image = ["Pillow"] - [metadata] lock-version = "2.1" python-versions = ">=3.12" -content-hash = "653eb47a984d886bcc8cd1bf551ecc1a4efa26d3335667bf70b535d99e5bcccd" +content-hash = "0ff0789ceee91157e5f804e4e3248e78513ae898a14b1973e46da2e50c332ef6" diff --git a/etl/pyproject.toml b/etl/pyproject.toml index 34e774c..50c0828 100644 --- a/etl/pyproject.toml +++ b/etl/pyproject.toml @@ -8,7 +8,7 @@ authors = [ readme = "README.md" requires-python = ">=3.12" dependencies = [ - "pypdf2[full] (>=3.0.1,<4.0.0)" + "pymupdf (>=1.25.3,<2.0.0)" ] [tool.poetry] diff --git a/etl/src/etl/main.py b/etl/src/etl/main.py index 49b20b7..ebdcf10 100644 --- a/etl/src/etl/main.py +++ b/etl/src/etl/main.py @@ -1,11 +1,13 @@ import os from filePathValidator.retrohome import RetroHomeFileStructureValidator - +from pdfReader.pdfReaderToText import pdfReaderToText DATA_LOC = "/workspaces/survey-extraction/data/" INTERESTING_FILE_LOC = "/workspaces/survey-extraction/data/first last/Submission 03.03.25/customer/10 Sandbeck Lane DN21 3LZ/PRE SITE NOTES.pdf" def main(): - RetroHomeFileStructureValidator(DATA_LOC) + # RetroHomeFileStructureValidator(DATA_LOC) + + pdfReaderToText(INTERESTING_FILE_LOC).get_text_from_pdf_file() if __name__ == "__main__": diff --git a/etl/src/etl/pdfReader/pdfTypes.py b/etl/src/etl/pdfReader/pdfReaderToText.py similarity index 55% rename from etl/src/etl/pdfReader/pdfTypes.py rename to etl/src/etl/pdfReader/pdfReaderToText.py index bb55ac7..a050a34 100644 --- a/etl/src/etl/pdfReader/pdfTypes.py +++ b/etl/src/etl/pdfReader/pdfReaderToText.py @@ -1,14 +1,19 @@ from utils.logger import Logger import logging +import pymupdf - -class pdfReader(): +class pdfReaderToText(): def __init__(self, file_path): self.source_path = file_path self.logger = Logger(name='pdfReader', level=logging.DEBUG).get_logger() + self.text = "" def get_text_from_pdf_file(self): self.logger.debug(f"Extrating text from {self.source_path}") + pdf = pymupdf.open(self.source_path) - return \ No newline at end of file + for page in pdf: + text = page.get_text().encode("utf8") + self.logger('###') + self.logger.info(text) \ No newline at end of file From d25d699000aaf4e37d500a551f6868247b2ab130 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 4 Mar 2025 12:32:23 +0000 Subject: [PATCH 19/20] made it into a prety list --- etl/src/etl/pdfReader/pdfReaderToText.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/etl/src/etl/pdfReader/pdfReaderToText.py b/etl/src/etl/pdfReader/pdfReaderToText.py index a050a34..1dfa2de 100644 --- a/etl/src/etl/pdfReader/pdfReaderToText.py +++ b/etl/src/etl/pdfReader/pdfReaderToText.py @@ -7,13 +7,16 @@ class pdfReaderToText(): def __init__(self, file_path): self.source_path = file_path self.logger = Logger(name='pdfReader', level=logging.DEBUG).get_logger() - self.text = "" + self.all_text = "" def get_text_from_pdf_file(self): self.logger.debug(f"Extrating text from {self.source_path}") pdf = pymupdf.open(self.source_path) for page in pdf: - text = page.get_text().encode("utf8") - self.logger('###') - self.logger.info(text) \ No newline at end of file + text = page.get_text() + self.all_text += text + + + from pprint import pprint + pprint(self.all_text.split('\n')) \ No newline at end of file From c9ee97d70af79fc20ae58a6816d8b7fb9e77a38f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 4 Mar 2025 12:38:31 +0000 Subject: [PATCH 20/20] pdf reader works --- etl/src/etl/main.py | 4 +++- etl/src/etl/pdfReader/pdfReaderToText.py | 11 +++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/etl/src/etl/main.py b/etl/src/etl/main.py index ebdcf10..34c7b9c 100644 --- a/etl/src/etl/main.py +++ b/etl/src/etl/main.py @@ -1,13 +1,15 @@ import os from filePathValidator.retrohome import RetroHomeFileStructureValidator from pdfReader.pdfReaderToText import pdfReaderToText +from pprint import pprint DATA_LOC = "/workspaces/survey-extraction/data/" INTERESTING_FILE_LOC = "/workspaces/survey-extraction/data/first last/Submission 03.03.25/customer/10 Sandbeck Lane DN21 3LZ/PRE SITE NOTES.pdf" def main(): # RetroHomeFileStructureValidator(DATA_LOC) - pdfReaderToText(INTERESTING_FILE_LOC).get_text_from_pdf_file() + list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test() + pprint(list_) if __name__ == "__main__": diff --git a/etl/src/etl/pdfReader/pdfReaderToText.py b/etl/src/etl/pdfReader/pdfReaderToText.py index 1dfa2de..f10fc65 100644 --- a/etl/src/etl/pdfReader/pdfReaderToText.py +++ b/etl/src/etl/pdfReader/pdfReaderToText.py @@ -8,7 +8,9 @@ class pdfReaderToText(): self.source_path = file_path self.logger = Logger(name='pdfReader', level=logging.DEBUG).get_logger() self.all_text = "" - + self.text_list = [] + self.get_text_from_pdf_file() + def get_text_from_pdf_file(self): self.logger.debug(f"Extrating text from {self.source_path}") pdf = pymupdf.open(self.source_path) @@ -17,6 +19,7 @@ class pdfReaderToText(): text = page.get_text() self.all_text += text - - from pprint import pprint - pprint(self.all_text.split('\n')) \ No newline at end of file + self.text_list = self.all_text.split('\n') + + def get_list_of_test(self): + return self.text_list \ No newline at end of file