From 272837d6ef42866e1fad432a1618792b8fede7db Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 29 Apr 2026 10:04:55 +0000 Subject: [PATCH] move extraction from spreadsheet to dedicated file --- backend/pashub_fetcher/handler/handler.py | 54 ++++------------------- backend/pashub_fetcher/spreadsheet.py | 43 ++++++++++++++++++ 2 files changed, 51 insertions(+), 46 deletions(-) create mode 100644 backend/pashub_fetcher/spreadsheet.py diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index b9df216e..a74f9a2d 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -1,8 +1,6 @@ from datetime import datetime, timezone import os -import re from typing import Any, Dict, List, Optional, Tuple, cast -from openpyxl import load_workbook from backend.app.config import get_settings from backend.app.db.connection import db_session @@ -14,9 +12,6 @@ from backend.app.db.models.uploaded_file import ( from backend.documents_parser.db_writer import save_epc_property_data from backend.documents_parser.parser import parse_pashub_site_notes from backend.pashub_fetcher.core_files import infer_file_type -from datatypes.epc.domain.epc_property_data import EpcPropertyData - -from backend.pashub_fetcher.job import Job from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( PashubToAraTriggerRequest, @@ -24,6 +19,7 @@ from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( from backend.pashub_fetcher.sharepoint_subfolders import SharepointSubfolders from backend.pashub_fetcher.token_getter import get_token_from_local_storage from backend.utils.subtasks import task_handler +from datatypes.epc.domain.epc_property_data import EpcPropertyData from utils.logger import setup_logger from utils.s3 import upload_file_to_s3 from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient @@ -33,45 +29,6 @@ from utils.sharepoint.domna_sites import DomnaSites logger = setup_logger() -def extract_jobs(filepath: str) -> List[Job]: - wb = load_workbook(filepath, data_only=True) - # ws = wb["watford warm homes (wave 3) mai"] - ws = wb["filtered"] - - HEADER_ROW = 3 - - headers: Dict[str, int] = {} - for col in range(1, ws.max_column + 1): - value = str(ws.cell(row=HEADER_ROW, column=col).value) - if value: - headers[value.strip()] = col - - name_col = headers["Name"] - # link_col = headers["Pashub Link"] - link_col = headers["PasHub Link"] - - jobs: List[Job] = [] - - for row in range(HEADER_ROW + 1, ws.max_row + 1): - name = ws.cell(row=row, column=name_col).value - link = ws.cell(row=row, column=link_col).value - - if not name or not link: - continue - - match = re.search(r"/jobs/([0-9a-fA-F\-]+)/", str(link)) - if not match: - continue - - jobs.append( - { - "id": match.group(1), - "address": str(name), - } - ) - - return jobs - def get_pashub_client(email: str, password: str) -> PashubClient: token = get_token_from_local_storage(email, password) @@ -143,9 +100,14 @@ def upload_job_to_s3_and_update_db( uploaded_files.append(uploaded_file) file_type: Optional[str] = cast(Optional[str], uploaded_file.file_type) - if file_type is not None and FileTypeEnum(file_type) == FileTypeEnum.RD_SAP_SITE_NOTE: + if ( + file_type is not None + and FileTypeEnum(file_type) == FileTypeEnum.RD_SAP_SITE_NOTE + ): try: - site_notes_pairs.append((uploaded_file, parse_pashub_site_notes(file_path))) + site_notes_pairs.append( + (uploaded_file, parse_pashub_site_notes(file_path)) + ) except Exception: logger.warning(f"Failed to parse site notes {file_path}", exc_info=True) diff --git a/backend/pashub_fetcher/spreadsheet.py b/backend/pashub_fetcher/spreadsheet.py new file mode 100644 index 00000000..5f8f74f6 --- /dev/null +++ b/backend/pashub_fetcher/spreadsheet.py @@ -0,0 +1,43 @@ +import re +from typing import Dict, List +from openpyxl import load_workbook + +from backend.pashub_fetcher.job import Job + + +def extract_jobs(filepath: str) -> List[Job]: + wb = load_workbook(filepath, data_only=True) + ws = wb["filtered"] + + HEADER_ROW = 3 + + headers: Dict[str, int] = {} + for col in range(1, ws.max_column + 1): + value = str(ws.cell(row=HEADER_ROW, column=col).value) + if value: + headers[value.strip()] = col + + name_col = headers["Name"] + link_col = headers["PasHub Link"] + + jobs: List[Job] = [] + + for row in range(HEADER_ROW + 1, ws.max_row + 1): + name = ws.cell(row=row, column=name_col).value + link = ws.cell(row=row, column=link_col).value + + if not name or not link: + continue + + match = re.search(r"/jobs/([0-9a-fA-F\-]+)/", str(link)) + if not match: + continue + + jobs.append( + { + "id": match.group(1), + "address": str(name), + } + ) + + return jobs