move extraction from spreadsheet to dedicated file

This commit is contained in:
Daniel Roth 2026-04-29 10:04:55 +00:00
parent 252657a374
commit 272837d6ef
2 changed files with 51 additions and 46 deletions

View file

@ -1,8 +1,6 @@
from datetime import datetime, timezone from datetime import datetime, timezone
import os import os
import re
from typing import Any, Dict, List, Optional, Tuple, cast from typing import Any, Dict, List, Optional, Tuple, cast
from openpyxl import load_workbook
from backend.app.config import get_settings from backend.app.config import get_settings
from backend.app.db.connection import db_session from backend.app.db.connection import db_session
@ -14,9 +12,6 @@ from backend.app.db.models.uploaded_file import (
from backend.documents_parser.db_writer import save_epc_property_data from backend.documents_parser.db_writer import save_epc_property_data
from backend.documents_parser.parser import parse_pashub_site_notes from backend.documents_parser.parser import parse_pashub_site_notes
from backend.pashub_fetcher.core_files import infer_file_type from backend.pashub_fetcher.core_files import infer_file_type
from datatypes.epc.domain.epc_property_data import EpcPropertyData
from backend.pashub_fetcher.job import Job
from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError
from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( from backend.pashub_fetcher.pashub_to_ara_trigger_request import (
PashubToAraTriggerRequest, PashubToAraTriggerRequest,
@ -24,6 +19,7 @@ from backend.pashub_fetcher.pashub_to_ara_trigger_request import (
from backend.pashub_fetcher.sharepoint_subfolders import SharepointSubfolders from backend.pashub_fetcher.sharepoint_subfolders import SharepointSubfolders
from backend.pashub_fetcher.token_getter import get_token_from_local_storage from backend.pashub_fetcher.token_getter import get_token_from_local_storage
from backend.utils.subtasks import task_handler from backend.utils.subtasks import task_handler
from datatypes.epc.domain.epc_property_data import EpcPropertyData
from utils.logger import setup_logger from utils.logger import setup_logger
from utils.s3 import upload_file_to_s3 from utils.s3 import upload_file_to_s3
from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient
@ -33,45 +29,6 @@ from utils.sharepoint.domna_sites import DomnaSites
logger = setup_logger() logger = setup_logger()
def extract_jobs(filepath: str) -> List[Job]:
wb = load_workbook(filepath, data_only=True)
# ws = wb["watford warm homes (wave 3) mai"]
ws = wb["filtered"]
HEADER_ROW = 3
headers: Dict[str, int] = {}
for col in range(1, ws.max_column + 1):
value = str(ws.cell(row=HEADER_ROW, column=col).value)
if value:
headers[value.strip()] = col
name_col = headers["Name"]
# link_col = headers["Pashub Link"]
link_col = headers["PasHub Link"]
jobs: List[Job] = []
for row in range(HEADER_ROW + 1, ws.max_row + 1):
name = ws.cell(row=row, column=name_col).value
link = ws.cell(row=row, column=link_col).value
if not name or not link:
continue
match = re.search(r"/jobs/([0-9a-fA-F\-]+)/", str(link))
if not match:
continue
jobs.append(
{
"id": match.group(1),
"address": str(name),
}
)
return jobs
def get_pashub_client(email: str, password: str) -> PashubClient: def get_pashub_client(email: str, password: str) -> PashubClient:
token = get_token_from_local_storage(email, password) token = get_token_from_local_storage(email, password)
@ -143,9 +100,14 @@ def upload_job_to_s3_and_update_db(
uploaded_files.append(uploaded_file) uploaded_files.append(uploaded_file)
file_type: Optional[str] = cast(Optional[str], uploaded_file.file_type) file_type: Optional[str] = cast(Optional[str], uploaded_file.file_type)
if file_type is not None and FileTypeEnum(file_type) == FileTypeEnum.RD_SAP_SITE_NOTE: if (
file_type is not None
and FileTypeEnum(file_type) == FileTypeEnum.RD_SAP_SITE_NOTE
):
try: try:
site_notes_pairs.append((uploaded_file, parse_pashub_site_notes(file_path))) site_notes_pairs.append(
(uploaded_file, parse_pashub_site_notes(file_path))
)
except Exception: except Exception:
logger.warning(f"Failed to parse site notes {file_path}", exc_info=True) logger.warning(f"Failed to parse site notes {file_path}", exc_info=True)

View file

@ -0,0 +1,43 @@
import re
from typing import Dict, List
from openpyxl import load_workbook
from backend.pashub_fetcher.job import Job
def extract_jobs(filepath: str) -> List[Job]:
wb = load_workbook(filepath, data_only=True)
ws = wb["filtered"]
HEADER_ROW = 3
headers: Dict[str, int] = {}
for col in range(1, ws.max_column + 1):
value = str(ws.cell(row=HEADER_ROW, column=col).value)
if value:
headers[value.strip()] = col
name_col = headers["Name"]
link_col = headers["PasHub Link"]
jobs: List[Job] = []
for row in range(HEADER_ROW + 1, ws.max_row + 1):
name = ws.cell(row=row, column=name_col).value
link = ws.cell(row=row, column=link_col).value
if not name or not link:
continue
match = re.search(r"/jobs/([0-9a-fA-F\-]+)/", str(link))
if not match:
continue
jobs.append(
{
"id": match.group(1),
"address": str(name),
}
)
return jobs