mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
load ecmk site notes to db
This commit is contained in:
parent
272837d6ef
commit
b347039b80
3 changed files with 37 additions and 6 deletions
|
|
@ -1,13 +1,28 @@
|
||||||
|
from typing import List
|
||||||
|
|
||||||
from datatypes.epc.domain.epc_property_data import EpcPropertyData
|
from datatypes.epc.domain.epc_property_data import EpcPropertyData
|
||||||
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
|
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
|
||||||
|
|
||||||
|
from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor
|
||||||
from backend.documents_parser.extractor import PasHubRdSapSiteNotesExtractor
|
from backend.documents_parser.extractor import PasHubRdSapSiteNotesExtractor
|
||||||
from backend.documents_parser.pdf import pdf_to_text_list
|
from backend.documents_parser.pdf import pdf_to_pages, pdf_to_text_list
|
||||||
|
|
||||||
|
|
||||||
def parse_pashub_site_notes(file_path: str) -> EpcPropertyData:
|
def parse_site_notes_pdf(file_path: str) -> EpcPropertyData:
|
||||||
with open(file_path, "rb") as f:
|
with open(file_path, "rb") as f:
|
||||||
pdf_bytes = f.read()
|
pdf_bytes = f.read()
|
||||||
|
pages = pdf_to_pages(pdf_bytes)
|
||||||
|
if "Elmhurst Energy Systems" in "\n".join(pages):
|
||||||
|
return _parse_elmhurst(pages)
|
||||||
|
return _parse_pashub(pdf_bytes)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_elmhurst(pages: List[str]) -> EpcPropertyData:
|
||||||
|
site_notes = ElmhurstSiteNotesExtractor(pages).extract()
|
||||||
|
return EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_pashub(pdf_bytes: bytes) -> EpcPropertyData:
|
||||||
tokens = pdf_to_text_list(pdf_bytes)
|
tokens = pdf_to_text_list(pdf_bytes)
|
||||||
site_notes = PasHubRdSapSiteNotesExtractor(tokens).extract()
|
site_notes = PasHubRdSapSiteNotesExtractor(tokens).extract()
|
||||||
return EpcPropertyDataMapper.from_site_notes(site_notes)
|
return EpcPropertyDataMapper.from_site_notes(site_notes)
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
import os
|
import os
|
||||||
|
from typing import cast
|
||||||
|
|
||||||
from backend.app.db.connection import db_session
|
from backend.app.db.connection import db_session
|
||||||
from backend.app.db.models.uploaded_file import (
|
from backend.app.db.models.uploaded_file import (
|
||||||
|
|
@ -7,9 +8,14 @@ from backend.app.db.models.uploaded_file import (
|
||||||
FileTypeEnum,
|
FileTypeEnum,
|
||||||
UploadedFile,
|
UploadedFile,
|
||||||
)
|
)
|
||||||
|
from backend.documents_parser.db_writer import save_epc_property_data
|
||||||
|
from backend.documents_parser.parser import parse_site_notes_pdf
|
||||||
|
from utils.logger import setup_logger
|
||||||
from utils.s3 import upload_file_to_s3
|
from utils.s3 import upload_file_to_s3
|
||||||
from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient
|
from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient
|
||||||
|
|
||||||
|
logger = setup_logger()
|
||||||
|
|
||||||
|
|
||||||
def upload_file_to_sharepoint(
|
def upload_file_to_sharepoint(
|
||||||
client: DomnaSharepointClient,
|
client: DomnaSharepointClient,
|
||||||
|
|
@ -43,7 +49,7 @@ def upload_excel_to_sharepoint(
|
||||||
# TODO: this should be moved to somewhere common and called by pashub fetcher
|
# TODO: this should be moved to somewhere common and called by pashub fetcher
|
||||||
def upload_file_to_s3_and_update_db(
|
def upload_file_to_s3_and_update_db(
|
||||||
bucket: str, file_path: str, hubspot_listing_id: str, file_type: FileTypeEnum
|
bucket: str, file_path: str, hubspot_listing_id: str, file_type: FileTypeEnum
|
||||||
) -> None:
|
) -> int:
|
||||||
filename: str = os.path.basename(file_path)
|
filename: str = os.path.basename(file_path)
|
||||||
key: str = f"documents/hubspot_listing_id/{hubspot_listing_id}/{filename}"
|
key: str = f"documents/hubspot_listing_id/{hubspot_listing_id}/{filename}"
|
||||||
|
|
||||||
|
|
@ -61,4 +67,14 @@ def upload_file_to_s3_and_update_db(
|
||||||
with db_session() as session:
|
with db_session() as session:
|
||||||
# TODO: we should do multiple files at once to reduce db trips
|
# TODO: we should do multiple files at once to reduce db trips
|
||||||
session.add(uploaded_file)
|
session.add(uploaded_file)
|
||||||
session.commit()
|
session.flush()
|
||||||
|
uploaded_file_id: int = int(cast(int, uploaded_file.id))
|
||||||
|
|
||||||
|
if file_type == FileTypeEnum.ECMK_RD_SAP_SITE_NOTE:
|
||||||
|
try:
|
||||||
|
epc_data = parse_site_notes_pdf(file_path)
|
||||||
|
save_epc_property_data(session, epc_data, uploaded_file_id=uploaded_file_id)
|
||||||
|
except Exception:
|
||||||
|
logger.warning(f"Failed to parse/save site notes {file_path}", exc_info=True)
|
||||||
|
|
||||||
|
return uploaded_file_id
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ from backend.app.db.models.uploaded_file import (
|
||||||
UploadedFile,
|
UploadedFile,
|
||||||
)
|
)
|
||||||
from backend.documents_parser.db_writer import save_epc_property_data
|
from backend.documents_parser.db_writer import save_epc_property_data
|
||||||
from backend.documents_parser.parser import parse_pashub_site_notes
|
from backend.documents_parser.parser import parse_site_notes_pdf
|
||||||
from backend.pashub_fetcher.core_files import infer_file_type
|
from backend.pashub_fetcher.core_files import infer_file_type
|
||||||
from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError
|
from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError
|
||||||
from backend.pashub_fetcher.pashub_to_ara_trigger_request import (
|
from backend.pashub_fetcher.pashub_to_ara_trigger_request import (
|
||||||
|
|
@ -106,7 +106,7 @@ def upload_job_to_s3_and_update_db(
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
site_notes_pairs.append(
|
site_notes_pairs.append(
|
||||||
(uploaded_file, parse_pashub_site_notes(file_path))
|
(uploaded_file, parse_site_notes_pdf(file_path))
|
||||||
)
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.warning(f"Failed to parse site notes {file_path}", exc_info=True)
|
logger.warning(f"Failed to parse site notes {file_path}", exc_info=True)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue