Merge pull request #1086 from Hestia-Homes/feature/pashub-additional-files

Fetch coordination and design documents from pashub
This commit is contained in:
Daniel Roth 2026-05-14 11:59:43 +01:00 committed by GitHub
commit c98fc8452f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 381 additions and 16 deletions

View file

@ -18,6 +18,9 @@ class FileTypeEnum(enum.Enum):
ECMK_RD_SAP_SITE_NOTE = "ecmk_rd_sap_site_note"
ECMK_SURVEY_XML = "ecmk_survey_xml"
MAGIC_PLAN_JSON = "magic_plan_json"
IMPROVEMENT_OPTION_EVALUATION = "improvement_option_evaluation"
MEDIUM_TERM_IMPROVEMENT_PLAN = "medium_term_improvement_plan"
RETROFIT_DESIGN_DOC = "retrofit_design_doc"
class FileSourceEnum(enum.Enum):

View file

@ -14,9 +14,12 @@ class CoreFiles(Enum):
PAR_PHOTOPACK = "PAR Photo Pack"
PAS2023_PROPERTY = "PAS 2023 Property Assessment Report"
PAS2023_OCCUPANCY = "PAS 2023 Occupancy Assessment Report"
IMPROVEMENT_OPTION_EVALUATION = "Improvement Option Evaluation"
MEDIUM_TERM_IMPROVEMENT_PLAN = "Medium Term Improvement Plan"
RETROFIT_DESIGN_DOC = "Retrofit Design Doc"
CORE_TO_FILETYPE_MAP = {
_CORE_FILE_TO_FILE_TYPE: dict[CoreFiles, str] = {
CoreFiles.PHOTOPACK: FileTypeEnum.PHOTO_PACK.value,
CoreFiles.SITENOTE: FileTypeEnum.SITE_NOTE.value,
CoreFiles.RDSAP_SITENOTE: FileTypeEnum.RD_SAP_SITE_NOTE.value,
@ -26,11 +29,49 @@ CORE_TO_FILETYPE_MAP = {
CoreFiles.PAR_PHOTOPACK: FileTypeEnum.PAR_PHOTO_PACK.value,
CoreFiles.PAS2023_PROPERTY: FileTypeEnum.PAS_2023_PROPERTY.value,
CoreFiles.PAS2023_OCCUPANCY: FileTypeEnum.PAS_2023_OCCUPANCY.value,
CoreFiles.IMPROVEMENT_OPTION_EVALUATION: FileTypeEnum.IMPROVEMENT_OPTION_EVALUATION.value,
CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN: FileTypeEnum.MEDIUM_TERM_IMPROVEMENT_PLAN.value,
CoreFiles.RETROFIT_DESIGN_DOC: FileTypeEnum.RETROFIT_DESIGN_DOC.value,
}
def infer_file_type(filename: str) -> Optional[str]:
for core_file, file_type in CORE_TO_FILETYPE_MAP.items():
def get_core_file_type(
filename: str, evidence_category: Optional[str] = None
) -> Optional[CoreFiles]:
# Identify retrofit design doc using evidence category as the name is possibly unreliable.
# We might change to always use evidence category, but needs more investigation
if evidence_category is not None and evidence_category.lower() == "retrofit design":
return CoreFiles.RETROFIT_DESIGN_DOC
if CoreFiles.IMPROVEMENT_OPTION_EVALUATION.value in filename:
return CoreFiles.IMPROVEMENT_OPTION_EVALUATION
if CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN.value in filename:
return CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN
if evidence_category is None and "-OSM-" in filename and "DR-N-A" in filename:
return CoreFiles.RETROFIT_DESIGN_DOC
_prefix_skip = {
CoreFiles.RETROFIT_DESIGN_DOC,
CoreFiles.IMPROVEMENT_OPTION_EVALUATION,
CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN,
}
for core_file in CoreFiles:
if core_file in _prefix_skip:
continue
if filename.startswith(core_file.value):
return file_type
return core_file
return None
def get_file_type_string(filename: str) -> Optional[str]:
core_file: Optional[CoreFiles] = get_core_file_type(filename)
if core_file is None:
return None
return _CORE_FILE_TO_FILE_TYPE[core_file]

View file

@ -5,12 +5,11 @@ from datetime import datetime
import requests
from backend.pashub_fetcher.core_files import CoreFiles
from backend.pashub_fetcher.core_files import CoreFiles, get_core_file_type
from backend.pashub_fetcher.evidence_file_data import EvidenceFileData
from backend.pashub_fetcher.evidence_metadata import EvidenceMetadata
from utils.logger import setup_logger
logger = setup_logger()
@ -86,12 +85,6 @@ class PashubClient:
except Exception:
return None
def _get_core_file_type(self, file: EvidenceFileData) -> Optional[CoreFiles]:
for core_file in CoreFiles:
if file.file_name.startswith(core_file.value):
return core_file
return None
def _select_latest_core_files(
self,
files: List[EvidenceFileData],
@ -99,7 +92,9 @@ class PashubClient:
grouped: Dict[CoreFiles, List[EvidenceFileData]] = defaultdict(list)
for file in files:
core_type = self._get_core_file_type(file)
core_type: Optional[CoreFiles] = get_core_file_type(
file.file_name, file.evidence_category
)
if not core_type:
continue
grouped[core_type].append(file)
@ -107,6 +102,9 @@ class PashubClient:
latest_files: Dict[CoreFiles, EvidenceFileData] = {}
for core_type, group in grouped.items():
if core_type == CoreFiles.RETROFIT_DESIGN_DOC and len(group) > 1:
osm_candidates = [f for f in group if "-OSM-" in f.file_name]
group = osm_candidates if osm_candidates else group
latest = max(group, key=lambda f: datetime.fromisoformat(f.created_utc))
latest_files[core_type] = latest

View file

@ -10,7 +10,7 @@ from backend.app.db.models.uploaded_file import (
)
from backend.documents_parser.db_writer import save_epc_property_data
from backend.documents_parser.parser import parse_site_notes_pdf
from backend.pashub_fetcher.core_files import infer_file_type
from backend.pashub_fetcher.core_files import get_file_type_string
from backend.pashub_fetcher.pashub_client import PashubClient
from backend.pashub_fetcher.pashub_to_ara_trigger_request import (
PashubToAraTriggerRequest,
@ -109,7 +109,7 @@ class PashubService:
uprn=int(uprn) if uprn else None,
hubspot_deal_id=hubspot_deal_id,
file_source=FileSourceEnum.PAS_HUB.value,
file_type=infer_file_type(filename),
file_type=get_file_type_string(filename),
)
file_paths.append(file_path)
uploaded_files.append(uploaded_file)

View file

@ -0,0 +1,185 @@
from backend.pashub_fetcher.core_files import (
CoreFiles,
get_core_file_type,
get_file_type_string,
)
def test_file_type_for_photopack():
assert get_file_type_string("Photopack_123456_V1.pdf") == "photo_pack"
def test_file_type_for_sitenote():
assert get_file_type_string("SiteNote_123456_V1.pdf") == "site_note"
def test_file_type_for_rdsap_sitenote():
assert (
get_file_type_string("RdSAP_SiteNote_9510890_V1_Assessmet.pdf")
== "rd_sap_site_note"
)
def test_file_type_for_pas2023_ventilation():
assert (
get_file_type_string("PAS 2023 Ventilation Assessment Report_123456.pdf")
== "pas_2023_ventilation"
)
def test_file_type_for_pas2023_condition():
assert (
get_file_type_string("PAS 2023 Condition Report_123456.pdf")
== "pas_2023_condition"
)
def test_file_type_for_pas_significance():
assert get_file_type_string("PAS Significance_123456.pdf") == "pas_significance"
def test_file_type_for_par_photopack():
assert (
get_file_type_string("PAR Photo Pack_95101890_V2_Assessment.pdf")
== "par_photo_pack"
)
def test_file_type_for_pas2023_property():
assert (
get_file_type_string("PAS 2023 Property Assessment Report_123456.pdf")
== "pas_2023_property"
)
def test_file_type_for_pas2023_occupancy():
assert (
get_file_type_string("PAS 2023 Occupancy Assessment Report_123456.pdf")
== "pas_2023_occupancy"
)
def test_file_type_for_improvement_option_evaluation():
# filename: "{job_id} - {postcode} - Improvement Option Evaluation.pdf"
assert (
get_file_type_string("6000802 - NG4 4HD - Improvement Option Evaluation.pdf")
== "improvement_option_evaluation"
)
def test_file_type_for_medium_term_improvement_plan():
# filename: "{job_id} - {postcode} - Medium Term Improvement Plan IOE.pdf"
assert (
get_file_type_string(
"60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf"
)
== "medium_term_improvement_plan"
)
def test_file_type_for_retrofit_design_doc():
assert (
get_file_type_string("2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf")
== "retrofit_design_doc"
)
assert (
get_file_type_string("2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf")
== "retrofit_design_doc"
)
# ---------------------------------------------------------------------------
# core_file_for
# ---------------------------------------------------------------------------
def test_core_file_for_evidence_category_match_is_case_insensitive() -> None:
# Arrange
filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf"
# Act
result = get_core_file_type(filename, evidence_category="Retrofit Design")
# Assert
assert result == CoreFiles.RETROFIT_DESIGN_DOC
def test_core_file_for_evidence_category_returns_retrofit_design_doc() -> None:
# Arrange
filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf"
# Act
result = get_core_file_type(filename, evidence_category="retrofit design")
# Assert
assert result == CoreFiles.RETROFIT_DESIGN_DOC
def test_core_file_for_ioe_substring_returns_improvement_option_evaluation() -> None:
# Arrange
filename = "6000802 - NG4 4HD - Improvement Option Evaluation.pdf"
# Act
result = get_core_file_type(filename)
# Assert
assert result == CoreFiles.IMPROVEMENT_OPTION_EVALUATION
def test_core_file_for_mtip_substring_returns_medium_term_improvement_plan() -> None:
# Arrange
filename = "60800802 - NG4 4HD - Medium Term Improvement Plan IOE.pdf"
# Act
result = get_core_file_type(filename)
# Assert
assert result == CoreFiles.MEDIUM_TERM_IMPROVEMENT_PLAN
def test_core_file_for_osm_pattern_returns_retrofit_design_doc_without_evidence_category() -> (
None
):
# Arrange
filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf"
# Act
result = get_core_file_type(filename)
# Assert
assert result == CoreFiles.RETROFIT_DESIGN_DOC
def test_core_file_for_prefix_returns_photopack() -> None:
# Arrange
filename = "Photopack_123456_V1.pdf"
# Act
result = get_core_file_type(filename)
# Assert
assert result == CoreFiles.PHOTOPACK
def test_core_file_for_unknown_filename_returns_none() -> None:
# Arrange
filename = "unknown_document_123.pdf"
# Act
result = get_core_file_type(filename)
# Assert
assert result is None
def test_core_file_for_osm_fallback_does_not_fire_when_evidence_category_present() -> (
None
):
# Arrange — OSM+DR-N-A filename but evidence_category is something other than retrofit design
filename = "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf"
# Act
result = get_core_file_type(filename, evidence_category="some other category")
# Assert
assert result is None

View file

@ -0,0 +1,117 @@
# pyright: reportPrivateUsage=false
from typing import Optional
from backend.pashub_fetcher.core_files import CoreFiles
from backend.pashub_fetcher.evidence_file_data import EvidenceFileData
from backend.pashub_fetcher.pashub_client import PashubClient
def make_client() -> PashubClient:
return PashubClient(token="test-token")
def make_file(
file_name: str = "unknown.pdf",
evidence_category: Optional[str] = None,
created_utc: str = "2024-01-01T00:00:00",
) -> EvidenceFileData:
return EvidenceFileData(
file_id="id-1",
file_name=file_name,
created_utc=created_utc,
file_size=1024,
file_extension="pdf",
evidence_category=evidence_category,
)
# ---------------------------------------------------------------------------
# _select_latest_core_files
# ---------------------------------------------------------------------------
def test_select_latest_core_files_returns_single_retrofit_design_doc() -> None:
# Arrange
client = make_client()
files = [
make_file(
file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf",
evidence_category="retrofit design",
created_utc="2024-06-01T00:00:00",
)
]
# Act
result = client._select_latest_core_files(files)
# Assert
assert result[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf"
def test_select_latest_core_files_osm_candidate_wins_over_non_osm() -> None:
# Arrange - the non-OSM file is newer but should lose to the OSM file
client = make_client()
files = [
make_file(
file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf",
evidence_category="retrofit design",
created_utc="2024-01-01T00:00:00",
),
make_file(
file_name="Retrofit Design Doc non-osm variant.pdf",
evidence_category="retrofit design",
created_utc="2024-06-01T00:00:00",
),
]
# Act
result = client._select_latest_core_files(files)
# Assert
assert result[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf"
def test_select_latest_core_files_picks_latest_when_both_candidates_have_osm() -> None:
# Arrange
client = make_client()
files = [
make_file(
file_name="2512-OSM-H21M900-XX-DR-N-A_Lord Nelson Street 018.pdf",
evidence_category="retrofit design",
created_utc="2024-01-01T00:00:00",
),
make_file(
file_name="2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf",
evidence_category="retrofit design",
created_utc="2024-06-01T00:00:00",
),
]
# Act
result = client._select_latest_core_files(files)
# Assert
assert result[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf"
def test_select_latest_core_files_falls_back_to_latest_when_no_osm_candidates() -> None:
# Arrange
client = make_client()
files = [
make_file(
file_name="retrofit_design_v1.pdf",
evidence_category="retrofit design",
created_utc="2024-01-01T00:00:00",
),
make_file(
file_name="retrofit_design_v2.pdf",
evidence_category="retrofit design",
created_utc="2024-06-01T00:00:00",
),
]
# Act
result = client._select_latest_core_files(files)
# Assert
assert result[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "retrofit_design_v2.pdf"

View file

@ -3,6 +3,27 @@ pythonpath = .
log_cli = true
log_cli_level = INFO
addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial
testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests datatypes/epc/schema/tests datatypes/epc/surveys/tests datatypes/epc/domain/tests backend/ecmk_fetcher/tests/ backend/pashub_fetcher/tests backend/documents_parser/tests backend/magic_plan/tests datatypes/magicplan/api/tests datatypes/magicplan/domain/tests backend/app/db/functions/tests
testpaths =
recommendations/tests
backend/tests
backend/address2UPRN/tests
backend/app/db/functions/tests
backend/categorisation/tests
backend/condition/tests
backend/documents_parser/tests
backend/ecmk_fetcher/tests
backend/export/tests
backend/magic_plan/tests
backend/onboarders/tests
backend/pashub_fetcher/tests
datatypes/epc/domain/tests
datatypes/epc/schema/tests
datatypes/epc/surveys/tests
datatypes/magicplan/api/tests
datatypes/magicplan/domain/tests
etl/epc/tests
etl/epc_clean/tests
etl/hubspot/tests
etl/spatial/tests
markers =
integration: mark a test as an integration test