From 383b8b0c375c3b1d6e0971af3dfa64f196e7b0a3 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 15 Jun 2026 10:48:17 +0000 Subject: [PATCH 1/9] =?UTF-8?q?SharePoint=20renamer=20build=5Fcanonical=5F?= =?UTF-8?q?filename=20behaviour=20verified=20by=20tests=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pytest.ini | 2 + scripts/tests/__init__.py | 0 .../tests/test_build_canonical_filename.py | 106 ++++++++++++++++++ 3 files changed, 108 insertions(+) create mode 100644 scripts/tests/__init__.py create mode 100644 scripts/tests/test_build_canonical_filename.py diff --git a/pytest.ini b/pytest.ini index 2bcd6178..a6eba3be 100644 --- a/pytest.ini +++ b/pytest.ini @@ -25,5 +25,7 @@ testpaths = etl/epc_clean/tests etl/hubspot/tests etl/spatial/tests + scripts/tests + ; tests/ markers = integration: mark a test as an integration test diff --git a/scripts/tests/__init__.py b/scripts/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/tests/test_build_canonical_filename.py b/scripts/tests/test_build_canonical_filename.py new file mode 100644 index 00000000..3890477c --- /dev/null +++ b/scripts/tests/test_build_canonical_filename.py @@ -0,0 +1,106 @@ +# scripts/tests/test_build_canonical_filename.py +from scripts.rename_sharepoint_files import build_canonical_filename + +UPRN = "10093456789" +ADDRESS = "1 High Street, Anytown" +POSTCODE = "SW1A 1AA" +STREET = "1 High Street" + + +def test_already_renamed_returns_none() -> None: + # Arrange + original = f"{UPRN}_High Street SW1A 1AA_EPC Report.pdf" + + # Act + result = build_canonical_filename(UPRN, ADDRESS, POSTCODE, original) + + # Assert + assert result is None + + +def test_address_postcode_prefix_stripped() -> None: + # Arrange + original = f"{ADDRESS} {POSTCODE} - EPC Report.pdf" + + # Act + result = build_canonical_filename(UPRN, ADDRESS, POSTCODE, original) + + # Assert + assert result == f"{UPRN}_{STREET} {POSTCODE}_EPC Report.pdf" + + +def test_address_only_prefix_stripped() -> None: + # Arrange + original = f"{ADDRESS} - EPC Report.pdf" + + # Act + result = build_canonical_filename(UPRN, ADDRESS, POSTCODE, original) + + # Assert + assert result == f"{UPRN}_{STREET} {POSTCODE}_EPC Report.pdf" + + +def test_street_postcode_prefix_stripped() -> None: + # Arrange + original = f"{STREET} {POSTCODE} - EPC Report.pdf" + + # Act + result = build_canonical_filename(UPRN, ADDRESS, POSTCODE, original) + + # Assert + assert result == f"{UPRN}_{STREET} {POSTCODE}_EPC Report.pdf" + + +def test_street_only_prefix_stripped() -> None: + # Arrange + original = f"{STREET} - EPC Report.pdf" + + # Act + result = build_canonical_filename(UPRN, ADDRESS, POSTCODE, original) + + # Assert + assert result == f"{UPRN}_{STREET} {POSTCODE}_EPC Report.pdf" + + +def test_dash_separator_removed_after_prefix_strip() -> None: + # Arrange – " - " separator between prefix and doc name + original = f"{STREET} {POSTCODE} - Floor Plan.pdf" + + # Act + result = build_canonical_filename(UPRN, ADDRESS, POSTCODE, original) + + # Assert + assert result == f"{UPRN}_{STREET} {POSTCODE}_Floor Plan.pdf" + + +def test_underscore_separator_removed_after_prefix_strip() -> None: + # Arrange – " _ " separator between prefix and doc name + original = f"{STREET} {POSTCODE} _ Floor Plan.pdf" + + # Act + result = build_canonical_filename(UPRN, ADDRESS, POSTCODE, original) + + # Assert + assert result == f"{UPRN}_{STREET} {POSTCODE}_Floor Plan.pdf" + + +def test_no_recognised_prefix_preserves_stem() -> None: + # Arrange + original = "Completely Different Name.pdf" + + # Act + result = build_canonical_filename(UPRN, ADDRESS, POSTCODE, original) + + # Assert + assert result == f"{UPRN}_{STREET} {POSTCODE}_Completely Different Name.pdf" + + +def test_no_doc_name_after_strip_omits_trailing_separator() -> None: + # Arrange – stem is exactly the address prefix with no trailing doc name + original = f"{STREET} {POSTCODE}.pdf" + + # Act + result = build_canonical_filename(UPRN, ADDRESS, POSTCODE, original) + + # Assert + assert result == f"{UPRN}_{STREET} {POSTCODE}.pdf" From b3e9d858d9dbd2c20b83390f93366a4b013cf6b6 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 15 Jun 2026 10:49:01 +0000 Subject: [PATCH 2/9] =?UTF-8?q?SharePoint=20renamer=20Lambda=20handler=20s?= =?UTF-8?q?tub=20created=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- applications/sharepoint_renamer/__init__.py | 0 applications/sharepoint_renamer/handler/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 applications/sharepoint_renamer/__init__.py create mode 100644 applications/sharepoint_renamer/handler/__init__.py diff --git a/applications/sharepoint_renamer/__init__.py b/applications/sharepoint_renamer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/applications/sharepoint_renamer/handler/__init__.py b/applications/sharepoint_renamer/handler/__init__.py new file mode 100644 index 00000000..e69de29b From 8cb0e986e65a669d1b384f93848bc8db6d0baa7e Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 15 Jun 2026 10:52:52 +0000 Subject: [PATCH 3/9] =?UTF-8?q?Deploy=20SharePoint=20renamer=20as=20Lambda?= =?UTF-8?q?=20with=20SQS=20trigger=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/deploy_terraform.yml | 39 +++++++++++++ .../sharepoint_renamer/handler/Dockerfile | 16 ++++++ .../sharepoint_renamer/handler/handler.py | 7 +++ .../handler/requirements.txt | 2 + .../lambda/sharepoint_renamer/main.tf | 22 ++++++++ .../lambda/sharepoint_renamer/outputs.tf | 9 +++ .../lambda/sharepoint_renamer/provider.tf | 20 +++++++ .../lambda/sharepoint_renamer/variables.tf | 55 +++++++++++++++++++ deployment/terraform/shared/main.tf | 14 +++++ scripts/__init__.py | 0 scripts/rename_sharepoint_files.py | 2 +- 11 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 applications/sharepoint_renamer/handler/Dockerfile create mode 100644 applications/sharepoint_renamer/handler/handler.py create mode 100644 applications/sharepoint_renamer/handler/requirements.txt create mode 100644 deployment/terraform/lambda/sharepoint_renamer/main.tf create mode 100644 deployment/terraform/lambda/sharepoint_renamer/outputs.tf create mode 100644 deployment/terraform/lambda/sharepoint_renamer/provider.tf create mode 100644 deployment/terraform/lambda/sharepoint_renamer/variables.tf create mode 100644 scripts/__init__.py diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 338ef11d..0780c580 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -495,6 +495,45 @@ jobs: TF_VAR_pashub_coordination_password: ${{ secrets.PASHUB_COORDINATION_PASSWORD }} + # ============================================================ + # Build SharePoint Renamer image and Push + # ============================================================ + sharepoint_renamer_image: + needs: [determine_stage, shared_terraform] + uses: ./.github/workflows/_build_image.yml + with: + ecr_repo: sharepoint-renamer-${{ needs.determine_stage.outputs.stage }} + dockerfile_path: applications/sharepoint_renamer/handler/Dockerfile + build_context: . + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + + + # ============================================================ + # Deploy SharePoint Renamer Lambda + # ============================================================ + sharepoint_renamer_lambda: + needs: [sharepoint_renamer_image, determine_stage] + uses: ./.github/workflows/_deploy_lambda.yml + with: + lambda_name: sharepoint_renamer + lambda_path: deployment/terraform/lambda/sharepoint_renamer + stage: ${{ needs.determine_stage.outputs.stage }} + ecr_repo: sharepoint-renamer-${{ needs.determine_stage.outputs.stage }} + image_digest: ${{ needs.sharepoint_renamer_image.outputs.image_digest }} + terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + TF_VAR_sharepoint_client_id: ${{ secrets.SHAREPOINT_CLIENT_ID }} + TF_VAR_sharepoint_client_secret: ${{ secrets.SHAREPOINT_CLIENT_SECRET }} + TF_VAR_sharepoint_tenant_id: ${{ secrets.SHAREPOINT_TENANT_ID }} + TF_VAR_social_housing_wave_3_sharepoint_id: ${{ secrets.SOCIAL_HOUSING_WAVE_3_SHAREPOINT_ID }} + + # ============================================================ # Deploy FastAPI Lambda # ============================================================ diff --git a/applications/sharepoint_renamer/handler/Dockerfile b/applications/sharepoint_renamer/handler/Dockerfile new file mode 100644 index 00000000..10c40e89 --- /dev/null +++ b/applications/sharepoint_renamer/handler/Dockerfile @@ -0,0 +1,16 @@ +FROM public.ecr.aws/lambda/python:3.11 + +WORKDIR /var/task + +COPY applications/sharepoint_renamer/handler/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY utils/ utils/ +COPY backend/__init__.py backend/__init__.py +COPY backend/pashub_fetcher/ backend/pashub_fetcher/ +COPY applications/sharepoint_renamer/ applications/sharepoint_renamer/ +COPY scripts/__init__.py scripts/__init__.py +COPY scripts/rename_sharepoint_files.py scripts/rename_sharepoint_files.py +COPY scripts/sero_address_list.csv scripts/sero_address_list.csv + +CMD ["applications.sharepoint_renamer.handler.handler.handler"] diff --git a/applications/sharepoint_renamer/handler/handler.py b/applications/sharepoint_renamer/handler/handler.py new file mode 100644 index 00000000..850d1ae6 --- /dev/null +++ b/applications/sharepoint_renamer/handler/handler.py @@ -0,0 +1,7 @@ +from typing import Any + +from scripts.rename_sharepoint_files import main + + +def handler(event: dict[str, Any], context: Any) -> None: + main() diff --git a/applications/sharepoint_renamer/handler/requirements.txt b/applications/sharepoint_renamer/handler/requirements.txt new file mode 100644 index 00000000..94317b81 --- /dev/null +++ b/applications/sharepoint_renamer/handler/requirements.txt @@ -0,0 +1,2 @@ +msal +requests diff --git a/deployment/terraform/lambda/sharepoint_renamer/main.tf b/deployment/terraform/lambda/sharepoint_renamer/main.tf new file mode 100644 index 00000000..0c245061 --- /dev/null +++ b/deployment/terraform/lambda/sharepoint_renamer/main.tf @@ -0,0 +1,22 @@ +module "lambda" { + source = "../../modules/lambda_with_sqs" + + name = "sharepoint_renamer" + stage = var.stage + + image_uri = local.image_uri + timeout = var.timeout + + reserved_concurrent_executions = var.reserved_concurrent_executions + + batch_size = var.batch_size + + environment = { + STAGE = var.stage + + SHAREPOINT_CLIENT_ID = var.sharepoint_client_id + SHAREPOINT_CLIENT_SECRET = var.sharepoint_client_secret + SHAREPOINT_TENANT_ID = var.sharepoint_tenant_id + SOCIAL_HOUSING_WAVE_3_SHAREPOINT_ID = var.social_housing_wave_3_sharepoint_id + } +} diff --git a/deployment/terraform/lambda/sharepoint_renamer/outputs.tf b/deployment/terraform/lambda/sharepoint_renamer/outputs.tf new file mode 100644 index 00000000..e71fac8b --- /dev/null +++ b/deployment/terraform/lambda/sharepoint_renamer/outputs.tf @@ -0,0 +1,9 @@ +output "sharepoint_renamer_queue_url" { + value = module.lambda.queue_url + description = "URL of the SharePoint Renamer SQS queue" +} + +output "sharepoint_renamer_queue_arn" { + value = module.lambda.queue_arn + description = "ARN of the SharePoint Renamer SQS queue" +} diff --git a/deployment/terraform/lambda/sharepoint_renamer/provider.tf b/deployment/terraform/lambda/sharepoint_renamer/provider.tf new file mode 100644 index 00000000..e6f8e32c --- /dev/null +++ b/deployment/terraform/lambda/sharepoint_renamer/provider.tf @@ -0,0 +1,20 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.0" + } + } + + backend "s3" { + bucket = "sharepoint-renamer-terraform-state" + key = "terraform.tfstate" + region = "eu-west-2" + } + + required_version = ">= 1.2.0" +} + +provider "aws" { + region = "eu-west-2" +} diff --git a/deployment/terraform/lambda/sharepoint_renamer/variables.tf b/deployment/terraform/lambda/sharepoint_renamer/variables.tf new file mode 100644 index 00000000..97cca538 --- /dev/null +++ b/deployment/terraform/lambda/sharepoint_renamer/variables.tf @@ -0,0 +1,55 @@ +variable "stage" { + description = "Deployment stage (e.g. dev, prod)" + type = string +} + +variable "ecr_repo_url" { + type = string + description = "ECR repository URL (no tag, no digest)" +} + +variable "image_digest" { + type = string + description = "Image digest (sha256:...)" +} + +variable "timeout" { + type = number + default = 900 + description = "Lambda timeout in seconds." +} + +variable "reserved_concurrent_executions" { + type = number + default = 1 + description = "Prevent parallel renames causing race conditions on SharePoint." +} + +variable "batch_size" { + type = number + default = 1 +} + +variable "sharepoint_client_id" { + type = string + sensitive = true +} + +variable "sharepoint_client_secret" { + type = string + sensitive = true +} + +variable "sharepoint_tenant_id" { + type = string + sensitive = true +} + +variable "social_housing_wave_3_sharepoint_id" { + type = string + sensitive = true +} + +locals { + image_uri = "${var.ecr_repo_url}@${var.image_digest}" +} diff --git a/deployment/terraform/shared/main.tf b/deployment/terraform/shared/main.tf index 7ca116e7..3d6bbd39 100644 --- a/deployment/terraform/shared/main.tf +++ b/deployment/terraform/shared/main.tf @@ -844,3 +844,17 @@ module "audit_generator_registry" { stage = var.stage } +################################################ +# SharePoint Renamer – Lambda +################################################ +module "sharepoint_renamer_state_bucket" { + source = "../modules/tf_state_bucket" + bucket_name = "sharepoint-renamer-terraform-state" +} + +module "sharepoint_renamer_registry" { + source = "../modules/container_registry" + name = "sharepoint-renamer" + stage = var.stage +} + diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/rename_sharepoint_files.py b/scripts/rename_sharepoint_files.py index a7306d88..7ed126e3 100644 --- a/scripts/rename_sharepoint_files.py +++ b/scripts/rename_sharepoint_files.py @@ -17,7 +17,7 @@ from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient from utils.sharepoint.domna_sites import DomnaSites DRY_RUN: bool = False -CSV_PATH: str = "scripts/sero_address_list_test.csv" +CSV_PATH: str = "scripts/sero_address_list.csv" BASE_PATH = ( "Osmosis-ACD Projects/Sero-Clarion Housing/" From beb4e5d0d919744df6fb9b2263d8e579b3c53e93 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 15 Jun 2026 11:01:51 +0000 Subject: [PATCH 4/9] Move SharePoint renamer logic from scripts/ into orchestrator and app-root handler --- applications/sharepoint_renamer/handler.py | 13 ++ .../sharepoint_renamer/handler/Dockerfile | 5 +- .../sharepoint_renamer/handler/handler.py | 7 - .../sharepoint_renamer_orchestrator.py | 113 +++++++++++++++ scripts/__init__.py | 0 scripts/rename_sharepoint_files.py | 137 ------------------ .../tests/test_build_canonical_filename.py | 2 +- 7 files changed, 129 insertions(+), 148 deletions(-) create mode 100644 applications/sharepoint_renamer/handler.py delete mode 100644 applications/sharepoint_renamer/handler/handler.py create mode 100644 orchestration/sharepoint_renamer_orchestrator.py delete mode 100644 scripts/__init__.py delete mode 100644 scripts/rename_sharepoint_files.py diff --git a/applications/sharepoint_renamer/handler.py b/applications/sharepoint_renamer/handler.py new file mode 100644 index 00000000..5a290878 --- /dev/null +++ b/applications/sharepoint_renamer/handler.py @@ -0,0 +1,13 @@ +from typing import Any + +from orchestration.sharepoint_renamer_orchestrator import SharepointRenamerOrchestrator +from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient +from utils.sharepoint.domna_sites import DomnaSites + +CSV_PATH = "scripts/sero_address_list.csv" + + +def handler(event: dict[str, Any], context: Any) -> None: + sp_client = DomnaSharepointClient(DomnaSites.SOCIAL_HOUSING_WAVE_3) + orchestrator = SharepointRenamerOrchestrator(sp_client, CSV_PATH) + orchestrator.run() diff --git a/applications/sharepoint_renamer/handler/Dockerfile b/applications/sharepoint_renamer/handler/Dockerfile index 10c40e89..bb946cc2 100644 --- a/applications/sharepoint_renamer/handler/Dockerfile +++ b/applications/sharepoint_renamer/handler/Dockerfile @@ -8,9 +8,8 @@ RUN pip install --no-cache-dir -r requirements.txt COPY utils/ utils/ COPY backend/__init__.py backend/__init__.py COPY backend/pashub_fetcher/ backend/pashub_fetcher/ +COPY orchestration/ orchestration/ COPY applications/sharepoint_renamer/ applications/sharepoint_renamer/ -COPY scripts/__init__.py scripts/__init__.py -COPY scripts/rename_sharepoint_files.py scripts/rename_sharepoint_files.py COPY scripts/sero_address_list.csv scripts/sero_address_list.csv -CMD ["applications.sharepoint_renamer.handler.handler.handler"] +CMD ["applications.sharepoint_renamer.handler.handler"] diff --git a/applications/sharepoint_renamer/handler/handler.py b/applications/sharepoint_renamer/handler/handler.py deleted file mode 100644 index 850d1ae6..00000000 --- a/applications/sharepoint_renamer/handler/handler.py +++ /dev/null @@ -1,7 +0,0 @@ -from typing import Any - -from scripts.rename_sharepoint_files import main - - -def handler(event: dict[str, Any], context: Any) -> None: - main() diff --git a/orchestration/sharepoint_renamer_orchestrator.py b/orchestration/sharepoint_renamer_orchestrator.py new file mode 100644 index 00000000..764776ae --- /dev/null +++ b/orchestration/sharepoint_renamer_orchestrator.py @@ -0,0 +1,113 @@ +import csv +import logging +import os +from typing import Optional + +from backend.pashub_fetcher.sharepoint_subfolders import SharepointSubfolders +from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient + +BASE_PATH = ( + "Osmosis-ACD Projects/Sero-Clarion Housing/" + "Sero Project Documents/Property Folders" +) +ASSESSMENT_SUBFOLDER = "A. Assessment" + +logger = logging.getLogger(__name__) + + +def build_canonical_filename( + uprn: str, address: str, postcode: str, original_name: str +) -> Optional[str]: + """ + Returns the canonical filename, or None if the file is already renamed. + + Already-renamed: name starts with "{uprn}_". + Strips any existing address prefix (address+postcode first, then address alone) + before inserting the canonical prefix. + """ + if original_name.startswith(f"{uprn}_"): + return None + + stem, ext = os.path.splitext(original_name) + stem_lower = stem.lower() + + street = address.split(",")[0].strip() + prefixes = [ + f"{address} {postcode}", + address, + f"{street} {postcode}", + street, + ] + + doc_name = stem + for prefix in prefixes: + if stem_lower.startswith(prefix.lower()): + doc_name = stem[len(prefix) :] + break + + if doc_name.startswith(" - "): + doc_name = doc_name[3:] + elif doc_name.startswith(" _ "): + doc_name = doc_name[3:] + doc_name = doc_name.strip() + + street_post = f"{street} {postcode}" + if doc_name: + return f"{uprn}_{street_post}_{doc_name}{ext}" + return f"{uprn}_{street_post}{ext}" + + +class SharepointRenamerOrchestrator: + def __init__(self, sp_client: DomnaSharepointClient, csv_path: str) -> None: + self._sp_client = sp_client + self._csv_path = csv_path + + def run(self) -> None: + with open(self._csv_path, newline="", encoding="utf-8-sig") as f: + reader = csv.DictReader(f) + required = {"UPRN", "Address", "Postcode"} + if not reader.fieldnames or not required.issubset(set(reader.fieldnames)): + raise ValueError( + f"CSV missing required columns. Expected {required}, got {reader.fieldnames}" + ) + + for row in reader: + uprn = row["UPRN"].strip() + address = row["Address"].strip() + postcode = row["Postcode"].strip() + folder_path = ( + f"{BASE_PATH}/{address}, {postcode}" + f"/{SharepointSubfolders.ASSESSMENT.value}/{ASSESSMENT_SUBFOLDER}" + ) + self._process_folder(folder_path, uprn, address, postcode) + + def _process_folder( + self, folder_path: str, uprn: str, address: str, postcode: str + ) -> None: + try: + contents = self._sp_client.get_folders_in_path(folder_path) + except ValueError: + logger.warning(f"Missing folder for UPRN {uprn}: {folder_path}") + return + + for item in contents.get("value", []): + if "folder" in item: + self._process_folder( + f"{folder_path}/{item['name']}", uprn, address, postcode + ) + elif "file" in item: + original_name: str = item["name"] + new_name = build_canonical_filename(uprn, address, postcode, original_name) + + if new_name is None: + continue + + try: + self._sp_client.rename_file(item["id"], new_name) + logger.info( + f'Renamed: "{original_name}" → "{new_name}" (UPRN: {uprn})' + ) + except Exception as e: + logger.error( + f'Failed to rename "{original_name}" → "{new_name}" (UPRN: {uprn}): {e}' + ) diff --git a/scripts/__init__.py b/scripts/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/scripts/rename_sharepoint_files.py b/scripts/rename_sharepoint_files.py deleted file mode 100644 index 7ed126e3..00000000 --- a/scripts/rename_sharepoint_files.py +++ /dev/null @@ -1,137 +0,0 @@ -""" -Rename files in SharePoint property folders to the canonical format: - {UPRN}_{Street} {Postcode}_{Document Name}.ext - -Set DRY_RUN = False when ready to commit. Run from repo root. -Required env vars: SHAREPOINT_CLIENT_ID, SHAREPOINT_CLIENT_SECRET, - SHAREPOINT_TENANT_ID, SOCIAL_HOUSING_WAVE_3_SHAREPOINT_ID -""" - -import csv -import os -from typing import Optional - -from backend.pashub_fetcher.sharepoint_subfolders import SharepointSubfolders -from utils.logger import setup_logger -from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient -from utils.sharepoint.domna_sites import DomnaSites - -DRY_RUN: bool = False -CSV_PATH: str = "scripts/sero_address_list.csv" - -BASE_PATH = ( - "Osmosis-ACD Projects/Sero-Clarion Housing/" - "Sero Project Documents/Property Folders" -) -ASSESSMENT_SUBFOLDER = "A. Assessment" - -logger = setup_logger() - - -def build_canonical_filename( - uprn: str, address: str, postcode: str, original_name: str -) -> Optional[str]: - """ - Returns the canonical filename, or None if the file is already renamed. - - Already-renamed: name starts with "{uprn}_". - Strips any existing address prefix (address+postcode first, then address alone) - before inserting the canonical prefix. - """ - if original_name.startswith(f"{uprn}_"): - return None - - stem, ext = os.path.splitext(original_name) - stem_lower = stem.lower() - - street = address.split(",")[0].strip() - prefixes = [ - f"{address} {postcode}", - address, - f"{street} {postcode}", - street, - ] - - doc_name = stem - for prefix in prefixes: - if stem_lower.startswith(prefix.lower()): - doc_name = stem[len(prefix) :] - break - - if doc_name.startswith(" - "): - doc_name = doc_name[3:] - elif doc_name.startswith(" _ "): - doc_name = doc_name[3:] - doc_name = doc_name.strip() - - street_post = f"{street} {postcode}" - if doc_name: - return f"{uprn}_{street_post}_{doc_name}{ext}" - return f"{uprn}_{street_post}{ext}" - - -def process_folder( - sp_client: DomnaSharepointClient, - folder_path: str, - uprn: str, - address: str, - postcode: str, -) -> None: - try: - contents = sp_client.get_folders_in_path(folder_path) - except ValueError: - logger.warning(f"Missing folder for UPRN {uprn}: {folder_path}") - return - - for item in contents.get("value", []): - if "folder" in item: - process_folder( - sp_client, f"{folder_path}/{item['name']}", uprn, address, postcode - ) - elif "file" in item: - original_name: str = item["name"] - new_name = build_canonical_filename(uprn, address, postcode, original_name) - - if new_name is None: - continue - - if DRY_RUN: - logger.info( - f'[DRY RUN] Renaming: "{original_name}" → "{new_name}" (UPRN: {uprn})' - ) - else: - try: - sp_client.rename_file(item["id"], new_name) - logger.info( - f'Renamed: "{original_name}" → "{new_name}" (UPRN: {uprn})' - ) - except Exception as e: - logger.error( - f'Failed to rename "{original_name}" → "{new_name}" (UPRN: {uprn}): {e}' - ) - - -def main() -> None: - sp_client = DomnaSharepointClient(DomnaSites.SOCIAL_HOUSING_WAVE_3) - - with open(CSV_PATH, newline="", encoding="utf-8-sig") as f: - reader = csv.DictReader(f) - required = {"UPRN", "Address", "Postcode"} - if not reader.fieldnames or not required.issubset(set(reader.fieldnames)): - raise ValueError( - f"CSV missing required columns. Expected {required}, got {reader.fieldnames}" - ) - - for row in reader: - uprn = row["UPRN"].strip() - address = row["Address"].strip() - postcode = row["Postcode"].strip() - folder_path = ( - f"{BASE_PATH}/{address}, {postcode}" - f"/{SharepointSubfolders.ASSESSMENT.value}/{ASSESSMENT_SUBFOLDER}" - ) - process_folder(sp_client, folder_path, uprn, address, postcode) - - -if __name__ == "__main__": - main() diff --git a/scripts/tests/test_build_canonical_filename.py b/scripts/tests/test_build_canonical_filename.py index 3890477c..67d4fcae 100644 --- a/scripts/tests/test_build_canonical_filename.py +++ b/scripts/tests/test_build_canonical_filename.py @@ -1,5 +1,5 @@ # scripts/tests/test_build_canonical_filename.py -from scripts.rename_sharepoint_files import build_canonical_filename +from orchestration.sharepoint_renamer_orchestrator import build_canonical_filename UPRN = "10093456789" ADDRESS = "1 High Street, Anytown" From 38b9e6384446100d82420689861ceb90de34dbd1 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 15 Jun 2026 11:02:48 +0000 Subject: [PATCH 5/9] revert pytest.ini --- pytest.ini | 2 -- 1 file changed, 2 deletions(-) diff --git a/pytest.ini b/pytest.ini index a6eba3be..2bcd6178 100644 --- a/pytest.ini +++ b/pytest.ini @@ -25,7 +25,5 @@ testpaths = etl/epc_clean/tests etl/hubspot/tests etl/spatial/tests - scripts/tests - ; tests/ markers = integration: mark a test as an integration test From 5c314e2914ae4fdf2e28d07a5605783348bf1e24 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 15 Jun 2026 11:11:08 +0000 Subject: [PATCH 6/9] move tests out of scripts/ --- pytest.ini | 1 + scripts/tests/__init__.py | 0 .../orchestration}/test_build_canonical_filename.py | 0 3 files changed, 1 insertion(+) delete mode 100644 scripts/tests/__init__.py rename {scripts/tests => tests/orchestration}/test_build_canonical_filename.py (100%) diff --git a/pytest.ini b/pytest.ini index 2bcd6178..cb6af047 100644 --- a/pytest.ini +++ b/pytest.ini @@ -25,5 +25,6 @@ testpaths = etl/epc_clean/tests etl/hubspot/tests etl/spatial/tests + tests/ markers = integration: mark a test as an integration test diff --git a/scripts/tests/__init__.py b/scripts/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/scripts/tests/test_build_canonical_filename.py b/tests/orchestration/test_build_canonical_filename.py similarity index 100% rename from scripts/tests/test_build_canonical_filename.py rename to tests/orchestration/test_build_canonical_filename.py From 0fc81da4cf3108c6a0f267c47f9e5987055d1b08 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 15 Jun 2026 11:14:09 +0000 Subject: [PATCH 7/9] move input files out of scripts/ --- applications/sharepoint_renamer/handler.py | 2 +- applications/sharepoint_renamer/handler/Dockerfile | 2 -- .../sharepoint_renamer}/sero_address_list.csv | 0 applications/sharepoint_renamer/sero_address_list_test.csv | 2 ++ 4 files changed, 3 insertions(+), 3 deletions(-) rename {scripts => applications/sharepoint_renamer}/sero_address_list.csv (100%) create mode 100644 applications/sharepoint_renamer/sero_address_list_test.csv diff --git a/applications/sharepoint_renamer/handler.py b/applications/sharepoint_renamer/handler.py index 5a290878..998458bc 100644 --- a/applications/sharepoint_renamer/handler.py +++ b/applications/sharepoint_renamer/handler.py @@ -4,7 +4,7 @@ from orchestration.sharepoint_renamer_orchestrator import SharepointRenamerOrche from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient from utils.sharepoint.domna_sites import DomnaSites -CSV_PATH = "scripts/sero_address_list.csv" +CSV_PATH = "applications/sharepoint_renamer/sero_address_list.csv" def handler(event: dict[str, Any], context: Any) -> None: diff --git a/applications/sharepoint_renamer/handler/Dockerfile b/applications/sharepoint_renamer/handler/Dockerfile index bb946cc2..a81294f9 100644 --- a/applications/sharepoint_renamer/handler/Dockerfile +++ b/applications/sharepoint_renamer/handler/Dockerfile @@ -10,6 +10,4 @@ COPY backend/__init__.py backend/__init__.py COPY backend/pashub_fetcher/ backend/pashub_fetcher/ COPY orchestration/ orchestration/ COPY applications/sharepoint_renamer/ applications/sharepoint_renamer/ -COPY scripts/sero_address_list.csv scripts/sero_address_list.csv - CMD ["applications.sharepoint_renamer.handler.handler"] diff --git a/scripts/sero_address_list.csv b/applications/sharepoint_renamer/sero_address_list.csv similarity index 100% rename from scripts/sero_address_list.csv rename to applications/sharepoint_renamer/sero_address_list.csv diff --git a/applications/sharepoint_renamer/sero_address_list_test.csv b/applications/sharepoint_renamer/sero_address_list_test.csv new file mode 100644 index 00000000..72b28047 --- /dev/null +++ b/applications/sharepoint_renamer/sero_address_list_test.csv @@ -0,0 +1,2 @@ +UPRN,Address,Postcode +U1014630,"118 Faringdon Avenue, Bromley",BR2 8BU \ No newline at end of file From a6050fc1c7b60d58be541a5355c5daa8d7d65257 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 15 Jun 2026 12:04:33 +0000 Subject: [PATCH 8/9] remove tests/ from pytest.ini --- pytest.ini | 1 - 1 file changed, 1 deletion(-) diff --git a/pytest.ini b/pytest.ini index cb6af047..2bcd6178 100644 --- a/pytest.ini +++ b/pytest.ini @@ -25,6 +25,5 @@ testpaths = etl/epc_clean/tests etl/hubspot/tests etl/spatial/tests - tests/ markers = integration: mark a test as an integration test From b9cbea367db3057365e2c654f89645e1a8ff3c22 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 15 Jun 2026 12:21:32 +0000 Subject: [PATCH 9/9] correct import in test file --- tests/scripts/test_rename_sharepoint_files.py | 59 +++++++------------ 1 file changed, 22 insertions(+), 37 deletions(-) diff --git a/tests/scripts/test_rename_sharepoint_files.py b/tests/scripts/test_rename_sharepoint_files.py index 4525fe84..7b3e6587 100644 --- a/tests/scripts/test_rename_sharepoint_files.py +++ b/tests/scripts/test_rename_sharepoint_files.py @@ -1,10 +1,12 @@ from typing import Any -from unittest.mock import MagicMock, call, patch +from unittest.mock import MagicMock import pytest -import scripts.rename_sharepoint_files as module -from scripts.rename_sharepoint_files import build_canonical_filename, process_folder +from orchestration.sharepoint_renamer_orchestrator import ( + SharepointRenamerOrchestrator, + build_canonical_filename, +) def _make_file(name: str, item_id: str = "id-1") -> dict[str, Any]: @@ -19,6 +21,12 @@ def _make_package(name: str) -> dict[str, Any]: return {"name": name, "package": {}} +def _make_orchestrator(sp: MagicMock) -> SharepointRenamerOrchestrator: + orchestrator = SharepointRenamerOrchestrator.__new__(SharepointRenamerOrchestrator) + orchestrator._sp_client = sp + return orchestrator + + # --------------------------------------------------------------------------- # build_canonical_filename # --------------------------------------------------------------------------- @@ -39,7 +47,7 @@ def test_no_prefix_still_canonical() -> None: # --------------------------------------------------------------------------- -# process_folder — files only at root level +# _process_folder — files only at root level # --------------------------------------------------------------------------- @@ -52,8 +60,7 @@ def test_renames_top_level_files(caplog: pytest.LogCaptureFixture) -> None: ] } - with patch.object(module, "DRY_RUN", False): - process_folder(sp, "some/path", "100", "1 High St", "AB1 2CD") + _make_orchestrator(sp)._process_folder("some/path", "100", "1 High St", "AB1 2CD") assert sp.rename_file.call_count == 2 sp.rename_file.assert_any_call("id-1", "100_1 High St AB1 2CD_Survey.pdf") @@ -61,7 +68,7 @@ def test_renames_top_level_files(caplog: pytest.LogCaptureFixture) -> None: # --------------------------------------------------------------------------- -# process_folder — recursive two-level hierarchy +# _process_folder — recursive two-level hierarchy # --------------------------------------------------------------------------- @@ -84,8 +91,7 @@ def test_recurses_into_subfolders_and_renames_all_files() -> None: root_contents if path == "base/path" else suba_contents ) - with patch.object(module, "DRY_RUN", False): - process_folder(sp, "base/path", "200", "2 Main Rd", "XY9 8ZW") + _make_orchestrator(sp)._process_folder("base/path", "200", "2 Main Rd", "XY9 8ZW") assert sp.rename_file.call_count == 2 sp.rename_file.assert_any_call("root-file", "200_2 Main Rd XY9 8ZW_Root.pdf") @@ -95,25 +101,22 @@ def test_recurses_into_subfolders_and_renames_all_files() -> None: # --------------------------------------------------------------------------- -# process_folder — non-file, non-folder items are skipped +# _process_folder — non-file, non-folder items are skipped # --------------------------------------------------------------------------- def test_ignores_package_items() -> None: sp = MagicMock() - sp.get_folders_in_path.return_value = { - "value": [_make_package("Notebook")] - } + sp.get_folders_in_path.return_value = {"value": [_make_package("Notebook")]} - with patch.object(module, "DRY_RUN", False): - process_folder(sp, "some/path", "300", "3 Oak Ave", "ZZ1 1ZZ") + _make_orchestrator(sp)._process_folder("some/path", "300", "3 Oak Ave", "ZZ1 1ZZ") sp.rename_file.assert_not_called() assert sp.get_folders_in_path.call_count == 1 # --------------------------------------------------------------------------- -# process_folder — missing folder +# _process_folder — missing folder # --------------------------------------------------------------------------- @@ -121,31 +124,14 @@ def test_missing_folder_logs_warning_and_returns(caplog: pytest.LogCaptureFixtur sp = MagicMock() sp.get_folders_in_path.side_effect = ValueError("not found") - with patch.object(module, "DRY_RUN", False): - process_folder(sp, "missing/path", "400", "4 Elm St", "AA2 2BB") + _make_orchestrator(sp)._process_folder("missing/path", "400", "4 Elm St", "AA2 2BB") sp.rename_file.assert_not_called() assert any("Missing folder" in r.message and "400" in r.message for r in caplog.records) # --------------------------------------------------------------------------- -# process_folder — dry run -# --------------------------------------------------------------------------- - - -def test_dry_run_logs_without_renaming(caplog: pytest.LogCaptureFixture) -> None: - sp = MagicMock() - sp.get_folders_in_path.return_value = {"value": [_make_file("Doc.pdf", "id-x")]} - - with patch.object(module, "DRY_RUN", True): - process_folder(sp, "some/path", "500", "5 Pine Ln", "BB3 3CC") - - sp.rename_file.assert_not_called() - assert any("[DRY RUN]" in r.message for r in caplog.records) - - -# --------------------------------------------------------------------------- -# process_folder — already-canonical files are skipped +# _process_folder — already-canonical files are skipped # --------------------------------------------------------------------------- @@ -155,7 +141,6 @@ def test_skips_already_canonical_files() -> None: "value": [_make_file("500_Pine Ln BB3 3CC_Doc.pdf", "id-y")] } - with patch.object(module, "DRY_RUN", False): - process_folder(sp, "some/path", "500", "5 Pine Ln", "BB3 3CC") + _make_orchestrator(sp)._process_folder("some/path", "500", "5 Pine Ln", "BB3 3CC") sp.rename_file.assert_not_called()