Merge pull request #1230 from Hestia-Homes/feature/deploy-sharepoint-renamer

Deploy sharepoint renamer
This commit is contained in:
Daniel Roth 2026-06-15 13:45:52 +01:00 committed by GitHub
commit 17420408e4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 430 additions and 174 deletions

View file

@ -495,6 +495,45 @@ jobs:
TF_VAR_pashub_coordination_password: ${{ secrets.PASHUB_COORDINATION_PASSWORD }}
# ============================================================
# Build SharePoint Renamer image and Push
# ============================================================
sharepoint_renamer_image:
needs: [determine_stage, shared_terraform]
uses: ./.github/workflows/_build_image.yml
with:
ecr_repo: sharepoint-renamer-${{ needs.determine_stage.outputs.stage }}
dockerfile_path: applications/sharepoint_renamer/handler/Dockerfile
build_context: .
secrets:
AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
# ============================================================
# Deploy SharePoint Renamer Lambda
# ============================================================
sharepoint_renamer_lambda:
needs: [sharepoint_renamer_image, determine_stage]
uses: ./.github/workflows/_deploy_lambda.yml
with:
lambda_name: sharepoint_renamer
lambda_path: deployment/terraform/lambda/sharepoint_renamer
stage: ${{ needs.determine_stage.outputs.stage }}
ecr_repo: sharepoint-renamer-${{ needs.determine_stage.outputs.stage }}
image_digest: ${{ needs.sharepoint_renamer_image.outputs.image_digest }}
terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }}
secrets:
AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
TF_VAR_sharepoint_client_id: ${{ secrets.SHAREPOINT_CLIENT_ID }}
TF_VAR_sharepoint_client_secret: ${{ secrets.SHAREPOINT_CLIENT_SECRET }}
TF_VAR_sharepoint_tenant_id: ${{ secrets.SHAREPOINT_TENANT_ID }}
TF_VAR_social_housing_wave_3_sharepoint_id: ${{ secrets.SOCIAL_HOUSING_WAVE_3_SHAREPOINT_ID }}
# ============================================================
# Deploy FastAPI Lambda
# ============================================================

View file

@ -0,0 +1,13 @@
from typing import Any
from orchestration.sharepoint_renamer_orchestrator import SharepointRenamerOrchestrator
from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient
from utils.sharepoint.domna_sites import DomnaSites
CSV_PATH = "applications/sharepoint_renamer/sero_address_list.csv"
def handler(event: dict[str, Any], context: Any) -> None:
sp_client = DomnaSharepointClient(DomnaSites.SOCIAL_HOUSING_WAVE_3)
orchestrator = SharepointRenamerOrchestrator(sp_client, CSV_PATH)
orchestrator.run()

View file

@ -0,0 +1,13 @@
FROM public.ecr.aws/lambda/python:3.11
WORKDIR /var/task
COPY applications/sharepoint_renamer/handler/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY utils/ utils/
COPY backend/__init__.py backend/__init__.py
COPY backend/pashub_fetcher/ backend/pashub_fetcher/
COPY orchestration/ orchestration/
COPY applications/sharepoint_renamer/ applications/sharepoint_renamer/
CMD ["applications.sharepoint_renamer.handler.handler"]

View file

@ -0,0 +1,2 @@
msal
requests

View file

@ -0,0 +1,2 @@
UPRN,Address,Postcode
U1014630,"118 Faringdon Avenue, Bromley",BR2 8BU
1 UPRN Address Postcode
2 U1014630 118 Faringdon Avenue, Bromley BR2 8BU

View file

@ -0,0 +1,22 @@
module "lambda" {
source = "../../modules/lambda_with_sqs"
name = "sharepoint_renamer"
stage = var.stage
image_uri = local.image_uri
timeout = var.timeout
reserved_concurrent_executions = var.reserved_concurrent_executions
batch_size = var.batch_size
environment = {
STAGE = var.stage
SHAREPOINT_CLIENT_ID = var.sharepoint_client_id
SHAREPOINT_CLIENT_SECRET = var.sharepoint_client_secret
SHAREPOINT_TENANT_ID = var.sharepoint_tenant_id
SOCIAL_HOUSING_WAVE_3_SHAREPOINT_ID = var.social_housing_wave_3_sharepoint_id
}
}

View file

@ -0,0 +1,9 @@
output "sharepoint_renamer_queue_url" {
value = module.lambda.queue_url
description = "URL of the SharePoint Renamer SQS queue"
}
output "sharepoint_renamer_queue_arn" {
value = module.lambda.queue_arn
description = "ARN of the SharePoint Renamer SQS queue"
}

View file

@ -0,0 +1,20 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 5.0"
}
}
backend "s3" {
bucket = "sharepoint-renamer-terraform-state"
key = "terraform.tfstate"
region = "eu-west-2"
}
required_version = ">= 1.2.0"
}
provider "aws" {
region = "eu-west-2"
}

View file

@ -0,0 +1,55 @@
variable "stage" {
description = "Deployment stage (e.g. dev, prod)"
type = string
}
variable "ecr_repo_url" {
type = string
description = "ECR repository URL (no tag, no digest)"
}
variable "image_digest" {
type = string
description = "Image digest (sha256:...)"
}
variable "timeout" {
type = number
default = 900
description = "Lambda timeout in seconds."
}
variable "reserved_concurrent_executions" {
type = number
default = 1
description = "Prevent parallel renames causing race conditions on SharePoint."
}
variable "batch_size" {
type = number
default = 1
}
variable "sharepoint_client_id" {
type = string
sensitive = true
}
variable "sharepoint_client_secret" {
type = string
sensitive = true
}
variable "sharepoint_tenant_id" {
type = string
sensitive = true
}
variable "social_housing_wave_3_sharepoint_id" {
type = string
sensitive = true
}
locals {
image_uri = "${var.ecr_repo_url}@${var.image_digest}"
}

View file

@ -844,3 +844,17 @@ module "audit_generator_registry" {
stage = var.stage
}
################################################
# SharePoint Renamer Lambda
################################################
module "sharepoint_renamer_state_bucket" {
source = "../modules/tf_state_bucket"
bucket_name = "sharepoint-renamer-terraform-state"
}
module "sharepoint_renamer_registry" {
source = "../modules/container_registry"
name = "sharepoint-renamer"
stage = var.stage
}

View file

@ -0,0 +1,113 @@
import csv
import logging
import os
from typing import Optional
from backend.pashub_fetcher.sharepoint_subfolders import SharepointSubfolders
from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient
BASE_PATH = (
"Osmosis-ACD Projects/Sero-Clarion Housing/"
"Sero Project Documents/Property Folders"
)
ASSESSMENT_SUBFOLDER = "A. Assessment"
logger = logging.getLogger(__name__)
def build_canonical_filename(
uprn: str, address: str, postcode: str, original_name: str
) -> Optional[str]:
"""
Returns the canonical filename, or None if the file is already renamed.
Already-renamed: name starts with "{uprn}_".
Strips any existing address prefix (address+postcode first, then address alone)
before inserting the canonical prefix.
"""
if original_name.startswith(f"{uprn}_"):
return None
stem, ext = os.path.splitext(original_name)
stem_lower = stem.lower()
street = address.split(",")[0].strip()
prefixes = [
f"{address} {postcode}",
address,
f"{street} {postcode}",
street,
]
doc_name = stem
for prefix in prefixes:
if stem_lower.startswith(prefix.lower()):
doc_name = stem[len(prefix) :]
break
if doc_name.startswith(" - "):
doc_name = doc_name[3:]
elif doc_name.startswith(" _ "):
doc_name = doc_name[3:]
doc_name = doc_name.strip()
street_post = f"{street} {postcode}"
if doc_name:
return f"{uprn}_{street_post}_{doc_name}{ext}"
return f"{uprn}_{street_post}{ext}"
class SharepointRenamerOrchestrator:
def __init__(self, sp_client: DomnaSharepointClient, csv_path: str) -> None:
self._sp_client = sp_client
self._csv_path = csv_path
def run(self) -> None:
with open(self._csv_path, newline="", encoding="utf-8-sig") as f:
reader = csv.DictReader(f)
required = {"UPRN", "Address", "Postcode"}
if not reader.fieldnames or not required.issubset(set(reader.fieldnames)):
raise ValueError(
f"CSV missing required columns. Expected {required}, got {reader.fieldnames}"
)
for row in reader:
uprn = row["UPRN"].strip()
address = row["Address"].strip()
postcode = row["Postcode"].strip()
folder_path = (
f"{BASE_PATH}/{address}, {postcode}"
f"/{SharepointSubfolders.ASSESSMENT.value}/{ASSESSMENT_SUBFOLDER}"
)
self._process_folder(folder_path, uprn, address, postcode)
def _process_folder(
self, folder_path: str, uprn: str, address: str, postcode: str
) -> None:
try:
contents = self._sp_client.get_folders_in_path(folder_path)
except ValueError:
logger.warning(f"Missing folder for UPRN {uprn}: {folder_path}")
return
for item in contents.get("value", []):
if "folder" in item:
self._process_folder(
f"{folder_path}/{item['name']}", uprn, address, postcode
)
elif "file" in item:
original_name: str = item["name"]
new_name = build_canonical_filename(uprn, address, postcode, original_name)
if new_name is None:
continue
try:
self._sp_client.rename_file(item["id"], new_name)
logger.info(
f'Renamed: "{original_name}""{new_name}" (UPRN: {uprn})'
)
except Exception as e:
logger.error(
f'Failed to rename "{original_name}""{new_name}" (UPRN: {uprn}): {e}'
)

View file

@ -1,137 +0,0 @@
"""
Rename files in SharePoint property folders to the canonical format:
{UPRN}_{Street} {Postcode}_{Document Name}.ext
Set DRY_RUN = False when ready to commit. Run from repo root.
Required env vars: SHAREPOINT_CLIENT_ID, SHAREPOINT_CLIENT_SECRET,
SHAREPOINT_TENANT_ID, SOCIAL_HOUSING_WAVE_3_SHAREPOINT_ID
"""
import csv
import os
from typing import Optional
from backend.pashub_fetcher.sharepoint_subfolders import SharepointSubfolders
from utils.logger import setup_logger
from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient
from utils.sharepoint.domna_sites import DomnaSites
DRY_RUN: bool = False
CSV_PATH: str = "scripts/sero_address_list_test.csv"
BASE_PATH = (
"Osmosis-ACD Projects/Sero-Clarion Housing/"
"Sero Project Documents/Property Folders"
)
ASSESSMENT_SUBFOLDER = "A. Assessment"
logger = setup_logger()
def build_canonical_filename(
uprn: str, address: str, postcode: str, original_name: str
) -> Optional[str]:
"""
Returns the canonical filename, or None if the file is already renamed.
Already-renamed: name starts with "{uprn}_".
Strips any existing address prefix (address+postcode first, then address alone)
before inserting the canonical prefix.
"""
if original_name.startswith(f"{uprn}_"):
return None
stem, ext = os.path.splitext(original_name)
stem_lower = stem.lower()
street = address.split(",")[0].strip()
prefixes = [
f"{address} {postcode}",
address,
f"{street} {postcode}",
street,
]
doc_name = stem
for prefix in prefixes:
if stem_lower.startswith(prefix.lower()):
doc_name = stem[len(prefix) :]
break
if doc_name.startswith(" - "):
doc_name = doc_name[3:]
elif doc_name.startswith(" _ "):
doc_name = doc_name[3:]
doc_name = doc_name.strip()
street_post = f"{street} {postcode}"
if doc_name:
return f"{uprn}_{street_post}_{doc_name}{ext}"
return f"{uprn}_{street_post}{ext}"
def process_folder(
sp_client: DomnaSharepointClient,
folder_path: str,
uprn: str,
address: str,
postcode: str,
) -> None:
try:
contents = sp_client.get_folders_in_path(folder_path)
except ValueError:
logger.warning(f"Missing folder for UPRN {uprn}: {folder_path}")
return
for item in contents.get("value", []):
if "folder" in item:
process_folder(
sp_client, f"{folder_path}/{item['name']}", uprn, address, postcode
)
elif "file" in item:
original_name: str = item["name"]
new_name = build_canonical_filename(uprn, address, postcode, original_name)
if new_name is None:
continue
if DRY_RUN:
logger.info(
f'[DRY RUN] Renaming: "{original_name}""{new_name}" (UPRN: {uprn})'
)
else:
try:
sp_client.rename_file(item["id"], new_name)
logger.info(
f'Renamed: "{original_name}""{new_name}" (UPRN: {uprn})'
)
except Exception as e:
logger.error(
f'Failed to rename "{original_name}""{new_name}" (UPRN: {uprn}): {e}'
)
def main() -> None:
sp_client = DomnaSharepointClient(DomnaSites.SOCIAL_HOUSING_WAVE_3)
with open(CSV_PATH, newline="", encoding="utf-8-sig") as f:
reader = csv.DictReader(f)
required = {"UPRN", "Address", "Postcode"}
if not reader.fieldnames or not required.issubset(set(reader.fieldnames)):
raise ValueError(
f"CSV missing required columns. Expected {required}, got {reader.fieldnames}"
)
for row in reader:
uprn = row["UPRN"].strip()
address = row["Address"].strip()
postcode = row["Postcode"].strip()
folder_path = (
f"{BASE_PATH}/{address}, {postcode}"
f"/{SharepointSubfolders.ASSESSMENT.value}/{ASSESSMENT_SUBFOLDER}"
)
process_folder(sp_client, folder_path, uprn, address, postcode)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,106 @@
# scripts/tests/test_build_canonical_filename.py
from orchestration.sharepoint_renamer_orchestrator import build_canonical_filename
UPRN = "10093456789"
ADDRESS = "1 High Street, Anytown"
POSTCODE = "SW1A 1AA"
STREET = "1 High Street"
def test_already_renamed_returns_none() -> None:
# Arrange
original = f"{UPRN}_High Street SW1A 1AA_EPC Report.pdf"
# Act
result = build_canonical_filename(UPRN, ADDRESS, POSTCODE, original)
# Assert
assert result is None
def test_address_postcode_prefix_stripped() -> None:
# Arrange
original = f"{ADDRESS} {POSTCODE} - EPC Report.pdf"
# Act
result = build_canonical_filename(UPRN, ADDRESS, POSTCODE, original)
# Assert
assert result == f"{UPRN}_{STREET} {POSTCODE}_EPC Report.pdf"
def test_address_only_prefix_stripped() -> None:
# Arrange
original = f"{ADDRESS} - EPC Report.pdf"
# Act
result = build_canonical_filename(UPRN, ADDRESS, POSTCODE, original)
# Assert
assert result == f"{UPRN}_{STREET} {POSTCODE}_EPC Report.pdf"
def test_street_postcode_prefix_stripped() -> None:
# Arrange
original = f"{STREET} {POSTCODE} - EPC Report.pdf"
# Act
result = build_canonical_filename(UPRN, ADDRESS, POSTCODE, original)
# Assert
assert result == f"{UPRN}_{STREET} {POSTCODE}_EPC Report.pdf"
def test_street_only_prefix_stripped() -> None:
# Arrange
original = f"{STREET} - EPC Report.pdf"
# Act
result = build_canonical_filename(UPRN, ADDRESS, POSTCODE, original)
# Assert
assert result == f"{UPRN}_{STREET} {POSTCODE}_EPC Report.pdf"
def test_dash_separator_removed_after_prefix_strip() -> None:
# Arrange " - " separator between prefix and doc name
original = f"{STREET} {POSTCODE} - Floor Plan.pdf"
# Act
result = build_canonical_filename(UPRN, ADDRESS, POSTCODE, original)
# Assert
assert result == f"{UPRN}_{STREET} {POSTCODE}_Floor Plan.pdf"
def test_underscore_separator_removed_after_prefix_strip() -> None:
# Arrange " _ " separator between prefix and doc name
original = f"{STREET} {POSTCODE} _ Floor Plan.pdf"
# Act
result = build_canonical_filename(UPRN, ADDRESS, POSTCODE, original)
# Assert
assert result == f"{UPRN}_{STREET} {POSTCODE}_Floor Plan.pdf"
def test_no_recognised_prefix_preserves_stem() -> None:
# Arrange
original = "Completely Different Name.pdf"
# Act
result = build_canonical_filename(UPRN, ADDRESS, POSTCODE, original)
# Assert
assert result == f"{UPRN}_{STREET} {POSTCODE}_Completely Different Name.pdf"
def test_no_doc_name_after_strip_omits_trailing_separator() -> None:
# Arrange stem is exactly the address prefix with no trailing doc name
original = f"{STREET} {POSTCODE}.pdf"
# Act
result = build_canonical_filename(UPRN, ADDRESS, POSTCODE, original)
# Assert
assert result == f"{UPRN}_{STREET} {POSTCODE}.pdf"

View file

@ -1,10 +1,12 @@
from typing import Any
from unittest.mock import MagicMock, call, patch
from unittest.mock import MagicMock
import pytest
import scripts.rename_sharepoint_files as module
from scripts.rename_sharepoint_files import build_canonical_filename, process_folder
from orchestration.sharepoint_renamer_orchestrator import (
SharepointRenamerOrchestrator,
build_canonical_filename,
)
def _make_file(name: str, item_id: str = "id-1") -> dict[str, Any]:
@ -19,6 +21,12 @@ def _make_package(name: str) -> dict[str, Any]:
return {"name": name, "package": {}}
def _make_orchestrator(sp: MagicMock) -> SharepointRenamerOrchestrator:
orchestrator = SharepointRenamerOrchestrator.__new__(SharepointRenamerOrchestrator)
orchestrator._sp_client = sp
return orchestrator
# ---------------------------------------------------------------------------
# build_canonical_filename
# ---------------------------------------------------------------------------
@ -39,7 +47,7 @@ def test_no_prefix_still_canonical() -> None:
# ---------------------------------------------------------------------------
# process_folder — files only at root level
# _process_folder — files only at root level
# ---------------------------------------------------------------------------
@ -52,8 +60,7 @@ def test_renames_top_level_files(caplog: pytest.LogCaptureFixture) -> None:
]
}
with patch.object(module, "DRY_RUN", False):
process_folder(sp, "some/path", "100", "1 High St", "AB1 2CD")
_make_orchestrator(sp)._process_folder("some/path", "100", "1 High St", "AB1 2CD")
assert sp.rename_file.call_count == 2
sp.rename_file.assert_any_call("id-1", "100_1 High St AB1 2CD_Survey.pdf")
@ -61,7 +68,7 @@ def test_renames_top_level_files(caplog: pytest.LogCaptureFixture) -> None:
# ---------------------------------------------------------------------------
# process_folder — recursive two-level hierarchy
# _process_folder — recursive two-level hierarchy
# ---------------------------------------------------------------------------
@ -84,8 +91,7 @@ def test_recurses_into_subfolders_and_renames_all_files() -> None:
root_contents if path == "base/path" else suba_contents
)
with patch.object(module, "DRY_RUN", False):
process_folder(sp, "base/path", "200", "2 Main Rd", "XY9 8ZW")
_make_orchestrator(sp)._process_folder("base/path", "200", "2 Main Rd", "XY9 8ZW")
assert sp.rename_file.call_count == 2
sp.rename_file.assert_any_call("root-file", "200_2 Main Rd XY9 8ZW_Root.pdf")
@ -95,25 +101,22 @@ def test_recurses_into_subfolders_and_renames_all_files() -> None:
# ---------------------------------------------------------------------------
# process_folder — non-file, non-folder items are skipped
# _process_folder — non-file, non-folder items are skipped
# ---------------------------------------------------------------------------
def test_ignores_package_items() -> None:
sp = MagicMock()
sp.get_folders_in_path.return_value = {
"value": [_make_package("Notebook")]
}
sp.get_folders_in_path.return_value = {"value": [_make_package("Notebook")]}
with patch.object(module, "DRY_RUN", False):
process_folder(sp, "some/path", "300", "3 Oak Ave", "ZZ1 1ZZ")
_make_orchestrator(sp)._process_folder("some/path", "300", "3 Oak Ave", "ZZ1 1ZZ")
sp.rename_file.assert_not_called()
assert sp.get_folders_in_path.call_count == 1
# ---------------------------------------------------------------------------
# process_folder — missing folder
# _process_folder — missing folder
# ---------------------------------------------------------------------------
@ -121,31 +124,14 @@ def test_missing_folder_logs_warning_and_returns(caplog: pytest.LogCaptureFixtur
sp = MagicMock()
sp.get_folders_in_path.side_effect = ValueError("not found")
with patch.object(module, "DRY_RUN", False):
process_folder(sp, "missing/path", "400", "4 Elm St", "AA2 2BB")
_make_orchestrator(sp)._process_folder("missing/path", "400", "4 Elm St", "AA2 2BB")
sp.rename_file.assert_not_called()
assert any("Missing folder" in r.message and "400" in r.message for r in caplog.records)
# ---------------------------------------------------------------------------
# process_folder — dry run
# ---------------------------------------------------------------------------
def test_dry_run_logs_without_renaming(caplog: pytest.LogCaptureFixture) -> None:
sp = MagicMock()
sp.get_folders_in_path.return_value = {"value": [_make_file("Doc.pdf", "id-x")]}
with patch.object(module, "DRY_RUN", True):
process_folder(sp, "some/path", "500", "5 Pine Ln", "BB3 3CC")
sp.rename_file.assert_not_called()
assert any("[DRY RUN]" in r.message for r in caplog.records)
# ---------------------------------------------------------------------------
# process_folder — already-canonical files are skipped
# _process_folder — already-canonical files are skipped
# ---------------------------------------------------------------------------
@ -155,7 +141,6 @@ def test_skips_already_canonical_files() -> None:
"value": [_make_file("500_Pine Ln BB3 3CC_Doc.pdf", "id-y")]
}
with patch.object(module, "DRY_RUN", False):
process_folder(sp, "some/path", "500", "5 Pine Ln", "BB3 3CC")
_make_orchestrator(sp)._process_folder("some/path", "500", "5 Pine Ln", "BB3 3CC")
sp.rename_file.assert_not_called()