Merge pull request #101 from Hestia-Homes/feature/scrape_livewest_and_southern

Feature/scrape livewest and southern
This commit is contained in:
Jun-te Kim 2025-11-04 20:17:07 +00:00 committed by GitHub
commit 4ff233dbc9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 217 additions and 61 deletions

View file

@ -1,4 +1,4 @@
name: Hubspot Sync Abri
name: Hubspot Sync
on:
schedule:
@ -6,7 +6,7 @@ on:
workflow_dispatch:
jobs:
hubspot-sync-abri:
hubspot-sync:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
@ -28,4 +28,4 @@ jobs:
run: |
pwd
ls -la
poetry run python etl/hubSpotClient/scripts/hubspot_update_abri_script.py
poetry run python etl/hubSpotClient/scripts/hubspot_update_script.py

View file

@ -8,7 +8,7 @@ class HubspotTodb():
def new_record_to_hubspot_data(self, deal_data, company, listing):
print("This has been depreciated using new interface")
self.upsert_hubspot_deal(self, deal_data, company, listing)
self.upsert_hubspot_deal(deal_data, company, listing)
def new_record_company(self, company_data):

View file

@ -6,6 +6,8 @@ from hubspot.crm.associations import ApiException
class Companies(Enum):
ABRI = "237615001799"
SOUTHERN_HOUSING_GROUP = "109343619305"
LIVEWEST = "86205872354"
class DealStage(Enum):
SURVEYED_COMPLETE_NEEDS_SIGN_OFF = "1617223914"
@ -143,3 +145,65 @@ class HubSpotClient():
company_info = company.properties
return company_info
def get_all_pipelines(self):
"""
Retrieve all pipelines for deals, returning a list of dicts with pipeline names and IDs.
"""
try:
pipelines_api = self.client.crm.pipelines.pipelines_api
response = pipelines_api.get_all(object_type="deals")
pipelines = [
{
"name": pipeline.label,
"id": pipeline.id
}
for pipeline in response.results
]
self.logger.info(f"Retrieved {len(pipelines)} pipelines.")
return pipelines
except Exception as e:
self.logger.error(f"Error retrieving pipelines: {e}")
return []
def get_deal_stages(self, pipeline_id=None):
"""
Retrieve all deal stages for a given pipeline.
If no pipeline_id is provided, retrieves all stages for all pipelines.
Returns a list of dicts with pipeline name, stage name, and stage ID.
"""
try:
pipelines_api = self.client.crm.pipelines.pipelines_api
response = pipelines_api.get_all(object_type="deals")
all_stages = []
for pipeline in response.results:
# Skip other pipelines if a specific one is requested
if pipeline_id and pipeline.id != str(pipeline_id):
continue
stages = [
{
"pipeline_name": pipeline.label,
"pipeline_id": pipeline.id,
"stage_name": stage.label,
"stage_id": stage.id
}
for stage in pipeline.stages
]
all_stages.extend(stages)
if not all_stages:
self.logger.info(f"No deal stages found for pipeline {pipeline_id if pipeline_id else 'ALL'}")
else:
self.logger.info(f"Retrieved {len(all_stages)} deal stages.")
return all_stages
except Exception as e:
self.logger.error(f"Error retrieving deal stages: {e}")
return []

View file

@ -2,44 +2,72 @@ from etl.hubSpotClient.hubspotClient import HubSpotClient, Companies, Pipeline
from tqdm import tqdm
from etl.db.hubSpotLoad import HubspotTodb
'''
# TODO:
get one deal from db, from db
for avri only so far
add it to the db
show in frontend
'''
# get ALL deals
hubspot = HubSpotClient()
hubspot.get_deal_stages()
db = HubspotTodb()
# All deals from a pipeline_id via filter
deals = hubspot.get_deal_ids_by_pipeline(
pipeline_id=Pipeline.OPERATIONS_SOCIAL_HOUSING.value,
)
# deals from companies we care about
valueable_deals = [
Companies.ABRI.value
companies = [
Companies.ABRI,
Companies.LIVEWEST,
Companies.SOUTHERN_HOUSING_GROUP,
]
deals_to_add = []
# Track all failures and summary data
all_failed_deals = []
summary_report = {}
deal_to_companies = {}
loader = HubspotTodb()
# Get all deals we care about
for i,deal in enumerate(tqdm(deals)):
company = hubspot.from_deal_get_associated_company_id(deal)
if company in valueable_deals:
deals_to_add.append(deal)
deal_to_companies.update({deal: company})
deal_data = hubspot.from_deal_get_info(deal_id=deal)
listing_data = hubspot.from_deal_get_associated_listing(deal_id=deal)
loader.new_record_to_hubspot_data(deal_data, deal_to_companies[deal], listing_data)
for company in companies:
records = db.find_all_deals_with_company_id(company.value)
updated_count = 0
checked_count = 0
failed_deals = []
#TODO check if database has abri data
# make companies table
# make a scrip that updates table
for deal in tqdm(records, desc=f"Checking HubSpot deals for {company.name}"):
checked_count += 1
try:
print(f"🔍 Working on deal {deal}")
was_up_to_date = db.update_deal(deal, hubspot)
if not was_up_to_date:
updated_count += 1
except Exception as e:
failed_info = {
"company": company.name,
"deal_id": deal,
"error": str(e)
}
failed_deals.append(failed_info)
all_failed_deals.append(failed_info)
print(f"❌ Failed to update deal {deal}: {e}")
# Store company-level summary (dont print yet)
summary_report[company.name] = {
"checked": checked_count,
"updated": updated_count,
"up_to_date": checked_count - updated_count - len(failed_deals),
"failed": len(failed_deals),
}
# ---- Final Summary Report ----
print("\n" + "="*100)
print("📊 FINAL SUMMARY REPORT")
print("="*100)
for company_name, stats in summary_report.items():
print(f"\n🏢 {company_name}")
print(f" - Total deals checked: {stats['checked']}")
print(f" - Updated deals: {stats['updated']}")
print(f" - Up-to-date deals: {stats['up_to_date']}")
print(f" - Failed deals: {stats['failed']}")
# ---- Global failure details ----
if all_failed_deals:
print("\n" + "="*100)
print("⚠️ FAILED DEALS DETAILS")
print("="*100)
for f in all_failed_deals:
print(f" - Company: {f['company']:<25} | Deal ID: {f['deal_id']} | Error: {f['error']}")
else:
print("\n🎉 No failed deals across any company!")

View file

@ -5,7 +5,7 @@ from etl.db.hubSpotLoad import HubspotTodb
hubspot = HubSpotClient()
# All deals from a pipeline_id via filter
company = hubspot.get_company_information(Companies.ABRI.value)
company = hubspot.get_company_information(Companies.SOUTHERN_HOUSING_GROUP.value)
loader = HubspotTodb()
loader.new_record_company(company)

View file

@ -1,23 +0,0 @@
from etl.hubSpotClient.hubspotClient import HubSpotClient, Companies, Pipeline
from tqdm import tqdm
from etl.db.hubSpotLoad import HubspotTodb
hubspot = HubSpotClient()
db = HubspotTodb()
records = db.find_all_deals_with_company_id(Companies.ABRI.value)
updated_count = 0 # Counter for deals that needed updating
checked_count = 0 # Optional: total processed counter
for deal in tqdm(records, desc="Checking HubSpot deals"):
checked_count += 1
was_up_to_date = db.update_deal(deal, hubspot)
# update_deal() returns False when discrepancies are found
if not was_up_to_date:
updated_count += 1
print(f"\n✅ Finished checking {checked_count} deals.")
print(f"🧩 {updated_count} deal(s) were updated.")
print(f"📈 {checked_count - updated_count} deal(s) were already up to date.")

View file

@ -0,0 +1,87 @@
from etl.hubSpotClient.hubspotClient import HubSpotClient, Companies, Pipeline
from tqdm import tqdm
from etl.db.hubSpotLoad import HubspotTodb
hubspot = HubSpotClient()
hubspot.get_deal_stages()
db = HubspotTodb()
companies = [
Companies.ABRI,
Companies.LIVEWEST,
Companies.SOUTHERN_HOUSING_GROUP,
]
# Global trackers
all_failed_deals = []
summary_report = {}
print("\n🚀 Starting HubSpot deal consistency check...\n")
for company in companies:
print(f"\n🏢 Processing company: {company.name}")
records = db.find_all_deals_with_company_id(company.value)
updated_count = 0
checked_count = 0
failed_deals = []
for deal in tqdm(records, desc=f"Checking HubSpot deals for {company.name}"):
checked_count += 1
try:
print(f"🔍 Working on deal {deal}")
was_up_to_date = db.update_deal(deal, hubspot)
if not was_up_to_date:
updated_count += 1
print(f"🧩 Deal {deal} was updated.")
else:
print(f"📈 Deal {deal} already up to date.")
except Exception as e:
failed_info = {
"company": company.name,
"deal_id": deal,
"error": str(e)
}
failed_deals.append(failed_info)
all_failed_deals.append(failed_info)
print(f"❌ Failed to update deal {deal}: {e}")
# Store per-company summary (dont print yet)
summary_report[company.name] = {
"checked": checked_count,
"updated": updated_count,
"failed": len(failed_deals),
"up_to_date": checked_count - updated_count - len(failed_deals),
}
# Company-level quick summary
print(f"\n✅ Finished checking {checked_count} deals for company {company.name}.")
print(f" 🧩 {updated_count} deal(s) were updated.")
print(f" 📈 {summary_report[company.name]['up_to_date']} deal(s) were already up to date.")
print(f" ⚠️ {len(failed_deals)} deal(s) failed.\n")
# ---- Final Summary Report ----
print("\n" + "=" * 100)
print("📊 FINAL SUMMARY REPORT (ALL COMPANIES)")
print("=" * 100)
for company_name, stats in summary_report.items():
print(f"\n🏢 {company_name}")
print(f" - Total deals checked: {stats['checked']}")
print(f" - Updated deals: {stats['updated']}")
print(f" - Up-to-date deals: {stats['up_to_date']}")
print(f" - Failed deals: {stats['failed']}")
# ---- Global Failed Deals ----
if all_failed_deals:
print("\n" + "=" * 100)
print("⚠️ FAILED DEALS DETAILS")
print("=" * 100)
for f in all_failed_deals:
print(f" - Company: {f['company']:<25} | Deal ID: {f['deal_id']} | Error: {f['error']}")
else:
print("\n🎉 No failed deals across any company!")
print("\n🏁 HubSpot deal consistency check complete!\n")