From ae7a7e1bd5bb82b0cebc6fa2479810775cc2a5c4 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 23 May 2025 14:07:11 +0000 Subject: [PATCH] submit this to main --- .devcontainer/Dockerfile | 20 ++++++++++++++++-- etl/surveyPrice/surveyPrice.py | 38 ++++++++++++++-------------------- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index ab88141..14de393 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -5,7 +5,7 @@ ARG DEBIAN_FRONTEND=noninteractive # Install system dependencies in a single layer RUN apt update && apt install -y --no-install-recommends \ - sudo jq vim \ + sudo jq vim curl\ && apt autoremove -y \ && rm -rf /var/lib/apt/lists/* @@ -17,6 +17,22 @@ RUN useradd -m -s /usr/bin/bash ${USER} \ # Install Poetry RUN pip install --no-cache-dir poetry +# Download and install nvm: +# RUN curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.3/install.sh | bash + +# # in lieu of restarting the shell +# RUN \. "$HOME/.nvm/nvm.sh" + +# # Download and install Node.js: +# RUN nvm install 22 + +# # Verify the Node.js version: +# RUN node -v # Should print "v22.16.0". +# RUN nvm current # Should print "v22.16.0". + +# # Verify npm version: +# RUN npm -v # Should print "10.9.2". + # Install aws RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" @@ -37,4 +53,4 @@ RUN terraform -install-autocomplete # Set the working directory -WORKDIR /workspaces/survey-extractor \ No newline at end of file +WORKDIR /workspaces/survey-extractor:q \ No newline at end of file diff --git a/etl/surveyPrice/surveyPrice.py b/etl/surveyPrice/surveyPrice.py index ae51422..2f3521b 100644 --- a/etl/surveyPrice/surveyPrice.py +++ b/etl/surveyPrice/surveyPrice.py @@ -254,22 +254,22 @@ class SurveyPrice(): def merge_hub_spot_and_survey_information_from_sharepoint_url(self, hubspot_data, survey_data): # Standardise address - def extract_start_and_postcode(addr): - if not isinstance(addr, str) or addr.strip() == "": - return "", "" - parts = addr.lower().replace(",", "").strip().split() - start = ' '.join(parts[:2]) # Number + street - postcode = ' '.join(parts[-2:]) # Postcode - return start, postcode + # def extract_start_and_postcode(addr): + # if not isinstance(addr, str) or addr.strip() == "": + # return "", "" + # parts = addr.lower().replace(",", "").strip().split() + # start = ' '.join(parts[:2]) # Number + street + # postcode = ' '.join(parts[-2:]) # Postcode + # return start, postcode # Extract start + postcode from both datasets - survey_data[['address_start', 'postcode']] = survey_data['SHAREPOINT ADDRESS'].apply( - lambda x: pd.Series(extract_start_and_postcode(x)) - ) - - hubspot_data[['address_start', 'postcode']] = hubspot_data['HUBSPOT_DEAL_ADDRESS'].apply( - lambda x: pd.Series(extract_start_and_postcode(x)) - ) + # survey_data[['address_start', 'postcode']] = survey_data['SHAREPOINT ADDRESS'].apply( + # lambda x: pd.Series(extract_start_and_postcode(x)) + # ) +# + # hubspot_data[['address_start', 'postcode']] = hubspot_data['HUBSPOT_DEAL_ADDRESS'].apply( + # lambda x: pd.Series(extract_start_and_postcode(x)) + # ) # re-name to installer @@ -286,16 +286,8 @@ class SurveyPrice(): } ) - merged_df = pd.merge( - survey_data, - hubspot_data, - on=['address_start', 'postcode'], - how='inner' - ) + merged_df = pd.concat([hubspot_data, survey_data], axis=1) - # if hubspot detects - - merged_df.drop(columns=['address_start', 'postcode'], inplace=True) def compute_energy_grant(row): pre_band_letter = row["SHAREPOINT PRE_INSTALL_SAP_SCORE_BANDING"][-1] post_band_letter = surveyedDataProcessor.get_band(row["HUBSPOT_POST_INSTALL_SAP_SCORE"])[-1]