Model/model_data/utils.py

import boto3
import pandas as pd
from io import BytesIO
import re
from textblob import TextBlob

# Pre-compile the regular expression
PERCENTAGE_PATTERN = re.compile(r'^\d+%?$')


def is_percentage_or_number(s):
    # re.match returns None if the string does not match the pattern
    return PERCENTAGE_PATTERN.match(s) is not None


def correct_spelling(text):
    words = text.split()

    corrected_words = []
    for word in words:
        if is_percentage_or_number(word):
            corrected_words.append(word)
        else:
            blob = TextBlob(word)  # create a TextBlob object
            corrected_word = blob.correct()  # use the correct method to correct spelling
            corrected_words.append(str(corrected_word))  # convert corrected word back to string

    corrected_text = ' '.join(corrected_words)
    return corrected_text


def save_dataframe_to_s3_parquet(df, bucket_name, file_key):
    """
    Save a pandas DataFrame to S3 as a Parquet file.

    :param df: The pandas DataFrame.
    :param bucket_name: Name of the S3 bucket.
    :param file_key: Key of the file (including directory path within the bucket).
    """

    # Convert the DataFrame to a Parquet format in memory
    parquet_buffer = BytesIO()
    df.to_parquet(parquet_buffer)

    # Create the boto3 client
    client = boto3.client('s3')

    # Upload the Parquet file to S3
    client.put_object(Bucket=bucket_name, Key=file_key, Body=parquet_buffer.getvalue())