mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
49 lines
1.4 KiB
Python
49 lines
1.4 KiB
Python
import boto3
|
|
import pandas as pd
|
|
from io import BytesIO
|
|
import re
|
|
from textblob import TextBlob
|
|
|
|
# Pre-compile the regular expression
|
|
PERCENTAGE_PATTERN = re.compile(r'^\d+%?$')
|
|
|
|
|
|
def is_percentage_or_number(s):
|
|
# re.match returns None if the string does not match the pattern
|
|
return PERCENTAGE_PATTERN.match(s) is not None
|
|
|
|
|
|
def correct_spelling(text):
|
|
words = text.split()
|
|
|
|
corrected_words = []
|
|
for word in words:
|
|
if is_percentage_or_number(word):
|
|
corrected_words.append(word)
|
|
else:
|
|
blob = TextBlob(word) # create a TextBlob object
|
|
corrected_word = blob.correct() # use the correct method to correct spelling
|
|
corrected_words.append(str(corrected_word)) # convert corrected word back to string
|
|
|
|
corrected_text = ' '.join(corrected_words)
|
|
return corrected_text
|
|
|
|
|
|
def save_dataframe_to_s3_parquet(df, bucket_name, file_key):
|
|
"""
|
|
Save a pandas DataFrame to S3 as a Parquet file.
|
|
|
|
:param df: The pandas DataFrame.
|
|
:param bucket_name: Name of the S3 bucket.
|
|
:param file_key: Key of the file (including directory path within the bucket).
|
|
"""
|
|
|
|
# Convert the DataFrame to a Parquet format in memory
|
|
parquet_buffer = BytesIO()
|
|
df.to_parquet(parquet_buffer)
|
|
|
|
# Create the boto3 client
|
|
client = boto3.client('s3')
|
|
|
|
# Upload the Parquet file to S3
|
|
client.put_object(Bucket=bucket_name, Key=file_key, Body=parquet_buffer.getvalue())
|