import boto3 from io import BytesIO import re from textblob import TextBlob # Pre-compile the regular expression PERCENTAGE_PATTERN = re.compile(r'^\d+%?$') def is_percentage_or_number(s): # re.match returns None if the string does not match the pattern return PERCENTAGE_PATTERN.match(s) is not None def correct_spelling(text): words = text.split() corrected_words = [] for word in words: if is_percentage_or_number(word): corrected_words.append(word) else: blob = TextBlob(word) # create a TextBlob object corrected_word = blob.correct() # use the correct method to correct spelling corrected_words.append(str(corrected_word)) # convert corrected word back to string corrected_text = ' '.join(corrected_words) return corrected_text def save_dataframe_to_s3_parquet(df, bucket_name, file_key): """ Save a pandas DataFrame to S3 as a Parquet file. :param df: The pandas DataFrame. :param bucket_name: Name of the S3 bucket. :param file_key: Key of the file (including directory path within the bucket). """ # Convert the DataFrame to a Parquet format in memory parquet_buffer = BytesIO() df.to_parquet(parquet_buffer) # Create the boto3 client client = boto3.client('s3') # Upload the Parquet file to S3 client.put_object(Bucket=bucket_name, Key=file_key, Body=parquet_buffer.getvalue())