Created
August 7, 2023 11:10
-
-
Save zafercavdar/d7294ce815aef81bb9e61cda5dcd7a67 to your computer and use it in GitHub Desktop.
Excel content reformatting
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import re | |
import pandas as pd | |
parser = argparse.ArgumentParser(description="Reformatter app") | |
parser.add_argument("path", type=str, help="Path to an Excel file") | |
def uppercase_first_letter(text: str) -> str: | |
return text[0].upper() + text[1:] if len(text) > 0 else text | |
def normalize_not_available_info(text: str) -> str: | |
phrases = [ | |
r"no available information", | |
r"no information available", | |
r"no information provided", | |
r"no.*mention of", | |
r"is not disclosed", | |
r"no.*involved", | |
r"does not provide", | |
r"doesn't provide", | |
r"not mentioned", | |
r"no.*signs", | |
] | |
search_text = str(text).lower() | |
for phrase in phrases: | |
if re.search(phrase, search_text) is not None: | |
return "N/A" | |
return uppercase_first_letter(str(text).strip()) | |
def export_as_excel(df, filename, sheet_name="Sheet 1"): | |
writer = pd.ExcelWriter(filename, engine='xlsxwriter') | |
df.to_excel(writer, sheet_name=sheet_name, index=True) # send df to writer | |
worksheet = writer.sheets[sheet_name] # pull worksheet object | |
worksheet.set_column(0, 0, 60) | |
for column in df: # loop through all columns | |
series = df[column] | |
col_idx = df.columns.get_loc(column) | |
max_len = max(( | |
series.astype(str).map(len).max(), # len of largest item | |
len(str(series.name)), # len of column name/header | |
)) | |
worksheet.set_column(col_idx + 1, col_idx + 1, | |
max_len // 1.1) # set column width | |
writer.save() | |
def reformat(filename): | |
df = pd.read_excel(filename, parse_dates=False) | |
column_order = sorted(df["Dimensions"].unique()) | |
df["Results_fixed"] = df["Results"].apply(normalize_not_available_info) | |
formatted_df = df.pivot_table(values='Results_fixed', | |
index=df["Company name"], | |
columns='Dimensions', | |
aggfunc='first')[column_order].fillna("N/A") | |
new_filename = filename.rsplit(".", maxsplit=1)[0] + "-reformatted.xlsx" | |
export_as_excel(formatted_df, new_filename) | |
print(f"The new file is saved as {new_filename}") | |
if __name__ == "__main__": | |
args = parser.parse_args() | |
reformat(args.path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment