Created
January 13, 2022 17:16
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def reduce_mem_usage(df): | |
start_mem_usg = df.memory_usage().sum() / 1024**2 | |
print("Memory usage of properties dataframe is :",start_mem_usg," MB") | |
NAlist = [] # Keeps track of columns that have missing values filled in. | |
for col in df.columns: | |
if df[col].dtype != object and df[col].dtype != 'datetime64[ns]': # Exclude strings and dates | |
# Print current column type | |
print("******************************") | |
print("Column: ",col) | |
print("dtype before: ",df[col].dtype) | |
# make variables for Int, max and min | |
IsInt = False | |
mx = df[col].max() | |
mn = df[col].min() | |
# Integer does not support NA, therefore, NA needs to be filled | |
if not np.isfinite(df[col]).all(): | |
NAlist.append(col) | |
df[col].fillna(mn-1,inplace=True) | |
# test if column can be converted to an integer | |
asint = df[col].fillna(0).astype(np.int64) | |
result = (df[col] - asint) | |
result = result.sum() | |
if result > -0.01 and result < 0.01: | |
IsInt = True | |
# Make Integer/unsigned Integer datatypes | |
if IsInt: | |
if mn >= 0: | |
if mx < 255: | |
df[col] = df[col].astype(np.uint8) | |
elif mx < 65535: | |
df[col] = df[col].astype(np.uint16) | |
elif mx < 4294967295: | |
df[col] = df[col].astype(np.uint32) | |
else: | |
df[col] = df[col].astype(np.uint64) | |
else: | |
if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max: | |
df[col] = df[col].astype(np.int8) | |
elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max: | |
df[col] = df[col].astype(np.int16) | |
elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max: | |
df[col] = df[col].astype(np.int32) | |
elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max: | |
df[col] = df[col].astype(np.int64) | |
# Make float datatypes 32 bit | |
else: | |
df[col] = df[col].astype(np.float32) | |
# Print new column type | |
print("dtype after: ",df[col].dtype) | |
print("******************************") | |
# Print final result | |
print("___MEMORY USAGE AFTER COMPLETION:___") | |
mem_usg = df.memory_usage().sum() / 1024**2 | |
print("Memory usage is: ",mem_usg," MB") | |
print("This is ",100*mem_usg/start_mem_usg,"% of the initial size") | |
return df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment