Last active
November 19, 2018 04:12
-
-
Save Dan-Patterson/d01837284aa6b3b77511c2f92a476c0b to your computer and use it in GitHub Desktop.
This can be used to convert excel files to numpy structured or record arrays.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# see https://community.esri.com/blogs/dan_patterson/2018/11/18/excel-arrays-tables-in-arcgis-pro for | |
# more information and descriptions | |
import numpy as np | |
def excel_np(path, sheet_num=0): | |
"""Read excel files to numpy structured/record arrays. Your spreadsheet | |
must adhere to simple rules:: | |
- first row must contain the field names for the output array | |
- no blank rows or columns, basically, no fluff or formatting | |
- if you have nodata values, put them in, since blank cells will be | |
'corrected' as best as possible. | |
- text and numbers in a column, results in a text column | |
Parameters: | |
----------- | |
path : text | |
Full path to the xls, xlsx file | |
sheet_num : integer | |
Sheets are numbered from 0. | |
Returns: | |
-------- | |
A numpy structured array is returned. Excel only uses float or string | |
data, so attempts are made to coerse integer columns by comparing the | |
float vs int versions of the arrays. A tad of a kludge, but it works. | |
The first row's data type is compared to its matching column data type. | |
If they match, then it is used as the dtype. If there is a mismatch an | |
attempt is made to recover numeric data by assigning blanks etc in numeric | |
columns a value of np.nan. | |
String/text columns are check for empty cells, '', "" and that ever so | |
ugly invisible space. | |
Notes: | |
------ | |
>>> aString = open('c:/temp/test.xlsx','rb').read() | |
>>> book_ = open_workbook(file_contents=aString) | |
>>> dir(book_): | |
get_sheet, nsheets, sheet_by_index, sheet_by_name etc.... | |
Now you can read a sheet | |
>>> sheet = book_.sheet_by_index(0) # first sheet | |
>>> sheet.col_types(0) | |
References: | |
---------- | |
`<https://media.readthedocs.org/pdf/xlrd/latest/xlrd.pdf>`_. | |
""" | |
def _values(sheet, rows, cols): | |
"""return cell types for the above. Skip the first row | |
Not use.... just kept for future reference | |
""" | |
ar = [] | |
for i in range(1, rows): | |
c = [] | |
for j in range(cols): | |
c.append(sheet.cell_values(i, j)) # sheet.cell_types also | |
ar.append(c) | |
return ar | |
def isfloat(a): | |
"""float check""" | |
try: | |
i = float(a) | |
return i | |
except ValueError: | |
return np.nan | |
def punc_space(name): | |
"""delete punctuation and spaces and replace with '_'""" | |
punc = list('!"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~ ') | |
return "".join([[i, '_'][i in punc] for i in name]) | |
import xlrd | |
w = xlrd.open_workbook(path) # xlrd.book.Book class | |
sheet = w.sheet_by_index(sheet_num) # sheet by number | |
# sheet = w.sheet_by_name('test') # case sensitive, not implemented | |
names = sheet.row_values(0) # clean these up later | |
cols = sheet.ncols | |
rows = sheet.nrows | |
col_data = [sheet.col_values(i, 1, rows) for i in range(cols)] | |
row_guess = sheet.row_values(1) | |
row_dts = [np.asarray(i).dtype.kind for i in row_guess] | |
col_dts = [np.asarray(col_data[i]).dtype.kind | |
for i in range(cols)] | |
clean = [] | |
for i in range(len(row_dts)): | |
c = col_data[i] | |
if row_dts[i] == col_dts[i]: # same dtype... send to array | |
ar = np.asarray(c) | |
if row_dts[i] == 'f': # float? if so, substitute np.nan | |
ar = np.array([isfloat(i) for i in c]) | |
is_nan = np.isnan(ar) # find the nan values, then check | |
not_nan = ar[~is_nan] # are the floats == ints? | |
if np.all(np.equal(not_nan, not_nan.astype('int'))): # integer? | |
ar[is_nan] = -999 | |
ar = ar.astype('int') | |
elif row_dts[i] in ('U', 'S'): # unicode/string... send to array | |
ar = np.char.strip(ar) | |
ar = np.where(np.char.str_len(ar) == 0, 'None', ar) | |
else: | |
ar = np.asarray(c) | |
clean.append(ar) | |
# ---- assemble the columns for the array ---- | |
dt_str = [i.dtype.str for i in clean] | |
names = [i.strip() for i in names] # clean up leading/trailing spaces | |
names = [punc_space(i) for i in names] # replace punctuation and spaces | |
dts_name = list(zip(names, dt_str)) | |
arr = np.empty((rows-1,), dtype= dts_name) | |
cnt = 0 | |
for i in names: | |
arr[i] = clean[cnt] | |
cnt +=1 | |
return arr | |
# | |
# ---------------------------------------------------------------------- | |
# __main__ .... code section | |
if __name__ == "__main__": | |
"""Optionally... | |
: provide a spreadsheet for testing | |
""" | |
#path = "c:/test/text.xlsx" | |
#arr = excel_np(path, 0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment