Last active
May 3, 2018 02:25
-
-
Save Maecenas/4df452bdf626a3ae4c43e616a6fb87d3 to your computer and use it in GitHub Desktop.
Data Processing Using Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 时间索引 | |
>>> import pandas as pd | |
>>> index = pd.date_range('20160503', periods=5) | |
>>> index | |
DatetimeIndex(['2016-05-03', '2016-05-04', '2016-05-05', '2016-05-06', | |
'2016-05-07'], | |
dtype='datetime64[ns]', freq='D') | |
# 数据显示 | |
>>> import numpy as np | |
>>> dates = pd.DataFrame(np.random.randn(5,3),index=index,columns=list('ABC')) | |
>>> dates | |
A B C | |
2016-05-03 0.320522 -0.854196 -0.477794 | |
2016-05-04 -0.736698 0.220931 0.878340 | |
2016-05-05 1.002439 -0.422472 -1.027093 | |
2016-05-06 1.792256 0.322563 0.285416 | |
2016-05-07 -2.291344 0.633183 1.120919 | |
>>> dates.index # 获取索引 | |
DatetimeIndex(['2016-05-03', '2016-05-04', '2016-05-05', '2016-05-06', | |
'2016-05-07'], | |
dtype='datetime64[ns]', freq='D') | |
>>> dates.columns # 列名 | |
Index([u'A', u'B', u'C'], dtype='object') | |
>>> dates.values # 返回数组,从这里可以看出来DataFrame与Series都是以ndarray为基础的 | |
array([[ 0.32052247, -0.85419619, -0.47779368], | |
[-0.73669815, 0.22093113, 0.87834037], | |
[ 1.00243859, -0.42247204, -1.02709299], | |
[ 1.79225566, 0.32256317, 0.28541609], | |
[-2.2913443 , 0.63318336, 1.12091912]]) | |
>>> dates.describe # 描述 | |
<bound method DataFrame.describe of A B C | |
2016-05-03 0.320522 -0.854196 -0.477794 | |
2016-05-04 -0.736698 0.220931 0.878340 | |
2016-05-05 1.002439 -0.422472 -1.027093 | |
2016-05-06 1.792256 0.322563 0.285416 | |
2016-05-07 -2.291344 0.633183 1.120919> | |
# 数据筛选 | |
>>> dates.head(1) # 切片,前一行 | |
A B C | |
2016-05-03 0.320522 -0.854196 -0.477794 | |
>>> dates.tail(1) # 最后一行 | |
A B C | |
2016-05-07 -2.291344 0.633183 1.120919 | |
>>> dates['2016-05-03':'2016-05-05'] # 通过索引切片 | |
A B C | |
2016-05-03 0.320522 -0.854196 -0.477794 | |
2016-05-04 -0.736698 0.220931 0.878340 | |
2016-05-05 1.002439 -0.422472 -1.027093 | |
>>> dates.loc['2016-05-03':'2016-05-05', ['A', 'B']] # 区域选择,定义索引,字段名 | |
A B | |
2016-05-03 0.320522 -0.854196 | |
2016-05-04 -0.736698 0.220931 | |
2016-05-05 1.002439 -0.422472 | |
>>> dates = pd.DataFrame(np.random.randn(5,3)) | |
>>> dates | |
0 1 2 | |
0 -0.814773 0.828948 1.279713 | |
1 1.010236 -1.063719 -1.010884 | |
2 -0.206223 -1.756636 -0.777366 | |
3 -0.253049 0.955689 -1.735237 | |
4 -1.269495 -0.321743 0.929069 | |
>>> dates.at[0,0] # 定位到单元格 | |
-0.81477290050615703 | |
>>> dates.iloc[1:4,[1,]] # 通过数字切片loc | |
B | |
2016-05-04 -1.002059 | |
2016-05-05 -0.298642 | |
2016-05-06 -1.042664 | |
>>> dates.iat[1,1] # 数字索引定位到单元格 | |
-1.0020589880335706 | |
>>> dates[dates.index > '2016-05-05'] # 条件筛选 | |
A B C | |
2016-05-06 -0.334060 -1.042664 0.999093 | |
2016-05-07 -2.051357 0.713025 -0.792198 | |
>>> dates[(dates.index > '2016-05-05') & (dates.A < 0)] # 组合筛选 | |
A B C | |
2016-05-06 -0.334060 -1.042664 0.999093 | |
2016-05-07 -2.051357 0.713025 -0.792198 | |
>>> dates[dates.A > 0].B # 返回Series | |
2016-05-06 -0.139405 | |
2016-05-07 -1.437701 | |
Freq: D, Name: B, dtype: float64 | |
>>> dates[dates.A>0].B.mean() # 计算平均值 | |
-0.78855298332911494 | |
>>> len(dates) # 行数 | |
5 | |
>>> import numpy as np | |
>>> np.sign(np.diff(dates.A)) | |
array([ 1., -1., 1., -1.]) | |
>>> status = np.sign(np.diff(dates.A)) | |
>>> status[np.where(status == 1.)].size # 算变更类型未1的数量 | |
2 | |
>>> dates.sort(columns='A') # 排序 | |
__main__:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....) | |
A B C | |
2016-05-03 -0.695635 0.077077 0.694779 | |
2016-05-05 -0.641996 1.420200 0.086330 | |
2016-05-04 -0.594039 -0.842575 -0.046909 | |
2016-05-07 0.667998 -1.437701 -0.196523 | |
2016-05-06 1.176916 -0.139405 -0.592849 | |
>>> dates.groupby('A').count() # 分组,分组的字段作为索引 | |
B C | |
A | |
-0.695635 1 1 | |
-0.641996 1 1 | |
-0.594039 1 1 | |
0.667998 1 1 | |
1.176916 1 1 | |
>>> dates.groupby('A').sum() # 分组计算和 | |
B C | |
A | |
-0.695635 0.077077 0.694779 | |
-0.641996 1.420200 0.086330 | |
-0.594039 -0.842575 -0.046909 | |
0.667998 -1.437701 -0.196523 | |
1.176916 -0.139405 -0.592849 | |
sum | |
# 数据合并 | |
>>> bdates = pd.DataFrame(np.random.randn(2,3),columns=list('ABC')) | |
>>> bdates | |
A B C | |
0 -0.043208 0.332842 -0.390034 | |
1 -0.318745 0.013284 0.147836 | |
>>> dates | |
A B C | |
2016-05-03 -0.695635 0.077077 0.694779 | |
2016-05-04 -0.594039 -0.842575 -0.046909 | |
2016-05-05 -0.641996 1.420200 0.086330 | |
2016-05-06 1.176916 -0.139405 -0.592849 | |
2016-05-07 0.667998 -1.437701 -0.196523 | |
>>> dates.append(bdates) # 追加 | |
A B C | |
2016-05-03 00:00:00 -0.695635 0.077077 0.694779 | |
2016-05-04 00:00:00 -0.594039 -0.842575 -0.046909 | |
2016-05-05 00:00:00 -0.641996 1.420200 0.086330 | |
2016-05-06 00:00:00 1.176916 -0.139405 -0.592849 | |
2016-05-07 00:00:00 0.667998 -1.437701 -0.196523 | |
0 -0.043208 0.332842 -0.390034 | |
1 -0.318745 0.013284 0.147836 | |
>>> pieces = [dates[:2], dates[len(dates)-1:] | |
>>> pd.concat(pieces) # 合并片段 | |
A B C | |
2016-05-03 -0.695635 0.077077 0.694779 | |
2016-05-04 -0.594039 -0.842575 -0.046909 | |
2016-05-07 0.667998 -1.437701 -0.196523 | |
>>> pd.concat(pieces, ignore_index=True) # 忽略不想同的字段,用NaN填充 | |
A B C | |
0 -0.695635 0.077077 0.694779 | |
1 -0.594039 -0.842575 -0.046909 | |
2 0.667998 -1.437701 -0.196523 | |
>>> pd.merge(dates, bdates, on='A') # join合并,类比sql语句 | |
Empty DataFrame | |
Columns: [A, B_x, C_x, B_y, C_y] | |
Index: [] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ndarray 多维数组 | |
>>> import numpy as np | |
>>> np.ones((3,4)) # 自动生成3*4的二维数组,全为1 | |
array([[ 1., 1., 1., 1.], | |
[ 1., 1., 1., 1.], | |
[ 1., 1., 1., 1.]]) | |
>>> np.array([[1,2], [3,4]]) | |
array([[1, 2], | |
[3, 4]]) | |
>>> np.zeros((1,2)) # 生成全为0 | |
array([[ 0., 0.]]) | |
>>> np.arange(1,10,1) # 类似于range生成列表 | |
array([1, 2, 3, 4, 5, 6, 7, 8, 9]) | |
# ndarray 运算 | |
>>> aarr = np.array([[2,2], [3,3]]) | |
>>> barr = np.array([[3,4], [5,6]]) | |
>>> aarr * barr # 直接对同位置的进行运算,+ - * / % +=等 | |
array([[ 6, 8], | |
[15, 18]]) | |
>>> barr > 5 # bool运算 | |
array([[False, False], | |
[False, True]], dtype=bool) | |
>>> barr.shape # 获取数组行列数结构 | |
(2, 2) | |
>>> barr.reshape(1,4) # 重排数据结构 | |
array([[3, 4, 5, 6]]) | |
>>> barr.sum() # 算和 | |
18 | |
>>> barr.sum(axis=0) # 算列的和 | |
array([ 8, 10]) | |
>>> barr | |
array([[3, 4], | |
[5, 6]]) | |
>>> barr.sum(axis=1) # 算行的和 axis 0 纵轴 1 横轴 | |
array([ 7, 11]) | |
>>> c = np.array([[3,4], [5,6]]) | |
>>> np.where(c > 5, aarr, barr) # 筛选组合,判断c的同位置元素如果大于5则选aarr同位置的,否则选barr的 | |
array([[3, 4], | |
[5, 3]]) | |
>>> def func(i, j): | |
... return (i+1) * (j+1) | |
... | |
>>> np.fromfunction(func, (9,9)) # 通过1个函数来创建数组,第二个参数为数组的行与列数,传给func的参数行列的索引 | |
array([[ 1., 2., 3., 4., 5., 6., 7., 8., 9.], | |
[ 2., 4., 6., 8., 10., 12., 14., 16., 18.], | |
[ 3., 6., 9., 12., 15., 18., 21., 24., 27.], | |
[ 4., 8., 12., 16., 20., 24., 28., 32., 36.], | |
[ 5., 10., 15., 20., 25., 30., 35., 40., 45.], | |
[ 6., 12., 18., 24., 30., 36., 42., 48., 54.], | |
[ 7., 14., 21., 28., 35., 42., 49., 56., 63.], | |
[ 8., 16., 24., 32., 40., 48., 56., 64., 72.], | |
[ 9., 18., 27., 36., 45., 54., 63., 72., 81.]]) | |
>>> np.add(aarr, barr) # ufunc的数组+ | |
array([[5, 6], | |
[8, 9]]) | |
>>> np.add.accumulate([1,2,3]) # reduce的方式对数组求和,并返回求和后的新的数组 | |
array([1, 3, 6]) | |
# Series 列 有序字段 | |
>>> import pandas as pd | |
>>> pd.Series([1,2.0,'a']) # 自动增加索引,类似与字典的key-value结构 | |
0 1 | |
1 2 | |
2 a | |
dtype: object | |
>>> aser = pd.Series([1,2,3], index=['a', 'b', 'c']) # 自定义索引名称 | |
>>> aser | |
a 1 | |
b 2 | |
c 3 | |
dtype: int64 | |
>>> aser.index | |
Index([u'a', u'b', u'c'], dtype='object') # 获取Series的索引 | |
>>> aser.values | |
array([1, 2, 3]) # 获取Series的值,返回数组对象 | |
>>> aser['b'] # 通过索引取值 | |
2 | |
>>> aser * 2 # 运算,可类比ndarray的运算 | |
a 2 | |
b 4 | |
c 6 | |
dtype: int64 | |
>>> import numpy as np | |
>>> np.exp(aser) # 计算自然对数的n次方 | |
a 2.718282 | |
b 7.389056 | |
c 20.085537 | |
dtype: float64 | |
>>> data = {'a':1, 'b':2} | |
>>> index = ['a', 'b', 'c'] # 数据对齐,如果没有对应索引的数据,自动填充NaN | |
>>> bser = pd.Series(data, index=index) | |
>>> bser | |
a 1.0 | |
b 2.0 | |
c NaN | |
dtype: float64 | |
>>> pd.isnull(bser) # 判断是否有NaN元素,返回一个新的结果Series | |
a False | |
b False | |
c True | |
dtype: bool | |
>>> cser = pd.Series({'a':2,'b':5,'d':10}) | |
>>> bser + cser # 运算时如果有不相称的索引也会对齐 | |
a 3.0 | |
b 7.0 | |
c NaN | |
d NaN | |
dtype: float64 | |
>>> aser.name # 名称属性,分别对应Series的名称与索引名称 | |
>>> aser.index.name | |
>>> aser.name = 'names' | |
>>> aser.index.name = 'col' | |
>>> aser | |
col | |
a 1 | |
b 2 | |
c 3 | |
Name: names, dtype: int64 | |
# DataFrame 数据帧 表 | |
>>> data = {'col1': [1,2,3], 'col2': [3,4,5]} | |
>>> adf = pd.DataFrame(data) | |
>>> adf | |
col1 col2 | |
0 1 3 | |
1 2 4 | |
2 3 5 | |
>>> adf['col1'] # 通过列名获取单列数据,返回Series对象 | |
0 1 | |
1 2 | |
2 3 | |
Name: col1, dtype: int64 | |
>>> adf.col1 # . 也可以用 | |
0 1 | |
1 2 | |
2 3 | |
Name: col1, dtype: int64 | |
>>> adf.ix[0] # 通过索引获取一行数据,返回Series | |
col1 1 | |
col2 3 | |
Name: 0, dtype: int64 | |
>>> adf.col1 = 'admin' # 重写一列的值 | |
>>> adf | |
col1 col2 | |
0 admin 3 | |
1 admin 4 | |
2 admin 5 | |
>>> adf['col1'] = [10,9,8] | |
>>> adf | |
col1 col2 | |
0 10 3 | |
1 9 4 | |
2 8 5 | |
>>> del adf['col1'] # 删除一列 | |
>>> adf | |
col2 | |
0 3 | |
1 4 | |
2 5 | |
>>> adf.index.name = 'index' # 索引名 | |
>>> adf | |
col2 | |
index | |
0 3 | |
1 4 | |
2 5 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment