pandas 基础命令速查表

pandas 基础命令复习

此notebook为kesci.com的pandas基础命令速查表项目。
网址链接：
pandas基础命令速查表

python

1	import pandas as pd

数据导入

python

pd.read_csv(filename) # 导入csv格式文件中的数据
pd.read_table(filename) # 导入有分隔符的文本 (如TSV) 中的数据
pd.read_excel(filename) # 导入Excel格式文件中的数据
pd.read_sql(query, connection_object) # 导入SQL数据表/数据库中的数据
pd.read_json(json_string) # 导入JSON格式的字符，URL地址或者文件中的数据
pd.read_html(url) # 导入经过解析的URL地址中包含的数据框 (DataFrame) 数据
pd.read_clipmboard() # 导入系统粘贴板里面的数据
pd.DataFrame(dict)  # 导入Python字典 (dict) 里面的数据，其中key是数据框的表头，value是数据框的内容。


parse_dates = [[0,1,2]]可以合并读入数据的前三列变成一个datetime的序列

数据导出

python

df.to_csv(filename) # 将数据框 (DataFrame)中的数据导入csv格式的文件中
df.to_excel(filename) # 将数据框 (DataFrame)中的数据导入Excel格式的文件中
df.to_sql(table_name,connection_object) # 将数据框 (DataFrame)中的数据导入SQL数据表/数据库中
df.to_json(filename) # 将数据框 (DataFrame)中的数据导入JSON格式的文件中

创建测试对象

numpy array 创建

python

1	pd.DataFrame(np.random.rand(10, 5))

	0	1	2	3	4
0	0.585777	0.433112	0.338235	0.291173	0.031913
1	0.624464	0.895368	0.292560	0.080429	0.495787
2	0.816629	0.563991	0.189032	0.114238	0.524798
3	0.065594	0.794394	0.699740	0.413209	0.630886
4	0.001313	0.100108	0.882507	0.498680	0.108653
5	0.777563	0.892090	0.950747	0.869066	0.405635
6	0.597492	0.794607	0.345172	0.685253	0.434808
7	0.263838	0.649635	0.828545	0.859189	0.090903
8	0.074650	0.448147	0.905926	0.283079	0.618817
9	0.117861	0.120424	0.063917	0.605063	0.954940

list 创建

python

1 2	my_list = ['Kesci',100,'欢迎来到科赛网'] pd.Series(my_list)

0      Kesci
1        100
2    欢迎来到科赛网
dtype: object

添加一个日期索引

python

1
2
3

df = pd.DataFrame(np.random.rand(10,5))
df.index = pd.date_range('2017/1/1', periods=df.shape[0])
df

	0	1	2	3	4
2017-01-01	0.828249	0.792735	0.512458	0.961731	0.917472
2017-01-02	0.566537	0.754457	0.235026	0.795147	0.245506
2017-01-03	0.468922	0.852834	0.549959	0.124779	0.219038
2017-01-04	0.467536	0.482505	0.238425	0.604942	0.921739
2017-01-05	0.599552	0.123519	0.814959	0.329789	0.696340
2017-01-06	0.946407	0.780112	0.965793	0.743042	0.613386
2017-01-07	0.791554	0.015501	0.500903	0.797965	0.856116
2017-01-08	0.052724	0.335682	0.253801	0.277599	0.525408
2017-01-09	0.492461	0.140432	0.316593	0.727536	0.958459
2017-01-10	0.562764	0.952519	0.256441	0.942915	0.152228

用普通字典创建

注意此法不能保证顺序

python

new_dict = {}
new_dict['foo'] = [1, 2, 3] 
new_dict['bar'] = [4, 5, 6]
df = pd.DataFrame(new_dict)
df

	foo	bar
0	1	4
1	2	5
2	3	6

用orderdict创建

此法一定能够按照顺序创建

python

from collections import OrderedDict

order_dict = OrderedDict()
order_dict['foo'] = [1, 2, 3]
order_dict['bar'] = [4, 5, 6]

order_dict

df = pd.DataFrame(order_dict)
df

	foo	bar
0	1	4
1	2	5
2	3	6

数据的查看与检查

python

1	df.head(3)

	0	1	2	3	4
2017-01-01	0.933522	0.245400	0.293748	0.723707	0.844672
2017-01-02	0.583137	0.717747	0.409480	0.489166	0.788739
2017-01-03	0.234310	0.287521	0.574982	0.845111	0.817525

python

1	df.tail(3)

	0	1	2	3	4
2017-01-08	0.551200	0.934601	0.933677	0.792482	0.532230
2017-01-09	0.126025	0.033953	0.284972	0.715794	0.083033
2017-01-10	0.138878	0.834870	0.065170	0.289762	0.456829

python

df.shape

(10, 5)

python

df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 10 entries, 2017-01-01 to 2017-01-10
Freq: D
Data columns (total 5 columns):
0    10 non-null float64
1    10 non-null float64
2    10 non-null float64
3    10 non-null float64
4    10 non-null float64
dtypes: float64(5)
memory usage: 800.0 bytes

数据描述

python

1	df.describe()

	0	1	2	3	4
count	10.000000	10.000000	10.000000	10.000000	10.000000
mean	0.497133	0.390297	0.444135	0.700152	0.542396
std	0.290880	0.333661	0.281356	0.193923	0.272418
min	0.126025	0.033116	0.065170	0.289762	0.083033
25%	0.277898	0.142655	0.287166	0.649304	0.376368
50%	0.492851	0.266461	0.378556	0.719751	0.516397
75%	0.608873	0.659798	0.637238	0.831953	0.810329
max	0.943647	0.934601	0.933677	0.921472	0.844672

单列数值次数统计

python

1	df.loc[:, 0].value_counts(dropna=False)

0.943647    1
0.617452    1
0.583137    1
0.551200    1
0.234310    1
0.933522    1
0.138878    1
0.408662    1
0.126025    1
0.434502    1
Name: 0, dtype: int64

查询数据框 (Data Frame) 中每个列的独特数据值出现次数统计

python

1	df.apply(pd.Series.value_counts)

	0	1	2	3	4
0.033116	NaN	1.0	NaN	NaN	NaN
0.033953	NaN	1.0	NaN	NaN	NaN
0.065170	NaN	NaN	1.0	NaN	NaN
0.083033	NaN	NaN	NaN	NaN	1.0
0.114544	NaN	NaN	1.0	NaN	NaN
0.120403	NaN	1.0	NaN	NaN	NaN
0.126025	1.0	NaN	NaN	NaN	NaN
0.138878	1.0	NaN	NaN	NaN	NaN
0.209412	NaN	1.0	NaN	NaN	NaN
0.223586	NaN	NaN	NaN	NaN	1.0
0.234310	1.0	NaN	NaN	NaN	NaN
0.245400	NaN	1.0	NaN	NaN	NaN
0.284972	NaN	NaN	1.0	NaN	NaN
0.287521	NaN	1.0	NaN	NaN	NaN
0.289762	NaN	NaN	NaN	1.0	NaN
0.293748	NaN	NaN	1.0	NaN	NaN
0.347632	NaN	NaN	1.0	NaN	NaN
0.349548	NaN	NaN	NaN	NaN	1.0
0.408662	1.0	NaN	NaN	NaN	NaN
0.409480	NaN	NaN	1.0	NaN	NaN
0.434502	1.0	NaN	NaN	NaN	NaN
0.456829	NaN	NaN	NaN	NaN	1.0
0.485952	NaN	1.0	NaN	NaN	NaN
0.489166	NaN	NaN	NaN	1.0	NaN
0.500563	NaN	NaN	NaN	NaN	1.0
0.532230	NaN	NaN	NaN	NaN	1.0
0.551200	1.0	NaN	NaN	NaN	NaN
0.574982	NaN	NaN	1.0	NaN	NaN
0.583137	1.0	NaN	NaN	NaN	NaN
0.617452	1.0	NaN	NaN	NaN	NaN
0.639218	NaN	NaN	NaN	1.0	NaN
0.657990	NaN	NaN	1.0	NaN	NaN
0.679562	NaN	NaN	NaN	1.0	NaN
0.715794	NaN	NaN	NaN	1.0	NaN
0.717747	NaN	1.0	NaN	NaN	NaN
0.723707	NaN	NaN	NaN	1.0	NaN
0.759157	NaN	NaN	1.0	NaN	NaN
0.788739	NaN	NaN	NaN	NaN	1.0
0.792482	NaN	NaN	NaN	1.0	NaN
0.817525	NaN	NaN	NaN	NaN	1.0
0.827238	NaN	NaN	NaN	NaN	1.0
0.834870	NaN	1.0	NaN	NaN	NaN
0.844672	NaN	NaN	NaN	NaN	1.0
0.845111	NaN	NaN	NaN	1.0	NaN
0.905241	NaN	NaN	NaN	1.0	NaN
0.921472	NaN	NaN	NaN	1.0	NaN
0.933522	1.0	NaN	NaN	NaN	NaN
0.933677	NaN	NaN	1.0	NaN	NaN
0.934601	NaN	1.0	NaN	NaN	NaN
0.943647	1.0	NaN	NaN	NaN	NaN

分类计数

python

from collections import OrderedDict
order_dict = OrderedDict()
order_dict['foo'] = [1, 2, 3, 3, 1]
order_dict['bar'] = [4, 5, 6, 5, 5]
df = pd.DataFrame(order_dict)
df

	foo	bar
0	1	4
1	2	5
2	3	6
3	3	5
4	1	5

样本分类统计个数

python

1	df.loc[:, 'foo'].value_counts().head(20)

3    2
1    2
2    1
Name: foo, dtype: int64

样本类型个数

python

1	df.loc[:, 'foo'].nunique()

python

1	df.loc[:, 'foo'].value_counts().shape[0]

python

1	len(df.loc[:, 'foo'].unique())

数据的选取

python

df

	0	1	2	3	4
2017-01-01	0.933522	0.245400	0.293748	0.723707	0.844672
2017-01-02	0.583137	0.717747	0.409480	0.489166	0.788739
2017-01-03	0.234310	0.287521	0.574982	0.845111	0.817525
2017-01-04	0.434502	0.120403	0.347632	0.639218	0.223586
2017-01-05	0.408662	0.209412	0.657990	0.679562	0.349548
2017-01-06	0.617452	0.033116	0.759157	0.921472	0.827238
2017-01-07	0.943647	0.485952	0.114544	0.905241	0.500563
2017-01-08	0.551200	0.934601	0.933677	0.792482	0.532230
2017-01-09	0.126025	0.033953	0.284972	0.715794	0.083033
2017-01-10	0.138878	0.834870	0.065170	0.289762	0.456829

python

1	df.loc['2017-01-01', 1]

0.24540013304048203

python

1	df.ix['2017-01-01', 1]

C:\Users\yxion\Anaconda3\envs\py37\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.





0.24540013304048203

python

1	df.iloc[0, 1]

0.24540013304048203

数据的清洗

数据列的重命名

python

df

	0	1	2	3	4
2017-01-01	0.828249	0.792735	0.512458	0.961731	0.917472
2017-01-02	0.566537	0.754457	0.235026	0.795147	0.245506
2017-01-03	0.468922	0.852834	0.549959	0.124779	0.219038
2017-01-04	0.467536	0.482505	0.238425	0.604942	0.921739
2017-01-05	0.599552	0.123519	0.814959	0.329789	0.696340
2017-01-06	0.946407	0.780112	0.965793	0.743042	0.613386
2017-01-07	0.791554	0.015501	0.500903	0.797965	0.856116
2017-01-08	0.052724	0.335682	0.253801	0.277599	0.525408
2017-01-09	0.492461	0.140432	0.316593	0.727536	0.958459
2017-01-10	0.562764	0.952519	0.256441	0.942915	0.152228

python

1	df.columns = ['a', 'b', 'c', 'd', 'e']

python

1
2

python

df

	a	b	c	d	e
2017-01-01	0.828249	0.792735	0.512458	0.961731	0.917472
2017-01-02	0.566537	0.754457	0.235026	0.795147	0.245506
2017-01-03	0.468922	0.852834	0.549959	0.124779	0.219038
2017-01-04	0.467536	0.482505	0.238425	0.604942	0.921739
2017-01-05	0.599552	0.123519	0.814959	0.329789	0.696340
2017-01-06	0.946407	0.780112	0.965793	0.743042	0.613386
2017-01-07	0.791554	0.015501	0.500903	0.797965	0.856116
2017-01-08	0.052724	0.335682	0.253801	0.277599	0.525408
2017-01-09	0.492461	0.140432	0.316593	0.727536	0.958459
2017-01-10	0.562764	0.952519	0.256441	0.942915	0.152228

df.rename不会替换原dataframe

python

1	df.rename(columns=lambda x: x+'changed')

	achanged	bchanged	cchanged	dchanged	echanged
2017-01-01	0.828249	0.792735	0.512458	0.961731	0.917472
2017-01-02	0.566537	0.754457	0.235026	0.795147	0.245506
2017-01-03	0.468922	0.852834	0.549959	0.124779	0.219038
2017-01-04	0.467536	0.482505	0.238425	0.604942	0.921739
2017-01-05	0.599552	0.123519	0.814959	0.329789	0.696340
2017-01-06	0.946407	0.780112	0.965793	0.743042	0.613386
2017-01-07	0.791554	0.015501	0.500903	0.797965	0.856116
2017-01-08	0.052724	0.335682	0.253801	0.277599	0.525408
2017-01-09	0.492461	0.140432	0.316593	0.727536	0.958459
2017-01-10	0.562764	0.952519	0.256441	0.942915	0.152228

python

1	df.rename(columns={'a':'newA','c':'newC'})

	newA	b	newC	d	e
2017-01-01	0.828249	0.792735	0.512458	0.961731	0.917472
2017-01-02	0.566537	0.754457	0.235026	0.795147	0.245506
2017-01-03	0.468922	0.852834	0.549959	0.124779	0.219038
2017-01-04	0.467536	0.482505	0.238425	0.604942	0.921739
2017-01-05	0.599552	0.123519	0.814959	0.329789	0.696340
2017-01-06	0.946407	0.780112	0.965793	0.743042	0.613386
2017-01-07	0.791554	0.015501	0.500903	0.797965	0.856116
2017-01-08	0.052724	0.335682	0.253801	0.277599	0.525408
2017-01-09	0.492461	0.140432	0.316593	0.727536	0.958459
2017-01-10	0.562764	0.952519	0.256441	0.942915	0.152228

数据行的重命名

python

1	df.index = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

python

df

	a	b	c	d	e
a	0.828249	0.792735	0.512458	0.961731	0.917472
b	0.566537	0.754457	0.235026	0.795147	0.245506
c	0.468922	0.852834	0.549959	0.124779	0.219038
d	0.467536	0.482505	0.238425	0.604942	0.921739
e	0.599552	0.123519	0.814959	0.329789	0.696340
f	0.946407	0.780112	0.965793	0.743042	0.613386
g	0.791554	0.015501	0.500903	0.797965	0.856116
h	0.052724	0.335682	0.253801	0.277599	0.525408
i	0.492461	0.140432	0.316593	0.727536	0.958459
j	0.562764	0.952519	0.256441	0.942915	0.152228

使用特定列作为行索引

python

df

	a	b	c	d	e
a	0.828249	0.792735	0.512458	0.961731	0.917472
b	0.566537	0.754457	0.235026	0.795147	0.245506
c	0.468922	0.852834	0.549959	0.124779	0.219038
d	0.467536	0.482505	0.238425	0.604942	0.921739
e	0.599552	0.123519	0.814959	0.329789	0.696340
f	0.946407	0.780112	0.965793	0.743042	0.613386
g	0.791554	0.015501	0.500903	0.797965	0.856116
h	0.052724	0.335682	0.253801	0.277599	0.525408
i	0.492461	0.140432	0.316593	0.727536	0.958459
j	0.562764	0.952519	0.256441	0.942915	0.152228

例如把a这列变成index

python

1 2	df1 = df.set_index('a') df1

	b	c	d	e
a
0.828249	0.792735	0.512458	0.961731	0.917472
0.566537	0.754457	0.235026	0.795147	0.245506
0.468922	0.852834	0.549959	0.124779	0.219038
0.467536	0.482505	0.238425	0.604942	0.921739
0.599552	0.123519	0.814959	0.329789	0.696340
0.946407	0.780112	0.965793	0.743042	0.613386
0.791554	0.015501	0.500903	0.797965	0.856116
0.052724	0.335682	0.253801	0.277599	0.525408
0.492461	0.140432	0.316593	0.727536	0.958459
0.562764	0.952519	0.256441	0.942915	0.152228

还原index, 把index列变回数据

python

1 2	df2 = df1.reset_index() df2

	a	b	c	d	e
0	0.828249	0.792735	0.512458	0.961731	0.917472
1	0.566537	0.754457	0.235026	0.795147	0.245506
2	0.468922	0.852834	0.549959	0.124779	0.219038
3	0.467536	0.482505	0.238425	0.604942	0.921739
4	0.599552	0.123519	0.814959	0.329789	0.696340
5	0.946407	0.780112	0.965793	0.743042	0.613386
6	0.791554	0.015501	0.500903	0.797965	0.856116
7	0.052724	0.335682	0.253801	0.277599	0.525408
8	0.492461	0.140432	0.316593	0.727536	0.958459
9	0.562764	0.952519	0.256441	0.942915	0.152228

检查空值

python

1
2
3

df = pd.DataFrame({'A':np.array([1,np.nan,2,3,6,np.nan]),
                 'B':np.array([np.nan,4,np.nan,5,9,np.nan]),
                  'C':'foo'})

python

df

	A	B	C
0	1.0	NaN	foo
1	NaN	4.0	foo
2	2.0	NaN	foo
3	3.0	5.0	foo
4	6.0	9.0	foo
5	NaN	NaN	foo

python

1	pd.isnull(df)

	A	B	C
0	False	True	False
1	True	False	False
2	False	True	False
3	False	False	False
4	False	False	False
5	True	True	False

python

1	pd.notnull(df)

	A	B	C
0	True	False	True
1	False	True	True
2	True	False	True
3	True	True	True
4	True	True	True
5	False	False	True

移除空值行

python

1	df.dropna()

	A	B	C
3	3.0	5.0	foo
4	6.0	9.0	foo

移除空值列

python

1	df.dropna(axis=1)

	C
0	foo
1	foo
2	foo
3	foo
4	foo
5	foo

移除超过n个空值的列

python

1	df.dropna(axis=1,thresh=1)

	A	B	C
0	1.0	NaN	foo
1	NaN	4.0	foo
2	2.0	NaN	foo
3	3.0	5.0	foo
4	6.0	9.0	foo
5	NaN	NaN	foo

填充空值为常数

python

1	df.fillna('Test')

	A	B	C
0	1	Test	foo
1	Test	4	foo
2	2	Test	foo
3	3	5	foo
4	6	9	foo
5	Test	Test	foo

填充空值为均值

python

1	df.fillna(df.mean())

	A	B	C
0	1.0	6.0	foo
1	3.0	4.0	foo
2	2.0	6.0	foo
3	3.0	5.0	foo
4	6.0	9.0	foo
5	3.0	6.0	foo

转换数据类型

python

1	df.loc[:, 'A'].astype(np.float64)

0    1.0
1    NaN
2    2.0
3    3.0
4    6.0
5    NaN
Name: A, dtype: float64

替换特定数据

python

1 2	s = pd.Series([1,3,5,np.nan,7,9,9]) s.replace(1, 'one')

0    one
1      3
2      5
3    NaN
4      7
5      9
6      9
dtype: object

python

0    1.0
1    3.0
2    5.0
3    NaN
4    7.0
5    9.0
6    9.0
dtype: float64

python

1	s.replace([1,3],['one','three'])

0      one
1    three
2        5
3      NaN
4        7
5        9
6        9
dtype: object

删除列或者行

python

from collections import OrderedDict

order_dict = OrderedDict()
order_dict['foo'] = [1, 2, 3]
order_dict['bar'] = [4, 5, 6]

order_dict

df = pd.DataFrame(order_dict)
df

	foo	bar
0	1	4
1	2	5
2	3	6

python

1	df.drop('foo', axis=1)

	bar
0	4
1	5
2	6

python

1	df.drop(0, axis=0)

	foo	bar
1	2	5
2	3	6

数据过滤,排序,分组

过滤

python

1
2
3

df = pd.DataFrame(np.random.rand(10,5))
df.index = pd.date_range('2017/1/1', periods=df.shape[0])
df

	0	1	2	3	4
2017-01-01	0.462320	0.641626	0.783622	0.405599	0.086237
2017-01-02	0.860784	0.463871	0.671531	0.276455	0.033289
2017-01-03	0.980626	0.996484	0.772686	0.711433	0.129981
2017-01-04	0.863321	0.884043	0.182961	0.889572	0.916571
2017-01-05	0.111071	0.927398	0.636951	0.709890	0.589606
2017-01-06	0.115526	0.935115	0.506834	0.305816	0.235401
2017-01-07	0.498320	0.330181	0.905941	0.135436	0.360116
2017-01-08	0.730144	0.045433	0.698341	0.929830	0.060545
2017-01-09	0.889181	0.189442	0.832306	0.776265	0.129116
2017-01-10	0.709422	0.344869	0.889001	0.007639	0.354979

python

1	df.loc[df[0]>0.5]

	0	1	2	3	4
2017-01-02	0.860784	0.463871	0.671531	0.276455	0.033289
2017-01-03	0.980626	0.996484	0.772686	0.711433	0.129981
2017-01-04	0.863321	0.884043	0.182961	0.889572	0.916571
2017-01-08	0.730144	0.045433	0.698341	0.929830	0.060545
2017-01-09	0.889181	0.189442	0.832306	0.776265	0.129116
2017-01-10	0.709422	0.344869	0.889001	0.007639	0.354979

python

1	df[df[0]>0.5]

	0	1	2	3	4
2017-01-02	0.860784	0.463871	0.671531	0.276455	0.033289
2017-01-03	0.980626	0.996484	0.772686	0.711433	0.129981
2017-01-04	0.863321	0.884043	0.182961	0.889572	0.916571
2017-01-08	0.730144	0.045433	0.698341	0.929830	0.060545
2017-01-09	0.889181	0.189442	0.832306	0.776265	0.129116
2017-01-10	0.709422	0.344869	0.889001	0.007639	0.354979

python

1	df[(df[0]>0.5) & (df[1]>0.5)]

	0	1	2	3	4
2017-01-03	0.980626	0.996484	0.772686	0.711433	0.129981
2017-01-04	0.863321	0.884043	0.182961	0.889572	0.916571

isin 的用法

python

from collections import OrderedDict

order_dict = OrderedDict()
order_dict['foo'] = [1, 2, 3]
order_dict['bar'] = [4, 5, 6]

order_dict

df = pd.DataFrame(order_dict)
df

	foo	bar
0	1	4
1	2	5
2	3	6

python

1	%timeit df.loc[df.loc[:,'foo'].isin([1,2]),'foo']

407 µs ± 7.45 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

python

1	%timeit df.loc[df.foo.isin([1,2]),'foo']

375 µs ± 2.49 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

python

1	%timeit df[df.foo.isin([1,2])].foo

523 µs ± 14.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

排序

单列

python

1	df.sort_values('a',ascending=False, na_position='first')

	a	b	c	d	e
f	0.946407	0.780112	0.965793	0.743042	0.613386
a	0.828249	0.792735	0.512458	0.961731	0.917472
g	0.791554	0.015501	0.500903	0.797965	0.856116
e	0.599552	0.123519	0.814959	0.329789	0.696340
b	0.566537	0.754457	0.235026	0.795147	0.245506
j	0.562764	0.952519	0.256441	0.942915	0.152228
i	0.492461	0.140432	0.316593	0.727536	0.958459
c	0.468922	0.852834	0.549959	0.124779	0.219038
d	0.467536	0.482505	0.238425	0.604942	0.921739
h	0.052724	0.335682	0.253801	0.277599	0.525408

同时排序两列, 第一列先排

python

1	df.sort_values(['a', 'b'],ascending=[False,False])

	a	b	c	d	e
f	0.946407	0.780112	0.965793	0.743042	0.613386
a	0.828249	0.792735	0.512458	0.961731	0.917472
g	0.791554	0.015501	0.500903	0.797965	0.856116
e	0.599552	0.123519	0.814959	0.329789	0.696340
b	0.566537	0.754457	0.235026	0.795147	0.245506
j	0.562764	0.952519	0.256441	0.942915	0.152228
i	0.492461	0.140432	0.316593	0.727536	0.958459
c	0.468922	0.852834	0.549959	0.124779	0.219038
d	0.467536	0.482505	0.238425	0.604942	0.921739
h	0.052724	0.335682	0.253801	0.277599	0.525408

分组

类似于excel pivot table的一些功能

单列

python

df = pd.DataFrame({'A':np.array(['foo','foo','foo','foo','bar','bar']),
      'B':np.array(['one','one','two','two','three','three']),
     'C':np.array(['small','medium','large','large','small','small']),
     'D':np.array([1,2,2,3,3,5])})

python

df

	A	B	C	D
0	foo	one	small	1
1	foo	one	medium	2
2	foo	two	large	2
3	foo	two	large	3
4	bar	three	small	3
5	bar	three	small	5

python

1	df.groupby('C').count()

	A	B	D
C
large	2	2	2
medium	1	1	1
small	3	3	3

python

1	df.groupby(['B','C']).mean()

		D
B	C
one	medium	2.0
one	small	1.0
three	small	4.0
two	large	2.5

按照列B对数据框df做分组处理后，返回对应的D的平均值

python

1	df.groupby('B')['D'].mean()

B
one      1.5
three    4.0
two      2.5
Name: D, dtype: float64

自定义函数

python

1	df.groupby('C').agg(np.mean)

	D
C
large	2.5
medium	2.0
small	3.0

官方相同效果函数如下

python

1	df.groupby('C').mean()

	D
C
large	2.5
medium	2.0
small	3.0

多组同时搞事

python

1	df.groupby('C').agg({'A':'describe', 'B':'sum'})

	A				B
	count	unique	top	freq	B
C
large	2	1	foo	2	twotwo
medium	1	1	foo	1	one
small	3	2	bar	2	onethreethree

透视表

python

df = pd.DataFrame({'A':np.array(['foo','foo','foo','foo','bar','bar']),
      'B':np.array(['one','one','two','two','three','three']),
     'C':np.array(['small','medium','large','large','small','small']),
     'D':np.array([1,2,2,3,3,5])})
df

	A	B	C	D
0	foo	one	small	1
1	foo	one	medium	2
2	foo	two	large	2
3	foo	two	large	3
4	bar	three	small	3
5	bar	three	small	5

python

1 2	df.pivot_table(df,index=['A','B'], columns=['C'],aggfunc=np.sum)

		D
	C	large	medium	small
A	B
bar	three	NaN	NaN	8.0
foo	one	NaN	2.0	1.0
foo	two	5.0	NaN	NaN

对数据使用函数

python

1 2	df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE')) df

	A	B	C	D	E
0	0.411040	0.192597	0.498276	0.629785	0.619616
1	0.845691	0.510586	0.299651	0.998108	0.304747
2	0.623651	0.208694	0.756371	0.460918	0.648548
3	0.345734	0.498705	0.809781	0.584406	0.679397
4	0.131337	0.700264	0.274539	0.606103	0.509637
5	0.849646	0.230177	0.805004	0.407902	0.625412
6	0.754866	0.470509	0.944374	0.740570	0.605332
7	0.179676	0.956882	0.886308	0.038653	0.694814
8	0.197512	0.464496	0.476800	0.682217	0.345192
9	0.215948	0.092079	0.023335	0.246493	0.971298

对每一列使用函数

python

1	df.apply(np.mean)

A    0.455510
B    0.432499
C    0.577444
D    0.539515
E    0.600399
dtype: float64

对每一行使用函数

python

1	df.apply(np.mean, axis=1)

0    0.470263
1    0.591757
2    0.539636
3    0.583605
4    0.444376
5    0.583628
6    0.703130
7    0.551267
8    0.433243
9    0.309831
dtype: float64

数据的连接, 组合, 插入

末尾拼接两个dataframe

python

df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']},
                   index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                    'B': ['B4', 'B5', 'B6', 'B7'],
                    'C': ['C4', 'C5', 'C6', 'C7'],
                    'D': ['D4', 'D5', 'D6', 'D7']},
                   index=[0, 1, 2, 3])

python

df1

	A	B	C	D
0	A0	B0	C0	D0
1	A1	B1	C1	D1
2	A2	B2	C2	D2
3	A3	B3	C3	D3

python

df2

	A	B	C	D
0	A4	B4	C4	D4
1	A5	B5	C5	D5
2	A6	B6	C6	D6
3	A7	B7	C7	D7

append只能拼接行

python

1	df1.append(df2)

	A	B	C	D
0	A0	B0	C0	D0
1	A1	B1	C1	D1
2	A2	B2	C2	D2
3	A3	B3	C3	D3
0	A4	B4	C4	D4
1	A5	B5	C5	D5
2	A6	B6	C6	D6
3	A7	B7	C7	D7

concat可以拼接列以及行,要注意index和column的重复

python

1	pd.concat([df1,df2],axis=0)

	A	B	C	D
0	A0	B0	C0	D0
1	A1	B1	C1	D1
2	A2	B2	C2	D2
3	A3	B3	C3	D3
0	A4	B4	C4	D4
1	A5	B5	C5	D5
2	A6	B6	C6	D6
3	A7	B7	C7	D7

python

1	pd.concat([df1,df2],axis=1)

	A	B	C	D	A	B	C	D
0	A0	B0	C0	D0	A4	B4	C4	D4
1	A1	B1	C1	D1	A5	B5	C5	D5
2	A2	B2	C2	D2	A6	B6	C6	D6
3	A3	B3	C3	D3	A7	B7	C7	D7

按loc末尾直接加入行或列

按loc大法只能拼一列或者一行到末尾

python

df1

	A	B	C	D
0	A0	B0	C0	D0
1	A1	B1	C1	D1
2	A2	B2	C2	D2
3	A3	B3	C3	D3

python

1 2	df1.loc[5] = list('EFGH') df1

	A	B	C	D
0	A0	B0	C0	D0
1	A1	B1	C1	D1
2	A2	B2	C2	D2
3	A3	B3	C3	D3
5	E	F	G	H

python

1 2	df1.loc[:,'E'] = list('EFGHI') df1

	A	B	C	D	E
0	A0	B0	C0	D0	E
1	A1	B1	C1	D1	F
2	A2	B2	C2	D2	G
3	A3	B3	C3	D3	H
5	E	F	G	H	I

两个dafaframe组合

df1.join(df2)

python

df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],           
                     'B': ['B0', 'B1', 'B2', 'B3']},
                     index=['K0', 'K1', 'K0', 'K1'])
   

df2 = pd.DataFrame({'C': ['C0', 'C1'],
                      'D': ['D0', 'D1']},
                     index=['K0', 'K1'])

python

1	df1.join(df2)

	A	B	C	D
K0	A0	B0	C0	D0
K0	A2	B2	C0	D0
K1	A1	B1	C1	D1
K1	A3	B3	C1	D1

插入特定行

特定的行插入一个dataframe

python

df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],           
                     'B': ['B0', 'B1', 'B2', 'B3']})
   

df2 = pd.DataFrame({'A': ['C0', 'C1'],
                      'B': ['D0', 'D1']})

python

df1

	A	B
0	A0	B0
1	A1	B1
2	A2	B2
3	A3	B3

python

df2

	A	B
0	C0	D0
1	C1	D1

定义一个在第n行后插入dataframe的函数

python

def insert_row(idx, df, df_insert):
    dfA = df.iloc[:idx, ]
    dfB = df.iloc[idx:, ]

    df = dfA.append(df_insert).append(dfB)

    return df

python

1 2	df1_inserted = insert_row(2, df1, df2) df1_inserted

	A	B
0	A0	B0
1	A1	B1
0	C0	D0
1	C1	D1
2	A2	B2
3	A3	B3

插入特定列

自带函数df.insert可在第n列插入一列新的数据

python

df1

	C	A	B
0	A	A0	B0
1	B	A1	B1
2	C	A2	B2
3	D	A3	B3

python

1	df1.insert(1,'D', list('ABCD'))

python

df1

	C	D	A	B
0	A	A	A0	B0
1	B	B	A1	B1
2	C	C	A2	B2
3	D	D	A3	B3

如果要在特定位置插入多列数据只能用这个

python

def insert_column(idx, df, df_insert):
    dfA = df.iloc[:,:idx]
    dfB = df.iloc[:,idx:]

    df = pd.concat([pd.concat([dfA, df_insert], axis=1),dfB], axis=1)

    return df

python

df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],           
                     'B': ['B0', 'B1', 'B2', 'B3'],
                      'C': ['B0', 'B1', 'B2', 'B3']})
   
df2 = pd.DataFrame({'D': ['B0', 'B1', 'B2', 'B3'],
                    'E': ['B0', 'B1', 'B2', 'B3']})

python

1	insert_column(1, df1, df2)

	A	D	E	B	C
0	A0	B0	B0	B0	B0
1	A1	B1	B1	B1	B1
2	A2	B2	B2	B2	B2
3	A3	B3	B3	B3	B3

利用groupby和条件增加列

python

df = pd.DataFrame({'A':np.array(['foo','foo','foo','foo','bar','bar']),
      'B':np.array(['one','one','two','two','three','three']),
     'C':np.array(['small','medium','large','large','small','small']),
     'D':np.array([1,2,2,3,3,5])})
df

	A	B	C	D
0	foo	one	small	1
1	foo	one	medium	2
2	foo	two	large	2
3	foo	two	large	3
4	bar	three	small	3
5	bar	three	small	5

python

1
2
3

def func(df):
    df['D_sum'] = df['D'].sum()
    return df

python

1	df.groupby('D').apply(func)

	A	B	C	D	D_sum
0	foo	one	small	1	1
1	foo	one	medium	2	4
2	foo	two	large	2	4
3	foo	two	large	3	6
4	bar	three	small	3	6
5	bar	three	small	5	5

利用apply函数增加列

python

df = pd.DataFrame({'A':np.array(['foo','foo','foo','foo','bar','bar']),
      'B':np.array(['one','one','two','two','three','three']),
     'C':np.array(['small','medium','large','large','small','small']),
     'D':np.array([1,2,2,3,3,5])})
df

	A	B	C	D
0	foo	one	small	1
1	foo	one	medium	2
2	foo	two	large	2
3	foo	two	large	3
4	bar	three	small	3
5	bar	three	small	5

python

1 2	def func(a, b): return 1 if a=='foo' else 0

python

1	df.apply(lambda column: func(column['A'], column['B']), axis=1)

0    1
1    1
2    1
3    1
4    0
5    0
dtype: int64

python

1
2
3

def func(df):
    df['A_equal_foo'] = 1 if df['A']=='foo' else 0
    return df

python

1	df.apply(func, axis=1)

	A	B	C	D	A_equal_foo
0	foo	one	small	1	1
1	foo	one	medium	2	1
2	foo	two	large	2	1
3	foo	two	large	3	1
4	bar	three	small	3	0
5	bar	three	small	5	0

database式merge数据

python

>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
...                     'value': [1, 2, 3, 5]})
>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
...                     'value': [5, 6, 7, 8]})

python

1	df1.merge(df2, on='value',how='inner')

	lkey	value	rkey
0	foo	5	foo

python

1	df1.merge(df2, on='value',how='outer')

	lkey	value	rkey
0	foo	1	NaN
1	bar	2	NaN
2	baz	3	NaN
3	foo	5	foo
4	NaN	6	bar
5	NaN	7	baz
6	NaN	8	foo

python

1	df1.merge(df2, left_on='value', right_on='value',how='inner')

	lkey	value	rkey
0	foo	5	foo

python

1	df1.merge(df2, left_on='value', right_on='value',how='outer')

	lkey	value	rkey
0	foo	1	NaN
1	bar	2	NaN
2	baz	3	NaN
3	foo	5	foo
4	NaN	6	bar
5	NaN	7	baz
6	NaN	8	foo

python

1	df1.merge(df2, left_on='lkey', right_on='rkey')

	lkey	value_x	rkey	value_y
0	foo	1	foo	5
1	foo	1	foo	8
2	foo	5	foo	5
3	foo	5	foo	8
4	bar	2	bar	6
5	baz	3	baz	7

python

1	df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=('_left', '_right'))

	lkey	value_left	rkey	value_right
0	foo	1	foo	5
1	foo	1	foo	8
2	foo	5	foo	5
3	foo	5	foo	8
4	bar	2	bar	6
5	baz	3	baz	7

数据统计

基本统计

python

1 2	df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE')) df.describe()

	A	B	C	D	E
count	10.000000	10.000000	10.000000	10.000000	10.000000
mean	0.548176	0.581951	0.492730	0.362844	0.464692
std	0.269487	0.296894	0.344442	0.314646	0.205078
min	0.139214	0.152646	0.032955	0.003880	0.142789
25%	0.328505	0.284380	0.224993	0.093924	0.339452
50%	0.597528	0.747586	0.538701	0.325856	0.453823
75%	0.715508	0.819579	0.799554	0.563304	0.653605
max	0.964156	0.871106	0.920916	0.841107	0.703043

python

1	df.mean() # 得到数据框df中每一列的平均值

A    0.548176
B    0.581951
C    0.492730
D    0.362844
E    0.464692
dtype: float64

python

1	df.corr() # 得到数据框df中每一列与其他列的相关系数

	A	B	C	D	E
A	1.000000	0.097622	-0.571543	0.364524	-0.083053
B	0.097622	1.000000	-0.058487	-0.347005	-0.603136
C	-0.571543	-0.058487	1.000000	-0.369241	0.222586
D	0.364524	-0.347005	-0.369241	1.000000	0.232339
E	-0.083053	-0.603136	0.222586	0.232339	1.000000

python

1	df.count() # 得到数据框df中每一列的非空值个数

A    10
B    10
C    10
D    10
E    10
dtype: int64

python

1	df.max() # 得到数据框df中每一列的最大值

A    0.964156
B    0.871106
C    0.920916
D    0.841107
E    0.703043
dtype: float64

python

1	df.min() # 得到数据框df中每一列的最小值

A    0.139214
B    0.152646
C    0.032955
D    0.003880
E    0.142789
dtype: float64

python

1	df.median() # 得到数据框df中每一列的中位数

A    0.597528
B    0.747586
C    0.538701
D    0.325856
E    0.453823
dtype: float64

python

1	df.std() # 得到数据框df中每一列的标准差

A    0.269487
B    0.296894
C    0.344442
D    0.314646
E    0.205078
dtype: float64

对所有值进行描述统计

python

1	df.stack().std(ddof=1)

0.2882311219915606

numpy求法, ddof是degree of freedom

python

1	df.values.std(ddof=1)

0.2882311219915606

最大值索引

每一列的最大值的索引

python

1	df.idxmax()

A    8
B    3
C    2
D    1
E    9
dtype: int64

时间序列

构建时间索引

python

1 2	i = pd.date_range('20000101',periods=100) i[0:5]

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05'],
              dtype='datetime64[ns]', freq='D')

python

1	i.year[0:5]

Int64Index([2000, 2000, 2000, 2000, 2000], dtype='int64')

python

1	i.day[0:5]

Int64Index([1, 2, 3, 4, 5], dtype='int64')

python

1	i.month[0:5]

Int64Index([1, 1, 1, 1, 1], dtype='int64')

转换时间格式

通过整数序列来转换时间序列

python

import pandas as pd
>>> i = pd.date_range('20000101',periods=100)
>>> df = pd.DataFrame(dict(year = i.year, month = i.month, day = i.day))
time_series = df.year*10000 + df.month*100 + df.day
time_series

0     20000101
1     20000102
2     20000103
3     20000104
4     20000105
        ...   
95    20000405
96    20000406
97    20000407
98    20000408
99    20000409
Length: 100, dtype: int64

python

1	pd.to_datetime(time_series, format='%Y%m%d')

0    2000-01-01
1    2000-01-02
2    2000-01-03
3    2000-01-04
4    2000-01-05
        ...    
95   2000-04-05
96   2000-04-06
97   2000-04-07
98   2000-04-08
99   2000-04-09
Length: 100, dtype: datetime64[ns]

通过String来转换时间序列

python

1
2
3

df = df.astype(str)
time_string = df.day + df.month + df.year
time_string

0     112000
1     212000
2     312000
3     412000
4     512000
       ...  
95    542000
96    642000
97    742000
98    842000
99    942000
Length: 100, dtype: object

python

1	pd.to_datetime(time_string, format="%d%m%Y")

0    2000-01-01
1    2000-01-02
2    2000-01-03
3    2000-01-04
4    2000-01-05
        ...    
95   2000-04-05
96   2000-04-06
97   2000-04-07
98   2000-04-08
99   2000-04-09
Length: 100, dtype: datetime64[ns]

时间格式采样

时间频率如下

B business day frequency
C custom business day frequency (experimental)
D calendar day frequency
W weekly frequency
M month end frequency
BM business month end frequency
CBM custom business month end frequency
MS month start frequency
BMS business month start frequency
CBMS custom business month start frequency
Q quarter end frequency
BQ business quarter endfrequency
QS quarter start frequency
BQS business quarter start frequency
A year end frequency
BA business year end frequency
AS year start frequency
BAS business year start frequency
BH business hour frequency
H hourly frequency
T minutely frequency
S secondly frequency
L milliseonds
U microseconds
N nanoseconds

原文链接：https://blog.csdn.net/qq_41996090/article/details/87555618

python

1 2	index = pd.date_range('1/1/2000', periods=9, freq='T') series = pd.Series(range(9), index=index)

python

series

2000-01-01 00:00:00    0
2000-01-01 00:01:00    1
2000-01-01 00:02:00    2
2000-01-01 00:03:00    3
2000-01-01 00:04:00    4
2000-01-01 00:05:00    5
2000-01-01 00:06:00    6
2000-01-01 00:07:00    7
2000-01-01 00:08:00    8
Freq: T, dtype: int64

python

1	series.resample('3T').sum()

2000-01-01 00:00:00     3
2000-01-01 00:03:00    12
2000-01-01 00:06:00    21
Freq: 3T, dtype: int64

python

1	series.resample('3T', label='right', closed='right').sum()

2000-01-01 00:00:00     0
2000-01-01 00:03:00     6
2000-01-01 00:06:00    15
2000-01-01 00:09:00    15
Freq: 3T, dtype: int64

采样的频率

python

1	series.resample('10S').asfreq().head()

2000-01-01 00:00:00    0.0
2000-01-01 00:00:10    NaN
2000-01-01 00:00:20    NaN
2000-01-01 00:00:30    NaN
2000-01-01 00:00:40    NaN
Freq: 10S, dtype: float64

自定义函数

python

1	import numpy as np

python

1 2	def custom_resampler(array_like): return np.sum(array_like) + 5

python

1	series.resample('3T').apply(custom_resampler)

2000-01-01 00:00:00     8
2000-01-01 00:03:00    17
2000-01-01 00:06:00    26
Freq: 3T, dtype: int64

	A	B	C
0	1.0	NaN	foo
1	NaN	4.0	foo
2	2.0	NaN	foo
3	3.0	5.0	foo
4	6.0	9.0	foo
5	NaN	NaN	foo

	A	B	C
0	1.0	NaN	foo
1	NaN	4.0	foo
2	2.0	NaN	foo
3	3.0	5.0	foo
4	6.0	9.0	foo
5	NaN	NaN	foo

	A	B	C
0	1.0	6.0	foo
1	3.0	4.0	foo
2	2.0	6.0	foo
3	3.0	5.0	foo
4	6.0	9.0	foo
5	3.0	6.0	foo

	A	B	C	D
0	A0	B0	C0	D0
1	A1	B1	C1	D1
2	A2	B2	C2	D2
3	A3	B3	C3	D3
0	A4	B4	C4	D4
1	A5	B5	C5	D5
2	A6	B6	C6	D6
3	A7	B7	C7	D7

	A	B	C	D
0	A0	B0	C0	D0
1	A1	B1	C1	D1
2	A2	B2	C2	D2
3	A3	B3	C3	D3
0	A4	B4	C4	D4
1	A5	B5	C5	D5
2	A6	B6	C6	D6
3	A7	B7	C7	D7

	A	B	C	D	A	B	C	D
0	A0	B0	C0	D0	A4	B4	C4	D4
1	A1	B1	C1	D1	A5	B5	C5	D5
2	A2	B2	C2	D2	A6	B6	C6	D6
3	A3	B3	C3	D3	A7	B7	C7	D7

	A	B	C
0	1.0	NaN	foo
1	NaN	4.0	foo
2	2.0	NaN	foo
3	3.0	5.0	foo
4	6.0	9.0	foo
5	NaN	NaN	foo

	A	B	C
0	1.0	NaN	foo
1	NaN	4.0	foo
2	2.0	NaN	foo
3	3.0	5.0	foo
4	6.0	9.0	foo
5	NaN	NaN	foo

	A	B	C
0	1.0	6.0	foo
1	3.0	4.0	foo
2	2.0	6.0	foo
3	3.0	5.0	foo
4	6.0	9.0	foo
5	3.0	6.0	foo

	A	B	C	D
0	A0	B0	C0	D0
1	A1	B1	C1	D1
2	A2	B2	C2	D2
3	A3	B3	C3	D3
0	A4	B4	C4	D4
1	A5	B5	C5	D5
2	A6	B6	C6	D6
3	A7	B7	C7	D7

	A	B	C	D
0	A0	B0	C0	D0
1	A1	B1	C1	D1
2	A2	B2	C2	D2
3	A3	B3	C3	D3
0	A4	B4	C4	D4
1	A5	B5	C5	D5
2	A6	B6	C6	D6
3	A7	B7	C7	D7

	A	B	C	D	A	B	C	D
0	A0	B0	C0	D0	A4	B4	C4	D4
1	A1	B1	C1	D1	A5	B5	C5	D5
2	A2	B2	C2	D2	A6	B6	C6	D6
3	A3	B3	C3	D3	A7	B7	C7	D7

	A	B	C
0	1.0	NaN	foo
1	NaN	4.0	foo
2	2.0	NaN	foo
3	3.0	5.0	foo
4	6.0	9.0	foo
5	NaN	NaN	foo

	A	B	C
0	1.0	NaN	foo
1	NaN	4.0	foo
2	2.0	NaN	foo
3	3.0	5.0	foo
4	6.0	9.0	foo
5	NaN	NaN	foo

	A	B	C
0	1.0	6.0	foo
1	3.0	4.0	foo
2	2.0	6.0	foo
3	3.0	5.0	foo
4	6.0	9.0	foo
5	3.0	6.0	foo

	A	B	C	D
0	A0	B0	C0	D0
1	A1	B1	C1	D1
2	A2	B2	C2	D2
3	A3	B3	C3	D3
0	A4	B4	C4	D4
1	A5	B5	C5	D5
2	A6	B6	C6	D6
3	A7	B7	C7	D7

	A	B	C	D
0	A0	B0	C0	D0
1	A1	B1	C1	D1
2	A2	B2	C2	D2
3	A3	B3	C3	D3
0	A4	B4	C4	D4
1	A5	B5	C5	D5
2	A6	B6	C6	D6
3	A7	B7	C7	D7

	A	B	C	D	A	B	C	D
0	A0	B0	C0	D0	A4	B4	C4	D4
1	A1	B1	C1	D1	A5	B5	C5	D5
2	A2	B2	C2	D2	A6	B6	C6	D6
3	A3	B3	C3	D3	A7	B7	C7	D7