一、基本索引

import pandas as pd
import numpy as np

Series 索引

ser_obj1 = pd.Series(range(5), index = ['a', 'b', 'c', 'd', 'e'])
print (ser_obj1.head())
a    0
b 1
c 2
d 3
e 4
dtype: int64
'''1. 行索引'''
print('行索引====================================================================')
print (ser_obj1['b'])
print (ser_obj1[0])
print('切片索引====================================================================')
'''2. 切片索引'''
print (ser_obj1[1:3])
print (ser_obj1['b':'d'])
print('不连续索引索引====================================================================')
#注意会不会包含尾巴。。
'''3. 不连续索引'''
print (ser_obj1[[0, 2, 4]])
print (ser_obj1[['a', 'e']])
print('布尔索引====================================================================')
'''4. 布尔索引'''
#是对里面的值进行判断,不是对索引
ser_bool = ser_obj1 > 2
print (ser_bool)
print (ser_obj1[ser_bool])#运用的方式
print (ser_obj1[ser_obj1 > 2])
行索引====================================================================
1
0
切片索引====================================================================
b 1
c 2
dtype: int64
b 1
c 2
d 3
dtype: int64
不连续索引索引====================================================================
a 0
c 2
e 4
dtype: int64
a 0
e 4
dtype: int64
布尔索引====================================================================
a False
b False
c False
d True
e True
dtype: bool
d 3
e 4
dtype: int64
d 3
e 4
dtype: int64

Dataframe 索引

df_obj1 = pd.DataFrame(np.random.randn(5,4), columns = ['a', 'b', 'c', 'd'])
print (df_obj1.head())
          a         b         c         d
0 -0.720611 -0.002543 -0.084255 -0.583719
1 0.174715 0.288090 -0.254462 -1.669668
2 -1.872654 -0.466842 -0.481040 -1.735558
3 1.147668 1.205686 0.625582 -2.111863
4 1.177882 0.049581 -0.404098 -0.497125
'''0. 转换成行索引的话===方式一:transpose或者T'''
bb=df_obj1.transpose()
print(df_obj1.T)
print(bb)
'''0. 转换成行索引的话===方式二:重构'''
          0         1         2         3         4
a -0.720611 0.174715 -1.872654 1.147668 1.177882
b -0.002543 0.288090 -0.466842 1.205686 0.049581
c -0.084255 -0.254462 -0.481040 0.625582 -0.404098
d -0.583719 -1.669668 -1.735558 -2.111863 -0.497125
0 1 2 3 4
a -0.720611 0.174715 -1.872654 1.147668 1.177882
b -0.002543 0.288090 -0.466842 1.205686 0.049581
c -0.084255 -0.254462 -0.481040 0.625582 -0.404098
d -0.583719 -1.669668 -1.735558 -2.111863 -0.497125





'0. 转换成行索引的话===方式二:重构/重塑===还没有完全明白'
'''1. 列索引=====默认为列索引来操作'''
print ('列索引')
print(df_obj1.b)#当成属性来获取值
print('===================')
print (df_obj1['a']) # 返回Series类型
print (type(df_obj1['a']))
print (df_obj1[['a']]) # 返回DataFrame类型
print (type(df_obj1[['a']]))
'''2. 不连续索引'''
print ('不连续索引')
print (df_obj1[['a','c']])
print (df_obj1[['a','c']])
列索引
0 -0.002543
1 0.288090
2 -0.466842
3 1.205686
4 0.049581
Name: b, dtype: float64
===================
0 -0.720611
1 0.174715
2 -1.872654
3 1.147668
4 1.177882
Name: a, dtype: float64
<class 'pandas.core.series.Series'>
a
0 -0.720611
1 0.174715
2 -1.872654
3 1.147668
4 1.177882
<class 'pandas.core.frame.DataFrame'>
不连续索引
a c
0 -0.720611 -0.084255
1 0.174715 -0.254462
2 -1.872654 -0.481040
3 1.147668 0.625582
4 1.177882 -0.404098
a c
0 -0.720611 -0.084255
1 0.174715 -0.254462
2 -1.872654 -0.481040
3 1.147668 0.625582
4 1.177882 -0.404098

三种索引方式

print(ser_obj1)
print('============================')
print(df_obj1)
a    0
b 1
c 2
d 3
e 4
dtype: int64
============================
a b c d
0 -0.720611 -0.002543 -0.084255 -0.583719
1 0.174715 0.288090 -0.254462 -1.669668
2 -1.872654 -0.466842 -0.481040 -1.735558
3 1.147668 1.205686 0.625582 -2.111863
4 1.177882 0.049581 -0.404098 -0.497125
'''1. 标签索引 loc===用的是index和column的值索引【标签索引】'''
# Series
print (ser_obj1['b':'d'])
print (ser_obj1.loc['b':'d'])

# DataFrame
print (df_obj1[['a']])
print (df_obj1.loc[0:2,'a':'c'])
b    1
c 2
d 3
dtype: int64
b 1
c 2
d 3
dtype: int64
a
0 -0.720611
1 0.174715
2 -1.872654
3 1.147668
4 1.177882
a b c
0 -0.720611 -0.002543 -0.084255
1 0.174715 0.288090 -0.254462
2 -1.872654 -0.466842 -0.481040
'''2. 位置索引 iloc====用的是index和column的默认0,1,2,3···的值索引【位置索引】'''
print (ser_obj1[1:3])
print (ser_obj1.iloc[1:3])

# DataFrame
print (df_obj1.iloc[0:2, 0:2])
# 注意和df_obj.loc[0:2, 'a']的区别 = = = 行和列:都不包括尾巴
b    1
c 2
dtype: int64
b 1
c 2
dtype: int64
a b
0 -0.720611 -0.002543
1 0.174715 0.288090
'''3. 混合索引 ix===先按标签索引loc尝试操作,然后再按位置索引iloc尝试操作'''
print (ser_obj1.ix[1:3])
print (ser_obj1.ix['b':'c'])

# DataFrame
print (df_obj1.ix[0:2, 0:3])
b    1
c 2
dtype: int64
b 1
c 2
dtype: int64
a b c
0 -0.720611 -0.002543 -0.084255
1 0.174715 0.288090 -0.254462
2 -1.872654 -0.466842 -0.481040

二、Pandas 层级索引

构造层级索引

ser_obj = pd.Series(np.random.randn(12),
index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd', 'd'],
[0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]])
#列表里面的列表
print (ser_obj)
a  0   -0.373441
1 0.615976
2 0.959092
b 0 1.743670
1 -0.791517
2 -0.774013
c 0 1.271094
1 -0.723264
2 0.253038
d 0 -0.767791
1 0.419253
2 -0.691644
dtype: float64
  • MultiIndex 索引对象
print (type(ser_obj.index))
print (ser_obj.index)
<class 'pandas.core.indexes.multi.MultiIndex'>
MultiIndex([('a', 0),
('a', 1),
('a', 2),
('b', 0),
('b', 1),
('b', 2),
('c', 0),
('c', 1),
('c', 2),
('d', 0),
('d', 1),
('d', 2)],
)

选取子集

# 外层选取
print (ser_obj['c'])
0    1.271094
1 -0.723264
2 0.253038
dtype: float64
# 内层选取
print (ser_obj[:, 2])
a    0.959092
b -0.774013
c 0.253038
d -0.691644
dtype: float64

交换分层顺序

df_obj2 = pd.DataFrame(np.random.randn(6,4), columns = ['a', 'b', 'c', 'd'],index=[['a', 'a', 'a', 'b', 'b','b'],[1, 2, 3, 1, 2,3]])
#print (df_obj1.swaplevel())
print (df_obj2)
            a         b         c         d
a 1 0.273776 -0.018659 1.512727 -0.088091
2 -0.410883 -1.488943 0.917268 1.179941
3 -0.237532 -0.823717 0.189495 1.060476
b 1 0.727872 0.323352 0.443786 0.780510
2 -1.407645 -0.059689 1.439843 -1.700740
3 -0.377628 -0.137348 -0.739980 0.122528

交换并排序分层

print (df_obj2.swaplevel().sortvalues(by='a'))
#Series没有这个内置函数sortlevel===dataframe有

三、透视表和交叉表

dataset_path = './data/starcraft.csv'
df_data = pd.read_csv(dataset_path, usecols=['LeagueIndex', 'Age', 'HoursPerWeek',
'TotalHours', 'APM'])
print(df_data.head())
   LeagueIndex   Age  HoursPerWeek  TotalHours       APM
0 5 27.0 10.0 3000.0 143.7180
1 5 23.0 10.0 5000.0 129.2322
2 4 30.0 10.0 200.0 69.9612
3 3 19.0 20.0 400.0 107.6016
4 3 32.0 10.0 500.0 122.8908

透视表

pd.pivot_table(df_data,
index=['Age'],
columns=['LeagueIndex'],
aggfunc=np.sum)
APM HoursPerWeek TotalHours
LeagueIndex 1 2 3 4 5 6 7 1 2 3 5 6 7 1 2 3 4 5 6 7
Age
16.0 1062.44754 2919.70434 4851.9222 5149.7310 7787.37780 9042.1722 386.7774 324.0 720.0 778.0 1220.0 1280.0 56.0 4307.0 13143.0 29211.0 23581.0 49233.0 51320.0 3000.0
17.0 655.67280 1661.01540 4181.8920 5525.3586 10052.72100 8310.0858 573.8286 184.0 378.0 664.0 1460.0 1116.0 104.0 2044.0 7423.0 16602.0 24005.0 53375.0 45421.0 12700.0
18.0 704.47680 3300.41040 4847.2152 8763.0783 10988.66100 9134.7240 618.5790 204.0 548.0 886.0 1644.0 1194.0 164.0 3570.0 11471.0 21037.0 46034.0 1056486.0 50378.0 3200.0
19.0 734.55600 2216.81880 5183.7888 8030.1960 9271.09260 11955.6030 696.7770 126.0 458.0 950.0 962.0 1642.0 168.0 2355.0 8467.0 31861.0 39705.0 44697.0 70331.0 4166.0
20.0 1624.89660 2147.23200 4211.5686 10596.2070 10871.65440 14291.8692 NaN 328.0 288.0 654.0 1290.0 1816.0 NaN 7212.0 6325.0 20174.0 55083.0 64170.0 105131.0 NaN
21.0 780.67950 1578.02880 3949.3062 8689.8804 11954.91660 13165.7649 867.3474 162.0 270.0 580.0 1446.0 1858.0 62.0 3377.0 7673.0 19095.0 42296.0 68739.0 82061.0 3180.0
22.0 674.59860 2147.50980 4379.3424 7818.7302 10473.28380 10165.8672 493.1586 146.0 372.0 680.0 1304.0 1448.0 112.0 4225.0 10861.0 23030.0 57996.0 84330.0 67069.0 6950.0
23.0 359.65980 1575.06120 4602.7416 7616.9298 8292.86160 6131.1936 1799.6520 46.0 320.0 598.0 998.0 726.0 296.0 896.0 12350.0 23081.0 40025.0 56097.0 43176.0 14290.0
24.0 439.43040 1717.55340 2876.8572 5503.7736 7292.32740 7240.4076 428.6538 116.0 344.0 406.0 668.0 1048.0 36.0 2070.0 9543.0 25421.0 35384.0 36147.0 43114.0 2250.0
25.0 572.61420 1178.02440 2201.6388 4710.9924 6168.19260 2200.6362 361.4550 124.0 166.0 268.0 682.0 256.0 52.0 2440.0 5846.0 11270.0 26610.0 40681.0 14890.0 3300.0
26.0 418.70874 1165.96680 1794.1890 3139.2852 4016.67060 3301.8498 408.2202 96.0 148.0 272.0 418.0 354.0 60.0 1608.0 3417.0 10548.0 16839.0 20100.0 17663.0 2300.0
27.0 359.17320 1164.15960 1426.4550 2850.1320 3498.30300 2040.8454 NaN 40.0 152.0 226.0 340.0 164.0 NaN 1100.0 3615.0 7525.0 15935.0 19770.0 11796.0 NaN
28.0 333.84240 479.34000 1152.5958 2205.8778 1992.60540 521.7666 NaN 28.0 90.0 220.0 186.0 44.0 NaN 466.0 1860.0 7901.0 15370.0 10872.0 2500.0 NaN
29.0 236.74020 695.88480 568.2594 1447.5906 1398.78540 715.9404 NaN 54.0 56.0 80.0 180.0 106.0 NaN 2490.0 2000.0 3816.0 8220.0 10292.0 5950.0 NaN
30.0 125.53740 441.14160 733.6416 743.4468 578.32020 123.3774 NaN 14.0 54.0 76.0 90.0 28.0 NaN 210.0 2440.0 4370.0 6310.0 3440.0 1500.0 NaN
31.0 41.58600 314.92980 659.2626 1166.7606 315.53460 200.1708 NaN 12.0 36.0 68.0 12.0 16.0 NaN 200.0 1300.0 3500.0 8710.0 1050.0 1500.0 NaN
32.0 179.14380 315.54180 457.5174 541.8996 66.19740 NaN NaN 40.0 56.0 68.0 10.0 NaN NaN 1600.0 860.0 2300.0 6040.0 800.0 NaN NaN
33.0 198.77880 153.34680 284.8218 116.7516 245.45166 NaN NaN 32.0 12.0 42.0 60.0 NaN NaN 1200.0 220.0 2065.0 1130.0 2050.0 NaN NaN
34.0 49.11360 276.88260 268.4100 340.7124 174.54540 NaN NaN 12.0 56.0 28.0 14.0 NaN NaN 150.0 3380.0 1150.0 2400.0 2764.0 NaN NaN
35.0 229.31280 54.04680 170.4930 634.7688 115.06440 NaN NaN 54.0 8.0 26.0 12.0 NaN NaN 1350.0 500.0 1950.0 3800.0 800.0 NaN NaN
36.0 NaN 150.13140 333.6474 73.6980 NaN NaN NaN NaN 16.0 40.0 NaN NaN NaN NaN 500.0 1950.0 400.0 NaN NaN NaN
37.0 22.05960 49.89600 44.9682 NaN 451.13160 NaN NaN 12.0 30.0 6.0 32.0 NaN NaN 300.0 125.0 600.0 NaN 1800.0 NaN NaN
38.0 71.59500 NaN 334.6878 NaN NaN NaN NaN 16.0 NaN 46.0 NaN NaN NaN 300.0 NaN 2280.0 NaN NaN NaN NaN
39.0 29.87640 NaN 53.7690 86.7246 NaN NaN NaN 10.0 NaN 12.0 NaN NaN NaN 500.0 NaN 450.0 500.0 NaN NaN NaN
40.0 38.55900 51.83580 107.4684 NaN NaN NaN NaN 12.0 14.0 26.0 NaN NaN NaN 150.0 500.0 1080.0 NaN NaN NaN NaN
41.0 NaN 108.45720 77.6472 NaN NaN NaN NaN NaN 12.0 14.0 NaN NaN NaN NaN 450.0 800.0 NaN NaN NaN NaN
43.0 NaN 86.05860 NaN NaN NaN NaN NaN NaN 10.0 NaN NaN NaN NaN NaN 730.0 NaN NaN NaN NaN NaN
44.0 NaN NaN NaN 89.5266 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 500.0 NaN NaN NaN

28 rows × 21 columns

pd.pivot_table(df_data,
index=['Age'],
columns=['LeagueIndex'],
aggfunc=np.sum,
fill_value=-100.)
APM HoursPerWeek TotalHours
LeagueIndex 1 2 3 4 5 6 7 1 2 3 5 6 7 1 2 3 4 5 6 7
Age
16.0 1062.44754 2919.70434 4851.9222 5149.7310 7787.37780 9042.1722 386.7774 324 720 778 1220 1280 56 4307 13143 29211 23581 49233 51320 3000
17.0 655.67280 1661.01540 4181.8920 5525.3586 10052.72100 8310.0858 573.8286 184 378 664 1460 1116 104 2044 7423 16602 24005 53375 45421 12700
18.0 704.47680 3300.41040 4847.2152 8763.0783 10988.66100 9134.7240 618.5790 204 548 886 1644 1194 164 3570 11471 21037 46034 1056486 50378 3200
19.0 734.55600 2216.81880 5183.7888 8030.1960 9271.09260 11955.6030 696.7770 126 458 950 962 1642 168 2355 8467 31861 39705 44697 70331 4166
20.0 1624.89660 2147.23200 4211.5686 10596.2070 10871.65440 14291.8692 -100.0000 328 288 654 1290 1816 -100 7212 6325 20174 55083 64170 105131 -100
21.0 780.67950 1578.02880 3949.3062 8689.8804 11954.91660 13165.7649 867.3474 162 270 580 1446 1858 62 3377 7673 19095 42296 68739 82061 3180
22.0 674.59860 2147.50980 4379.3424 7818.7302 10473.28380 10165.8672 493.1586 146 372 680 1304 1448 112 4225 10861 23030 57996 84330 67069 6950
23.0 359.65980 1575.06120 4602.7416 7616.9298 8292.86160 6131.1936 1799.6520 46 320 598 998 726 296 896 12350 23081 40025 56097 43176 14290
24.0 439.43040 1717.55340 2876.8572 5503.7736 7292.32740 7240.4076 428.6538 116 344 406 668 1048 36 2070 9543 25421 35384 36147 43114 2250
25.0 572.61420 1178.02440 2201.6388 4710.9924 6168.19260 2200.6362 361.4550 124 166 268 682 256 52 2440 5846 11270 26610 40681 14890 3300
26.0 418.70874 1165.96680 1794.1890 3139.2852 4016.67060 3301.8498 408.2202 96 148 272 418 354 60 1608 3417 10548 16839 20100 17663 2300
27.0 359.17320 1164.15960 1426.4550 2850.1320 3498.30300 2040.8454 -100.0000 40 152 226 340 164 -100 1100 3615 7525 15935 19770 11796 -100
28.0 333.84240 479.34000 1152.5958 2205.8778 1992.60540 521.7666 -100.0000 28 90 220 186 44 -100 466 1860 7901 15370 10872 2500 -100
29.0 236.74020 695.88480 568.2594 1447.5906 1398.78540 715.9404 -100.0000 54 56 80 180 106 -100 2490 2000 3816 8220 10292 5950 -100
30.0 125.53740 441.14160 733.6416 743.4468 578.32020 123.3774 -100.0000 14 54 76 90 28 -100 210 2440 4370 6310 3440 1500 -100
31.0 41.58600 314.92980 659.2626 1166.7606 315.53460 200.1708 -100.0000 12 36 68 12 16 -100 200 1300 3500 8710 1050 1500 -100
32.0 179.14380 315.54180 457.5174 541.8996 66.19740 -100.0000 -100.0000 40 56 68 10 -100 -100 1600 860 2300 6040 800 -100 -100
33.0 198.77880 153.34680 284.8218 116.7516 245.45166 -100.0000 -100.0000 32 12 42 60 -100 -100 1200 220 2065 1130 2050 -100 -100
34.0 49.11360 276.88260 268.4100 340.7124 174.54540 -100.0000 -100.0000 12 56 28 14 -100 -100 150 3380 1150 2400 2764 -100 -100
35.0 229.31280 54.04680 170.4930 634.7688 115.06440 -100.0000 -100.0000 54 8 26 12 -100 -100 1350 500 1950 3800 800 -100 -100
36.0 -100.00000 150.13140 333.6474 73.6980 -100.00000 -100.0000 -100.0000 -100 16 40 -100 -100 -100 -100 500 1950 400 -100 -100 -100
37.0 22.05960 49.89600 44.9682 -100.0000 451.13160 -100.0000 -100.0000 12 30 6 32 -100 -100 300 125 600 -100 1800 -100 -100
38.0 71.59500 -100.00000 334.6878 -100.0000 -100.00000 -100.0000 -100.0000 16 -100 46 -100 -100 -100 300 -100 2280 -100 -100 -100 -100
39.0 29.87640 -100.00000 53.7690 86.7246 -100.00000 -100.0000 -100.0000 10 -100 12 -100 -100 -100 500 -100 450 500 -100 -100 -100
40.0 38.55900 51.83580 107.4684 -100.0000 -100.00000 -100.0000 -100.0000 12 14 26 -100 -100 -100 150 500 1080 -100 -100 -100 -100
41.0 -100.00000 108.45720 77.6472 -100.0000 -100.00000 -100.0000 -100.0000 -100 12 14 -100 -100 -100 -100 450 800 -100 -100 -100 -100
43.0 -100.00000 86.05860 -100.0000 -100.0000 -100.00000 -100.0000 -100.0000 -100 10 -100 -100 -100 -100 -100 730 -100 -100 -100 -100 -100
44.0 -100.00000 -100.00000 -100.0000 89.5266 -100.00000 -100.0000 -100.0000 -100 -100 -100 -100 -100 -100 -100 -100 -100 500 -100 -100 -100

28 rows × 21 columns

交叉表

pd.crosstab(df_data['LeagueIndex'],
df_data['Age'])
Age 16.0 17.0 18.0 19.0 20.0 21.0 22.0 23.0 24.0 25.0 34.0 35.0 36.0 37.0 38.0 39.0 40.0 41.0 43.0 44.0
LeagueIndex
1 18 9 13 12 22 12 12 6 9 11 1 5 0 1 1 1 1 0 0 0
2 38 22 38 32 25 23 28 24 24 18 5 1 2 1 0 0 1 2 1 0
3 48 43 56 53 47 44 47 47 32 26 3 3 5 1 4 1 2 1 0 0
4 45 49 78 71 97 81 75 72 52 45 4 7 1 0 0 1 0 0 0 1
5 55 71 81 68 80 96 81 59 58 51 2 1 0 2 0 0 0 0 0 0
6 50 51 56 73 86 83 68 42 48 15 0 0 0 0 0 0 0 0 0 0
7 2 3 3 4 0 5 3 9 2 2 0 0 0 0 0 0 0 0 0 0

7 rows × 28 columns

pd.crosstab(df_data['LeagueIndex'],
[df_data['Age'], df_data['HoursPerWeek']])
Age 16.0 39.0 40.0 41.0 43.0 44.0
HoursPerWeek 0.0 2.0 4.0 6.0 8.0 10.0 12.0 14.0 16.0 20.0 12.0 10.0 12.0 14.0 16.0 4.0 8.0 14.0 10.0 6.0
LeagueIndex
1 0 0 0 1 3 1 1 2 3 3 0 0 1 0 0 0 0 0 0 0
2 0 0 2 1 0 9 4 4 3 3 0 0 0 1 0 1 1 0 1 0
3 0 0 1 6 7 6 6 7 2 1 1 1 0 0 1 0 0 1 0 0
4 0 2 5 1 6 4 3 6 3 1 0 0 0 0 0 0 0 0 0 1
5 0 1 2 1 3 8 3 6 3 6 0 0 0 0 0 0 0 0 0 0
6 1 1 4 2 1 1 4 3 1 7 0 0 0 0 0 0 0 0 0 0
7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

7 rows × 325 columns