尼采般地抒情

尼采般地抒情

尼采般地抒情

音乐盒

站点信息

文章总数目: 313
已运行时间: 1541

规整数据:连接、合并、重构、转换etc

三、数据规整-连接-含索引


  • 数据连接 merge


import pandas as pd
import numpy as np


df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data1' : ['sfd','fdsf','we',24,3253,234,23]})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                        'data2' : np.random.randint(0,10,3)})

print (df_obj1)
print (df_obj2)


  key data1
0   b   sfd
1   b  fdsf
2   a    we
3   c    24
4   a  3253
5   a   234
6   b    23
  key  data2
0   a      0
1   b      6
2   d      6


merge和on


默认将重叠列的列名作为“外键”进行连接


pd.merge(df_obj1, df_obj2)


key

data1

data2

0

b

sfd

6

1

b

fdsf

6

2

b

23

6

3

a

we

0

4

a

3253

0

5

a

234

0


# on显示指定“外键”
pd.merge(df_obj1, df_obj2, on='key')


key

data1

data2

0

b

sfd

6

1

b

fdsf

6

2

b

23

6

3

a

we

0

4

a

3253

0

5

a

234

0


left-on和right-on


# left_on,right_on分别指定左侧数据和右侧数据的“外键”

更改列名

df_obj1 = df_obj1.rename(columns={‘key’:’key1’})
df_obj2 = df_obj2.rename(columns={‘key’:’key2’})


print(df_obj1)
print(df_obj2)


  key1 data1
0    b   sfd
1    b  fdsf
2    a    we
3    c    24
4    a  3253
5    a   234
6    b    23
  key2  data2
0    a      0
1    b      6
2    d      6


pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2')


key1

data1

key2

data2

0

b

sfd

b

6

1

b

fdsf

b

6

2

b

23

b

6

3

a

we

a

0

4

a

3253

a

0

5

a

234

a

0


how


# “外连接”
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='outer')


key1

data1

key2

data2

0

b

sfd

b

6.0

1

b

fdsf

b

6.0

2

b

23

b

6.0

3

a

we

a

0.0

4

a

3253

a

0.0

5

a

234

a

0.0

6

c

24

NaN

NaN

7

NaN

NaN

d

6.0


# 左连接
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='left')


key1

data1

key2

data2

0

b

sfd

b

6.0

1

b

fdsf

b

6.0

2

a

we

a

0.0

3

c

24

NaN

NaN

4

a

3253

a

0.0

5

a

234

a

0.0

6

b

23

b

6.0


# 右连接
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='right')


key1

data1

key2

data2

0

b

sfd

b

6

1

b

fdsf

b

6

2

b

23

b

6

3

a

we

a

0

4

a

3253

a

0

5

a

234

a

0

6

NaN

NaN

d

6


处理重复列名suffixes


# 处理重复列名
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                        'data' : np.random.randint(0,10,3)})

pd.merge(df_obj1, df_obj2, on=’key’, suffixes=(‘_left’, ‘_right’))


key

data_left

data_right

0

b

9

1

1

b

1

1

2

b

6

1

3

a

7

1

4

a

3

1

5

a

4

1


# 按索引连接
df_obj3 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data1' : np.random.randint(0,10,7)})
df_obj4 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])


print(df_obj3)
print(df_obj4)


  key  data1
0   b      7
1   b      4
2   a      1
3   c      9
4   a      2
5   a      9
6   b      7
   data2
a      9
b      4
d      0


pd.merge(df_obj3, df_obj4, left_on='key', right_index=True)


key

data1

data2

0

b

7

4

1

b

4

4

6

b

7

4

2

a

1

9

4

a

2

9

5

a

9

9


按索引连接right_index


# 按索引连接
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])


print(df_obj1)
print(df_obj2)


  key  data1
0   b      0
1   b      2
2   a      7
3   c      3
4   a      1
5   a      1
6   b      6
   data2
a      2
b      1
d      1


pd.merge(df_obj1, df_obj2, left_on='key', right_index=True)


key

data1

data2

0

b

0

1

1

b

2

1

6

b

6

1

2

a

7

2

4

a

1

2

5

a

1

2


四、数据合并


  • 数据合并 concat
  • 按索引连接===right_index


import numpy as np
import pandas as pd


numpy的concat


arr1 = np.random.randint(0, 10, (3, 4))
arr2 = np.random.randint(0, 10, (3, 4))

print (arr1)
print (arr2)


[[6 0 3 2]
 [5 7 9 8]
 [5 8 0 3]]
[[6 5 7 9]
 [0 1 0 0]
 [1 1 1 7]]


np.concatenate([arr1, arr2])


array([[6, 0, 3, 2],
       [5, 7, 9, 8],
       [5, 8, 0, 3],
       [6, 5, 7, 9],
       [0, 1, 0, 0],
       [1, 1, 1, 7]])


np.concatenate([arr1, arr2], axis=1)


array([[6, 0, 3, 2, 6, 5, 7, 9],
       [5, 7, 9, 8, 0, 1, 0, 0],
       [5, 8, 0, 3, 1, 1, 1, 7]])


series上的concat


# index 没有重复的情况
ser_obj1 = pd.Series(np.random.randint(0, 10, 5), index=range(0,5))
ser_obj2 = pd.Series(np.random.randint(0, 10, 4), index=range(5,9))
ser_obj3 = pd.Series(np.random.randint(0, 10, 3), index=range(9,12))


pd.concat([ser_obj1, ser_obj2, ser_obj3])


0     0
1     4
2     5
3     1
4     9
5     7
6     8
7     5
8     0
9     5
10    9
11    0
dtype: int32


pd.concat([ser_obj1, ser_obj2, ser_obj3], axis=1)


0

1

2

0

0.0

NaN

NaN

1

4.0

NaN

NaN

2

5.0

NaN

NaN

3

1.0

NaN

NaN

4

9.0

NaN

NaN

5

NaN

7.0

NaN

6

NaN

8.0

NaN

7

NaN

5.0

NaN

8

NaN

0.0

NaN

9

NaN

NaN

5.0

10

NaN

NaN

9.0

11

NaN

NaN

0.0


# index 有重复的情况
ser_obj1 = pd.Series(np.random.randint(0, 10, 5), index=range(5))
ser_obj2 = pd.Series(np.random.randint(0, 10, 4), index=range(4))
ser_obj3 = pd.Series(np.random.randint(0, 10, 3), index=range(3))

print (ser_obj1)
print (ser_obj2)
print (ser_obj3)


0    5
1    3
2    0
3    8
4    3
dtype: int32
0    5
1    3
2    2
3    1
dtype: int32
0    5
1    8
2    6
dtype: int32


pd.concat([ser_obj1, ser_obj2, ser_obj3])


0    5
1    3
2    0
3    8
4    3
0    5
1    3
2    2
3    1
0    5
1    8
2    6
dtype: int32


pd.concat([ser_obj1, ser_obj2, ser_obj3], axis=1, join='inner')


0

1

2

0

5

5

5

1

3

3

8

2

0

2

6


dataframe上的concat


df_obj1 = pd.DataFrame(np.random.randint(0, 10, (3, 2)), index=['a', 'b', 'c'],
                       columns=['A', 'B'])
df_obj2 = pd.DataFrame(np.random.randint(0, 10, (2, 2)), index=['a', 'b'],
                       columns=['C', 'D'])
print (df_obj1)
print (df_obj2)


   A  B
a  4  3
b  8  1
c  6  3
   C  D
a  1  3
b  8  2


pd.concat([df_obj1, df_obj2])


C:\Users\wztli\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass ‘sort=False’.

To retain the current behavior and silence the warning, pass ‘sort=True’.

"""Entry point for launching an IPython kernel.


A

B

C

D

a

4.0

3.0

NaN

NaN

b

8.0

1.0

NaN

NaN

c

6.0

3.0

NaN

NaN

a

NaN

NaN

1.0

3.0

b

NaN

NaN

8.0

2.0


pd.concat([df_obj1, df_obj2], axis=1)


C:\Users\wztli\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass ‘sort=False’.

To retain the current behavior and silence the warning, pass ‘sort=True’.

"""Entry point for launching an IPython kernel.


A

B

C

D

a

4

3

1.0

3.0

b

8

1

8.0

2.0

c

6

3

NaN

NaN


五、数据重构


import numpy as np
import pandas as pd


stack


df_obj = pd.DataFrame(np.random.randint(0,10, (5,2)), columns=['data1', 'data2'])
df_obj


data1

data2

0

0

4

1

6

2

2

9

8

3

7

0

4

3

1


stacked = df_obj.stack()
print (stacked)


0  data1    0
   data2    4
1  data1    6
   data2    2
2  data1    9
   data2    8
3  data1    7
   data2    0
4  data1    3
   data2    1
dtype: int32


print (type(stacked))
print (type(stacked.index))


<class 'pandas.core.series.Series'>
<class 'pandas.core.indexes.multi.MultiIndex'>


unstack


# 默认操作内层索引
stacked.unstack()


data1

data2

0

0

4

1

6

2

2

9

8

3

7

0

4

3

1


# 通过level指定操作索引的级别
stacked.unstack(level=0)


0

1

2

3

4

data1

0

6

9

7

3

data2

4

2

8

0

1


六、数据转换


import numpy as np
import pandas as pd


重复数据duplicates函数


df_obj = pd.DataFrame({'data1' : ['a'] * 4 + ['b'] * 4,
                       'data2' : np.random.randint(0, 4, 8)})
df_obj


data1

data2

0

a

3

1

a

2

2

a

2

3

a

1

4

b

0

5

b

2

6

b

2

7

b

1


df_obj.duplicated()


0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
dtype: bool


df_obj.drop_duplicates()


data1

data2

0

a

3

1

a

2

3

a

1

4

b

0

5

b

2

7

b

1


df_obj.drop_duplicates('data2')


data1

data2

0

a

3

1

a

2

3

a

1

4

b

0


map函数


ser_obj = pd.Series(np.random.randint(0,10,10))
ser_obj


0    1
1    9
2    1
3    2
4    7
5    2
6    4
7    5
8    4
9    6
dtype: int32


ser_obj.map(lambda x : x ** 2)


0     1
1    81
2     1
3     4
4    49
5     4
6    16
7    25
8    16
9    36
dtype: int64


数据替换repalce


# 替换单个值
ser_obj.replace(0, -100)


0    1
1    9
2    1
3    2
4    7
5    2
6    4
7    5
8    4
9    6
dtype: int32


# 替换多个值
ser_obj.replace([0, 2], -100)


0      1
1      9
2      1
3   -100
4      7
5   -100
6      4
7      5
8      4
9      6
dtype: int32


# 替换多个值
ser_obj.replace([0, 2], [-100, -200])


0      1
1      9
2      1
3   -200
4      7
5   -200
6      4
7      5
8      4
9      6
dtype: int64


ser_obj.map(lambda x : x ** 2)

3. 数据替换repalce

替换单个值

ser_obj.replace(0, -100)

替换多个值

ser_obj.replace([0, 2], -100)

替换多个值

ser_obj.replace([0, 2], [-100, -200])


0      1
1      9
2      1
3   -200
4      7
5   -200
6      4
7      5
8      4
9      6
dtype: int64

评论区