尼采般地抒情

尼采般地抒情

尼采般地抒情

公告栏

此网站主题为本人手写主题, 主题待开源···

音乐盒

站点信息

文章总数目: 307
已运行时间: 1101
目录
  1. 三、数据规整-连接-含索引
    1. merge 和 on
    2. left-on 和 right-on
    3. how
    4. 处理重复列名 suffixes
    5. 按索引连接 right_index
  2. 四、数据合并
    1. numpy 的 concat
    2. series 上的 concat
    3. dataframe 上的 concat
  3. 五、数据重构
    1. stack
    2. unstack
  4. 六、数据转换
    1. 重复数据 duplicates 函数
    2. map 函数
    3. 数据替换 repalce
尼采般地抒情

尼采般地抒情

尼采般地抒情

公告栏

此网站主题为本人手写主题, 主题待开源···

站点信息

文章总数目: 307
已运行时间: 1101

规整数据:连接、合并、重构、转换etc

三、数据规整-连接-含索引


  • 数据连接 merge


import pandas as pd

import numpy as np


df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : ['sfd','fdsf','we',24,3253,234,23]})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
'data2' : np.random.randint(0,10,3)})

print (df_obj1)
print (df_obj2)


 key data1
0 b sfd
1 b fdsf
2 a we
3 c 24
4 a 3253
5 a 234
6 b 23
key data2
0 a 0
1 b 6
2 d 6


merge 和 on


默认将重叠列的列名作为“外键”进行连接


pd.merge(df_obj1, df_obj2)


key

data1

data2

0

b

sfd

6

1

b

fdsf

6

2

b

23

6

3

a

we

0

4

a

3253

0

5

a

234

0


# on 显示指定“外键”
pd.merge(df_obj1, df_obj2, on='key')


key

data1

data2

0

b

sfd

6

1

b

fdsf

6

2

b

23

6

3

a

we

0

4

a

3253

0

5

a

234

0


left-on 和 right-on


# left_on,right_on 分别指定左侧数据和右侧数据的“外键”

# 更改列名

df_obj1 = df_obj1.rename(columns={'key':'key1'})
df_obj2 = df_obj2.rename(columns={'key':'key2'})


print(df_obj1)
print(df_obj2)


 key1 data1
0 b sfd
1 b fdsf
2 a we
3 c 24
4 a 3253
5 a 234
6 b 23
key2 data2
0 a 0
1 b 6
2 d 6


pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2')


key1

data1

key2

data2

0

b

sfd

b

6

1

b

fdsf

b

6

2

b

23

b

6

3

a

we

a

0

4

a

3253

a

0

5

a

234

a

0


how


# “外连接”
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='outer')


key1

data1

key2

data2

0

b

sfd

b

6.0

1

b

fdsf

b

6.0

2

b

23

b

6.0

3

a

we

a

0.0

4

a

3253

a

0.0

5

a

234

a

0.0

6

c

24

NaN

NaN

7

NaN

NaN

d

6.0


# 左连接
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='left')


key1

data1

key2

data2

0

b

sfd

b

6.0

1

b

fdsf

b

6.0

2

a

we

a

0.0

3

c

24

NaN

NaN

4

a

3253

a

0.0

5

a

234

a

0.0

6

b

23

b

6.0


# 右连接
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='right')


key1

data1

key2

data2

0

b

sfd

b

6

1

b

fdsf

b

6

2

b

23

b

6

3

a

we

a

0

4

a

3253

a

0

5

a

234

a

0

6

NaN

NaN

d

6


处理重复列名 suffixes


# 处理重复列名
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
'data' : np.random.randint(0,10,3)})

pd.merge(df_obj1, df_obj2, on='key', suffixes=('\_left', '\_right'))


key

data_left

data_right

0

b

9

1

1

b

1

1

2

b

6

1

3

a

7

1

4

a

3

1

5

a

4

1


# 按索引连接
df_obj3 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : np.random.randint(0,10,7)})
df_obj4 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])


print(df_obj3)
print(df_obj4)


 key data1
0 b 7
1 b 4
2 a 1
3 c 9
4 a 2
5 a 9
6 b 7
data2
a 9
b 4
d 0


pd.merge(df_obj3, df_obj4, left_on='key', right_index=True)


key

data1

data2

0

b

7

4

1

b

4

4

6

b

7

4

2

a

1

9

4

a

2

9

5

a

9

9


按索引连接 right_index


# 按索引连接
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])


print(df_obj1)
print(df_obj2)


 key data1
0 b 0
1 b 2
2 a 7
3 c 3
4 a 1
5 a 1
6 b 6
data2
a 2
b 1
d 1


pd.merge(df_obj1, df_obj2, left_on='key', right_index=True)


key

data1

data2

0

b

0

1

1

b

2

1

6

b

6

1

2

a

7

2

4

a

1

2

5

a

1

2


四、数据合并


  • 数据合并 concat
  • 按索引连接===right_index


import numpy as np
import pandas as pd


numpy 的 concat


arr1 = np.random.randint(0, 10, (3, 4))
arr2 = np.random.randint(0, 10, (3, 4))

print (arr1)
print (arr2)


[[6 0 3 2]
 [5 7 9 8]
 [5 8 0 3]]
[[6 5 7 9]
 [0 1 0 0]
 [1 1 1 7]]


np.concatenate([arr1, arr2])


array([[6, 0, 3, 2],
       [5, 7, 9, 8],
       [5, 8, 0, 3],
       [6, 5, 7, 9],
       [0, 1, 0, 0],
       [1, 1, 1, 7]])


np.concatenate([arr1, arr2], axis=1)


array([[6, 0, 3, 2, 6, 5, 7, 9],
       [5, 7, 9, 8, 0, 1, 0, 0],
       [5, 8, 0, 3, 1, 1, 1, 7]])


series 上的 concat


# index 没有重复的情况
ser_obj1 = pd.Series(np.random.randint(0, 10, 5), index=range(0,5))
ser_obj2 = pd.Series(np.random.randint(0, 10, 4), index=range(5,9))
ser_obj3 = pd.Series(np.random.randint(0, 10, 3), index=range(9,12))


pd.concat([ser_obj1, ser_obj2, ser_obj3])


0 0
1 4
2 5
3 1
4 9
5 7
6 8
7 5
8 0
9 5
10 9
11 0
dtype: int32


pd.concat([ser_obj1, ser_obj2, ser_obj3], axis=1)


0

1

2

0

0.0

NaN

NaN

1

4.0

NaN

NaN

2

5.0

NaN

NaN

3

1.0

NaN

NaN

4

9.0

NaN

NaN

5

NaN

7.0

NaN

6

NaN

8.0

NaN

7

NaN

5.0

NaN

8

NaN

0.0

NaN

9

NaN

NaN

5.0

10

NaN

NaN

9.0

11

NaN

NaN

0.0


# index 有重复的情况
ser_obj1 = pd.Series(np.random.randint(0, 10, 5), index=range(5))
ser_obj2 = pd.Series(np.random.randint(0, 10, 4), index=range(4))
ser_obj3 = pd.Series(np.random.randint(0, 10, 3), index=range(3))

print (ser_obj1)
print (ser_obj2)
print (ser_obj3)


0 5
1 3
2 0
3 8
4 3
dtype: int32
0 5
1 3
2 2
3 1
dtype: int32
0 5
1 8
2 6
dtype: int32


pd.concat([ser_obj1, ser_obj2, ser_obj3])


0 5
1 3
2 0
3 8
4 3
0 5
1 3
2 2
3 1
0 5
1 8
2 6
dtype: int32


pd.concat([ser_obj1, ser_obj2, ser_obj3], axis=1, join='inner')


0

1

2

0

5

5

5

1

3

3

8

2

0

2

6


dataframe 上的 concat


df_obj1 = pd.DataFrame(np.random.randint(0, 10, (3, 2)), index=['a', 'b', 'c'],
columns=['A', 'B'])
df_obj2 = pd.DataFrame(np.random.randint(0, 10, (2, 2)), index=['a', 'b'],
columns=['C', 'D'])
print (df_obj1)
print (df_obj2)


 A B
a 4 3
b 8 1
c 6 3
C D
a 1 3
b 8 2


pd.concat([df_obj1, df_obj2])


C:\Users\wztli\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

"""Entry point for launching an IPython kernel.


A

B

C

D

a

4.0

3.0

NaN

NaN

b

8.0

1.0

NaN

NaN

c

6.0

3.0

NaN

NaN

a

NaN

NaN

1.0

3.0

b

NaN

NaN

8.0

2.0


pd.concat([df_obj1, df_obj2], axis=1)


C:\Users\wztli\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

"""Entry point for launching an IPython kernel.


A

B

C

D

a

4

3

1.0

3.0

b

8

1

8.0

2.0

c

6

3

NaN

NaN


五、数据重构


import numpy as np
import pandas as pd


stack


df_obj = pd.DataFrame(np.random.randint(0,10, (5,2)), columns=['data1', 'data2'])
df_obj


data1

data2

0

0

4

1

6

2

2

9

8

3

7

0

4

3

1


stacked = df_obj.stack()
print (stacked)


0 data1 0
data2 4
1 data1 6
data2 2
2 data1 9
data2 8
3 data1 7
data2 0
4 data1 3
data2 1
dtype: int32


print (type(stacked))
print (type(stacked.index))


<class 'pandas.core.series.Series'>
<class 'pandas.core.indexes.multi.MultiIndex'>


unstack


# 默认操作内层索引
stacked.unstack()


data1

data2

0

0

4

1

6

2

2

9

8

3

7

0

4

3

1


# 通过 level 指定操作索引的级别
stacked.unstack(level=0)


0

1

2

3

4

data1

0

6

9

7

3

data2

4

2

8

0

1


六、数据转换


import numpy as np
import pandas as pd


重复数据 duplicates 函数


df_obj = pd.DataFrame({'data1' : ['a'] _ 4 + ['b'] _ 4,
'data2' : np.random.randint(0, 4, 8)})
df_obj


data1

data2

0

a

3

1

a

2

2

a

2

3

a

1

4

b

0

5

b

2

6

b

2

7

b

1


df_obj.duplicated()


0 False
1 False
2 True
3 False
4 False
5 False
6 True
7 False
dtype: bool


df_obj.drop_duplicates()


data1

data2

0

a

3

1

a

2

3

a

1

4

b

0

5

b

2

7

b

1


df_obj.drop_duplicates('data2')


data1

data2

0

a

3

1

a

2

3

a

1

4

b

0


map 函数


ser_obj = pd.Series(np.random.randint(0,10,10))
ser_obj


0 1
1 9
2 1
3 2
4 7
5 2
6 4
7 5
8 4
9 6
dtype: int32


ser_obj.map(lambda x : x ** 2)


0 1
1 81
2 1
3 4
4 49
5 4
6 16
7 25
8 16
9 36
dtype: int64


数据替换 repalce


# 替换单个值
ser_obj.replace(0, -100)


0 1
1 9
2 1
3 2
4 7
5 2
6 4
7 5
8 4
9 6
dtype: int32


# 替换多个值
ser_obj.replace([0, 2], -100)


0 1
1 9
2 1
3 -100
4 7
5 -100
6 4
7 5
8 4
9 6
dtype: int32


# 替换多个值
ser_obj.replace([0, 2], [-100, -200])


0 1
1 9
2 1
3 -200
4 7
5 -200
6 4
7 5
8 4
9 6
dtype: int64


ser_obj.map(lambda x : x ** 2)

#### 3. 数据替换 repalce

# 替换单个值

ser_obj.replace(0, -100)

# 替换多个值

ser_obj.replace([0, 2], -100)

# 替换多个值

ser_obj.replace([0, 2], [-100, -200])


0 1
1 9
2 1
3 -200
4 7
5 -200
6 4
7 5
8 4
9 6
dtype: int64

评论区

Twikoo giscus