pandas在处理一维度数据和二维数据很是在行,在实际生产环境中应用十分广泛。我们直奔主题,本文主要讲解它的两个核心数据结构:Series 和
DataFrame。

一,Series (一维,带有标签的数组)

ta是个定长的字典序列。说是定长是因为在存储的时候,相当于两个
ndarray,一个数组构成对象的键(index),另一个构成对象的值(values)这也是和字典结构最大的不同。因为在字典的结构里,元素的个数是不固定的。

ta有两个基本属性:index 和 values。在 Series 结构中,index 默认是 0,1,2,……递增的整数序列。

1,创建
from pandas import Series, DataFrame x1 = Series([1, 2, 3, 4]) x2 =
Series(data=[1, 2, 3, 4], index=['a', 'b', 'c', 'd']) d = {'a': 1, 'b': 2, 'c':
3, 'd': 4} x3 = Series(d) print x3 ''' 如下是x1的输出结果: 0 1 1 2 2 3 3 4 dtype: int64
''' print x1 ''' 如下是x2的输出结果: a 1 b 2 c 3 d 4 dtype: int64 ''' print x2 '''
如下是x3的输出结果: a 1 b 2 c 3 d 4 dtype: int64 ''' print x3

2,切片和索引
t = pd.Series(np.arange(10), index=list(string.ascii_uppercase[:10])) '''
输出结果如下: A 0 B 1 C 2 D 3 E 4 F 5 G 6 H 7 I 8 J 9 dtype: int64 ''' print t #
1,切片,传入的参数此次为: start,end,step se1 = t[2:10:2] ''' C 2 E 4 G 6 I 8 dtype: int64
''' print se1 # 2,索引,获取序号为1的值,如下的获取方式和这个:se2=t['B'] 是等价的 se2 = t[1] ''' 1 '''
print se2 # 3,索引,获取多个序号对应的值的时候,和这个se3 = t[['C', 'D', 'G']] 是等价的 se3 = t[[2, 3,
6]] ''' C 2 D 3 G 6 dtype: int64 ''' print se3 # 4,索引,获取序号(就是默认的索引)大于5的数据 se4 =
t[t > 5] ''' G 6 H 7 I 8 J 9 dtype: int64 ''' print se4
二,DataFrame(二维Series容器)

DataFrame 类型数据结构类似二维表。它包括了行索引和列索引,我们可以将 DataFrame 看成是由相同索引的 Series 组成的字典类型。

*
行索引:表明不同行,横向索引,叫做index,0轴,axis=0

*
列索引:表明不同列,纵向索引,叫columns,1轴,axis=1

1,创建
from pandas import Series, DataFrame data = {'Chinese': [66, 95, 93, 90, 80],
'English': [65, 85, 92, 88, 90], 'Math': [30, 98, 96, 77, 90]} df1 =
DataFrame(data) df2 = DataFrame(data, index=['ZhangSan', 'LiSi', 'WangWu',
'XiaoMing', 'LiHua'], columns=['English', 'Math', 'Chinese']) ''' 如下是df1的返回结果:
Chinese English Math 0 66 65 30 1 95 85 98 2 93 92 96 3 90 88 77 4 80 90 90 '''
print df1 print('------------------') ''' 如下是df2的返回结果: English Math Chinese
ZhangSan 65 30 66 LiSi 85 98 95 WangWu 92 96 93 XiaoMing 88 77 90 LiHua 90 90
80 ''' print df2

2,DataFrame的基本属性

*
df.shape # 行数,列数

*
df.dtypes # 列数据类型

*
df.ndim # 数据维度

*
df.index # 行索引

*
df.columns # 列索引

*
df.values # 对象值,二维ndarray数组

*
df.head(3) # 显示前几行,默认是5行

*
df.tail(3) # 显示末尾几行,默认是5行

*
df.info() # 相关信息概览

*
df.describe() # 快速综合统计结果

三,数据处理

1,数据导入和输出

Pandas 允许直接从 json,txt,xlsx,csv 等文件中导入数据,也可以输出到  json,txt,xlsx,csv 等文件,非常方便。
import pandas as pd from pandas import Series, DataFrame # 读写 xlsx格式的文件 score
= DataFrame(pd.read_excel('data.xlsx')) score.to_excel('data1.xlsx') # 读写 json
格式的文件 data=DataFrame(pd.read_json('data.json'))
data.to_json('another_data.json') # 读写csv格式的文件
data2=DataFrame(pd.read_csv('data.csv')) data2.to_csv('another_data.csv') #
读写txt 格式的文件 data3=DataFrame(pd.read_hdf('./data.txt'))
data3.to_hdf('./another_data.txt')

2,删除 DataFrame 中的不必要的列或行

  2.1,删除某一列
data = {'Chinese': [66, 95, 93, 90, 80], 'English': [65, 85, 92, 88, 90],
'Math': [30, 98, 96, 77, 90]} df2 = DataFrame(data=data, index=['ZhangSan',
'LiSi', 'WangWu', 'XiaoMing', 'LiHua'], columns=['English', 'Math', 'Chinese'])
''' df2的内容如下: English Math Chinese ZhangSan 65 30 66 LiSi 85 98 95 WangWu 92 96
93 XiaoMing 88 77 90 LiHua 90 90 80 ''' # 删除'Chinese'这一列 df2 =
df2.drop(columns=['Chinese']) ''' 运行结果如下; English Math ZhangSan 65 30 LiSi 85
98 WangWu 92 96 XiaoMing 88 77 LiHua 90 90 ''' print df2
 

2.2,删除某一行
data = {'Chinese': [66, 95, 93, 90, 80], 'English': [65, 85, 92, 88, 90],
'Math': [30, 98, 96, 77, 90]} df2 = DataFrame(data=data, index=['ZhangSan',
'LiSi', 'WangWu', 'XiaoMing', 'LiHua'], columns=['English', 'Math', 'Chinese'])
''' df2的内容如下: English Math Chinese ZhangSan 65 30 66 LiSi 85 98 95 WangWu 92 96
93 XiaoMing 88 77 90 LiHua 90 90 80 ''' #删除 'ZhangSan' 这一行
df2=df2.drop(index=['ZhangSan']) ''' 运行结果如下; English Math ZhangSan 65 30 LiSi
85 98 WangWu 92 96 XiaoMing 88 77 LiHua 90 90 ''' print df2

3,重命名列名 columns或者 行索引index,让列表名或者行名更容易识别
data = {'Chinese': [66, 95, 93, 90, 80], 'English': [65, 85, 92, 88, 90],
'Math': [30, 98, 96, 77, 90]} df2 = DataFrame(data=data, index=['ZhangSan',
'LiSi', 'WangWu', 'XiaoMing', 'LiHua'], columns=['English', 'Math', 'Chinese'])
''' df2的内容如下: English Math Chinese ZhangSan 65 30 66 LiSi 85 98 95 WangWu 92 96
93 XiaoMing 88 77 90 LiHua 90 90 80 ''' df2.rename(columns={'Chinese': '语文',
'English': '英语','Math':'数学'}, inplace=True) df2.rename(index={'ZhangSan':
'张三'},inplace=True) ''' 运行结果如下; 英语 数学 语文 张三 65 30 66 LiSi 85 98 95 WangWu 92 96
93 XiaoMing 88 77 90 LiHua 90 90 80 ''' print df2

4, 去重复的值
data = {'Chinese': [66, 95, 93, 90, 80,80], 'English': [65, 85, 92, 88,
90,90], 'Math': [30, 98, 96, 77, 90,90]} df2 = DataFrame(data=data,
index=['ZhangSan', 'LiSi', 'WangWu', 'XiaoMing', 'LiHua','LiHua'],
columns=['English', 'Math', 'Chinese']) ''' df2的内容如下: English Math Chinese
ZhangSan 65 30 66 LiSi 85 98 95 WangWu 92 96 93 XiaoMing 88 77 90 LiHua 90 90
80 LiHua 90 90 80 ''' # 去除重复行 df2 = df2.drop_duplicates() ''' 运行结果如下; English
Math Chinese ZhangSan 65 30 66 LiSi 85 98 95 WangWu 92 96 93 XiaoMing 88 77 90
LiHua 90 90 80 ''' print df2

5,更改数据格式

可以使用astype 函数来规范数据格式,即转换数据类型
data = {'Chinese': [66, 95, 93, 90, 80,80], 'English': [65, 85, 92, 88,
90,90], 'Math': [30, 98, 96, 77, 90,90]} df2 = DataFrame(data=data,
index=['ZhangSan', 'LiSi', 'WangWu', 'XiaoMing', 'LiHua','LiHua'],
columns=['English', 'Math', 'Chinese']) ''' df2的内容如下: English Math Chinese
ZhangSan 65 30 66 LiSi 85 98 95 WangWu 92 96 93 XiaoMing 88 77 90 LiHua 90 90
80 LiHua 90 90 80 ''' df2[['Math']].astype('str')
df2[['Math']].astype(np.float) ''' 运行结果如下; English Math Chinese ZhangSan 65 30
66 LiSi 85 98 95 WangWu 92 96 93 XiaoMing 88 77 90 LiHua 90 90 80 ''' print df2

6,数据间的空格
data = {'Chinese': [66, 95, 93, 90, 80,80], 'English': [65, 85, 92, 88,
90,90], 'Math': [30, 98, 96, 77,
90,90],'address':['Beijing','Chendu','Xian','Shanghai','Beijing','Beijing']}
df2 = DataFrame(data=data, index=['ZhangSan', 'LiSi', 'WangWu', 'XiaoMing',
'LiHua','LiHua'], columns=['English', 'Math', 'Chinese','address']) '''
df2的内容如下: English Math Chinese address ZhangSan 65 30 66 Beijing LiSi 85 98 95
Chendu WangWu 92 96 93 Xian XiaoMing 88 77 90 Shanghai LiHua 90 90 80 Beijing
LiHua 90 90 80 Beijing ''' # 删除左右两边空格 df2['address'] =
df2['address'].map(str.strip) # 删除左边空格 df2['address'] =
df2['address'].map(str.lstrip) # 删除右边空格 df2['address'] =
df2['address'].map(str.rstrip) ''' 运行结果如下; English Math Chinese address
ZhangSan 65 30 66 Beijing LiSi 85 98 95 Chendu WangWu 92 96 93 Xian XiaoMing 88
77 90 Shanghai LiHua 90 90 80 Beijing LiHua 90 90 80 Beijing ''' print df2

7,去除特殊字符
data = {'Chinese': [66, 95, 93, 90, 80,80], 'English': [65, 85, 92, 88,
90,90], 'Math': [30, 98, 96, 77,
90,90],'address':['Beijing','Chendu#','Xian','Shanghai#','Beijing','Beijing']}
df2 = DataFrame(data=data, index=['ZhangSan', 'LiSi', 'WangWu', 'XiaoMing',
'LiHua','LiHua'], columns=['English', 'Math', 'Chinese','address']) '''
df2的内容如下: English Math Chinese address ZhangSan 65 30 66 Beijing LiSi 85 98 95
Chendu# WangWu 92 96 93 Xian XiaoMing 88 77 90 Shanghai# LiHua 90 90 80 Beijing
LiHua 90 90 80 Beijing ''' print df2 # 删除 '#' df2['address'] =
df2['address'].str.strip('#') ''' 运行结果如下; English Math Chinese address ZhangSan
65 30 66 Beijing LiSi 85 98 95 Chendu WangWu 92 96 93 Xian XiaoMing 88 77 90
Shanghai LiHua 90 90 80 Beijing LiHua 90 90 80 Beijing ''' print df2

8,某些列转换为大写,小写或者首字母大写
data = {'Chinese': [66, 95, 93, 90, 80,80], 'English': [65, 85, 92, 88,
90,90], 'Math': [30, 98, 96, 77,
90,90],'address':['Beijing','Chendu#','Xian','Shanghai#','Beijing','Beijing']}
df2 = DataFrame(data=data, index=['ZhangSan', 'LiSi', 'WangWu', 'XiaoMing',
'LiHua','LiHua'], columns=['English', 'Math', 'Chinese','address']) '''
df2的内容如下: English Math Chinese address ZhangSan 65 30 66 Beijing LiSi 85 98 95
Chendu# WangWu 92 96 93 Xian XiaoMing 88 77 90 Shanghai# LiHua 90 90 80 Beijing
LiHua 90 90 80 Beijing ''' print df2 # 列 'address' 转换为大写 df2['address'] =
df2['address'].str.upper() # 列 'address' 转换为小写 # df2['address'] =
df2['address'].str.lower() # 列 'address' 转换为首字母大写 # df2['address'] =
df2['address'].str.title() ''' 运行结果如下; English Math Chinese address ZhangSan 65
30 66 BEIJING LiSi 85 98 95 CHENDU# WangWu 92 96 93 XIAN XiaoMing 88 77 90
SHANGHAI# LiHua 90 90 80 BEIJING LiHua 90 90 80 BEIJING ''' print df2

9,查找空值,填充空值,删除空值:
data = {'Chinese': [66, 95, 93, 90, 80, np.nan], 'English': [65, 85, 92, 88,
90, np.nan], 'Math': [30, 98, 96, 77, 90, 90], 'address': ['Beijing',
'Chendu#', 'Xian', 'Shanghai#', 'Beijing', 'Beijing']} df2 =
DataFrame(data=data, index=['ZhangSan', 'LiSi', 'WangWu', 'XiaoMing', 'LiHua',
'LiHua'], columns=['English', 'Math', 'Chinese', 'address']) ''' df2的内容如下:
English Math Chinese address ZhangSan 65.0 30 66.0 Beijing LiSi 85.0 98 95.0
Chendu# WangWu 92.0 96 93.0 Xian XiaoMing 88.0 77 90.0 Shanghai# LiHua 90.0 90
80.0 Beijing LiHua NaN 90 NaN Beijing ''' print df2 #
1,判断数据中是否存在NANN的情况,可以用这种方式:df2.isna() 或者 pd.isna(df2); df2.notna() 或者
pd.notna(df2) tips:isnull是isna的别名 ''' 运行结果如下: English Math Chinese address
ZhangSan False False False False LiSi False False False False WangWu False
False False False XiaoMing False False False False LiHua False False False
False LiHua False False False False ''' print df2.isna() # 2,判断数据中哪列存在空值 '''
运行结果如下: English False Math False Chinese False address False dtype: bool '''
print df2.isna().any() # 3,在整个数据中碰到 nap就把其替换为 0 df2 = df2.fillna(0) ''' 运行结果如下:
English Math Chinese address ZhangSan 65.0 30 66.0 Beijing LiSi 85.0 98 95.0
Chendu# WangWu 92.0 96 93.0 Xian XiaoMing 88.0 77 90.0 Shanghai# LiHua 90.0 90
80.0 Beijing LiHua 0.0 90 0.0 Beijing ''' print df2 # 4,使用平均值进行填充空值
df2.fillna(df2.mean()) # 5, 使用中位数填充空值 df2.fillna(df2.median()) #
6,丢弃‘English’和‘Chinese’这两列中有缺失值的行 df2.dropna(axis=0, subset=["English",
"Chinese"]) # 7,丢弃全为空值的那些列 data.dropna(axis=1, how="all") #
8,丢弃有缺失值的列(一般不会这么做,这样会删掉一个特征) data.dropna(axis=1) # 9,只丢弃全为空值的那些行
data.dropna(how='all')

10,使用 apply 函数对数据进行处理:
data = {'Chinese': [66, 95, 93, 90, 80, np.nan], 'English': [65, 85, 92, 88,
90, np.nan], 'Math': [30, 98, 96, 77, 90, 90], 'address': ['Beijing',
'Chendu#', 'Xian', 'Shanghai#', 'Beijing', 'Beijing']} df2 =
DataFrame(data=data, index=['ZhangSan', 'LiSi', 'WangWu', 'XiaoMing', 'LiHua',
'LiHua'], columns=['English', 'Math', 'Chinese', 'address']) ''' df2的内容如下:
English Math Chinese address ZhangSan 65.0 30 66.0 Beijing LiSi 85.0 98 95.0
Chendu# WangWu 92.0 96 93.0 Xian XiaoMing 88.0 77 90.0 Shanghai# LiHua 90.0 90
80.0 Beijing LiHua NaN 90 NaN Beijing ''' print df2 # 对 address 列进行大写转换
df2['address'] = df2['address'].apply(str.upper) ''' 如下是运行结果: English Math
Chinese address ZhangSan 65.0 30 66.0 BEIJING LiSi 85.0 98 95.0 CHENDU# WangWu
92.0 96 93.0 XIAN XiaoMing 88.0 77 90.0 SHANGHAI# LiHua 90.0 90 80.0 BEIJING
LiHua NaN 90 NaN BEIJING ''' print df2

11,使用 apply 函数对数据进行复杂处理:
def plus(df, n, m): df['another1'] = (df[u'Chinese'] + df[u'English']) * m
df['another2'] = (df[u'Chinese'] + df[u'English']) * n return df def method3():
data = {'Chinese': [66, 95, 93, 90, 80, np.nan], 'English': [65, 85, 92, 88,
90, np.nan], 'Math': [30, 98, 96, 77, 90, 90], 'address': ['Beijing',
'Chendu#', 'Xian', 'Shanghai#', 'Beijing', 'Beijing']} df2 =
DataFrame(data=data, index=['ZhangSan', 'LiSi', 'WangWu', 'XiaoMing', 'LiHua',
'LiHua'], columns=['English', 'Math', 'Chinese', 'address']) ''' df2的内容如下:
English Math Chinese address ZhangSan 65.0 30 66.0 Beijing LiSi 85.0 98 95.0
Chendu# WangWu 92.0 96 93.0 Xian XiaoMing 88.0 77 90.0 Shanghai# LiHua 90.0 90
80.0 Beijing LiHua NaN 90 NaN Beijing ''' print df2 #
apply调用一个函数plus,按照列为轴进行操作(axis=1),args传递plus需要的两个参数. df2 = df2.apply(plus,
axis=1, args=(2, 3,)) ''' 如下是运行结果: English Math Chinese address another1
another2 ZhangSan 65.0 30 66.0 Beijing 393.0 262.0 LiSi 85.0 98 95.0 Chendu#
540.0 360.0 WangWu 92.0 96 93.0 Xian 555.0 370.0 XiaoMing 88.0 77 90.0
Shanghai# 534.0 356.0 LiHua 90.0 90 80.0 Beijing 510.0 340.0 LiHua NaN 90 NaN
Beijing NaN NaN ''' print df2

12,数据统计相关的函数:

 

13,数据表合并
df1 = DataFrame({'name': ['joel', 'ziwen', 'a', 'b', 'c'], 'data1': range(5)})
df2 = DataFrame({'name': ['joel', 'ziwen', 'A', 'B', 'C'], 'data2': range(5)})
# 1. 基于指定列进行连接 df3 = pd.merge(df1, df2, on='name') ''' 运行结果如下: data1 name data2
0 0 joel 0 1 1 ziwen 1 ''' print df3 # 2,inner 内连接 df4 = pd.merge(df1, df2,
how='inner') ''' 运行结果如下: data1 name data2 0 0 joel 0 1 1 ziwen 1 ''' print df4
# 3,left 左连接 df5 = pd.merge(df1, df2, how='left') ''' 如下是运行结果: data1 name data2
0 0 joel 0.0 1 1 ziwen 1.0 2 2 a NaN 3 3 b NaN 4 4 c NaN ''' print df5 #
4,right 右连接 df6 = pd.merge(df1, df2, how='right') ''' data1 name data2 0 0.0
joel 0 1 1.0 ziwen 1 2 NaN A 2 3 NaN B 3 4 NaN C 4 ''' print df6 # 5,outer 外连接
df7 = pd.merge(df1, df2, how='outer') ''' data1 name data2 0 0.0 joel 0.0 1 1.0
ziwen 1.0 2 2.0 a NaN 3 3.0 b NaN 4 4.0 c NaN 5 NaN A 2.0 6 NaN B 3.0 7 NaN C
4.0 ''' print df7

14,df.loc 通过标签索引(行索引和列索引)获取行数据
t = pd.DataFrame(np.arange(12).reshape((3, 4)),
index=list(string.ascii_uppercase[:3]),
columns=list(string.ascii_uppercase[-4:])) ''' t的输出结果为: W X Y Z A 0 1 2 3 B 4 5
6 7 C 8 9 10 11 ''' print t tmp = t[:-1][['W', 'Z']] ''' 1,简单粗暴的取值: W Z A 0 3 B
4 7 ''' print tmp # 2,通过列名获取行数据 print t.loc['A', 'W'] # 结果为:0 # 3,'A'行,多列数据 '''
结果如下: W 0 Z 3 ''' print t.loc['A', ['W', 'Z']] # 4,多行多列 ''' W X A 0 1 B 4 5 '''
print t.loc[['A', 'B'], ['W', 'X']] # 5,多行多列,注意 冒号连接的是个闭区间 ''' W X Y Z A 0 1 2
3 B 4 5 6 7 C 8 9 10 11 ''' print t.loc['A':'C', 'W':'Z']

15,df.iloc,通过序号获取行数据
t = pd.DataFrame(np.arange(12).reshape((3, 4)),
index=list(string.ascii_uppercase[:3]),
columns=list(string.ascii_uppercase[-4:])) ''' t的输出结果为: W X Y Z A 0 1 2 3 B 4 5
6 7 C 8 9 10 11 ''' print t # 1,获取多行多列数据,1:3 注意这个是左闭右开的区间 ''' 结果如下: Y Z B 6 7 C
10 11 ''' print t.iloc[1:3, [2, 3]] # 2,多行多列,如下的两个 1:3 都是左闭右开的区间 ''' W X A 0 1
B 4 5 ''' print t.iloc[1:3, 1:3] # 3,修改数据 ''' W X Y Z A 0 1 100 3 B 4 5 6 7 C 8
9 10 11 ''' t.loc['A', 'Y'] = 100 print t

16,pandas布尔索引
data = {"Row_Labels": ['BELLS', 'CHARLIE', 'LUCKY', 'ROCKY', 'E', 'M'],
"Count_AnimalName": [1195, 856, 723, 823, 100, 20]} t = pd.DataFrame(data=data,
index=list([1156, 2660, 8552, 12368, 11, 12])) ''' t 的内容: Count_AnimalName
Row_Labels 1156 1195 BELLS 2660 856 CHARLIE 8552 723 LUCKY 12368 823 ROCKY 11
100 E 12 20 M ''' print t t = t[(t['Row_Labels'].str.len() > 2) &
(t['Count_AnimalName'] > 700)] ''' 返回的结果: Count_AnimalName Row_Labels 1156 1195
BELLS 2660 856 CHARLIE 8552 723 LUCKY 12368 823 ROCKY ''' print t

17,pandas的字符串

技术
©2019-2020 Toolsou All rights reserved,
CSS实现溢出显示省略号mysql导出数据sql语句_mysql 根据sql查询语句导出数据HTML+CSS+JavaScript实现模态框(可拖拽)关于串口单片机给上位机的发送数据七大排序算法Java学生信息管理系统控制台版react中解决css样式全局污染qt使用数据库sqlite2020第十一届蓝桥杯国赛Python组vue 限制input只能输入正数