数据分析
目录
1. Series
1.1 Series 创建和切片
1.2 Series 的索引和值
2. DataFrame
2.1 pandas 读取外部数据
2.2 DataFrame 的基础属性
2.3 pandas 的索引
2.3.1 df.loc 方法
2.3.2 df.iloc 方法
2.3.3 布尔索引
2.4 缺失数据的处理
1. Series
Series:一维,带标签的数组。
1.1 Series 创建和切片
#测试pandas.Series的创建和切片import pandas as pdt = pd.Series([13,34,45,2])
print(t)
print(t[t>20])#通过字典创建一个Series,其中的索引就是字典的键
a = {"name":"老白","age":26,"career":"waiter","company":"同福客栈"}
t1 = pd.Series(a)
print(t1)
print("*"*50)
print(t1["career"])
print("*"*50)
print(t1[0])
print("*"*50)
print(t1[["name","company"]])#重新指定索引
t2 = pd.Series([445,234,523,56,45],index=list("abcde"))
print(t2)
print("*"*50)
print(t2["c"])
print("*"*50)
print(t2[:2])
print("*"*50)
print(t2[[1,3]])'''
输出结果:
0 13
1 34
2 45
3 2
dtype: int64
1 34
2 45
dtype: int64
name 老白
age 26
career waiter
company 同福客栈
dtype: object
**************************************************
waiter
**************************************************
老白
**************************************************
name 老白
company 同福客栈
dtype: object
a 445
b 234
c 523
d 56
e 45
dtype: int64
**************************************************
523
**************************************************
a 445
b 234
dtype: int64
**************************************************
b 234
d 56
dtype: int64
'''
1.2 Series 的索引和值
Series对象本质上由两个数组构成:
一个数组构成对象的键(index,索引);一个数组构成对象的值(values)。
#测试pandas.Series的索引和值import pandas as pda = {"name":"老白","age":26,"career":"waiter","company":"同福客栈"}
t = pd.Series(a)for i in t.index:print(i)
print("*"*50)
print(type(t.index))
print("*"*50)
print(len(t.index))
print("*"*50)
print(list(t.index)[:3])print("*"*50)
print(t.values)
print("*"*50)
print(type(t.values))'''
输出结果:
name
age
career
company
**************************************************
<class 'pandas.core.indexes.base.Index'>
**************************************************
4
**************************************************
['name', 'age', 'career']
**************************************************
['老白' 26 'waiter' '同福客栈']
**************************************************
<class 'numpy.ndarray'>
'''
2. DataFrame
DataFrame:二维,Series容器。
2.1 pandas 读取外部数据
#测试pandas读取外部数据import pandas as pddf = pd.read_csv("./dogNames2.csv")
print(df.info())#DataFrame中排序的方法
df = df.sort_values(by="Count_AnimalName",ascending=False)
print(df.head()) #显示头部几行,默认5行'''
输出结果:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16220 entries, 0 to 16219
Data columns (total 2 columns):# Column Non-Null Count Dtype
--- ------ -------------- ----- 0 Row_Labels 16217 non-null object1 Count_AnimalName 16220 non-null int64
dtypes: int64(1), object(1)
memory usage: 253.6+ KB
None
**************************************************Row_Labels Count_AnimalName
1156 BELLA 1195
9140 MAX 1153
2660 CHARLIE 856
3251 COCO 852
12368 ROCKY 823
'''
2.2 DataFrame 的基础属性
#测试DataFrame的创建、基础属性import pandas as pd
import numpy as npt = pd.DataFrame(np.arange(12).reshape((3,4)),index=list("abc"),columns=list("WXYZ"))
print(t)d1 = {"name":["老白","赛貂蝉"],"age":[26,20],"career":["waiter","CEO"],"company":["同福客栈","怡红楼"]}
t1 = pd.DataFrame(d1)
print(t1)
print("*"*50)
print(type(t1))
print("*"*50)
print(t1.index) #行索引
print("*"*50)
print(t1.columns) #列索引
print("*"*50)
print(t1.values) #对象值,二维ndarray数组
print("*"*50)
print(t1.shape) #行数,列数
print("*"*50)
print(t1.dtypes) #列数据类型
print("*"*50)
print(t1.ndim) #数据维度
print("*"*50)
print(t1.info()) #相关信息
print("*"*50)
print(t1.describe()) #快速综合统计结果'''
输出结果:
W X Y Z
a 0 1 2 3
b 4 5 6 7
c 8 9 10 11name age career company
0 老白 26 waiter 同福客栈
1 赛貂蝉 20 CEO 怡红楼
**************************************************
<class 'pandas.core.frame.DataFrame'>
**************************************************
RangeIndex(start=0, stop=2, step=1)
**************************************************
Index(['name', 'age', 'career', 'company'], dtype='object')
**************************************************
[['老白' 26 'waiter' '同福客栈']['赛貂蝉' 20 'CEO' '怡红楼']]
**************************************************
(2, 4)
**************************************************
name object
age int64
career object
company object
dtype: object
**************************************************
2
**************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 4 columns):# Column Non-Null Count Dtype
--- ------ -------------- ----- 0 name 2 non-null object1 age 2 non-null int64 2 career 2 non-null object3 company 2 non-null object
dtypes: int64(1), object(3)
memory usage: 192.0+ bytes
None
**************************************************age
count 2.000000
mean 23.000000
std 4.242641
min 20.000000
25% 21.500000
50% 23.000000
75% 24.500000
max 26.000000
'''
2.3 pandas 的索引
- [数字]表示取行索引,对行进行操作;
- [字符串]表示取列索引,对列进行操作。
#测试DataFrame的索引import pandas as pddf = pd.read_csv("./dogNames2.csv")#进行降序排列
df = df.sort_values(by="Count_AnimalName",ascending=False)
print(df[:5]) #按照行进行操作
print("*"*50)
print(df[:5]["Row_Labels"]) #取前5行的第“Row_Labels”列
print("*"*50)
print(df["Row_Labels"][:5]) #取第“Row_Labels”列的前5行
print("*"*50)
print(type(df["Row_Labels"])) #Series类型
print("*"*50)
print(df[["Row_Labels","Count_AnimalName"]][:5])'''
输出结果:
Row_Labels Count_AnimalName
1156 BELLA 1195
9140 MAX 1153
2660 CHARLIE 856
3251 COCO 852
12368 ROCKY 823
**************************************************
1156 BELLA
9140 MAX
2660 CHARLIE
3251 COCO
12368 ROCKY
Name: Row_Labels, dtype: object
**************************************************
1156 BELLA
9140 MAX
2660 CHARLIE
3251 COCO
12368 ROCKY
Name: Row_Labels, dtype: object
**************************************************
<class 'pandas.core.series.Series'>
**************************************************Row_Labels Count_AnimalName
1156 BELLA 1195
9140 MAX 1153
2660 CHARLIE 856
3251 COCO 852
12368 ROCKY 823
'''
2.3.1 df.loc 方法
通过标签索引数据
import pandas as pdt = pd.DataFrame(np.arange(12).reshape((3,4)),index=list("abc"),columns=list("WXYZ"))# loc方法,通过标签索引数据
print(t.loc["a","Z"])
print("*"*50)
print(t.loc["a"])
# print(t.loc["a",:])
print("*"*50)
print(t.loc[:,"Y"])
print("*"*50)
print(t.loc[["a","c"],["W","Z"]])
print("*"*50)
print(t.loc["a":"c",["W","Z"]]) #在loc中,冒号索引是闭合的,即会选择到冒号后面的数据'''
输出结果:
3
**************************************************
W 0
X 1
Y 2
Z 3
Name: a, dtype: int32
**************************************************
a 2
b 6
c 10
Name: Y, dtype: int32
**************************************************W Z
a 0 3
c 8 11
**************************************************W Z
a 0 3
b 4 7
c 8 11
'''
2.3.2 df.iloc 方法
通过位置获取数据
import pandas as pdt = pd.DataFrame(np.arange(12).reshape((3,4)),index=list("abc"),columns=list("WXYZ"))# iloc方法,通过位置获取行数据
print(t.iloc[1])
print("*"*50)
print(t.iloc[:,[2,1]])
print("*"*50)
print(t.iloc[[1,0],[2,1]])
print("*"*50)
print(t.iloc[1:,:2]) #取第二行后的每一行、第三列前的每一列
print("*"*50)
t.iloc[1:,:2] = np.nan
print(t)'''
输出结果:
W 4
X 5
Y 6
Z 7
Name: b, dtype: int32
**************************************************Y X
a 2 1
b 6 5
c 10 9
**************************************************Y X
b 6 5
a 2 1
**************************************************W X
b 4 5
c 8 9
**************************************************W X Y Z
a 0.0 1.0 2 3
b NaN NaN 6 7
c NaN NaN 10 11
'''
2.3.3 布尔索引
#测试pandas的布尔索引import pandas as pddf = pd.read_csv("./dogNames2.csv")#找到所有的使用次数超过700并且名字的字符串的长度大于4的狗的名字
print(df[(df["Row_Labels"].str.len()>4)&(df["Count_AnimalName"]>700)])'''
输出结果:
Row_Labels Count_AnimalName
1156 BELLA 1195
2660 CHARLIE 856
8552 LUCKY 723
12368 ROCKY 823
'''
2.4 缺失数据的处理
pd.dropna(axis=0,how='any') #只要该行存在nan,就删除该行
pd.dropna(axis=0,how='all',inplace=False) #只有该行全部为nan,才删除该行,inplace参数判断是否进行原地修改
#测试pandas缺失数据的处理import pandas as pd
import numpy as npt = pd.DataFrame(np.arange(12).reshape((3,4)),index=list("abc"),columns=list("WXYZ"))
t.iloc[1:,:2] = np.nanprint(pd.isnull(t))
print("*"*50)
print(t[pd.notnull(t["W"])]) #t中"W"这一列数值不为nan的所有行
print("*"*50)#删除NaN所在的行、列
t1 = t.dropna(axis=0,how="any",inplace=False)
print(t1)
print("*"*50)#填充NaN的数据
t2 = t.fillna(0) #用0填充NaN
print(t2)
print("*"*50)t3 = t.fillna(t.mean()) #用平均值填充NaN
print(t3)
print("*"*50)t4 = t["X"].fillna(t["X"].median()) #用"X"列的中位数填充"X"列的NaN
print(t4)'''
输出数据:W X Y Z
a False False False False
b True True False False
c True True False False
**************************************************W X Y Z
a 0.0 1.0 2 3
**************************************************W X Y Z
a 0.0 1.0 2 3
**************************************************W X Y Z
a 0.0 1.0 2 3
b 0.0 0.0 6 7
c 0.0 0.0 10 11
**************************************************W X Y Z
a 0.0 1.0 2 3
b 0.0 1.0 6 7
c 0.0 1.0 10 11
**************************************************
a 1.0
b 1.0
c 1.0
Name: X, dtype: float64
'''


发布评论