In [26]:

import matplotlib.pyplot as plt
import numpy as np

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
import seaborn as sns

In [27]:

### 1.0.0 数据探索EDA <每次面对新数据都应该进行数据探索，依据探索结果进行特征工程和选择模型>
#加载iris数据集，数据集被手动删除一些特征值。
iris_data = pd.read_csv("../_Datasets/iris_miss.data",sep=',')  #指定字段分隔符，默认逗号

##（1）查看前几行数据。
iris_data.head()

Out[27]:

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	NaN	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	NaN	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

In [28]:

##（2）查看数据维度（例如二维数据的行列数）
iris_data.shape  #150行5列

Out[28]:

(150, 5)

In [29]:

##（3）查看特征列名。
iris_data.columns

Out[29]:

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [30]:

##（4）特征列重命名

#可以根据列名筛选特征。也可以重命名该列。例如将diabetes_X特征s1-s6重命名为feature1 - feature6。
##方法1-暴力重命名，此方法需要写全所有列名，否则报错。
#diabetes_X.columns = ['feature1', 'feature2', 'feature3', 'feature4', 'species']
#diabetes_X.columns

#方法2-rename方法，此方法只需写需要重命名的字段。根据需要，非必需
iris_data.rename(columns={'sepal_length':'SL','sepal_width':'SW','petal_length':'PL','petal_width':'PW'},inplace=True)
iris_data.columns

Out[30]:

Index(['SL', 'SW', 'PL', 'PW', 'species'], dtype='object')

In [31]:

##（5）统计摘要describe
#describe() 方法用于生成DataFrame中数值列的统计摘要。
#它提供了每个数值列的计数、均值、标准差、最小值、25th、50th（中位数）、75th 四分位数和最大值。
iris_data.describe()

Out[31]:

	SL	SW	PL	PW
count	147.000000	143.000000	148.000000	149.000000
mean	5.859184	3.042657	3.737162	1.205369
std	0.828413	0.432075	1.766055	0.761292
min	4.300000	2.000000	1.000000	0.100000
25%	5.100000	2.800000	1.575000	0.300000
50%	5.800000	3.000000	4.300000	1.300000
75%	6.400000	3.300000	5.100000	1.800000
max	7.900000	4.400000	6.900000	2.500000

In [32]:

##（6）INFO摘要
#info() 方法用于获取DataFrame的摘要信息，包括每列的非空值数量、列的数据类型等
iris_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   SL       147 non-null    float64
 1   SW       143 non-null    float64
 2   PL       148 non-null    float64
 3   PW       149 non-null    float64
 4   species  145 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB

In [33]:

# 也可以用isnull方法简单计算含缺失特征的样本数。
missing_values = iris_data.isnull().sum()
missing_values

Out[33]:

SL         3
SW         7
PL         2
PW         1
species    5
dtype: int64

In [34]:

## （7）缺失值处理
#可简单删除缺失值样本，也可补全（特征补全参考5.4节）
df_filtered = iris_data.dropna()
df_filtered  #一共有18条样本包含缺失值，删除后剩余132条样本

Out[34]:

	SL	SW	PL	PW	species
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa
5	5.4	3.9	1.7	0.4	setosa
6	4.6	3.4	1.4	0.3	setosa
...	...	...	...	...	...
144	6.7	3.3	5.7	2.5	virginica
145	6.7	3.0	5.2	2.3	virginica
147	6.5	3.0	5.2	2.0	virginica
148	6.2	3.4	5.4	2.3	virginica
149	5.9	3.0	5.1	1.8	virginica

132 rows × 5 columns

In [35]:

### 总结
# 数据探索完后，根据探索结果进行样本补全、特征工程等，再训练模型