import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
data = load_iris()
X = data.data
y = data.target
print(X.shape)
print(X[:5])
# 初始化PCA,设置主成分数目
pca = PCA(n_components=2)
# 对数据进行PCA降维
X_pca = pca.fit_transform(X)
# 输出降维后的数据形状
print(X_pca.shape)
(150, 4) [[5.1 3.5 1.4 0.2] [4.9 3. 1.4 0.2] [4.7 3.2 1.3 0.2] [4.6 3.1 1.5 0.2] [5. 3.6 1.4 0.2]] (150, 2)
ICA独立成分分析
from sklearn.decomposition import FastICA
print(X.shape)
# 初始化ICA,设置主成分数目
ica = FastICA(n_components=2)
ica.fit(X)
X_ica = ica.transform(X)
print(X_ica.shape)
(150, 4) (150, 2)
LDA线性判别分析
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
print(X.shape)
lda = LinearDiscriminantAnalysis(n_components=2)
lda.fit(X, y)
X_kpca = lda.transform(X)
print(X_kpca.shape)
(150, 4) (150, 2)
方差过滤
选取的数据已经被清洗过,所以方差过滤的效果不会太好
data = load_iris()
X = data.data
y = data.target
print(X.shape)
selector = VarianceThreshold(1*10**-40)
x_feature_selection = selector.fit_transform(X)
print(x_feature_selection.shape)
(150, 4) (150, 4)
相关性过滤
卡方过滤
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# 留下300个特征
X_fschi = SelectKBest(chi2, k=300).fit_transform(X, y)
print(X_fschi.shape)
(150, 4)
d:\Python\CondaEnvs\DataAnalysis\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:779: UserWarning: k=300 is greater than n_features=4. All the features will be returned. warnings.warn(
使用学习曲线来获得一个最优的超参数K
score = []
for i in range(390, 200, -10):
x_fschi = SelectKBest(chi2, k=i).fit_transform(X, y)
once = cross_val_score(
RFC(n_estimators=10, random_state=0), x_fschi, y, cv=5).mean()
score.append(once)
plt.plot(range(390, 200, -10), score)
plt.show()
d:\Python\CondaEnvs\DataAnalysis\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:779: UserWarning: k=390 is greater than n_features=4. All the features will be returned. warnings.warn( d:\Python\CondaEnvs\DataAnalysis\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:779: UserWarning: k=380 is greater than n_features=4. All the features will be returned. warnings.warn( d:\Python\CondaEnvs\DataAnalysis\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:779: UserWarning: k=370 is greater than n_features=4. All the features will be returned. warnings.warn( d:\Python\CondaEnvs\DataAnalysis\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:779: UserWarning: k=360 is greater than n_features=4. All the features will be returned. warnings.warn( d:\Python\CondaEnvs\DataAnalysis\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:779: UserWarning: k=350 is greater than n_features=4. All the features will be returned. warnings.warn( d:\Python\CondaEnvs\DataAnalysis\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:779: UserWarning: k=340 is greater than n_features=4. All the features will be returned. warnings.warn( d:\Python\CondaEnvs\DataAnalysis\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:779: UserWarning: k=330 is greater than n_features=4. All the features will be returned. warnings.warn( d:\Python\CondaEnvs\DataAnalysis\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:779: UserWarning: k=320 is greater than n_features=4. All the features will be returned. warnings.warn( d:\Python\CondaEnvs\DataAnalysis\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:779: UserWarning: k=310 is greater than n_features=4. All the features will be returned. warnings.warn( d:\Python\CondaEnvs\DataAnalysis\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:779: UserWarning: k=300 is greater than n_features=4. All the features will be returned. warnings.warn( d:\Python\CondaEnvs\DataAnalysis\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:779: UserWarning: k=290 is greater than n_features=4. All the features will be returned. warnings.warn( d:\Python\CondaEnvs\DataAnalysis\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:779: UserWarning: k=280 is greater than n_features=4. All the features will be returned. warnings.warn( d:\Python\CondaEnvs\DataAnalysis\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:779: UserWarning: k=270 is greater than n_features=4. All the features will be returned. warnings.warn( d:\Python\CondaEnvs\DataAnalysis\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:779: UserWarning: k=260 is greater than n_features=4. All the features will be returned. warnings.warn( d:\Python\CondaEnvs\DataAnalysis\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:779: UserWarning: k=250 is greater than n_features=4. All the features will be returned. warnings.warn( d:\Python\CondaEnvs\DataAnalysis\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:779: UserWarning: k=240 is greater than n_features=4. All the features will be returned. warnings.warn( d:\Python\CondaEnvs\DataAnalysis\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:779: UserWarning: k=230 is greater than n_features=4. All the features will be returned. warnings.warn( d:\Python\CondaEnvs\DataAnalysis\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:779: UserWarning: k=220 is greater than n_features=4. All the features will be returned. warnings.warn( d:\Python\CondaEnvs\DataAnalysis\lib\site-packages\sklearn\feature_selection\_univariate_selection.py:779: UserWarning: k=210 is greater than n_features=4. All the features will be returned. warnings.warn(
chivalue, pvalues_chi = chi2(X, y)
print(chivalue)
print(pvalues_chi)
[ 10.81782088 3.7107283 116.31261309 67.0483602 ] [4.47651499e-03 1.56395980e-01 5.53397228e-26 2.75824965e-15]
k = chivalue.shape[0] - (pvalues_chi > 0.05).sum()
print(k)
3
F检验
from sklearn.feature_selection import f_classif
F, pvalues_f = f_classif(X, y)
k = F.shape[0] - (pvalues_f > 0.05).sum()
k
np.int64(4)
互信息法
from sklearn.feature_selection import mutual_info_regression as MIC
result = MIC(X, y)
k = X.shape[1] - sum(result <= 0)
k
np.int64(4)
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier as RFC
rfc = RFC()
rfc.fit(X, y)
fi = rfc.feature_importances_
fi
array([3.68063403e-04, 1.55507755e-04, 3.40179337e-06, ..., 4.08596449e-06, 0.00000000e+00, 3.36483008e-06])
threshold = np.linspace(0, fi.max(), 20)
threshold
array([0. , 0.0002761 , 0.00055221, 0.00082831, 0.00110442, 0.00138052, 0.00165663, 0.00193273, 0.00220884, 0.00248494, 0.00276104, 0.00303715, 0.00331325, 0.00358936, 0.00386546, 0.00414157, 0.00441767, 0.00469378, 0.00496988, 0.00524598])
score = []
for i in threshold:
x_embedded = SelectFromModel(rfc, threshold=i).fit_transform(X, y)
once = cross_val_score(RFC(n_estimators=10, random_state=0),
x_embedded, y, cv=5).mean()
score.append(once)
plt.plot(threshold, score)
plt.show()
X_embedded = SelectFromModel(rfc, threshold=0.00067).fit_transform(X, y)
X_embedded.shape
print(cross_val_score(rfc, X_embedded, y, cv=5).mean())
0.96
import pandas as pd
import numpy as np
from sklearn.decomposition import FactorAnalysis
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
# 加载示例数据集
data = load_iris()
X = data.data
y = data.target
feature_names = data.feature_names
# 将数据转换为DataFrame
df = pd.DataFrame(X, columns=feature_names)
print(df.head())
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) 0 5.1 3.5 1.4 0.2 1 4.9 3.0 1.4 0.2 2 4.7 3.2 1.3 0.2 3 4.6 3.1 1.5 0.2 4 5.0 3.6 1.4 0.2
from sklearn.preprocessing import StandardScaler
# 标准化数据
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 计算协方差矩阵的特征值
cov_matrix = np.cov(X_scaled, rowvar=False)
eigenvalues, _ = np.linalg.eig(cov_matrix)
# 绘制碎石图
plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o')
plt.title('Scree Plot')
plt.xlabel('Factors')
plt.ylabel('Eigenvalues')
plt.show()
# 初始化因子分析模型,假设我们想要2个因子
fa = FactorAnalysis(n_components=2, random_state=42)
# 拟合模型并转换数据
X_fa = fa.fit_transform(X_scaled)
# 将结果转换为DataFrame
df_fa = pd.DataFrame(X_fa, columns=['Factor 1', 'Factor 2'])
print(df_fa.head())
# 获取因子载荷矩阵
loadings = pd.DataFrame(fa.components_.T, columns=[
'Factor 1', 'Factor 2'], index=feature_names)
print(loadings)
Factor 1 Factor 2 0 -1.328255 -0.559732 1 -1.337759 -0.000107 2 -1.402517 0.307654 3 -1.300187 0.719597 4 -1.333781 -0.363828 Factor 1 Factor 2 sepal length (cm) 0.880960 -0.447287 sepal width (cm) -0.416916 -0.553900 petal length (cm) 0.999189 0.019153 petal width (cm) 0.962289 0.058402
n_factors = 2
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0) # 标准化
X[:5]
array([[ 1.10252609, -0.92662384, -2.22574466, 0.19894046, -1.05119277], [-1.41317631, -0.09082591, 1.33010341, -0.83892082, 0.91055478], [ 1.17516225, 0.235568 , -0.17837558, 0.46713812, -1.23929943], [ 1.63974719, 0.75432769, 1.12727895, -1.82278903, 1.5553271 ], [ 1.28722509, -0.140075 , 0.59776196, -1.49282457, 0.41577788]])
R = np.cov(X, rowvar=False) # 计算协方差矩阵
R
array([[ 1.01010101, -0.03578495, -0.00420521, 0.16590033, -0.03816365], [-0.03578495, 1.01010101, -0.03823689, -0.09396191, 0.10202764], [-0.00420521, -0.03823689, 1.01010101, -0.04038821, -0.14398062], [ 0.16590033, -0.09396191, -0.04038821, 1.01010101, 0.02946688], [-0.03816365, 0.10202764, -0.14398062, 0.02946688, 1.01010101]])
eigenvalues, eigenvectors = np.linalg.eig(R) # 计算特征值和特征向量
eigenvalues
array([1.23575513, 1.18666926, 0.80051743, 0.95759901, 0.86996422])
选择最大的K个特征值对应的特征向量
idx = eigenvalues.argsort()[::-1] # 对特征值排序
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:, idx]
eigenvals = eigenvalues[:n_factors]
eigenvecs = eigenvectors[:, :n_factors]
loadings = eigenvecs * np.sqrt(eigenvals)
loadings
array([[-0.56019332, 0.38473066], [ 0.57375641, 0.10026824], [-0.28962755, -0.62764749], [-0.5377147 , 0.54816824], [ 0.468744 , 0.57807272]])
F = X @ loadings @ np.linalg.inv(loadings.T @ loadings)
F
array([[-8.93672723e-01, 1.03620674e+00], [ 9.97141351e-01, -1.11331623e+00], [-1.05490003e+00, 1.07328147e-01], [ 7.25810935e-01, -8.52306419e-02], [ 1.86241992e-02, -3.97720830e-01], [-1.63226641e+00, -5.25392421e-02], [-1.38984979e-02, 2.35260990e+00], [-9.51363586e-01, 1.24067084e+00], [ 8.18615714e-01, -9.57352641e-02], [ 7.52754346e-01, 1.62719775e+00], [ 5.74344185e-01, -5.92751160e-01], [-3.02722167e-01, -1.70621370e+00], [ 4.68333006e-04, -2.71870263e-01], [ 5.31842869e-01, 1.47632728e-01], [-1.95138815e-01, 7.58449711e-01], [ 5.52345186e-01, -4.67294798e-01], [-1.22297039e+00, 3.15066081e-01], [ 7.80695127e-01, -6.74109211e-01], [-1.02751564e+00, 5.97553845e-01], [ 1.31269371e+00, -4.97933850e-01], [-2.72819389e-01, 1.11725993e+00], [-2.93710916e-01, -1.19680312e+00], [ 8.50033433e-01, 4.65341156e-01], [ 3.24066720e-01, -1.65753521e-01], [-3.06656709e-01, -1.32057761e+00], [ 1.17653516e+00, -7.96624194e-01], [-1.28555765e+00, 1.28712099e+00], [-6.03963911e-01, 1.54517906e+00], [ 2.83020117e-01, -7.89237756e-01], [-3.23709733e-01, -7.30214797e-01], [ 1.14063477e+00, -2.22028628e+00], [ 9.68038954e-01, -1.05782929e+00], [ 1.50019802e+00, -2.85673418e-01], [-9.21343224e-01, 2.23288438e-01], [ 1.24907535e+00, 9.50658122e-01], [ 1.35939112e+00, 1.67495009e+00], [ 1.66748321e-01, 9.15424211e-02], [-2.85140804e-01, -8.11082327e-01], [ 3.99217070e-01, -1.01381388e+00], [-1.41742315e+00, 8.49382762e-01], [ 8.05223101e-01, 2.10656220e+00], [-2.52009816e+00, -6.25333192e-01], [ 8.94683435e-01, 4.93239114e-01], [ 5.12374470e-01, 3.09221863e-01], [ 1.05602138e+00, -2.53139289e-01], [ 2.75758732e-01, 1.59844718e+00], [-1.47075199e+00, -1.54445603e+00], [ 1.11459121e+00, 1.81063371e-01], [ 9.69387403e-01, 4.99449607e-01], [ 4.97758741e-01, 8.82457459e-01], [ 2.00622662e+00, -4.63414284e-02], [-1.36646548e+00, 2.51810110e-01], [ 1.30425986e+00, -1.49409865e-01], [-1.43666872e+00, -6.86471674e-01], [-6.71157496e-01, -1.34827901e+00], [-3.20384822e-01, 1.41801592e-01], [-1.29282990e+00, -1.45192299e+00], [-1.52617551e-01, -2.90664552e-01], [-9.03950447e-02, 1.34660946e+00], [-2.50055379e-01, -1.87806842e+00], [-3.96924614e-01, -7.64384538e-01], [ 6.17021169e-01, 1.21275406e+00], [ 6.82038478e-01, -1.42894594e+00], [ 7.98657949e-01, 1.49030922e+00], [ 6.29827960e-02, 9.93270200e-01], [ 1.79283264e-01, 9.65802957e-01], [ 7.69451717e-01, 3.77052864e-01], [-1.09712854e+00, -1.11093779e+00], [-1.14222320e+00, 5.19056931e-02], [ 2.91141448e-01, 2.58356237e-01], [-1.46956821e+00, -6.97928256e-01], [-1.76927687e+00, 8.65717500e-01], [ 2.37036254e-01, -5.93131597e-01], [ 1.29585399e+00, -4.83274429e-01], [-3.53961529e-01, 1.11778855e+00], [ 7.98554521e-03, -9.07973959e-02], [-5.47522972e-01, -3.81394418e-01], [-1.23246193e+00, 8.75478651e-01], [-1.20580061e+00, -2.25957045e-01], [-1.72918111e+00, 5.70980705e-01], [-7.65670877e-02, 1.02450393e+00], [ 3.17874827e-01, -7.72420483e-01], [-3.20325625e-01, -3.78949840e-01], [-2.92331315e-01, 5.23742287e-01], [-9.04401483e-02, -3.73625851e-01], [ 1.47471891e+00, -1.49899833e+00], [ 5.84408624e-01, -2.59492950e+00], [ 9.22530858e-01, 1.97120848e-01], [ 4.55303500e-01, 2.07026581e+00], [-2.42524297e+00, -2.81019055e-01], [ 9.08144167e-01, 2.07879276e-01], [ 1.28335616e+00, -7.70829240e-02], [-2.43240104e+00, 1.20950063e-01], [ 1.19014990e+00, -1.24287921e+00], [-6.83817262e-01, 6.00775009e-01], [-7.09405555e-01, -3.79417657e-01], [ 1.28116603e-01, 1.10592246e+00], [ 5.60044052e-01, -8.79909558e-01], [ 2.03580911e+00, 1.38991865e+00], [-1.69710363e-01, -1.34591399e+00]])