特征选择方法有多种,主要包括以下几种。
移除低方差特征法
假设某个特征对应的方差为0或者非常小,通常可以认为该特征的作用可以忽略,因而可以移除该特征。
示例如下:
from sklearn.feature_selection import VarianceThreshold
X = [[0, 0, 1], [0, 1, 0],
[1, 0, 0], [0, 1, 1],
[0, 1, 0], [0, 1, 1]]
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X)
array([[0, 1],
[1, 0],
[0, 0],
[1, 1],
[1, 0],
[1, 1]])
容易看出上述方法剔除了第一列,因为 超出了给定阈值,即第一列有超过 5/6 的变量取值为0。
2. 单变量特征选择法
这种方法通过单变量统计检验来选择最好的特征。
比如,通过卡方检验方法可以选择前两个最好的特征。
示例如下:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
iris = load_iris()
X, y = iris.data, iris.target
X.shape
(150, 4)
X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
X_new.shape
(150, 2)
下面给出一个完整的示例,来说明特征选择对SVM的影响。
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, svm
from sklearn.feature_selection import SelectPercentile, f_classif
###############################
# import some data to play with
# The iris dataset
iris = datasets.load_iris()
# Some noisy data not correlated
E = np.random.uniform(0, 0.1, size=(len(iris.data), 20))
# Add the noisy data to the informative features
X = np.hstack((iris.data, E))
y = iris.target
#####################################
plt.figure(1)
plt.clf()
X_indices = np.arange(X.shape[-1])
####################################
# Univariate feature selection with F-test for feature scoring
# We use the default selection function:
# the 10% most significant features
selector = SelectPercentile(f_classif, percentile=10)
selector.fit(X, y)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()
plt.bar(X_indices - .45, scores, width=.2,
label=r'Univariate score ($-Log(p_{value})$)', color='g')
########################################
# Compare to the weights of an SVM
clf = svm.SVC(kernel='linear')
clf.fit(X, y)
svm_weights = (clf.coef_ ** 2).sum(axis=0)
svm_weights /= svm_weights.max()
plt.bar(X_indices - .25, svm_weights, width=.2,
label='SVM weight', color='r')
clf_selected = svm.SVC(kernel='linear')
clf_selected.fit(selector.transform(X), y)
svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0)
svm_weights_selected /= svm_weights_selected.max()
plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,
width=.2, label='SVM weights after selection', color='b')
plt.title("Comparing feature selection")
plt.xlabel('Feature number')
plt.yticks(())
plt.axis('tight')
plt.legend(loc='upper right')
plt.show()
实验结果如下
3 递归特征淘汰法
这种方法通过在训练所得模型中逐步去除不那么重要的特征,每次都会去掉权值绝对值最小的特征,直至特征个数达到预设数目。
示例如下:
print(__doc__)
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt
# Load the digits dataset
digits = load_digits()
X = digits.images.reshape((len(digits.images), -1))
y = digits.target
# Create the RFE object and rank each pixel
svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
rfe.fit(X, y)
ranking = rfe.ranking_.reshape(digits.images[0].shape)
# Plot pixel ranking
plt.matshow(ranking)
plt.colorbar()
plt.title("Ranking of pixels with RFE")
plt.show()
实验结果如下:
含有交叉验证方法的示例如下:
print(__doc__)
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
# Build a classification task using 3 informative features
X, y = make_classification(n_samples=1000, n_features=25, n_informative=3,
n_redundant=2, n_repeated=0, n_classes=8,
n_clusters_per_class=1, random_state=0)
# Create the RFE object and compute a cross-validated score.
svc = SVC(kernel="linear")
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 2),
scoring='accuracy')
rfecv.fit(X, y)
print("Optimal number of features : %d" % rfecv.n_features_)
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
4 基于 SelectFromModel 的特征选择
这种方法中,训练好的模型中特征对应权重小于设定阈值时则舍弃。
示例如下:
# Author: Manoj Kumar <mks542@nyu.edu>
# License: BSD 3 clause
print(__doc__)
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_boston
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
# Load the boston dataset.
boston = load_boston()
X, y = boston['data'], boston['target']
# We use the base estimator LassoCV since
# the L1 norm promotes sparsity of features.
clf = LassoCV()
# Set a minimum threshold of 0.25
sfm = SelectFromModel(clf, threshold=0.25)
sfm.fit(X, y)
n_features = sfm.transform(X).shape[1]
# Reset the threshold till the number of features equals two.
# Note that the attribute can be set directly instead of repeatedly
# fitting the metatransformer.
while n_features > 2:
sfm.threshold += 0.1
X_transform = sfm.transform(X)
n_features = X_transform.shape[1]
# Plot the selected two features from X.
plt.title(
"Features selected from Boston using SelectFromModel with "
"threshold %0.3f." % sfm.threshold)
feature1 = X_transform[:, 0]
feature2 = X_transform[:, 1]
plt.plot(feature1, feature2, 'r.')
plt.xlabel("Feature number 1")
plt.ylabel("Feature number 2")
plt.ylim([np.min(feature2), np.max(feature2)])
plt.show()
基于稀疏恢复的特征选择
这种方法跟压缩感知和L1正则的关联性比较大。
print(__doc__)
# Author: Alexandre Gramfort and Gael Varoquaux
# License: BSD 3 clause
import warnings
import matplotlib.pyplot as plt
import numpy as np
from scipy import linalg
from sklearn.linear_model import (RandomizedLasso,
lasso_stability_path,
LassoLarsCV)
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import auc, precision_recall_curve
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.utils.extmath import pinvh
from sklearn.utils import ConvergenceWarning
def mutual_incoherence(X_relevant, X_irelevant):
"""Mutual incoherence, as defined by
formula (26a) of [Wainwright2006]. """
projector = np.dot(np.dot(X_irelevant.T, X_relevant),
pinvh(np.dot(X_relevant.T, X_relevant)))
return np.max(np.abs(projector).sum(axis=1))
for conditioning in (1, 1e-4):
#####################################
# Simulate regression data with a correlated design
n_features = 501
n_relevant_features = 3
noise_level = .2
coef_min = .2
# The Donoho-Tanner phase transition is
# around n_samples=25: below we
# will completely fail to recover in the well-conditioned case
n_samples = 25
block_size = n_relevant_features
rng = np.random.RandomState(42)
# The coefficients of our model
coef = np.zeros(n_features)
coef[:n_relevant_features] = coef_min + rng.rand(n_relevant_features)
# The correlation of our design: variables correlated by blocs of 3
corr = np.zeros((n_features, n_features))
for i in range(0, n_features, block_size):
corr[i:i + block_size, i:i + block_size] = 1 - conditioning
corr.flat[::n_features + 1] = 1
corr = linalg.cholesky(corr)
# Our design
X = rng.normal(size=(n_samples, n_features))
X = np.dot(X, corr)
# Keep [Wainwright2006] (26c) constant
X[:n_relevant_features] /= np.abs(
linalg.svdvals(X[:n_relevant_features])).max()
X = StandardScaler().fit_transform(X.copy())
# The output variable
y = np.dot(X, coef)
y /= np.std(y)
# We scale the added noise as a function of the average correlation
# between the design and the output variable
y += noise_level * rng.normal(size=n_samples)
mi = mutual_incoherence(X[:, :n_relevant_features],
X[:, n_relevant_features:])
####################################
# Plot stability selection path,
# using a high eps for early stopping
# of the path, to save computation time
alpha_grid, scores_path = lasso_stability_path(X, y, random_state=42,
eps=0.05)
plt.figure()
# We plot the path as a function of alpha/alpha_max
# to the power 1/3: the
# power 1/3 scales the path less brutally than
# the log, and enables to
# see the progression along the path
hg = plt.plot(alpha_grid[1:] ** .333,
scores_path[coef != 0].T[1:], 'r')
hb = plt.plot(alpha_grid[1:] ** .333,
scores_path[coef == 0].T[1:], 'k')
ymin, ymax = plt.ylim()
plt.xlabel(r'$(\alpha / \alpha_{max})^{1/3}$')
plt.ylabel('Stability score: proportion of times selected')
plt.title('Stability Scores Path - Mutual incoherence: %.1f' % mi)
plt.axis('tight')
plt.legend((hg[0], hb[0]),
('relevant features', 'irrelevant features'),
loc='best')
####################################
# Plot the estimated stability scores for a given alpha
# Use 6-fold cross-validation rather than
# the default 3-fold: it leads to
# a better choice of alpha:
# Stop the user warnings outputs- they are
# not necessary for the example
# as it is specifically set up to be challenging.
with warnings.catch_warnings():
warnings.simplefilter('ignore', UserWarning)
warnings.simplefilter('ignore', ConvergenceWarning)
lars_cv = LassoLarsCV(cv=6).fit(X, y)
# Run the RandomizedLasso: we use a paths
# going down to .1*alpha_max
# to avoid exploring the regime in which
# very noisy variables enter
# the model
alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)
clf = RandomizedLasso(alpha=alphas, random_state=42).fit(X, y)
trees = ExtraTreesRegressor(100).fit(X, y)
# Compare with F-score
F, _ = f_regression(X, y)
plt.figure()
for name, score in [('F-test', F),
('Stability selection', clf.scores_),
('Lasso coefs', np.abs(lars_cv.coef_)),
('Trees', trees.feature_importances_),
]:
precision, recall, thresholds = precision_recall_curve(coef != 0,
score)
plt.semilogy(np.maximum(score / np.max(score), 1e-4),
label="%s. AUC: %.3f" % (name, auc(recall, precision)))
plt.plot(np.where(coef != 0)[0], [2e-4] * n_relevant_features, 'mo',
label="Ground truth")
plt.xlabel("Features")
plt.ylabel("Score")
# Plot only the 100 first coefficients
plt.xlim(0, 100)
plt.legend(loc='best')
plt.title('Feature selection scores - Mutual incoherence: %.1f'
% mi)
plt.show()
实验结果如下
基于树(随机森林)的特征选择方法
示例如下:
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
# Build a classification task using 3 informative features
X, y = make_classification(n_samples=1000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=2,
random_state=0,
shuffle=False)
# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
random_state=0)
forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(X.shape[1]):
print("%d. feature %d (%f)" %
(f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
实验结果如下
下面是通过森林法得到人脸中每个像素特征的重要性。
print(__doc__)
from time import time
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_olivetti_faces
from sklearn.ensemble import ExtraTreesClassifier
# Number of cores to use to perform parallel
# fitting of the forest model
n_jobs = 1
# Load the faces dataset
data = fetch_olivetti_faces()
X = data.images.reshape((len(data.images), -1))
y = data.targetmask = y < 5
# Limit to 5 classes
X = X[mask]y = y[mask]
# Build a forest and compute the pixel importances
print("Fitting ExtraTreesClassifier on faces data with %d cores..."
% n_jobs)
t0 = time()
forest = ExtraTreesClassifier(n_estimators=1000,
max_features=128,
n_jobs=n_jobs,
random_state=0)
forest.fit(X, y)
print("done in %0.3fs" % (time() - t0))
importances = forest.feature_importances_
importances = importances.reshape(data.images[0].shape)
# Plot pixel importances
plt.matshow(importances, cmap=plt.cm.hot)
plt.title("Pixel importances with forests of trees")
plt.show()
实验结果如下:
本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。