谈谈特征选择及基于scikit-learn的示例

浏览: 2300

                               

特征选择方法有多种,主要包括以下几种。

  1. 移除低方差特征法

    假设某个特征对应的方差为0或者非常小,通常可以认为该特征的作用可以忽略,因而可以移除该特征。

示例如下:

from sklearn.feature_selection import VarianceThreshold

X = [[0, 0, 1], [0, 1, 0],
    [1, 0, 0], [0, 1, 1],
    [0, 1, 0], [0, 1, 1]]

sel = VarianceThreshold(threshold=(.8 * (1 - .8)))

sel.fit_transform(X)

array([[0, 1],
      [1, 0],      
      [0, 0],
      [1, 1],

      [1, 0],      
      [1, 1]])

容易看出上述方法剔除了第一列,因为   超出了给定阈值,即第一列有超过 5/6 的变量取值为0。

2. 单变量特征选择法

这种方法通过单变量统计检验来选择最好的特征。

比如,通过卡方检验方法可以选择前两个最好的特征。

示例如下:

from sklearn.datasets import load_iris

from sklearn.feature_selection import SelectKBest

from sklearn.feature_selection import chi2

iris = load_iris()

X, y = iris.data, iris.target

X.shape

(150, 4)

X_new = SelectKBest(chi2, k=2).fit_transform(X, y)

X_new.shape

(150, 2)

下面给出一个完整的示例,来说明特征选择对SVM的影响。

print(__doc__)

import numpy as np

import matplotlib.pyplot as plt

from sklearn import datasets, svm

from sklearn.feature_selection import SelectPercentile, f_classif

###############################
# import some data to play with

# The iris dataset

iris = datasets.load_iris()

# Some noisy data not correlated

E = np.random.uniform(0, 0.1, size=(len(iris.data), 20))

# Add the noisy data to the informative features

X = np.hstack((iris.data, E))

y = iris.target

#####################################

plt.figure(1)

plt.clf()

X_indices = np.arange(X.shape[-1])

####################################

# Univariate feature selection with F-test for feature scoring

# We use the default selection function:
# the 10% most significant features

selector = SelectPercentile(f_classif, percentile=10)

selector.fit(X, y)

scores = -np.log10(selector.pvalues_)

scores /= scores.max()

plt.bar(X_indices - .45, scores, width=.2,
label=r'Univariate score ($-Log(p_{value})$)', color='g')

########################################

# Compare to the weights of an SVM

clf = svm.SVC(kernel='linear')

clf.fit(X, y)

svm_weights = (clf.coef_ ** 2).sum(axis=0)

svm_weights /= svm_weights.max()

plt.bar(X_indices - .25, svm_weights, width=.2,
   label='SVM weight', color='r')

clf_selected = svm.SVC(kernel='linear')

clf_selected.fit(selector.transform(X), y)

svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0)

svm_weights_selected /= svm_weights_selected.max()

plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,
width=.2, label='SVM weights after selection', color='b')

plt.title("Comparing feature selection")

plt.xlabel('Feature number')

plt.yticks(())

plt.axis('tight')

plt.legend(loc='upper right')

plt.show()

实验结果如下


3 递归特征淘汰法

这种方法通过在训练所得模型中逐步去除不那么重要的特征,每次都会去掉权值绝对值最小的特征,直至特征个数达到预设数目。

示例如下:

print(__doc__)

from sklearn.svm import SVC

from sklearn.datasets import load_digits

from sklearn.feature_selection import RFE

import matplotlib.pyplot as plt

# Load the digits dataset

digits = load_digits()

X = digits.images.reshape((len(digits.images), -1))

y = digits.target

# Create the RFE object and rank each pixel

svc = SVC(kernel="linear", C=1)

rfe = RFE(estimator=svc, n_features_to_select=1, step=1)

rfe.fit(X, y)

ranking = rfe.ranking_.reshape(digits.images[0].shape)

# Plot pixel ranking

plt.matshow(ranking)

plt.colorbar()

plt.title("Ranking of pixels with RFE")

plt.show()

实验结果如下:


含有交叉验证方法的示例如下:

print(__doc__)

import matplotlib.pyplot as plt

from sklearn.svm import SVC

from sklearn.cross_validation import StratifiedKFold

from sklearn.feature_selection import RFECV

from sklearn.datasets import make_classification

# Build a classification task using 3 informative features

X, y = make_classification(n_samples=1000, n_features=25, n_informative=3,
n_redundant=2, n_repeated=0, n_classes=8,
n_clusters_per_class=1, random_state=0)

# Create the RFE object and compute a cross-validated score.

svc = SVC(kernel="linear")

# The "accuracy" scoring is proportional to the number of correct

# classifications

rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 2),
scoring='accuracy')

rfecv.fit(X, y)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores

plt.figure()

plt.xlabel("Number of features selected")

plt.ylabel("Cross validation score (nb of correct classifications)")

plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)

plt.show()



4 基于 SelectFromModel 的特征选择

这种方法中,训练好的模型中特征对应权重小于设定阈值时则舍弃。

示例如下:

# Author: Manoj Kumar <mks542@nyu.edu>

# License: BSD 3 clause

print(__doc__)

import matplotlib.pyplot as plt

import numpy as np

from sklearn.datasets import load_boston

from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LassoCV

# Load the boston dataset.

boston = load_boston()

X, y = boston['data'], boston['target']

# We use the base estimator LassoCV since
# the L1 norm promotes sparsity of features.

clf = LassoCV()

# Set a minimum threshold of 0.25

sfm = SelectFromModel(clf, threshold=0.25)

sfm.fit(X, y)

n_features = sfm.transform(X).shape[1]

# Reset the threshold till the number of features equals two.

# Note that the attribute can be set directly instead of repeatedly

# fitting the metatransformer.

while n_features > 2:
sfm.threshold += 0.1
X_transform = sfm.transform(X)
n_features = X_transform.shape[1]

# Plot the selected two features from X.

plt.title(
"Features selected from Boston using SelectFromModel with "
"threshold %0.3f." % sfm.threshold)

feature1 = X_transform[:, 0]

feature2 = X_transform[:, 1]
plt.plot(feature1, feature2, 'r.')

plt.xlabel("Feature number 1")

plt.ylabel("Feature number 2")

plt.ylim([np.min(feature2), np.max(feature2)])

plt.show()


基于稀疏恢复的特征选择

这种方法跟压缩感知和L1正则的关联性比较大。

print(__doc__)

# Author: Alexandre Gramfort and Gael Varoquaux

# License: BSD 3 clause

import warnings

import matplotlib.pyplot as plt

import numpy as np

from scipy import linalg

from sklearn.linear_model import (RandomizedLasso,
                                 
lasso_stability_path,
LassoLarsCV)

from sklearn.feature_selection import f_regression

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import auc, precision_recall_curve

from sklearn.ensemble import ExtraTreesRegressor

from sklearn.utils.extmath import pinvh

from sklearn.utils import ConvergenceWarning

def mutual_incoherence(X_relevant, X_irelevant):
"""Mutual incoherence, as defined by
   formula (26a) of [Wainwright2006].
   """
projector = np.dot(np.dot(X_irelevant.T, X_relevant),
pinvh(np.dot(X_relevant.T, X_relevant)))
return np.max(np.abs(projector).sum(axis=1))

for conditioning in (1, 1e-4):
#####################################
# Simulate regression data with a correlated design
n_features = 501
n_relevant_features = 3
noise_level = .2
coef_min = .2
# The Donoho-Tanner phase transition is
   # around n_samples=25: below we

# will completely fail to recover in the well-conditioned case
n_samples = 25
block_size = n_relevant_features

rng = np.random.RandomState(42)

# The coefficients of our model
coef = np.zeros(n_features)
coef[:n_relevant_features] = coef_min + rng.rand(n_relevant_features)

# The correlation of our design: variables correlated by blocs of 3
corr = np.zeros((n_features, n_features))
for i in range(0, n_features, block_size):
corr[i:i + block_size, i:i + block_size] = 1 - conditioning
corr.flat[::n_features + 1] = 1
corr = linalg.cholesky(corr)

# Our design
X = rng.normal(size=(n_samples, n_features))
X = np.dot(X, corr)
# Keep [Wainwright2006] (26c) constant
X[:n_relevant_features] /= np.abs(
linalg.svdvals(X[:n_relevant_features])).max()
X = StandardScaler().fit_transform(X.copy())

# The output variable
y = np.dot(X, coef)
y /= np.std(y)
# We scale the added noise as a function of the average correlation
# between the design and the output variable
y += noise_level * rng.normal(size=n_samples)
mi = mutual_incoherence(X[:, :n_relevant_features],
X[:, n_relevant_features:])

####################################
# Plot stability selection path,
   # using a high eps for early stopping

# of the path, to save computation time
alpha_grid, scores_path = lasso_stability_path(X, y, random_state=42,
eps=0.05)

plt.figure()
# We plot the path as a function of alpha/alpha_max
   # to the power 1/3: the

# power 1/3 scales the path less brutally than
   # the log, and enables to

# see the progression along the path
hg = plt.plot(alpha_grid[1:] ** .333,
                 scores_path[coef != 0].T[1:], 'r')
hb = plt.plot(alpha_grid[1:] ** .333,
                 scores_path[coef == 0].T[1:], 'k')
ymin, ymax = plt.ylim()
plt.xlabel(r'$(\alpha / \alpha_{max})^{1/3}$')
plt.ylabel('Stability score: proportion of times selected')
plt.title('Stability Scores Path - Mutual incoherence: %.1f' % mi)
plt.axis('tight')
plt.legend((hg[0], hb[0]),
              ('relevant features', 'irrelevant features'),
loc='best')

####################################
# Plot the estimated stability scores for a given alpha

# Use 6-fold cross-validation rather than
   # the default 3-fold: it leads to

# a better choice of alpha:
# Stop the user warnings outputs- they are
   # not necessary for the example

# as it is specifically set up to be challenging.
with warnings.catch_warnings():
warnings.simplefilter('ignore', UserWarning)
warnings.simplefilter('ignore', ConvergenceWarning)
lars_cv = LassoLarsCV(cv=6).fit(X, y)

# Run the RandomizedLasso: we use a paths
   # going down to .1*alpha_max

# to avoid exploring the regime in which
   # very noisy variables enter

# the model
alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)
clf = RandomizedLasso(alpha=alphas, random_state=42).fit(X, y)
trees = ExtraTreesRegressor(100).fit(X, y)
# Compare with F-score
F, _ = f_regression(X, y)

plt.figure()
for name, score in [('F-test', F),
('Stability selection', clf.scores_),
('Lasso coefs', np.abs(lars_cv.coef_)),
('Trees', trees.feature_importances_),
]:
precision, recall, thresholds = precision_recall_curve(coef != 0,
score)
plt.semilogy(np.maximum(score / np.max(score), 1e-4),
label="%s. AUC: %.3f" % (name, auc(recall, precision)))

plt.plot(np.where(coef != 0)[0], [2e-4] * n_relevant_features, 'mo',
label="Ground truth")
plt.xlabel("Features")
plt.ylabel("Score")
# Plot only the 100 first coefficients
plt.xlim(0, 100)
plt.legend(loc='best')
plt.title('Feature selection scores - Mutual incoherence: %.1f'
% mi)

plt.show()

实验结果如下





基于树(随机森林)的特征选择方法

示例如下:

print(__doc__)

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import make_classification

from sklearn.ensemble import ExtraTreesClassifier

# Build a classification task using 3 informative features

X, y = make_classification(n_samples=1000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=2,
random_state=0,
shuffle=False)

# Build a forest and compute the feature importances

forest = ExtraTreesClassifier(n_estimators=250,
random_state=0)

forest.fit(X, y)

importances = forest.feature_importances_

std = np.std([tree.feature_importances_ for tree in forest.estimators_],
axis=0)

indices = np.argsort(importances)[::-1]

# Print the feature ranking

print("Feature ranking:")

for f in range(X.shape[1]):
print("%d. feature %d (%f)" %
   (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest

plt.figure()

plt.title("Feature importances")

plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")

plt.xticks(range(X.shape[1]), indices)

plt.xlim([-1, X.shape[1]])

plt.show()

实验结果如下


下面是通过森林法得到人脸中每个像素特征的重要性。

print(__doc__)

from time import time

import matplotlib.pyplot as plt

from sklearn.datasets import fetch_olivetti_faces

from sklearn.ensemble import ExtraTreesClassifier

# Number of cores to use to perform parallel
# fitting of the forest model

n_jobs = 1

# Load the faces dataset

data = fetch_olivetti_faces()

X = data.images.reshape((len(data.images), -1))

y = data.targetmask = y < 5
# Limit to 5 classes

X = X[mask]y = y[mask]

# Build a forest and compute the pixel importances

print("Fitting ExtraTreesClassifier on faces data with %d cores..."
   
% n_jobs)

t0 = time()

forest = ExtraTreesClassifier(n_estimators=1000,
max_features=128,
n_jobs=n_jobs,
random_state=0)

forest.fit(X, y)

print("done in %0.3fs" % (time() - t0))

importances = forest.feature_importances_

importances = importances.reshape(data.images[0].shape)

# Plot pixel importances

plt.matshow(importances, cmap=plt.cm.hot)

plt.title("Pixel importances with forests of trees")

plt.show()

实验结果如下:


推荐 0
本文由 深度学习 创作,采用 知识共享署名-相同方式共享 3.0 中国大陆许可协议 进行许可。
转载、引用前需联系作者,并署名作者且注明文章出处。
本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。

0 个评论

要回复文章请先登录注册