谈谈特征选择及基于scikit-learn的示例

发表: 2017-03-24 浏览: 2300

机器学习数据挖掘深度学习

特征选择方法有多种，主要包括以下几种。

移除低方差特征法
假设某个特征对应的方差为0或者非常小，通常可以认为该特征的作用可以忽略，因而可以移除该特征。

示例如下：

from sklearn.feature_selection import VarianceThreshold

X = [[0, 0, 1], [0, 1, 0], 
     [1, 0, 0], [0, 1, 1], 
     [0, 1, 0], [0, 1, 1]]

sel = VarianceThreshold(threshold=(.8 * (1 - .8)))

sel.fit_transform(X)

array([[0, 1],
       [1, 0],       
       [0, 0], 
       [1, 1], 
       [1, 0],       
       [1, 1]])

容易看出上述方法剔除了第一列，因为超出了给定阈值，即第一列有超过 5/6 的变量取值为0。

2. 单变量特征选择法

这种方法通过单变量统计检验来选择最好的特征。

比如，通过卡方检验方法可以选择前两个最好的特征。

示例如下：

from sklearn.datasets import load_iris

from sklearn.feature_selection import SelectKBest

from sklearn.feature_selection import chi2

iris = load_iris()

X, y = iris.data, iris.target

X.shape

(150, 4)

X_new = SelectKBest(chi2, k=2).fit_transform(X, y)

X_new.shape

(150, 2)

下面给出一个完整的示例，来说明特征选择对SVM的影响。

print(__doc__)

import numpy as np

import matplotlib.pyplot as plt

from sklearn import datasets, svm

from sklearn.feature_selection import SelectPercentile, f_classif

###############################
# import some data to play with

# The iris dataset

iris = datasets.load_iris()

# Some noisy data not correlated

E = np.random.uniform(0, 0.1, size=(len(iris.data), 20))

# Add the noisy data to the informative features

X = np.hstack((iris.data, E))

y = iris.target

#####################################

plt.figure(1)

plt.clf()

X_indices = np.arange(X.shape[-1])

####################################

# Univariate feature selection with F-test for feature scoring

# We use the default selection function: 
# the 10% most significant features

selector = SelectPercentile(f_classif, percentile=10)

selector.fit(X, y)

scores = -np.log10(selector.pvalues_)

scores /= scores.max()

plt.bar(X_indices - .45, scores, width=.2,

        label=r'Univariate score ($-Log(p_{value})$)', color='g')

########################################

# Compare to the weights of an SVM

clf = svm.SVC(kernel='linear')

clf.fit(X, y)

svm_weights = (clf.coef_ ** 2).sum(axis=0)

svm_weights /= svm_weights.max()

plt.bar(X_indices - .25, svm_weights, width=.2, 
    label='SVM weight', color='r')

clf_selected = svm.SVC(kernel='linear')

clf_selected.fit(selector.transform(X), y)

svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0)

svm_weights_selected /= svm_weights_selected.max()

plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,

        width=.2, label='SVM weights after selection', color='b')

plt.title("Comparing feature selection")

plt.xlabel('Feature number')

plt.yticks(())

plt.axis('tight')

plt.legend(loc='upper right')

plt.show()

实验结果如下

3 递归特征淘汰法

这种方法通过在训练所得模型中逐步去除不那么重要的特征，每次都会去掉权值绝对值最小的特征，直至特征个数达到预设数目。

示例如下：

print(__doc__)

from sklearn.svm import SVC

from sklearn.datasets import load_digits

from sklearn.feature_selection import RFE

import matplotlib.pyplot as plt

# Load the digits dataset

digits = load_digits()

X = digits.images.reshape((len(digits.images), -1))

y = digits.target

# Create the RFE object and rank each pixel

svc = SVC(kernel="linear", C=1)

rfe = RFE(estimator=svc, n_features_to_select=1, step=1)

rfe.fit(X, y)

ranking = rfe.ranking_.reshape(digits.images[0].shape)

# Plot pixel ranking

plt.matshow(ranking)

plt.colorbar()

plt.title("Ranking of pixels with RFE")

plt.show()

实验结果如下：

含有交叉验证方法的示例如下：

print(__doc__)

import matplotlib.pyplot as plt

from sklearn.svm import SVC

from sklearn.cross_validation import StratifiedKFold

from sklearn.feature_selection import RFECV

from sklearn.datasets import make_classification

# Build a classification task using 3 informative features

X, y = make_classification(n_samples=1000, n_features=25, n_informative=3,

                           n_redundant=2, n_repeated=0, n_classes=8,

                           n_clusters_per_class=1, random_state=0)

# Create the RFE object and compute a cross-validated score.

svc = SVC(kernel="linear")

# The "accuracy" scoring is proportional to the number of correct

# classifications

rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 2),

              scoring='accuracy')

rfecv.fit(X, y)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores

plt.figure()

plt.xlabel("Number of features selected")

plt.ylabel("Cross validation score (nb of correct classifications)")

plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)

plt.show()

4 基于 SelectFromModel 的特征选择

这种方法中，训练好的模型中特征对应权重小于设定阈值时则舍弃。

示例如下：

# Author: Manoj Kumar <mks542@nyu.edu>

# License: BSD 3 clause

print(__doc__)

import matplotlib.pyplot as plt

import numpy as np

from sklearn.datasets import load_boston

from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LassoCV

# Load the boston dataset.

boston = load_boston()

X, y = boston['data'], boston['target']

# We use the base estimator LassoCV since
# the L1 norm promotes sparsity of features.

clf = LassoCV()

# Set a minimum threshold of 0.25

sfm = SelectFromModel(clf, threshold=0.25)

sfm.fit(X, y)

n_features = sfm.transform(X).shape[1]

# Reset the threshold till the number of features equals two.

# Note that the attribute can be set directly instead of repeatedly

# fitting the metatransformer.

while n_features > 2:

    sfm.threshold += 0.1

    X_transform = sfm.transform(X)

    n_features = X_transform.shape[1]

# Plot the selected two features from X.

plt.title(

    "Features selected from Boston using SelectFromModel with "

    "threshold %0.3f." % sfm.threshold)

feature1 = X_transform[:, 0]

feature2 = X_transform[:, 1] 
plt.plot(feature1, feature2, 'r.')

plt.xlabel("Feature number 1")

plt.ylabel("Feature number 2")

plt.ylim([np.min(feature2), np.max(feature2)])

plt.show()

基于稀疏恢复的特征选择

这种方法跟压缩感知和L1正则的关联性比较大。

print(__doc__)

# Author: Alexandre Gramfort and Gael Varoquaux

# License: BSD 3 clause

import warnings

import matplotlib.pyplot as plt

import numpy as np

from scipy import linalg

from sklearn.linear_model import (RandomizedLasso,
                                  lasso_stability_path,

                                  LassoLarsCV)

from sklearn.feature_selection import f_regression

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import auc, precision_recall_curve

from sklearn.ensemble import ExtraTreesRegressor

from sklearn.utils.extmath import pinvh

from sklearn.utils import ConvergenceWarning

def mutual_incoherence(X_relevant, X_irelevant):

    """Mutual incoherence, as defined by 
    formula (26a) of [Wainwright2006].    """

    projector = np.dot(np.dot(X_irelevant.T, X_relevant),

                       pinvh(np.dot(X_relevant.T, X_relevant)))

    return np.max(np.abs(projector).sum(axis=1))

for conditioning in (1, 1e-4):

    #####################################
    # Simulate regression data with a correlated design

    n_features = 501

    n_relevant_features = 3

    noise_level = .2

    coef_min = .2

    # The Donoho-Tanner phase transition is 
    # around n_samples=25: below we

    # will completely fail to recover in the well-conditioned case

    n_samples = 25

    block_size = n_relevant_features



    rng = np.random.RandomState(42)



    # The coefficients of our model

    coef = np.zeros(n_features)

    coef[:n_relevant_features] = coef_min + rng.rand(n_relevant_features)



    # The correlation of our design: variables correlated by blocs of 3

    corr = np.zeros((n_features, n_features))

    for i in range(0, n_features, block_size):

        corr[i:i + block_size, i:i + block_size] = 1 - conditioning

    corr.flat[::n_features + 1] = 1

    corr = linalg.cholesky(corr)



    # Our design

    X = rng.normal(size=(n_samples, n_features))

    X = np.dot(X, corr)

    # Keep [Wainwright2006] (26c) constant

    X[:n_relevant_features] /= np.abs(

        linalg.svdvals(X[:n_relevant_features])).max()

    X = StandardScaler().fit_transform(X.copy())



    # The output variable

    y = np.dot(X, coef)

    y /= np.std(y)

    # We scale the added noise as a function of the average correlation

    # between the design and the output variable

    y += noise_level * rng.normal(size=n_samples)

    mi = mutual_incoherence(X[:, :n_relevant_features],

                            X[:, n_relevant_features:])



    ####################################
    # Plot stability selection path, 
    # using a high eps for early stopping

    # of the path, to save computation time

    alpha_grid, scores_path = lasso_stability_path(X, y, random_state=42,

                                                   eps=0.05)



    plt.figure()

    # We plot the path as a function of alpha/alpha_max 
    # to the power 1/3: the

    # power 1/3 scales the path less brutally than 
    # the log, and enables to

    # see the progression along the path

    hg = plt.plot(alpha_grid[1:] ** .333, 
                  scores_path[coef != 0].T[1:], 'r')

    hb = plt.plot(alpha_grid[1:] ** .333, 
                  scores_path[coef == 0].T[1:], 'k')

    ymin, ymax = plt.ylim()

    plt.xlabel(r'$(\alpha / \alpha_{max})^{1/3}$')

    plt.ylabel('Stability score: proportion of times selected')

    plt.title('Stability Scores Path - Mutual incoherence: %.1f' % mi)

    plt.axis('tight')

    plt.legend((hg[0], hb[0]), 
               ('relevant features', 'irrelevant features'),

               loc='best')



    ####################################
    # Plot the estimated stability scores for a given alpha



    # Use 6-fold cross-validation rather than 
    # the default 3-fold: it leads to

    # a better choice of alpha:

    # Stop the user warnings outputs- they are 
    # not necessary for the example

    # as it is specifically set up to be challenging.

    with warnings.catch_warnings():

        warnings.simplefilter('ignore', UserWarning)

        warnings.simplefilter('ignore', ConvergenceWarning)

        lars_cv = LassoLarsCV(cv=6).fit(X, y)



    # Run the RandomizedLasso: we use a paths
    # going down to .1*alpha_max

    # to avoid exploring the regime in which
    # very noisy variables enter

    # the model

    alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)

    clf = RandomizedLasso(alpha=alphas, random_state=42).fit(X, y)

    trees = ExtraTreesRegressor(100).fit(X, y)

    # Compare with F-score

    F, _ = f_regression(X, y)



    plt.figure()

    for name, score in [('F-test', F),

                        ('Stability selection', clf.scores_),

                        ('Lasso coefs', np.abs(lars_cv.coef_)),

                        ('Trees', trees.feature_importances_),

                        ]:

        precision, recall, thresholds = precision_recall_curve(coef != 0,

                                                               score)

        plt.semilogy(np.maximum(score / np.max(score), 1e-4),

                     label="%s. AUC: %.3f" % (name, auc(recall, precision)))



    plt.plot(np.where(coef != 0)[0], [2e-4] * n_relevant_features, 'mo',

             label="Ground truth")

    plt.xlabel("Features")

    plt.ylabel("Score")

    # Plot only the 100 first coefficients

    plt.xlim(0, 100)

    plt.legend(loc='best')

    plt.title('Feature selection scores - Mutual incoherence: %.1f'

              % mi)

plt.show()

实验结果如下

基于树（随机森林）的特征选择方法

示例如下：

print(__doc__)

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import make_classification

from sklearn.ensemble import ExtraTreesClassifier

# Build a classification task using 3 informative features

X, y = make_classification(n_samples=1000,

                           n_features=10,

                           n_informative=3,

                           n_redundant=0,

                           n_repeated=0,

                           n_classes=2,

                           random_state=0,

                           shuffle=False)

# Build a forest and compute the feature importances

forest = ExtraTreesClassifier(n_estimators=250,

                              random_state=0)

forest.fit(X, y)

importances = forest.feature_importances_

std = np.std([tree.feature_importances_ for tree in forest.estimators_],

             axis=0)

indices = np.argsort(importances)[::-1]

# Print the feature ranking

print("Feature ranking:")

for f in range(X.shape[1]):

    print("%d. feature %d (%f)" % 
    (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest

plt.figure()

plt.title("Feature importances")

plt.bar(range(X.shape[1]), importances[indices],

       color="r", yerr=std[indices], align="center")

plt.xticks(range(X.shape[1]), indices)

plt.xlim([-1, X.shape[1]])

plt.show()

实验结果如下

下面是通过森林法得到人脸中每个像素特征的重要性。

print(__doc__)

from time import time

import matplotlib.pyplot as plt

from sklearn.datasets import fetch_olivetti_faces

from sklearn.ensemble import ExtraTreesClassifier

# Number of cores to use to perform parallel 
# fitting of the forest model

n_jobs = 1

# Load the faces dataset

data = fetch_olivetti_faces()

X = data.images.reshape((len(data.images), -1))

y = data.targetmask = y < 5  
# Limit to 5 classes

X = X[mask]y = y[mask]

# Build a forest and compute the pixel importances

print("Fitting ExtraTreesClassifier on faces data with %d cores..."
     % n_jobs)

t0 = time()

forest = ExtraTreesClassifier(n_estimators=1000,

                              max_features=128,

                              n_jobs=n_jobs,

                              random_state=0)

forest.fit(X, y)

print("done in %0.3fs" % (time() - t0))

importances = forest.feature_importances_

importances = importances.reshape(data.images[0].shape)

# Plot pixel importances

plt.matshow(importances, cmap=plt.cm.hot)

plt.title("Pixel importances with forests of trees")

plt.show()

实验结果如下：

0 个评论

要回复文章请先登录或注册