展示如何用R处理稀疏矩阵和利用GLMNET包

发表: 2016-07-04 浏览: 1844

# 建立简单的some_data.frame

some_dataframe<-read.table("~\some_data.frame.txt",sep="\t",header=T)

some_dataframe

## c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 outcome

## 1 2 7 0 0 0 0 0 0 0 0 0

## 2 0 0 3 0 0 0 0 0 0 0 0

## 3 0 0 0 6 1 0 0 0 0 0 0

## 4 0 0 0 2 0 0 0 0 0 0 0

## 5 0 0 0 0 0 0 0 0 12 0 1

## 6 0 0 0 0 0 25 0 0 0 0 1

## 7 1 0 0 0 2 0 0 0 0 0 0

## 8 0 0 0 2 0 0 0 0 0 0 0

## 9 0 0 0 0 0 0 0 0 14 0 1

## 10 0 0 0 0 0 21 0 0 0 0 1

## 11 0 0 0 0 0 0 28 0 0 0 1

## 12 0 0 0 0 0 0 0 35 0 0 1

## 13 0 0 0 0 0 0 0 0 42 0 1

## 14 0 0 0 0 0 0 0 0 0 49 1

some_matrix<-data.matrix(some_dataframe)

some_matrix

## c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 outcome

## [1,] 2 7 0 0 0 0 0 0 0 0 0

## [2,] 0 0 3 0 0 0 0 0 0 0 0

## [3,] 0 0 0 6 1 0 0 0 0 0 0

## [4,] 0 0 0 2 0 0 0 0 0 0 0

## [5,] 0 0 0 0 0 0 0 0 12 0 1

## [6,] 0 0 0 0 0 25 0 0 0 0 1

## [7,] 1 0 0 0 2 0 0 0 0 0 0

## [8,] 0 0 0 2 0 0 0 0 0 0 0

## [9,] 0 0 0 0 0 0 0 0 14 0 1

## [10,] 0 0 0 0 0 21 0 0 0 0 1

## [11,] 0 0 0 0 0 0 28 0 0 0 1

## [12,] 0 0 0 0 0 0 0 35 0 0 1

## [13,] 0 0 0 0 0 0 0 0 42 0 1

## [14,] 0 0 0 0 0 0 0 0 0 49 1

# 显然，这跟data.frame格式差不多。为了将它转成稀疏矩阵，我们加载Matrix包，利用Matrix函数，将sparse参数设置为TRUE。

library(Matrix)

print(Matrix(some_matrix,sparse=T))

## 14 x 11 sparse Matrix of class "dgCMatrix"

## [[ suppressing 11 column names 'c1', 'c2', 'c3' ... ]]

##

## [1,] 2 7 . . . . . . . . .

## [2,] . . 3 . . . . . . . .

## [3,] . . . 6 1 . . . . . .

## [4,] . . . 2 . . . . . . .

## [5,] . . . . . . . . 12 . 1

## [6,] . . . . . 25 . . . . 1

## [7,] 1 . . . 2 . . . . . .

## [8,] . . . 2 . . . . . . .

## [9,] . . . . . . . . 14 . 1

## [10,] . . . . . 21 . . . . 1

## [11,] . . . . . . 28 . . . 1

## [12,] . . . . . . . 35 . . 1

## [13,] . . . . . . . . 42 . 1

## [14,] . . . . . . . . . 49 1

# 在这里，它只保留了非零值。

接下来，让我们将data.frame数据分成两份：2/3做为训练集，1/3做为测试集。

set.seed(2)

split<-sample(nrow(some_dataframe),floor(0.7*nrow(some_dataframe)))

train<-some_dataframe[split,]

test<-some_dataframe[-split,]

train

## c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 outcome

## 3 0 0 0 6 1 0 0 0 0 0 0

## 10 0 0 0 0 0 21 0 0 0 0 1

## 7 1 0 0 0 2 0 0 0 0 0 0

## 2 0 0 3 0 0 0 0 0 0 0 0

## 13 0 0 0 0 0 0 0 0 42 0 1

## 9 0 0 0 0 0 0 0 0 14 0 1

## 11 0 0 0 0 0 0 28 0 0 0 1

## 6 0 0 0 0 0 25 0 0 0 0 1

## 14 0 0 0 0 0 0 0 0 0 49 1

test

## c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 outcome

## 1 2 7 0 0 0 0 0 0 0 0 0

## 4 0 0 0 2 0 0 0 0 0 0 0

## 5 0 0 0 0 0 0 0 0 12 0 1

## 8 0 0 0 2 0 0 0 0 0 0 0

## 12 0 0 0 0 0 0 0 35 0 0 1

# 然后，我们用sparse.model.matrix函数创建稀疏矩阵

train_sparse<-sparse.model.matrix(~.,train[1:10])

test_sparse<-sparse.model.matrix(~.,test[1:10])

train_sparse

## 9 x 11 sparse Matrix of class "dgCMatrix"

## [[ suppressing 11 column names '(Intercept)', 'c1', 'c2' ... ]]

##

## 3 1 . . . 6 1 . . . . .

## 10 1 . . . . . 21 . . . .

## 7 1 1 . . . 2 . . . . .

## 2 1 . . 3 . . . . . . .

## 13 1 . . . . . . . . 42 .

## 9 1 . . . . . . . . 14 .

## 11 1 . . . . . . 28 . . .

## 6 1 . . . . . 25 . . . .

## 14 1 . . . . . . . . . 49

test_sparse

## 5 x 11 sparse Matrix of class "dgCMatrix"

## [[ suppressing 11 column names '(Intercept)', 'c1', 'c2' ... ]]

##

## 1 1 2 7 . . . . . . . .

## 4 1 . . . 2 . . . . . .

## 5 1 . . . . . . . . 12 .

## 8 1 . . . 2 . . . . . .

## 12 1 . . . . . . . 35 . .

library(glmnet)

## Loaded glmnet 1.9-8

fit<-glmnet(train_sparse,train[,11])

pred<-predict(fit,test_sparse,test[,11],type="class")

print(head(pred[,1:5]))

## 1 2 3 4 5

## 1 0.9898 0.9898 0.6667 0.9898 0.6667

## 4 0.8306 0.8306 0.6667 0.8306 0.6667

## 5 0.9898 0.9898 0.6667 0.9898 0.6667

## 8 0.8306 0.8306 0.6667 0.8306 0.6667

## 12 0.9898 0.9898 0.6667 0.9898 0.6667

# 利用cv.glmnet找出最好的lambda/penalty

cv<-cv.glmnet(train_sparse,train[,11],nfolds=3)

pred<-predict(fit,test_sparse,type="response",s=cv$lambda.min)

print(names(cv))

## [1] "lambda" "cvm" "cvsd" "cvup" "cvlo"

## [6] "nzero" "name" "glmnet.fit" "lambda.min" "lambda.1se"

print(pred)

## 1

## 1 0.9898

## 4 0.8306

## 5 0.9898

## 8 0.8306

## 12 0.9898

# receiver operating characteristic (ROC curves)

library(pROC)

## Type 'citation("pROC")' for a citation.

##

## Attaching package: 'pROC'

##

## 下列对象被屏蔽了from 'package:glmnet':

##

## auc

##

## 下列对象被屏蔽了from 'package:stats':

##

## cov, smooth, var

auc<-roc(test[,11],pred)

print(auc$auc)

## Area under the curve: 0.833

# how does sparse deal with categorical data (adding mood feature with two levels)?

cat_dataframe<-data.frame(some_dataframe,

mood=c("happy","happy","happy","happy","sad","sad","happy","happy",

"sad","sad","sad","sad","sad","sad"))

cat_dataframe<-cat_dataframe[,c(colnames(cat_dataframe)[1:10],"mood","outcome")]

sparse.model.matrix(~.,cat_dataframe)

## 14 x 13 sparse Matrix of class "dgCMatrix"

## [[ suppressing 13 column names '(Intercept)', 'c1', 'c2' ... ]]

##

## 1 1 2 7 . . . . . . . . . .

## 2 1 . . 3 . . . . . . . . .

## 3 1 . . . 6 1 . . . . . . .

## 4 1 . . . 2 . . . . . . . .

## 5 1 . . . . . . . . 12 . 1 1

## 6 1 . . . . . 25 . . . . 1 1

## 7 1 1 . . . 2 . . . . . . .

## 8 1 . . . 2 . . . . . . . .

## 9 1 . . . . . . . . 14 . 1 1

## 10 1 . . . . . 21 . . . . 1 1

## 11 1 . . . . . . 28 . . . 1 1

## 12 1 . . . . . . . 35 . . 1 1

## 13 1 . . . . . . . . 42 . 1 1

## 14 1 . . . . . . . . . 49 1 1

print(levels(cat_dataframe$mood))

## [1] "happy" "sad"

dim(cat_dataframe)

## [1] 14 12

dim(sparse.model.matrix(~.,cat_dataframe))

## [1] 14 13

顶0

0 个评论

要回复文章请先登录或注册