本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。
文章目录
- # 给数据集插入离群值 cars1 <- cars[1:30, ] # 原始数据 cars_outliers <- data.frame(speed = c(19, 19, 20, 20, 20), dist = c(190, 186, 210, 220, 218)) # 引入离群值 cars2 <- rbind(cars1, cars_outliers) # 包含李全职的数据 # 绘制包含离群值的数据建模结果 par(mfrow = c(1, 2)) plot(cars2$speed, cars2$dist, xlim = c(0, 28), ylim=c(0, 230), main = "With Outliers", xlab = "speed", ylab = "dist", pch = "*", col = "red", cex = 2) abline(lm(dist ~ speed, data = cars2), col = "blue", lwd = 3, lty = 2) # 绘制原始数据建模加过,留意回归线斜率的变化 plot(cars1$speed, cars1$dist, xlim = c(0, 28), ylim = c(0, 230), main = "Outliers removed \n A much better fit!", xlab = "speed", ylab = "dist", pch = "*", col = "red", cex = 2) abline(lm(dist ~ speed, data = cars1), col = "blue", lwd = 3, lty = 2)
- url <- "http://rstatistics.net/wp-content/uploads/2015/09/ozone.csv" # 备用数据源: https://raw.githubusercontent.com/selva86/datasets/master/ozone.csv inputData <- read.csv(url) # 导入数据 outlier_values <- boxplot.stats(inputData$pressure_height)$out # outlier values. boxplot(inputData$pressure_height, main="Pressure Height", boxwex=0.1) mtext(paste("Outliers: ", paste(outlier_values, collapse=", ")), cex=0.6)
- url <- "http://rstatistics.net/wp-content/uploads/2015/09/ozone.csv" ozone <- read.csv(url) # Month和Day_of_Week是分类变量 boxplot(ozone_reading ~ Month, data=ozone, main="Ozone reading across months") # 有明确的模式 boxplot(ozone_reading ~ Day_of_week, data=ozone, main="Ozone reading for days of week") # this may not be significant, as day of week variable is a subset of the month var.
- boxplot(ozone_reading ~ pressure_height, data=ozone, main="Boxplot for Pressure height (continuos var) vs Ozone") boxplot(ozone_reading ~ cut(pressure_height, pretty(inputData$pressure_height)), data=ozone, main="Boxplot for Pressure height (categorial) vs Ozone", cex.axis=0.5)
- mod <- lm(ozone_reading ~ ., data=ozone) cooksd <- cooks.distance(mod)
- plot(cooksd, pch="*", cex=2, main="Influential Obs by Cooks distance") # 绘制Cook距离 abline(h = 4*mean(cooksd, na.rm=T), col="red") # 添加决策线 text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red") # 添加标签
- influential 4*mean(cooksd, na.rm=T))]) # 有影响力的观测值行标 head(ozone[influential, ]) # 列出这些观测 #> Month Day_of_month Day_of_week ozone_reading pressure_height Wind_speed Humidity #> 19 1 19 1 4.07 5680 5 73 #> 23 1 23 5 4.90 5700 5 59 #> 58 2 27 5 22.89 5740 3 47 #> 133 5 12 3 33.04 5880 3 80 #> 135 5 14 5 31.15 5850 4 76 #> 149 5 28 5 4.82 5750 3 76 #> Temperature_Sandburg Temperature_ElMonte Inversion_base_height Pressure_gradient #> 19 52 56.48 393 -68 #> 23 69 51.08 3044 18 #> 58 53 58.82 885 -4 #> 133 80 73.04 436 0 #> 135 78 71.24 1181 50 #> 149 65 51.08 3644 86 #> Inversion_temperature Visibility #> 19 69.80 10 #> 23 52.88 150 #> 58 67.10 80 #> 133 86.36 40 #> 135 79.88 17 #> 149 59.36 70
- 第58, 133, 135行的ozone_reading值非常大第23, 135, 149行的Inversion_bzase_height值非常大第19行有非常低的Pressure_gradient
- car::outlierTest(mod) #> No Studentized residuals with Bonferonni p Largest |rstudent|: #> rstudent unadjusted p-value Bonferonni p #> 243 3.045756 0.0026525 0.53845
- set.seed(1234) y=rnorm(100) outlier(y) #> [1] 2.548991 outlier(y,opposite=TRUE) #> [1] -2.345698 dim(y) <- c(20,5) # convert it to a matrix outlier(y) #> [1] 2.415835 1.102298 1.647817 2.548991 2.121117 outlier(y,opposite=TRUE) #> [1] -2.345698 -2.180040 -1.806031 -1.390701 -1.372302
- set.seed(1234) x = rnorm(10) scores(x) # z得分 => (x-mean)/sd scores(x, type="chisq") # chisq得分 => (x - mean(x))^2/var(x) #> [1] 0.68458034 0.44007451 2.17210689 3.88421971 0.66539631 . . . scores(x, type="t") # t得分 scores(x, type="chisq", prob=0.9) # 是否超过chisq分布的0.9分位数 #> [1] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE scores(x, type="chisq", prob=0.95) # 0.95分位数 scores(x, type="z", prob=0.95) # 基于z得分判定 scores(x, type="t", prob=0.95) # 大家都懂,我懒得翻译了
- x <- ozone$pressure_height qnt <- quantile(x, probs=c(.25, .75), na.rm = T) caps <- quantile(x, probs=c(.05, .95), na.rm = T) H <- 1.5 * IQR(x, na.rm = T) x[x < (qnt[1] - H)] (qnt[2] + H)] <- caps[2]
- 离群值检测的重要性
- 检测离群值
- 结果如下
- 离散化处理后,你会发现被判定为离群值的点更少,并且ozone_reading随着pressure_height的增加而变化的趋势愈发明确了。
- 3. 多元模型检测法
- 仅凭一个特征就判定一个观测值是离群点可能并不科学。利用多个特征的信息来判断个体是否是离群值会更好,这就需要使用Cook距离。
- Cook距离可以衡量一个给定的回归模型是否只受单个变量X的影响。Cook距离会极端每一个数据点对预测结果的影响。对于每个观测i,Cook距离会衡量包含i与不包含i时,Y的拟合值的变化,这样我们就知道了i对拟合结果的影响了。
- 观测i的Cook距离计算公式如下:
- 其中:
- 是使用所有观测计算的第j个y的拟合值
- 是使用除观测i外所有观测计算的第j个y的拟合值
- 是均方误差
- 是回归模型的系数个数
- 离群值检验
- 0utliners包
- 离群值处理