dplyr：一套数据处理语法

发表: 2017-06-19 浏览: 1958

数据分析 R语言

可能更多的人知道Hadley大神是因为他那大名鼎鼎的ggplot2，但dplyr包也不遑多让。作为一套数据处理与清洗的语法，dplyr包都是你玩转数据的必备武器。R在基础配置中进行数据处理无疑也是okay的，但总有懒人会想到要偷更多懒，于是Hadley大神又写了dplyr包来拯救这些懒人。本文主要参考R自带的帮助系统dplyr导论（introduction to dplyr）以及dplyr-package文档。

dplyr包提供了几个核心“动词”函数来快速方便的处理数据，包括filter、arrange、select、distinct、mutate以及summarise等函数。下面以nycflights包中的flights数据集为例演示dplyr包处理数据的一系列操作。flights数据集包含了2013年从纽约起飞的336776架航班的飞行数据。

数据概览

library(dplyr)
library(nycflights13)
dim(flights)
[1] 336776 19
#将数据转换为更好处理的tibble表格形式
flights<-tbl_df(flights)
head(flights)
# A tibble: 6 × 19
 year month day dep_time sched_dep_time dep_delay arr_time
 <int> <int> <int> <int>  <int> <dbl> <int>
1 2013 1 1 517  515  2 830
2 2013 1 1 533  529  4 850
3 2013 1 1 542  540  2 923
4 2013 1 1 544  545 -1 1004
5 2013 1 1 554  600 -6 812
6 2013 1 1 554  558 -4 740
# ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
# carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
# air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
# time_hour <dttm>

数据筛选-filter/slice

 filter(flights,month==1,day==1)
# A tibble: 842 × 19
  year month  day dep_time sched_dep_time dep_delay arr_time
  <int> <int> <int>  <int>     <int>   <dbl>  <int>
1  2013   1   1   517      515     2   830
2  2013   1   1   533      529     4   850
3  2013   1   1   542      540     2   923
4  2013   1   1   544      545    -1   1004
5  2013   1   1   554      600    -6   812
6  2013   1   1   554      558    -4   740
7  2013   1   1   555      600    -5   913
8  2013   1   1   557      600    -3   709
9  2013   1   1   557      600    -3   838
10 2013   1   1   558      600    -2   753
# ... with 832 more rows, and 12 more variables: sched_arr_time <int>,
#  arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#  origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#  minute <dbl>, time_hour <dttm>
#选取10行观测
slice(flights,1:10)
# A tibble: 10 × 19
  year month  day dep_time sched_dep_time dep_delay arr_time
  <int> <int> <int>  <int>     <int>   <dbl>  <int>
1  2013   1   1   517      515     2   830
2  2013   1   1   533      529     4   850
3  2013   1   1   542      540     2   923
4  2013   1   1   544      545    -1   1004
5  2013   1   1   554      600    -6   812
6  2013   1   1   554      558    -4   740
7  2013   1   1   555      600    -5   913
8  2013   1   1   557      600    -3   709
9  2013   1   1   557      600    -3   838
10 2013   1   1   558      600    -2   753
# ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
#  carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#  air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
#  time_hour <dttm>

同样功能R基础用法为：

flights[which(flights$month==1&flights$day==1),]
flights[1:10,]

数据重排-arrange

arrange(flights,year,month,day)
# A tibble: 336,776 × 19
  year month  day dep_time sched_dep_time dep_delay arr_time
  <int> <int> <int>  <int>     <int>   <dbl>  <int>
1  2013   1   1   517      515     2   830
2  2013   1   1   533      529     4   850
3  2013   1   1   542      540     2   923
4  2013   1   1   544      545    -1   1004
5  2013   1   1   554      600    -6   812
6  2013   1   1   554      558    -4   740
7  2013   1   1   555      600    -5   913
8  2013   1   1   557      600    -3   709
9  2013   1   1   557      600    -3   838
10 2013   1   1   558      600    -2   753
# ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
#  arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#  origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#  minute <dbl>, time_hour <dttm>
#按照某个变量降序排列
 arrange(flights,desc(arr_delay))
# A tibble: 336,776 × 19
  year month  day dep_time sched_dep_time dep_delay arr_time
  <int> <int> <int>  <int>     <int>   <dbl>  <int>
1  2013   1   9   641      900   1301   1242
2  2013   6  15   1432      1935   1137   1607
3  2013   1  10   1121      1635   1126   1239
4  2013   9  20   1139      1845   1014   1457
5  2013   7  22   845      1600   1005   1044
6  2013   4  10   1100      1900    960   1342
7  2013   3  17   2321      810    911   135
8  2013   7  22   2257      759    898   121
9  2013  12   5   756      1700    896   1058
10 2013   5   3   1133      2055    878   1250
# ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
#  arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#  origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#  minute <dbl>, time_hour <dttm>

相应功能的R基础用法：

flights[order(flights$year,flights$month,flights$day),]
flights[order(flights$arr_delay,decreasing=TRUE),]

数据选择-select

 select(flights,year,month,day)
# A tibble: 336,776 × 3
  year month  day
  <int> <int> <int>
1  2013   1   1
2  2013   1   1
3  2013   1   1
4  2013   1   1
5  2013   1   1
6  2013   1   1
7  2013   1   1
8  2013   1   1
9  2013   1   1
10 2013   1   1
# ... with 336,766 more rows
#数据选择的R基础用法
subset(flights,select=year:day)
# A tibble: 336,776 × 3
  year month  day
  <int> <int> <int>
1  2013   1   1
2  2013   1   1
3  2013   1   1
4  2013   1   1
5  2013   1   1
6  2013   1   1
7  2013   1   1
8  2013   1   1
9  2013   1   1
10 2013   1   1
# ... with 336,766 more rows

数据变量重命名-rename

rename(flights,tail_num=tailnum)
#变量重命名的R基础用法
tail_num<-flights$tailnum

抽取变量中单一观测值-distinct

distinct(flights,tailnum)
# A tibble: 4,044 × 1
  tailnum
   <chr>
1  N14228
2  N24211
3  N619AA
4  N804JB
5  N668DN
6  N39463
7  N516JB
8  N829AS
9  N593JB
10 N3ALAA
# ... with 4,034 more rows
#相应功能的R基础用法
unique(flights$tailnum)

数据变形-mutate

mutate(flights,gain=arr_delay-dep_delay,speed=distance/air_time*60)
# A tibble: 336,776 × 21
  year month  day dep_time sched_dep_time dep_delay arr_time
  <int> <int> <int>  <int>     <int>   <dbl>  <int>
1  2013   1   1   517      515     2   830
2  2013   1   1   533      529     4   850
3  2013   1   1   542      540     2   923
4  2013   1   1   544      545    -1   1004
5  2013   1   1   554      600    -6   812
6  2013   1   1   554      558    -4   740
7  2013   1   1   555      600    -5   913
8  2013   1   1   557      600    -3   709
9  2013   1   1   557      600    -3   838
10 2013   1   1   558      600    -2   753
# ... with 336,766 more rows, and 14 more variables: sched_arr_time <int>,
#  arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#  origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#  minute <dbl>, time_hour <dttm>, gain <dbl>, speed <dbl>
#相应功能的R基础用法
transform(flights,gain=arr_delay-dep_delay,speed=distance/air_time*60)

mutate与transform的区别在于mutate可以即时调用刚刚生成的变量名，但transform函数则不行。不过也可以通过transmute函数来仅显示创建的变量数据。

transmute(flights,gain=arr_delay-dep_delay,gain_per_hour=gain/(air_time/60))
# A tibble: 336,776 × 2
  gain gain_per_hour
  <dbl>     <dbl>
1   9   2.378855
2   16   4.229075
3   31   11.625000
4  -17   -5.573770
5  -19   -9.827586
6   16   6.400000
7   24   9.113924
8  -11  -12.452830
9   -5   -2.142857
10  10   4.347826
# ... with 336,766 more rows

数据归总summarise与随机抽样sample_n/sample_frac

summarise(flights,delay=mean(dep_delay,na.rm=TRUE))
# A tibble: 1 × 1
   delay
   <dbl>
1 12.63907
#按个数抽样
 sample_n(flights,10)
# A tibble: 10 × 19
  year month  day dep_time sched_dep_time dep_delay arr_time
  <int> <int> <int>  <int>     <int>   <dbl>  <int>
1  2013   7  31   1945      1930    15   2320
2  2013   8   2   1856      1820    36   2121
3  2013   1  22   817      825    -8   941
4  2013  11  27   514      515    -1   743
5  2013   9   1   1705      1705     0   1926
6  2013  10  13   2026      1940    46   2216
7  2013  12   4   1419      1425    -6   1641
8  2013  11   8   713      719    -6   920
9  2013   3   8   1510      1455    15   1645
10 2013   3   7   2227      2230    -3   312
# ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
#  carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#  air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
#  time_hour <dttm>
#按比例抽样
 sample_frac(flights,0.01)
# A tibble: 3,368 × 19
  year month  day dep_time sched_dep_time dep_delay arr_time
  <int> <int> <int>  <int>     <int>   <dbl>  <int>
1  2013   6   3   625      630    -5   932
2  2013   9  26   649      655    -6   1041
3  2013   3  10   1212      1220    -8   1359
4  2013   8  21   1123      1130    -7   1320
5  2013   8   6   1314      1155    79   1424
6  2013   5  21   656      601    55   811
7  2013   1  26   1727      1734    -7   1848
8  2013  12   7   919      920    -1   1321
9  2013   9   5   1728      1725     3   2022
10 2013   8  16   1450      1440    10   1708
# ... with 3,358 more rows, and 12 more variables: sched_arr_time <int>,
#  arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#  origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#  minute <dbl>, time_hour <dttm>

或许你也注意到了dplyr进行数据处理的几个主要“动词”函数的规律了：

● 第一个参数一定是需要处理的数据集

● 第二个参数则形容了我们将要对数据进行怎样的处理，相较于基础的数据操作而言，dplyr函数无需使用$符号。

● dplyr函数的结果以一个新的数据框形式呈现。

数据的分组处理-group_by

destinations<-group_by(flights,dest)
summarise(destinations,planes=n_distinct(tailnum),
     flights=n())
# A tibble: 105 × 3
  dest planes flights
  <chr> <int>  <int>
1  ABQ  108   254
2  ACK   58   265
3  ALB  172   439
4  ANC   6    8
5  ATL  1180  17215
6  AUS  993  2439
7  AVL  159   275
8  BDL  186   443
9  BGR   46   375
10  BHM   45   297
# ... with 95 more rows

分组处理结合汇总函数可以轻松按照变量对数据进行上卷处理：

daily<-group_by(flights,year,month,day)
#按天上卷
(per_day<-summarise(daily,flights=n()))
Source: local data frame [365 x 4]
Groups: year, month [?]
  year month  day flights
  <int> <int> <int>  <int>
1  2013   1   1   842
2  2013   1   2   943
3  2013   1   3   914
4  2013   1   4   915
5  2013   1   5   720
6  2013   1   6   832
7  2013   1   7   933
8  2013   1   8   899
9  2013   1   9   902
10 2013   1  10   932
# ... with 355 more rows
#按月上卷
(per_month<-summarise(per_day,flights=sum(flights)))
Source: local data frame [12 x 3]
Groups: year [?]
  year month flights
  <int> <int>  <int>
1  2013   1  27004
2  2013   2  24951
3  2013   3  28834
4  2013   4  28330
5  2013   5  28796
6  2013   6  28243
7  2013   7  29425
8  2013   8  29327
9  2013   9  27574
10 2013  10  28889
11 2013  11  27268
12 2013  12  28135
#按年上卷
(per_year<-summarise(per_month,flights=sum(flights)))
# A tibble: 1 × 2
  year flights
 <int>  <int>
1 2013 336776

管道函数%>%

dplyr在处理过程必须及时对结果进行保存，所以在进行数据处理时需一步步赋值处理：

a1<-group_by(flights,year,month,day)
a2<-select(a1,arr_delay,dep_delay)
a3<-summarise(a2,
   arr=mean(arr_delay,na.rm=TRUE),
   dep=mean(dep_delay,na.rm=TRUE))
(a4<-filter(a3,arr>30|dep>30))
Source: local data frame [49 x 5]
Groups: year, month [11]
  year month  day   arr   dep
  <int> <int> <int>  <dbl>  <dbl>
1  2013   1  16 34.24736 24.61287
2  2013   1  31 32.60285 28.65836
3  2013   2  11 36.29009 39.07360
4  2013   2  27 31.25249 37.76327
5  2013   3   8 85.86216 83.53692
6  2013   3  18 41.29189 30.11796
7  2013   4  10 38.41231 33.02368
8  2013   4  12 36.04814 34.83843
9  2013   4  18 36.02848 34.91536
10 2013   4  19 47.91170 46.12783
# ... with 39 more rows

当然，若果你不想进行即时保存，也可以通过对函数的嵌套处理：

filter(
  summarise(
    select(
      group_by(flights,year,month,day),
      arr_delay,dep_delay
     ),
     arr=mean(arr_delay,na.rm=TRUE),
     dep=mean(dep_delay,na.rm=TRUE)
 ),
 arr>30|dep>30
)
Adding missing grouping variables: `year`, `month`, `day`
Source: local data frame [49 x 5]
Groups: year, month [11]
    year month   day      arr      dep
   <int> <int> <int>    <dbl>    <dbl>
1   2013     1    16 34.24736 24.61287
2   2013     1    31 32.60285 28.65836
3   2013     2    11 36.29009 39.07360
4   2013     2    27 31.25249 37.76327
5   2013     3     8 85.86216 83.53692
6   2013     3    18 41.29189 30.11796
7   2013     4    10 38.41231 33.02368
8   2013     4    12 36.04814 34.83843
9   2013     4    18 36.02848 34.91536
10  2013     4    19 47.91170 46.12783
# ... with 39 more rows

上面的嵌套读起来太费劲，dplyr为大家提供了管道操作符号%>%来传递每一层的运行结果：

flights %>%
  group_by(year,month,day) %>%
  select(arr_delay,dep_delay) %>%
  summarise(
   arr=mean(arr_delay,na.rm=TRUE),
   dep=mean(dep_delay,na.rm=TRUE)
  ) %>%
  filter(arr>30|dep>30)
Adding missing grouping variables: `year`, `month`, `day`
Source: local data frame [49 x 5]
Groups: year, month [11]
  year month  day   arr   dep
  <int> <int> <int>  <dbl>  <dbl>
1  2013   1  16 34.24736 24.61287
2  2013   1  31 32.60285 28.65836
3  2013   2  11 36.29009 39.07360
4  2013   2  27 31.25249 37.76327
5  2013   3   8 85.86216 83.53692
6  2013   3  18 41.29189 30.11796
7  2013   4  10 38.41231 33.02368
8  2013   4  12 36.04814 34.83843
9  2013   4  18 36.02848 34.91536
10 2013   4  19 47.91170 46.12783
# ... with 39 more rows

长按二维码.关注数据科学家养成记

1 个评论

鸣人吃土豆

这管道函数，666

要回复文章请先登录或注册