可能更多的人知道Hadley大神是因为他那大名鼎鼎的ggplot2,但dplyr包也不遑多让。作为一套数据处理与清洗的语法,dplyr包都是你玩转数据的必备武器。R在基础配置中进行数据处理无疑也是okay的,但总有懒人会想到要偷更多懒,于是Hadley大神又写了dplyr包来拯救这些懒人。本文主要参考R自带的帮助系统dplyr导论(introduction to dplyr)以及dplyr-package文档。
dplyr包提供了几个核心“动词”函数来快速方便的处理数据,包括filter、arrange、select、distinct、mutate以及summarise等函数。下面以nycflights包中的flights数据集为例演示dplyr包处理数据的一系列操作。flights数据集包含了2013年从纽约起飞的336776架航班的飞行数据。
数据概览
library(dplyr)
library(nycflights13)
dim(flights)
[1] 336776 19
#将数据转换为更好处理的tibble表格形式
flights<-tbl_df(flights)
head(flights)
# A tibble: 6 × 19
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 1 1 517 515 2 830
2 2013 1 1 533 529 4 850
3 2013 1 1 542 540 2 923
4 2013 1 1 544 545 -1 1004
5 2013 1 1 554 600 -6 812
6 2013 1 1 554 558 -4 740
# ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
# carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
# air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
# time_hour <dttm>
数据筛选-filter/slice
filter(flights,month==1,day==1)
# A tibble: 842 × 19
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 1 1 517 515 2 830
2 2013 1 1 533 529 4 850
3 2013 1 1 542 540 2 923
4 2013 1 1 544 545 -1 1004
5 2013 1 1 554 600 -6 812
6 2013 1 1 554 558 -4 740
7 2013 1 1 555 600 -5 913
8 2013 1 1 557 600 -3 709
9 2013 1 1 557 600 -3 838
10 2013 1 1 558 600 -2 753
# ... with 832 more rows, and 12 more variables: sched_arr_time <int>,
# arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>
#选取10行观测
slice(flights,1:10)
# A tibble: 10 × 19
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 1 1 517 515 2 830
2 2013 1 1 533 529 4 850
3 2013 1 1 542 540 2 923
4 2013 1 1 544 545 -1 1004
5 2013 1 1 554 600 -6 812
6 2013 1 1 554 558 -4 740
7 2013 1 1 555 600 -5 913
8 2013 1 1 557 600 -3 709
9 2013 1 1 557 600 -3 838
10 2013 1 1 558 600 -2 753
# ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
# carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
# air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
# time_hour <dttm>
同样功能R基础用法为:
flights[which(flights$month==1&flights$day==1),]
flights[1:10,]
数据重排-arrange
arrange(flights,year,month,day)
# A tibble: 336,776 × 19
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 1 1 517 515 2 830
2 2013 1 1 533 529 4 850
3 2013 1 1 542 540 2 923
4 2013 1 1 544 545 -1 1004
5 2013 1 1 554 600 -6 812
6 2013 1 1 554 558 -4 740
7 2013 1 1 555 600 -5 913
8 2013 1 1 557 600 -3 709
9 2013 1 1 557 600 -3 838
10 2013 1 1 558 600 -2 753
# ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
# arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>
#按照某个变量降序排列
arrange(flights,desc(arr_delay))
# A tibble: 336,776 × 19
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 1 9 641 900 1301 1242
2 2013 6 15 1432 1935 1137 1607
3 2013 1 10 1121 1635 1126 1239
4 2013 9 20 1139 1845 1014 1457
5 2013 7 22 845 1600 1005 1044
6 2013 4 10 1100 1900 960 1342
7 2013 3 17 2321 810 911 135
8 2013 7 22 2257 759 898 121
9 2013 12 5 756 1700 896 1058
10 2013 5 3 1133 2055 878 1250
# ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
# arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>
相应功能的R基础用法:
flights[order(flights$year,flights$month,flights$day),]
flights[order(flights$arr_delay,decreasing=TRUE),]
数据选择-select
select(flights,year,month,day)
# A tibble: 336,776 × 3
year month day
<int> <int> <int>
1 2013 1 1
2 2013 1 1
3 2013 1 1
4 2013 1 1
5 2013 1 1
6 2013 1 1
7 2013 1 1
8 2013 1 1
9 2013 1 1
10 2013 1 1
# ... with 336,766 more rows
#数据选择的R基础用法
subset(flights,select=year:day)
# A tibble: 336,776 × 3
year month day
<int> <int> <int>
1 2013 1 1
2 2013 1 1
3 2013 1 1
4 2013 1 1
5 2013 1 1
6 2013 1 1
7 2013 1 1
8 2013 1 1
9 2013 1 1
10 2013 1 1
# ... with 336,766 more rows
数据变量重命名-rename
rename(flights,tail_num=tailnum)
#变量重命名的R基础用法
tail_num<-flights$tailnum
抽取变量中单一观测值-distinct
distinct(flights,tailnum)
# A tibble: 4,044 × 1
tailnum
<chr>
1 N14228
2 N24211
3 N619AA
4 N804JB
5 N668DN
6 N39463
7 N516JB
8 N829AS
9 N593JB
10 N3ALAA
# ... with 4,034 more rows
#相应功能的R基础用法
unique(flights$tailnum)
数据变形-mutate
mutate(flights,gain=arr_delay-dep_delay,speed=distance/air_time*60)
# A tibble: 336,776 × 21
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 1 1 517 515 2 830
2 2013 1 1 533 529 4 850
3 2013 1 1 542 540 2 923
4 2013 1 1 544 545 -1 1004
5 2013 1 1 554 600 -6 812
6 2013 1 1 554 558 -4 740
7 2013 1 1 555 600 -5 913
8 2013 1 1 557 600 -3 709
9 2013 1 1 557 600 -3 838
10 2013 1 1 558 600 -2 753
# ... with 336,766 more rows, and 14 more variables: sched_arr_time <int>,
# arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>, gain <dbl>, speed <dbl>
#相应功能的R基础用法
transform(flights,gain=arr_delay-dep_delay,speed=distance/air_time*60)
mutate与transform的区别在于mutate可以即时调用刚刚生成的变量名,但transform函数则不行。不过也可以通过transmute函数来仅显示创建的变量数据。
transmute(flights,gain=arr_delay-dep_delay,gain_per_hour=gain/(air_time/60))
# A tibble: 336,776 × 2
gain gain_per_hour
<dbl> <dbl>
1 9 2.378855
2 16 4.229075
3 31 11.625000
4 -17 -5.573770
5 -19 -9.827586
6 16 6.400000
7 24 9.113924
8 -11 -12.452830
9 -5 -2.142857
10 10 4.347826
# ... with 336,766 more rows
数据归总summarise与随机抽样sample_n/sample_frac
summarise(flights,delay=mean(dep_delay,na.rm=TRUE))
# A tibble: 1 × 1
delay
<dbl>
1 12.63907
#按个数抽样
sample_n(flights,10)
# A tibble: 10 × 19
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 7 31 1945 1930 15 2320
2 2013 8 2 1856 1820 36 2121
3 2013 1 22 817 825 -8 941
4 2013 11 27 514 515 -1 743
5 2013 9 1 1705 1705 0 1926
6 2013 10 13 2026 1940 46 2216
7 2013 12 4 1419 1425 -6 1641
8 2013 11 8 713 719 -6 920
9 2013 3 8 1510 1455 15 1645
10 2013 3 7 2227 2230 -3 312
# ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
# carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
# air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
# time_hour <dttm>
#按比例抽样
sample_frac(flights,0.01)
# A tibble: 3,368 × 19
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 6 3 625 630 -5 932
2 2013 9 26 649 655 -6 1041
3 2013 3 10 1212 1220 -8 1359
4 2013 8 21 1123 1130 -7 1320
5 2013 8 6 1314 1155 79 1424
6 2013 5 21 656 601 55 811
7 2013 1 26 1727 1734 -7 1848
8 2013 12 7 919 920 -1 1321
9 2013 9 5 1728 1725 3 2022
10 2013 8 16 1450 1440 10 1708
# ... with 3,358 more rows, and 12 more variables: sched_arr_time <int>,
# arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>
或许你也注意到了dplyr进行数据处理的几个主要“动词”函数的规律了:
● 第一个参数一定是需要处理的数据集
● 第二个参数则形容了我们将要对数据进行怎样的处理,相较于基础的数据操作而言,dplyr函数无需使用$符号。
● dplyr函数的结果以一个新的数据框形式呈现。
数据的分组处理-group_by
destinations<-group_by(flights,dest)
summarise(destinations,planes=n_distinct(tailnum),
flights=n())
# A tibble: 105 × 3
dest planes flights
<chr> <int> <int>
1 ABQ 108 254
2 ACK 58 265
3 ALB 172 439
4 ANC 6 8
5 ATL 1180 17215
6 AUS 993 2439
7 AVL 159 275
8 BDL 186 443
9 BGR 46 375
10 BHM 45 297
# ... with 95 more rows
分组处理结合汇总函数可以轻松按照变量对数据进行上卷处理:
daily<-group_by(flights,year,month,day)
#按天上卷
(per_day<-summarise(daily,flights=n()))
Source: local data frame [365 x 4]
Groups: year, month [?]
year month day flights
<int> <int> <int> <int>
1 2013 1 1 842
2 2013 1 2 943
3 2013 1 3 914
4 2013 1 4 915
5 2013 1 5 720
6 2013 1 6 832
7 2013 1 7 933
8 2013 1 8 899
9 2013 1 9 902
10 2013 1 10 932
# ... with 355 more rows
#按月上卷
(per_month<-summarise(per_day,flights=sum(flights)))
Source: local data frame [12 x 3]
Groups: year [?]
year month flights
<int> <int> <int>
1 2013 1 27004
2 2013 2 24951
3 2013 3 28834
4 2013 4 28330
5 2013 5 28796
6 2013 6 28243
7 2013 7 29425
8 2013 8 29327
9 2013 9 27574
10 2013 10 28889
11 2013 11 27268
12 2013 12 28135
#按年上卷
(per_year<-summarise(per_month,flights=sum(flights)))
# A tibble: 1 × 2
year flights
<int> <int>
1 2013 336776
管道函数%>%
dplyr在处理过程必须及时对结果进行保存,所以在进行数据处理时需一步步赋值处理:
a1<-group_by(flights,year,month,day)
a2<-select(a1,arr_delay,dep_delay)
a3<-summarise(a2,
arr=mean(arr_delay,na.rm=TRUE),
dep=mean(dep_delay,na.rm=TRUE))
(a4<-filter(a3,arr>30|dep>30))
Source: local data frame [49 x 5]
Groups: year, month [11]
year month day arr dep
<int> <int> <int> <dbl> <dbl>
1 2013 1 16 34.24736 24.61287
2 2013 1 31 32.60285 28.65836
3 2013 2 11 36.29009 39.07360
4 2013 2 27 31.25249 37.76327
5 2013 3 8 85.86216 83.53692
6 2013 3 18 41.29189 30.11796
7 2013 4 10 38.41231 33.02368
8 2013 4 12 36.04814 34.83843
9 2013 4 18 36.02848 34.91536
10 2013 4 19 47.91170 46.12783
# ... with 39 more rows
当然,若果你不想进行即时保存,也可以通过对函数的嵌套处理:
filter(
summarise(
select(
group_by(flights,year,month,day),
arr_delay,dep_delay
),
arr=mean(arr_delay,na.rm=TRUE),
dep=mean(dep_delay,na.rm=TRUE)
),
arr>30|dep>30
)
Adding missing grouping variables: `year`, `month`, `day`
Source: local data frame [49 x 5]
Groups: year, month [11]
year month day arr dep
<int> <int> <int> <dbl> <dbl>
1 2013 1 16 34.24736 24.61287
2 2013 1 31 32.60285 28.65836
3 2013 2 11 36.29009 39.07360
4 2013 2 27 31.25249 37.76327
5 2013 3 8 85.86216 83.53692
6 2013 3 18 41.29189 30.11796
7 2013 4 10 38.41231 33.02368
8 2013 4 12 36.04814 34.83843
9 2013 4 18 36.02848 34.91536
10 2013 4 19 47.91170 46.12783
# ... with 39 more rows
上面的嵌套读起来太费劲,dplyr为大家提供了管道操作符号%>%来传递每一层的运行结果:
flights %>%
group_by(year,month,day) %>%
select(arr_delay,dep_delay) %>%
summarise(
arr=mean(arr_delay,na.rm=TRUE),
dep=mean(dep_delay,na.rm=TRUE)
) %>%
filter(arr>30|dep>30)
Adding missing grouping variables: `year`, `month`, `day`
Source: local data frame [49 x 5]
Groups: year, month [11]
year month day arr dep
<int> <int> <int> <dbl> <dbl>
1 2013 1 16 34.24736 24.61287
2 2013 1 31 32.60285 28.65836
3 2013 2 11 36.29009 39.07360
4 2013 2 27 31.25249 37.76327
5 2013 3 8 85.86216 83.53692
6 2013 3 18 41.29189 30.11796
7 2013 4 10 38.41231 33.02368
8 2013 4 12 36.04814 34.83843
9 2013 4 18 36.02848 34.91536
10 2013 4 19 47.91170 46.12783
# ... with 39 more rows
长按二维码.关注数据科学家养成记