挖掘成龙的朋友圈

浏览: 1610

今天给大家分享如何使用R语言从百度百科深层抓取成龙的朋友圈,以及如何绘制成龙的社交图.

加载R包

library(rvest)
library(stringr)

一级关系网抓取

url='https://baike.baidu.com/item/%E6%88%90%E9%BE%99/71648?fr=aladdin'
web=read_html(url)
name=web%>%html_nodes('#slider_relations>ul>li>a>.name')%>%html_attr('title')
relation=web%>%html_nodes('#slider_relations>ul>li>a>.name')%>%html_text()
relation=(str_split(relation,name)%>%unlist)[(str_split(relation,name)%>%unlist
                                             %>%str_length()) >0]
href=web%>%html_nodes('#slider_relations>ul>li> a')%>%html_attr('href')
re=data.frame(sorce='成龙',name=name,
             relation=relation,href=href,stringsAsFactors =F)

构建爬虫及清理函数

为减少代码冗余,构建两个函数.

  • 数据爬取函数

fun = function(x){
 web=read_html(x)
 name=web%>%html_nodes('#slider_relations>ul>li>a>.name')%>%html_attr('title')
 relation=web%>%html_nodes('#slider_relations>ul>li>a>.name')%>%html_text()
 relation=(str_split(relation,name)%>%unlist)[(str_split(relation,name)%>%unlist
                                               %>%str_length()) >0]
 href=web%>%html_nodes('#slider_relations>ul>li> a')%>%html_attr('href')
 re=data.frame(name=name,relation=relation,href=href,stringsAsFactors =F)
 return(re)
}

数据清理函数

fun1=function(x){
 re=x
 res=NULL
 for(i in 1:length(re$href)){
   if(nrow(fun(re$href[i]))>0){
     res=rbind(res,data.frame(sorce=re$name[i],fun(re$href[i]),stringsAsFactors =F))
   }
 }
 re1=res[(res$name %in% unique(re$sorce))==0,]
 return(re1)
}

数据读取及保存

#二级关系网
re1=fun1(re)
#三级关系网
re2=fun1(re1)
#四级关系网
re3=fun1(re2)
#五级关系网
re4=fun1(re3)
##数据保存
saveRDS(list(re=re,re1=re1,re2=re2,re3=re3,re4=re4),'D:\\chenglong.rds')

数据可视化

  • 绘制一级网络图

#加载数据
ldt=readRDS('D:\\chenglong.rds')
str(ldt)
str(ldt$re)
#一级关系网络图
library(GGally)
library(network)
library(ggnetwork)
library(geomnet)
##点
nodes=data.frame(label=unique(c(ldt$re$sorce,ldt$re$name)),
                grade=c(1,rep(2,9)),stringsAsFactors =F)
##边
links=ldt$re[,1:2]
mm.net <- network(links, directed = FALSE)
rownames(nodes) <- nodes$label
mm.net%v%'grade'= ifelse(network.vertex.names(mm.net)=="周杰伦" ,1,2)
set.edge.attribute(mm.net,'labb',ldt$re$relation)
set.seed(10052016)
ggplot(data = ggnetwork(mm.net, layout = "kamadakawai"),
      aes(x, y, xend = xend, yend = yend)) +
 geom_edges(color = "grey50",linetype=3) +
 geom_nodes(aes(colour = as.factor(grade)), size = 5) +
 geom_nodetext_repel(aes(colour = as.factor(grade),label = vertex.names),
               size = 4, vjust = -0.6) +
 geom_edgetext(aes(label = labb),color = "white", fill = "grey25")+
 scale_colour_manual(values =  c("pink", "gold")) +
 xlim(c(-0.05, 1.05)) +
 theme_blank() +
 theme(legend.position = "n")+
 theme(axis.text = element_blank(),
       axis.title = element_blank(),
       panel.background = element_rect(fill = "grey25"),
       panel.grid = element_blank())

image.png

绘制五级网络图

ldt$re$grade=2
ldt$re1$grade=3
ldt$re2$grade=4
ldt$re3$grade=5
ldt$re4$grade=6
dt4=rbind(ldt$re[,-4],ldt$re1[,-4],ldt$re2[,-4],ldt$re3[,-4],ldt$re4[,-4])
#点
nodes4=data.frame(label=c('成龙',dt4[!duplicated(dt4$name),]$name) ,
                 grade=c(1,dt4[!duplicated(dt4$name),]$grade),stringsAsFactors =F)
nodes4=nodes4[-27,]
#边
links4=dt4[,1:2]
mm.net4 <- network(links4, directed = FALSE)
rownames(nodes4) <- nodes4$label
mm.net4%v%'grade'= nodes4[network.vertex.names(mm.net4) ,'grade']
set.edge.attribute(mm.net4,'labb',dt4$relation)
set.seed(10052016)
p=ggplot(data = ggnetwork(mm.net4, layout = "kamadakawai"),
      aes(x, y, xend = xend, yend = yend)) +
 geom_edges(color = "grey80",linetype=3) +
 geom_nodes(aes(colour = as.factor(grade)), size = 1) +
 geom_nodetext_repel(aes(colour = as.factor(grade),label = vertex.names),
                     size = 3, vjust = -0.6) +
 #geom_edgetext(aes(label = labb),color = "white", fill = "grey25")+
 scale_colour_brewer(palette = "Oranges") +
 xlim(c(-0.05, 1.05)) +
 theme_blank() +
 theme(legend.position = "n")+
 theme(axis.text = element_blank(),
       axis.title = element_blank(),
       panel.background = element_rect(fill = "grey25"),
       panel.grid = element_blank())
p
windows(height = 15,width = 20)
p+scale_colour_brewer(palette = "Oranges")

image.png


推荐 1
本文由 EasyCharts 创作,采用 知识共享署名-相同方式共享 3.0 中国大陆许可协议 进行许可。
转载、引用前需联系作者,并署名作者且注明文章出处。
本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。

0 个评论

要回复文章请先登录注册