爬取糗事百科-使用BeautifulSoup解析页面获取文本内容

发表: 2017-10-24 浏览: 1828

Python

思路：

首先爬取整个页面源码，这个比较简单
保存到本地供测试用，省的每次测试都要爬取一遍
分析源码，使用beautifulSoup的find_all找出需要的tag即可。

代码：

# -*- coding:utf-8 -*-



import urllib.request

import re

from bs4 import BeautifulSoup



import sys

import io

sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码

'''

url ='https://www.qiushibaike.com/hot/'

header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'

            ' Chrome/59.0.3071.109 Safari/537.36'}



req = urllib.request.Request(url,headers = header)

res = urllib.request.urlopen(req)

data = res.read()

data = data.decode('utf-8')

with open(r"D:\python学习笔记\爬虫\保存记录\7_糗事百科HTML.txt",'w',encoding = 'utf-8') as f:

    f.write(data)

'''

file = open(r"D:\python学习笔记\爬虫\保存记录\7_糗事百科HTML.txt",'r',encoding='utf-8')

data = file.read()

file.close()

#print(data)

soup = BeautifulSoup(data,'lxml')



#print(soup.find_all(class_="article block untagged mb15 typs_long"))

#分别获取内容和作者及点赞数等

content = []

for div in soup.find_all(name = 'div',class_ = 'content') :

	content.append(div.text)

	#print(div.text)



author = []

birth = []

for div in soup.find_all(name = 'div',class_='author clearfix'):

	author.append(div.h2.text)

	birth.append(div.div.text)



#print(author)

#print(birth)

	

#点赞数

agree = []

for span in soup.find_all(name = 'span',class_='stats-vote'):

	agree.append(span.i.text)

#print(agree)



#评论数

comment = []

for a in soup.find_all(name = 'a',onclick = "_hmt.push(['_trackEvent','web-list-comment','chick'])"):

	comment.append(a.i.text)

#print(comment)

#print(len(comment))



for i in range(len(author)) :

	#print(author[i]+birth[i]+agree[i]+comment[i]+content[i])

	#字符串换行需要在每行的末尾加\,或者用 '''  '''  或者 """

	str_n = '==============================第%s个笑话============================\n'%(i+1)

	str1 = '作者：'+author[i].replace('\n','')+'\n'\

		   '年龄：'+ birth[i].replace('\n','')+'\t'\

		   '点赞：'+agree[i].replace('\n','')+'\t'\

		   '评论：'+comment[i].replace('\n','')+'\n'\

		   '内容：'+content[i].replace('\n','')+'\n'+'\n'

	with open(r"D:\python学习笔记\爬虫\保存记录\7_03糗事百科HTML.txt",'a',encoding = 'utf-8') as f:

		f.write(str_n + str1)

效果图：

总结：

因为刚开始学习使用BeautifulSoup，整体感觉比使用正则简单很多，代码量也少了，不用去匹配那么多东西！可能爬取的东西比较简单，希望大家多多交流，给予指正

0 个评论

要回复文章请先登录或注册