今日头条街拍爬取练习

浏览: 2177

本次练习使用到的知识点有

* Requests 库的使用

* BeautifulShop 库的使用

* 正则表达式的使用

* pymongo 库的使用

1、项目流程分析

Clipboard Image.png

2、中心调度

# 中心调度
def main(offset):
# 获取列表页
index_data = get_page_index(offset,KEYWORDS)
if index_data is None:
print("offset:"+offset+"异常")
return
# 解析列表页获取所有详情页的url
for url in parse_page_index(index_data):
# 获取详情页
detail_data = get_page_detail(url)
if detail_data is None:
print('url:%s异常'.format(url))
pass
# 解析详情页
data = parse_page_detail(detail_data, url)
if data is None:
continue
save_to_mongo(data)

3、请求和解析列表页

# 请求获取列表页的响应数据
def get_page_index(offset,keywords):
params = {
'offset':offset,
'format':'json',
'keyword':KEYWORDS,
'cur_tab':3,
'autoload':'true',
'count':20
}
try:
response = requests.get('http://www.toutiao.com/search_content/',params=params)
if response.status_code==200:
return response.text
return None
except RequestException as e:
return None


# 解析列表页
def parse_page_index(text):
try:
data = json.loads(text)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
except JSONDecodeError as e:
print('解析异常')
return []

4、请求和解析详情页

# 解析详情页面
def parse_page_detail(html, url):
soup = BeautifulSoup(html,'lxml')
# 获取页面的标题
title = soup.title.string
image_pattern = re.compile('var gallery = (.*?);',re.S)
result = image_pattern.search(html)
if result:
try:
data = json.loads(result.group(1))
if data and 'sub_images' in data.keys():
# 获取所有的imageurl
images = [item.get('url') for item in data.get('sub_images')]
for image in images:
# 下载图片
download_image(image)
return {'title':title, 'url':url, 'images':images}
except JSONDecodeError as e:
return None
return None

5、下载图片和保存至Mongodb

# 获取图片的二进制流
def download_image(url):
try:
print('图片'+url+'正在下载')
response = requests.get(url)
if response.status_code == 200:
# 保存图片
save_image(response.content)
except RequestException as e:
print('异常image:'+url)
pass


# 保存二进制流至文件
def save_image(content):
file_path = '{0}/images/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb+') as file:
file.write(content)
file.close()


def save_to_mongo(data):
if db[MONGO_TABLE].insert(data):
print('成功保存'+data['title'])
return True
return False

6、完整代码

#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import re
import requests
import pymongo
import json
from hashlib import md5
from bs4 import BeautifulSoup
from setting import *
from requests.exceptions import RequestException
from json.decoder import JSONDecodeError
from multiprocessing import Pool

client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]


# 请求获取列表页的响应数据
def get_page_index(offset,keywords):
params = {
'offset':offset,
'format':'json',
'keyword':KEYWORDS,
'cur_tab':3,
'autoload':'true',
'count':20
}
try:
response = requests.get('http://www.toutiao.com/search_content/',params=params)
if response.status_code==200:
return response.text
return None
except RequestException as e:
return None


# 解析列表页
def parse_page_index(text):
try:
data = json.loads(text)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
except JSONDecodeError as e:
print('解析异常')
return []


# 请求获取详情页面的响应数据
def get_page_detail(url):
response = requests.get(url)
try:
if response.status_code==200:
return response.text
return None
except RequestException as e:
return None


# 解析详情页面
def parse_page_detail(html, url):
soup = BeautifulSoup(html,'lxml')
# 获取页面的标题
title = soup.title.string
image_pattern = re.compile('var gallery = (.*?);',re.S)
result = image_pattern.search(html)
if result:
try:
data = json.loads(result.group(1))
if data and 'sub_images' in data.keys():
# 获取所有的imageurl
images = [item.get('url') for item in data.get('sub_images')]
for image in images:
# 下载图片
download_image(image)
return {'title':title, 'url':url, 'images':images}
except JSONDecodeError as e:
return None
return None


# 获取图片的二进制流
def download_image(url):
try:
print('图片'+url+'正在下载')
response = requests.get(url)
if response.status_code == 200:
# 保存图片
save_image(response.content)
except RequestException as e:
print('异常image:'+url)
pass


# 保存二进制流至文件
def save_image(content):
file_path = '{0}/images/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb+') as file:
file.write(content)
file.close()


def save_to_mongo(data):
if db[MONGO_TABLE].insert(data):
print('成功保存'+data['title'])
return True
return False


# 中心调度
def main(offset):
# 获取列表页
index_data = get_page_index(offset,KEYWORDS)
if index_data is None:
print("offset:"+offset+"异常")
return
# 解析列表页获取所有详情页的url
for url in parse_page_index(index_data):
# 获取详情页
detail_data = get_page_detail(url)
if detail_data is None:
print('url:%s异常'.format(url))
pass
# 解析详情页
data = parse_page_detail(detail_data, url)
if data is None:
continue
save_to_mongo(data)


if __name__=='__main__':
groups = [x*20 for x in range(GROUP_START,GROUP_END+1)]
pool = Pool()
pool.map(main, groups)

7、运行结果

Clipboard Image.png


Clipboard Image.png

推荐 3
本文由 蒋蜀黍 创作,采用 知识共享署名-相同方式共享 3.0 中国大陆许可协议 进行许可。
转载、引用前需联系作者,并署名作者且注明文章出处。
本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。

0 个评论

要回复文章请先登录注册