# -*- coding: utf-8 -*-
"""
作业:爬取CSDN博客http://blog.csdn.net/首页显示的所有文章,每个文章 内容单独生成一个本地网页存到本地中
"""
from urllib.error import URLError
import urllib.request
import urllib.parse
import re
# 全局变量定义开始---------------------------------------
url = 'http://blog.csdn.net' # 首页url
url_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:51.0) Gecko/20100101 Firefox/51.0'
# 首页获取href标签的pattern
# 结果样例<a href="http://blog.csdn.net/namelessml/article/details/52654102" target="_blank">Hive安装、配置,HQL,实践</a>
pat_href = '<a href="http://blog.csdn.net/.*?/article/details/.*?"[^>]*>\s*[^<]+</a>'
pat_link = 'href="https://ask.hellobi.com/(http://blog.csdn.net/.*?/article/details/.*?)"' # 获取http链接的pattern
pat_link_title = '>(\s*[^<]+)</a>' # 获取href里link文本的pattern
# 全局变量定义结束---------------------------------------
# 函数定义开始------------------------------------------
# 分析网页,获取所有链接标签及标题文本,返回列表形式
def get_href_list(html_content,pattern):
try:
#获取decode之后的内容
content = html_content.decode("utf-8","ignore")
#获取href标签列表
href_list = re.compile(pattern).findall(content)
return href_list
except Exception as er:
print(er)
# 函数定义结束-----------------------------------------
# 打开首页
opener = urllib.request.build_opener()
opener.addheaders = [('User-Agent', url_agent)]
opener.open(url)
# 获取首页内容
main_content = opener.open(url).read()
# 将所有满足条件的href标签放到列表里
href_list = get_href_list(main_content,pat_href)
for i in range(0,len(href_list)):
try:
print("第"+str(i+1)+"次爬取")
this_link = re.findall(pat_link,href_list[i])[0]
# 确保文件名不包含/
this_link_title = re.findall(pat_link_title,href_list[i])[0].replace('/','')
# 子网页内容
sub_content = opener.open(url).read()
fh=open(this_link_title +".html","wb")
fh.write(sub_content)
fh.close()
print("-------成功-------")
except URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)