作业4：爬去CSDN博客上的文章链接，将文章保存到单独的文件

发表: 2016-11-02 浏览: 842
Python
# -*- coding: utf-8 -*-
"""
作业：爬取CSDN博客http://blog.csdn.net/首页显示的所有文章，每个文章 内容单独生成一个本地网页存到本地中

"""

from urllib.error import URLError
import urllib.request 
import urllib.parse
import re

# 全局变量定义开始---------------------------------------

url = 'http://blog.csdn.net'  # 首页url
url_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:51.0) Gecko/20100101 Firefox/51.0'

# 首页获取href标签的pattern
# 结果样例<a href="http://blog.csdn.net/namelessml/article/details/52654102" target="_blank">Hive安装、配置，HQL，实践</a>
pat_href = '<a href="http://blog.csdn.net/.*?/article/details/.*?"[^>]*>\s*[^<]+</a>'  
pat_link = 'href="https://ask.hellobi.com/(http://blog.csdn.net/.*?/article/details/.*?)"'  # 获取http链接的pattern
pat_link_title = '>(\s*[^<]+)</a>'  # 获取href里link文本的pattern


# 全局变量定义结束---------------------------------------


# 函数定义开始------------------------------------------

# 分析网页，获取所有链接标签及标题文本，返回列表形式
def get_href_list(html_content,pattern):
    
    try:
        #获取decode之后的内容
        content = html_content.decode("utf-8","ignore")

        #获取href标签列表
        href_list = re.compile(pattern).findall(content)
        
        return href_list
        
    except Exception as er:
        
        print(er)
        
# 函数定义结束-----------------------------------------


# 打开首页
opener = urllib.request.build_opener()
opener.addheaders = [('User-Agent', url_agent)]
opener.open(url)

# 获取首页内容
main_content = opener.open(url).read()

#  将所有满足条件的href标签放到列表里
href_list = get_href_list(main_content,pat_href)


for i in range(0,len(href_list)):
    
    try:
        print("第"+str(i+1)+"次爬取")
        
        this_link = re.findall(pat_link,href_list[i])[0]
        
        #  确保文件名不包含/
        this_link_title = re.findall(pat_link_title,href_list[i])[0].replace('/','')
        
        #  子网页内容
        sub_content = opener.open(url).read()
        
        fh=open(this_link_title +".html","wb")
        fh.write(sub_content)
        fh.close()
        
        print("-------成功-------")
        
    except URLError  as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)
1 个评论

梁勇
零度作业妥妥的。
要回复文章请先登录或注册