Python网络爬虫（七）- 深度爬虫CrawlSpider

发表: 2017-10-16 浏览: 7125

Python

Python网络爬虫（一）- 入门基础
Python网络爬虫（二）- urllib爬虫案例
Python网络爬虫（三）- 爬虫进阶
Python网络爬虫（四）- XPath
Python网络爬虫（五）- Requests和Beautiful Soup
Python网络爬虫（六）- Scrapy框架
Python网络爬虫（七）- 深度爬虫CrawlSpider
Python网络爬虫（八） - 利用有道词典实现一个简单翻译程序

深度爬虫之前推荐一个简单实用的库fake-useragent，可以伪装生成headers请求头中的User Agent值

#安装

pip install  fake-useragent

#使用

import requests

from fake_useragent import UserAgent

ua = UserAgent()

headers = {'User-Agent': ua.random}

url = '待爬网页的url'

resp = requests.get(url, headers=headers)

1.深度爬虫CrawlSpider

scrapy.spiders.CrawlSpider



 创建项目：scrapy startproct <project_name>



 创建爬虫：scrapy genspider –t crawl <spider_name> <domains>



 核心处理规则： from scrapy.spiders import CrawlSpider, Rule

 核心处理提取： from scrapy.linkextractors import LinkExtractor

rules：该属性为一个正则表达式集合，用于告知爬虫需要跟踪哪些链接
rules属性还有一个callback函数，用于解析下载得到的响应，而parse_item()方法给我们提供了一个从响应中获取数据的例子。
使用shell命令抓取：scrapy shell http://baidu.com

2.链接提取：LinkExtractor

class scrapy.contrib.linkextractor.sgml.SgmlLinkExtractor(

    allow = (),         # 符合正则表达式参数的数据会被提取

    deny = (),          # 符合正则表达式参数的数据禁止提取

    allow_domains = (),     # 包含的域名中可以提取数据

    deny_domains = (),      # 包含的域名中禁止提取数据

    deny_extensions = (),       

    restrict_xpath = (),        # 使用xpath提取数据，和allow共同起作用

    tags = (),          # 根据标签名称提取数据

    attrs = (),         # 根据标签属性提取数据

    canonicalize = (),

    unique = True,          # 剔除重复链接请求

    process_value = None

)

3.爬取规则：rules

rules = [

    Rule(

        link_extractor,     # LinkExtractor对象

        callback=None,      # 请求到响应数据时的回调函数

        cb_kwargs=None,     # 调用函数设置的参数,不要指定为parse

        follow=None,        # 是否从response跟进链接，为布尔值

        process_links=None, # 过滤linkextractor列表，每次获取列表时都会调用

        process_request=None    # 过滤request,每次提取request都会调用

    )

]

4.如何在pycharm中直接运行爬虫

1. 在项目下创建start.py文件

# -*- coding:utf-8 -*-

from scrapy import cmdline  #引入命令行

cmdline.execute('scrapy crawl dang'.split())

2. 如图所示

点击Edit Configurations

添加python文件

配置完毕后，点击ok

点击运行

配置了这么多最后发现start.py后直接运行就行，不需要配置那么多。

5.使用CrawlSpider爬取猎聘网python相关岗位招聘信息

创建项目

scrapy startproject liep

自动创建spiders文件

scrapy genspider lp liepin.com

items.py

# -*- coding: utf-8 -*-



import scrapy





class LiepItem(scrapy.Item):



    name = scrapy.Field()

    company = scrapy.Field()

    salary = scrapy.Field()

    address = scrapy.Field()

    #投递时间反馈

    experience = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-



# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html



import json





class LiepPipeline(object):

    def __init__(self):

        self.file = open('liepin.json','w')



    def process_item(self, item, spider):

        text = json.dumps(dict(item),ensure_ascii=False)

        self.file.write(text.encode('utf-8'))

        print 'QAQ ----> 正在写入数据'



    def close(self):

        self.file.close()

lp.py

# -*- coding: utf-8 -*-

from scrapy.spiders import CrawlSpider,Rule

from scrapy.linkextractors import LinkExtractor

from liep.items import LiepItem

import re



class LpSpider(CrawlSpider):

    reg = re.compile('\s*')

    name = 'lp'

    allowed_domains = ['www.liepin.com']

    start_urls = ['https://www.liepin.com/zhaopin/?pubTime=&ckid=6f6956c5d999c17e&fromSearchBtn=2&compkind=&isAnalysis=&init=-1&searchType=1&dqs=020&industryType=&jobKind=&sortFlag=15&degradeFlag=0&industries=040&salary=0%240&compscale=&key=python&clean_condition=&headckid=7a006343bdb04f47&curPage=0',]



    #定义提取超链接的提取规则

    page_link = LinkExtractor(allow=('&curPage=\d+'))

    #定义爬取数据的规则

    rules = {

        Rule(page_link,callback='parse_content',follow=True)



    }



    #定义处理函数

    def parse_content(self, response):

        #定义一个Item,用于存储数据

        item = LiepItem()

        #获取整个我们需要的数据区域

        job_list = response.xpath('//div[@class="job-info"]')

        for job in job_list:

            name = job.xpath('.//h3/a')

            item['name'] = self.reg.sub('', name.xpath('string(.)').extract()[0])

            item['company'] = job.xpath('..//p[@class="company-name"]/a/text()').extract()

            item['salary'] = job.xpath('.//span[@class="text-warning"]/text()').extract()

            item['address'] = job.xpath('.//p[@class="condition clearfix"]//a/text()').extract()

            item['experience'] = job.xpath('.//p[@class="condition clearfix"]//span[3]/text()').extract()



            yield item

settings.py

DEFAULT_REQUEST_HEADERS = {

  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

  'Accept-Language': 'en',

  'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36',

}



#把ITEM_PIPELINES的注释取消

ITEM_PIPELINES = {

   'firPro.pipelines.FirproPipeline': 300,

}

爬取的结果liepin.json

{

  "salary": "12-24万",

  "company": "嗨皮(上海)网络科技股份有限公司",

  "name": "python开发工程师",

  "experience": "3年工作经验",

  "address": "上海"

}{

  "salary": "14-28万",

  "company": "第一弹",

  "name": "Python后端开发",

  "experience": "3年工作经验",

  "address": "上海"

}{

  "salary": "12-18万",

  "company": "易路软件",

  "name": "Python中级开发工程师",

  "experience": "3年工作经验",

  "address": "上海-闵行区"

}{

  "salary": "11-21万",

  "company": "信用飞/首付游",

  "name": "Python开发工程师（风控方向）",

  "experience": "1年工作经验",

  "address": "上海-徐汇区"

}{

  "salary": "13-24万",

  "company": "联车科技",

  "name": "python开发",

  "experience": "3年工作经验",

  "address": "上海"

}{

  "salary": "12-24万",

  "company": "寻仟信息",

  "name": "Python开发工程师",

  "experience": "1年工作经验",

  "address": "上海"

}{

  "salary": "12-22万",

  "company": "ifuwo",

  "name": "Python开发工程师",

  "experience": "1年工作经验",

  "address": "上海-浦东新区"

}{

  "salary": "12-24万",

  "company": "小葫芦",

  "name": "python开发工程师",

  "experience": "1年工作经验",

  "address": "上海"

}{

  "salary": "14-24万",

  "company": "ifuwo",

  "name": "python后台工程师",

  "experience": "2年工作经验",

  "address": "上海-浦东新区"

}{

  "salary": "面议",

  "company": "森浦资讯",

  "name": "Python开发工程师",

  "experience": "2年工作经验",

  "address": "上海"

}{

  "salary": "14-24万",

  "company": "优刻得",

  "name": "OPL-python运维开发",

  "experience": "2年工作经验",

  "address": "上海"

}{

  "salary": "面议",

  "company": "上海聪牛金融信息服务有限公司",

  "name": "python开发工程师",

  "experience": "2年工作经验",

  "address": "上海"

}{

  "salary": "12-30万",

  "company": "进馨网络",

  "name": "python开发工程师",

  "experience": "3年工作经验",

  "address": "上海"

}{

  "salary": "12-18万",

  "company": "载信软件",

  "name": "Python工程师",

  "experience": "1年工作经验",

  "address": "上海"

}{

  "salary": "14-24万",

  "company": "优刻得",

  "name": "OPL-python运维开发J10605",

  "experience": "1年工作经验",

  "address": "上海"

}{

  "salary": "10-24万",

  "company": "上海霄骋信息科技有限公司",

  "name": "Python爬虫开发工程师",

  "experience": "2年工作经验",

  "address": "上海"

}{

  "salary": "面议",

  "company": "五五海淘",

  "name": "Python",

  "experience": "1年工作经验",

  "address": "上海"

}

.................

.................

6.使用中间件设置请求头和代理

scrapyAPI文档中关于中间件的描述

-settings.py

# -*- coding: utf-8 -*-







BOT_NAME = 'tea'



SPIDER_MODULES = ['tea.spiders']

NEWSPIDER_MODULE = 'tea.spiders'



# 用于设置日志配置文件，将程序运行的信息，保存在指定的文件中

LOG_FILE = 's.log'

# 用于设置信息记录级别 DEBUG最高级别~记录所有信息  --  INFO WARNING...

# 详细日志<DEBUG> -> 摘要信息<INFO> -> 警告信息<WARNING> -> 错误信息<ERROR>....

LOG_LEVEL = 'INFO'





# Crawl responsibly by identifying yourself (and your website) on the user-agent

#USER_AGENT = 'tea (+http://www.yourdomain.com)'

USER_AGENTS = [

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",

    "Opera/8.0 (Windows NT 5.1; U; en)",

    "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",

    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",

    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",

    "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ",

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",

    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",

    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",

    "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",

    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",

    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) ",

    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",

    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",

    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) ",

    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",

    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) "

    ]









# Obey robots.txt rules

ROBOTSTXT_OBEY = True



# Enable or disable downloader middlewares

# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html

DOWNLOADER_MIDDLEWARES = {

   # 'tea.middlewares.MyCustomDownloaderMiddleware': 543,

    'tea.middlewares.UseragentMiddleware': 543,

    'tea.middlewares.ProxyMiddleware':600,

}



PROXY = [

    {"ip_port":"178.62.47.236:80"},

    {"ip_port":"125.77.25.116:80"},

    {"ip_port":"13.58.249.76:8080"},

    {"ip_port":"37.204.253.2:8081"},

    {"ip_port":"78.47.174.243:3128"},

    {"ip_port":"139.59.235.243:3128", "user_password":"admin:123123"}

]

middlewares.py

# -*- coding: utf-8 -*-



# Define here the models for your spider middleware

#

# See documentation in:

# http://doc.scrapy.org/en/latest/topics/spider-middleware.html



import random

import base64

from settings import USER_AGENTS,PROXY



#创建一个自定义的下载中间件 -- 需要在settings.py中进行配置才能起作用

class UseragentMiddleware(object):

    #定义一个专门用于处理请求的函数:两个参数，第一个参数就是要处理的请求对象，第二个参数是爬虫程序

    #该函数必须返回一个数据-None/request，如果返回的是None,表示处理完成，交给后续的中间件继续操作

    #如果返回的是request,此时返回的request会被重新交给引擎添加到请求队列中，重新发起

    def process_request(self,request,spider):

        print ('----QAQ-----')

        #随机获取一个user-Agent

        useragent = random.choice(USER_AGENTS)

        #给request请求头中添加user-agent配置

        request.headers.setdefault('User-agent',useragent)

        print ('---->headers successful')

        return None



class ProxyMiddleware(object):

    def process_request(self,request,spider):

        print ('------->-_-')

        proxy = random.choice(PROXY)

        # 给request请求中添加Proxy配置

        print proxy['ip_port'],proxy.get('user_password',None)

        request.meta['proxy'] = proxy.get('ip_port')



        #验证

        if proxy.get('user_password',None):

            b64 = base64.b64encode(proxy.get('user_password'))

            print b64

            request.headers['Proxy-Authorization'] = 'Basic '+b64

            print '======proxy======'

可以看到请求头和代理IP已被加入

1 个评论

laical

scrapy中的selector.xpath().get如何改写到一般的爬虫里面啊

要回复文章请先登录或注册

Python网络爬虫（七）- 深度爬虫CrawlSpider

目录：

1.深度爬虫CrawlSpider

2.链接提取：LinkExtractor

3.爬取规则：rules

4.如何在pycharm中直接运行爬虫

配置了这么多最后发现start.py后直接运行就行，不需要配置那么多。

5.使用CrawlSpider爬取猎聘网python相关岗位招聘信息

6.使用中间件设置请求头和代理

1 个评论