Python爬虫之多进程爬取（以58同城二手市场为例）

发表: 2017-10-12 浏览: 1318

Python

今天以58同城的二手市场为例（也就是转转）给大家介绍一下大规模的结构数据怎么爬取。

分析

先看下转转的网页结构与我想爬取的数据：

类目

物品页

详细页

我的做法是先提取大类目的链接，然后进入爬取物品页的链接，进而爬取详细页的数据，总共建立了3个Python的文件，分别为channel_extract.py，page_spider.py，main.py

channel_extract.py

import requests

from lxml import etree



start_url = 'http://cs.58.com/sale.shtml'

url_host = 'http://cs.58.com'



def get_channel_urls(url):

    html = requests.get(url)

    selector = etree.HTML(html.text)

    infos = selector.xpath('//div[@class="lbsear"]/div/ul/li')



    for info in infos:

        class_urls = info.xpath('ul/li/b/a/@href')

        for class_url in class_urls:

            print(url_host + class_url)







# get_channel_urls(start_url)



channel_list = '''

    http://cs.58.com/shouji/

    http://cs.58.com/tongxunyw/

    http://cs.58.com/danche/

    http://cs.58.com/fzixingche/

    http://cs.58.com/diandongche/

    http://cs.58.com/sanlunche/

    http://cs.58.com/peijianzhuangbei/

    http://cs.58.com/diannao/

    http://cs.58.com/bijiben/

    http://cs.58.com/pbdn/

    http://cs.58.com/diannaopeijian/

    http://cs.58.com/zhoubianshebei/

    http://cs.58.com/shuma/

    http://cs.58.com/shumaxiangji/

    http://cs.58.com/mpsanmpsi/

    http://cs.58.com/youxiji/

    http://cs.58.com/jiadian/

    http://cs.58.com/dianshiji/

    http://cs.58.com/ershoukongtiao/

    http://cs.58.com/xiyiji/

    http://cs.58.com/bingxiang/

    http://cs.58.com/binggui/

    http://cs.58.com/chuang/

    http://cs.58.com/ershoujiaju/

    http://cs.58.com/bangongshebei/

    http://cs.58.com/diannaohaocai/

    http://cs.58.com/bangongjiaju/

    http://cs.58.com/ershoushebei/

    http://cs.58.com/yingyou/

    http://cs.58.com/yingeryongpin/

    http://cs.58.com/muyingweiyang/

    http://cs.58.com/muyingtongchuang/

    http://cs.58.com/yunfuyongpin/

    http://cs.58.com/fushi/

    http://cs.58.com/nanzhuang/

    http://cs.58.com/fsxiemao/

    http://cs.58.com/xiangbao/

    http://cs.58.com/meirong/

    http://cs.58.com/yishu/

    http://cs.58.com/shufahuihua/

    http://cs.58.com/zhubaoshipin/

    http://cs.58.com/yuqi/

    http://cs.58.com/tushu/

    http://cs.58.com/tushubook/

    http://cs.58.com/wenti/

    http://cs.58.com/yundongfushi/

    http://cs.58.com/jianshenqixie/

    http://cs.58.com/huju/

    http://cs.58.com/qiulei/

    http://cs.58.com/yueqi/

    http://cs.58.com/chengren/

    http://cs.58.com/nvyongpin/

    http://cs.58.com/qinglvqingqu/

    http://cs.58.com/qingquneiyi/

    http://cs.58.com/chengren/

    http://cs.58.com/xiaoyuan/

    http://cs.58.com/ershouqiugou/

    http://cs.58.com/tiaozao/

'''

爬取类目链接比较简单，在这里就不多讲，然后把爬取的类目链接赋值给channel_list变量（具体原因见下面说明）

page_spider.py

import requests

from lxml import etree

import time

import pymongo

# import random



client = pymongo.MongoClient('localhost', 27017)

test = client['test']

tongcheng = test['tongcheng']



headers = {

    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',

    'Connection':'keep-alive'

}



# proxy_list = [

#     'http://218.56.132.158',

#     'http://115.47.44.102',

#     'http://118.144.149.200',

#     'http://222.223.239.135',

#     'http://123.234.219.133'

# ]

# proxy_ip = random.choice(proxy_list)

# proxies = {'http':proxy_ip}



def get_links_from(channel,pages):

    list_view = '{}pn{}/'.format(channel,str(pages))

    try:

        html = requests.get(list_view,headers=headers)

        time.sleep(2)

        selector = etree.HTML(html.text)

        if selector.xpath('//tr'):

            infos = selector.xpath('//tr')

            for info in infos:

                if info.xpath('td[2]/a/@href'):

                    url = info.xpath('td[2]/a/@href')[0]

                    get_info(url)

                else:pass

        else:

            pass

    except requests.exceptions.ConnectionError:pass



def get_info(url):

    html = requests.get(url,headers=headers)

    selector = etree.HTML(html.text)

    try:

        title = selector.xpath('//h1/text()')[0]

        if selector.xpath('//span[@class="price_now"]/i/text()'):

            price = selector.xpath('//span[@class="price_now"]/i/text()')[0]

        else:

            price = "无"

        if selector.xpath('//div[@class="palce_li"]/span/i/text()'):

            area = selector.xpath('//div[@class="palce_li"]/span/i/text()')[0]

        else:

            area = "无"

        view = selector.xpath('//p/span[1]/text()')[0]

        if selector.xpath('//p/span[2]/text()'):

            want = selector.xpath('//p/span[2]/text()')[0]

        else:

            want = "无"

        info = {

            'tittle':title,

            'price':price,

            'area':area,

            'view':view,

            'want':want,

            'url':url

        }

        tongcheng.insert_one(info)



    except IndexError:

        pass

1 try多用，避免大型错误，我也是通过几次调试加上的
2 大规模数据爬取加请求头，代理池，以及请求延迟和断点续爬；请求头不用多说，平时爬取数据都尽量加上，请求头的作用在于Python伪装浏览器爬取；大规模数据爬取数据，时间久了，很容易被ban ip，最好的方法是设置代理池，我这上面有写做法（但由于找的ip不好用，几个ip都不行，后面放弃了）；请求延迟是尽量减少访问网站的频次；断点续爬就是说爬取一半由于各种原因停止了，我们需要它接着爬取而不是重新开始（下次有时间给大家介绍下）

main.py

import sys

sys.path.append("..")

from multiprocessing import Pool

from channel_extract import channel_list

from page_spider import get_links_from



def get_all_links_from(channel):

    for num in range(1,101):

        get_links_from(channel,num)



if __name__ == '__main__':



    pool = Pool()

    pool.map(get_all_links_from,channel_list.split())

这就是多进程了！！！用法简单，不多说

结果

为了方便看爬取情况，又建立了一个counts.py

import sys

sys.path.append("..")

import time

from page_spider import tongcheng



while True:

    print(tongcheng.find().count())

    time.sleep(5)

在cmd中运行，现在已经爬取了10w+，还是爬取中。。。。

结果

0 个评论

要回复文章请先登录或注册