Python3爬取目标经销商数据

浏览: 1614

目前爬虫也是很热门,本文采用Python3进行语法编写,Python3与Python2中的函数会有所不同,但是相差不大,具体的问题可以百度找到,因有朋友在做汽车方面的业务,因此需要一些网络数据进行支撑,一个个找会非常的繁琐,因此之前弄了些爬虫数据。这里进行其中一部分爬虫的代码共享。

首先要定义一些常量

# !/usr/bin/env python

# -*- coding:utf-8 -*-
# by Zason_Zhang
import urllib,urllib.request,json

from pyquery import PyQuery as pq

class Constant():

"""常量"""

NAME = 'name'

DIANMIAN = 'dianmian'

ADDRESS = 'address'

TELEPHONE = 'tel'


进行爬虫爬取

###目的爬取目标用户电话
class AutoDealerCrawler():


def __init__(self):

self.saver = Saver() # 保存

self.index_url = ''

self.entry_url = self.index_url + '/dmzs.aspx?classid=38'

self.province_url = self.entry_url +'&city='

def get_province_list(self):

"""省份"""

page = urllib.request.urlopen(self.entry_url).read()#读页

d = pq(page)

#print d

return[(province_option.text, province_option.get('value')) \

for province_option in d('select#ddlCity option')]

def get_name_list(self,province_id):

'''省店'''

css_selector = 'html body form#form1 div#body div.body div#conter div.right div.col1 ul.a01 li span a'

page = urllib.request.urlopen(self.province_url + province_id).read()

d = pq(page)

#print '2222' %d

return [(shop.text,shop.get('href')) \

for shop in d(css_selector)]



def get_dealer_info_one_province(self, province, shop_url):

"""省份经销商信息"""

css_selector = 'html body div#body div.body div#conter div.right div.col9a'

css_selectorname = 'html body div#body div.body div#conter div.right div.col9wz2'

page = urllib.request.urlopen(self.index_url +'/' + shop_url).read()

d = pq(page)

dealer_info_list = []



for dealername in d(css_selectorname):

print (dealername.text)

dealer_infoname = {}

dealer_infoname[Constant.NAME] = dealername.text

dealer_info_list.append(dealer_infoname)





for dealer_dt in d(css_selector):

p = dealer_dt.findall('p')#获取标签p

if len(p) >= 3:

dealer_info = {}

print (p[0].text[3:].strip())

print (p[1].text[3:].strip())

print (p[2].text[3:].strip())

dealer_info[Constant.DIANMIAN] = p[0].text[3:].strip()

dealer_info[Constant.TELEPHONE] = p[1].text[3:].strip()

dealer_info[Constant.ADDRESS] = p[2].text[3:].strip()



dealer_info_list.append(dealer_info)

else:

self.error_logger.log(u'城市链接错误:%s' % shop_url)

temp_a = dealer_info_list[0]

temp_b = dealer_info_list[1]

temp_a.update(temp_b)

#print temp_a

dealer_info_list_new = []

dealer_info_list_new.append(temp_a)

self.saver.add(province, dealer_info_list_new)

def get_all_dealer_info(self):

"""经销商信息生成程序"""

for province,province_id in self.get_province_list():

if province_id =='':

print ('')

else:

for shop,shop_url in self.get_name_list(province_id):



self.get_dealer_info_one_province(province, shop_url)

self.saver.commit()

数据存储

class Saver():

"""保存excel"""

def __init__(self):

import xlwt

self.count = 1

self.book = xlwt.Workbook()

self.book_name = u'信息'

self.sheet = self.book.add_sheet(self.book_name)

self.write_header()

def write_header(self):

self.sheet.write(0, 0, u'编号')

self.sheet.write(0, 1, u'省份')

self.sheet.write(0, 2, u'店门')

self.sheet.write(0, 3, u'公司名称')

self.sheet.write(0, 4, u'联系电话')

self.sheet.write(0, 5, u'地址')

def add(self, province, dealer_info_list):

for dealer in dealer_info_list:

self.sheet.write(self.count, 0, self.count)

self.sheet.write(self.count, 1, province)

print (self.count)

print (province)

#print dealer[Constant.DIANMIAN]

self.sheet.write(self.count, 2, dealer[Constant.DIANMIAN])

self.sheet.write(self.count, 3, dealer[Constant.NAME])

self.sheet.write(self.count, 4, dealer[Constant.TELEPHONE])

self.sheet.write(self.count, 5, dealer[Constant.ADDRESS])

self.count += 1



def commit(self):

self.book.save(self.book_name + '.xls')

print (self.book_name + u'已保存')

if __name__ == '__main__':

crawler = AutoDealerCrawler()

crawler.get_all_dealer_info()

爬虫效果如下:


推荐 6
本文由 张聪 创作,采用 知识共享署名-相同方式共享 3.0 中国大陆许可协议 进行许可。
转载、引用前需联系作者,并署名作者且注明文章出处。
本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。

0 个评论

要回复文章请先登录注册