抓取易购的老板和方太烟机和灶具套餐数据
一、抓取的页面网址
import requests
from bs4 import BeautifulSoup
urlf='http://search.suning.com/%E6%B2%B9%E7%83%9F%E6%9C%BA/&iy=0&sc=0&hf=solr_54435_attrId:%E7%83%9F%E7%81%B6%E4%B8%A4%E4%BB%B6%E5%A5%97,solr_54437_attrId:%E5%A4%A9%E7%84%B6%E6%B0%94&ci=337026&cf=brand_Name_FacetAll:%E6%96%B9%E5%A4%AA(FOTILE)&st=0'
urll='http://search.suning.com/%E6%B2%B9%E7%83%9F%E6%9C%BA/&iy=0&sc=0&hf=solr_54435_attrId:%E7%83%9F%E7%81%B6%E4%B8%A4%E4%BB%B6%E5%A5%97,solr_54437_attrId:%E5%A4%A9%E7%84%B6%E6%B0%94&ci=337026&cf=brand_Name_FacetAll:%E8%80%81%E6%9D%BF(ROBAM)&st=0'
url=[urlf,urll]
#url 这一步是可以优化的 爬完数据才发现可以多选品牌的
二、抓取的具体商品网址
#获取具体商品url
def Geturl(url):
dataurl=[]
for each in url:
res=requests.get(each)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
for each1 in soup.select('.sellPoint'):
dataurl.append(each1['href'])
return list(set(dataurl)) #抓取的url有重复的用元祖筛选了
三、获取商品的具体参数
#价格是单独获取的 在network的js中
import json
def Price(url):
newurl1=url.split('/')[-1].strip('.html')
newurl2='http://pas.suning.com/nspcsale_0_000000000'
newurl3='_000000000'
newurl4='_0000000000_110_551_5510101_20358_1000002_9001_10008_Z001__.html?'
newurl=newurl2+newurl1+newurl3+newurl1+newurl4
res2=requests.get(newurl)
res2.encoding='utf-8'
res2=json.loads(res2.text.lstrip("pcData(").rstrip(")\n"))
return res2['data']['price']['saleInfo'][0]['netPrice']
#获取商品参数的keys
def Getparam(url):
res1=requests.get(url)
res1.encoding='utf-8'
soup1=BeautifulSoup(res1.text,'html.parser')
a=soup1.select('#itemParameter')[0].text
a=a.split('\n\n\n')
a.remove('\n\n主体')
b=['价格']
for each1 in range(len(a)):
if each1%2==0:
b.append(a[each1])
return b[:-1]
#获取商品参数的values
def Getparam1(url):
res1=requests.get(url)
res1.encoding='utf-8'
soup1=BeautifulSoup(res1.text,'html.parser')
a=soup1.select('#itemParameter')[0].text
a=a.split('\n\n\n')
a.remove('\n\n主体')
b=[Price(url)]
for each1 in range(len(a)):
if each1%2==1:
b.append(a[each1])
return b
#生成字典 目的是用字典生成dataframe
def Getgroup(url):
return dict(zip(Getparam(url),Getparam1(url)))
四 生成df列表
def GetAll(url):
new=Geturl(url)
x={}
i=0
for pd in new:
x[i]=Getgroup(pd)
i+=1
return x
import pandas as pd
df=pd.DataFrame(GetAll(url))
df.T.to_csv('details.csv')
五 大功告成
但是感觉在生成df用嵌套字典复杂了点,大家有啥好建议吗?
ps 最终选择方太的 EMD2T+FD21BE 大家觉得口碑怎么样?