简书非官方大数据（三）

发表: 2017-10-12 浏览: 1253

Python

最近回老家了一趟，爬取简书的程序也就停下来了，回到长沙继续爬，很高兴的爬到了300W左右，导出来一看，好多重复的，我记得我说过设置过的，拿到代码一看，晕：

插入的是author_infos表，但判断却是author_url表，然后我就打算去重后调用url爬取用户详细信息，可耐mongodb去重不会，自己百度了下也没搞明白；再者，向右前辈说我爬取字段太少，那我就想重新修改爬取好了（已哭晕在厕所）。

代码

import requests

from lxml import etree

import time

import pymongo



client = pymongo.MongoClient('localhost', 27017)

jianshu = client['jianshu']

author_infos = jianshu['author_infos']



headers = {

    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',

    'Connection':'keep-alive'

}



def get_article_url(url,page):

    link_view = '{}?order_by=added_at&page={}'.format(url,str(page))

    try:

        html = requests.get(link_view,headers=headers)

        selector = etree.HTML(html.text)

        infos = selector.xpath('//div[@class="name"]')

        for info in infos:

            author_name = info.xpath('a/text()')[0]

            authorurl = info.xpath('a/@href')[0]

            if 'http://www.jianshu.com'+ authorurl in [item['url'] for item in author_infos.find()]:

                pass

            else:

            # print('http://www.jianshu.com'+authorurl,author_name)

            #     author_infos.insert_one({'author_name':author_name,'author_url':'http://www.jianshu.com'+authorurl})

                get_all_info('http://www.jianshu.com'+authorurl)

                get_reader_url(authorurl)

        time.sleep(2)

    except requests.exceptions.ConnectionError:

        pass



# get_article_url('http://www.jianshu.com/c/bDHhpK',2)

def get_reader_url(url):

    link_views = ['http://www.jianshu.com/users/{}/followers?page={}'.format(url.split('/')[-1],str(i)) for i in range(1,100)]

    for link_view in link_views:

        try:

            html = requests.get(link_view,headers=headers)

            selector = etree.HTML(html.text)

            infos = selector.xpath('//li/div[@class="info"]')

            for info in infos:

                author_name = info.xpath('a/text()')[0]

                authorurl = info.xpath('a/@href')[0]

                # print(author_name,authorurl)

                # author_infos.insert_one({'author_name': author_name, 'author_url': 'http://www.jianshu.com' + authorurl})

                get_all_info('http://www.jianshu.com' + authorurl)

        except requests.exceptions.ConnectionError:

            pass

# get_reader_url('http://www.jianshu.com/u/7091a52ac9e5')



def get_all_info(url):

    html = requests.get(url,headers=headers)

    selector = etree.HTML(html.text)

    try:

        author_name = selector.xpath('//a[@class="name"]/text()')[0]

        author_focus = selector.xpath('//div[@class="info"]/ul/li[1]/div/p/text()')[0]

        author_fans = selector.xpath('//div[@class="info"]/ul/li[2]/div/p/text()')[0]

        author_article = selector.xpath('//div[@class="info"]/ul/li[3]/div/p/text()')[0]

        author_write_amount = selector.xpath('//div[@class="info"]/ul/li[4]/div/p/text()')[0]

        author_get_like = selector.xpath('//div[@class="info"]/ul/li[5]/div/p/text()')[0]

        author_intrus = selector.xpath('//div[1]/div/div[2]/div[2]/div/text()')

        author_intru = selector.xpath('//div[1]/div/div[2]/div[2]/div/text()') if len(author_intrus) != 0 else "无"

        if selector.xpath('//span[@class="author-tag"]'):

            author_title = '签约作者'

        else:

            author_title = '普通作者'

        infos = {

            'url':url,

            'name':author_name,

            'focus':author_focus,

            'fans':author_fans,

            'article':author_article,

            'write_amount':author_write_amount,

            'get_like':author_get_like,

            'intru':author_intru,

            'title':author_title

        }

        author_infos.insert_one(infos)

    except IndexError:

        pass

今天就先这样，主要是记录自己的学习过程。
本文已在版权印备案，如需转载请访问版权印。58708803

0 个评论

要回复文章请先登录或注册