Pullword是一个中文在线分词系统,使用的是模糊分词的模式,输入待分词句子会返回词条及该词在句子中的成词概率。
Pullword的API调用说明:http://api.pullword.com/。
# -*- coding: utf-8 -*-
import requests
def split_word(words, probability=0, mode=0):
d = {}
for i in words:
m = i.split(':') #使用:分隔词条和成词概率
d[m[0]] = float(m[1]) #以词条为主键,成词概率为值
m = sorted(d.items(), key=lambda k: k[1], reverse=True) #dict按照value排序,返回list
if mode == 0: #0:返回大于等于选词概率的词条list
word_list = []
for i in range(len(m)):
if m[i][1] >= probability:
word_list.append(m[i][0])
return word_list
elif mode == 1: #1:返回小于选词概率的词条list
word_list = []
for i in range(len(m)):
if m[i][1] < probability:
word_list.append(m[i][0])
return word_list
elif mode == 2: #2:返回所有概率的词条及其概率的list
return m
def pullword(word, probability, mode):
"""
:param word: 一段需要分词的中文
:param probability: 选词概率
:param mode: 返回模式,0:返回大于等于选词概率的词条list,1:返回小于选词概率的词条list,2:返回所有概率的词条及其概率的list
:return: 根据参数,按照概率从大到小排序返回list
"""
#IP调用
url1 = 'http://43.241.223.121/get.php' #IP1
url2 = 'http://120.26.6.172/get.php' #IP2
word = bytes(word, encoding='utf-8') #将str类型文本转为bytes类型
params = {'source': word, 'param1': '0', 'param2': '1'} #默认参数
r1 = requests.get(url=url1, params=params) #调用IP1
if r1.status_code != 200:
r2 = requests.get(url=url2, params=params) #如果IP1调用失败,调用IP2
if r2.status_code != 200 and r2.content.decode('utf-8').strip().split('\r\n')[0].startswith('error'):
result = list()
result.append(word.decode('utf-8'))
return result #如果IP2也调用失败或者分词结果返回error,结果返回输入文本
else:
data = r2.content.decode('utf-8').strip().split('\r\n') #使用换行符分隔每个词条,返回list
return split_word(data, probability, mode) #根据参数返回词条list
elif r1.content.decode('utf-8').strip().split('\r\n')[0].startswith('error'):
result = list()
result.append(word.decode('utf-8'))
return result
else:
data = r1.content.decode('utf-8').strip().split('\r\n')
return split_word(data, probability, mode)
https://segmentfault.com/a/1190000007587330