文本分析（词云图、提取关键词）

发表: 2019-04-14 浏览: 1701
文本分析


import pandas as pd

import numpy

import jieba

import os



#查看修改路径

os.getcwd()

os.chdir(r"C:\Users\zcfemail\0.Python\4.python课程学习\2.文本分析")



#导入原资料

df_txt=pd.read_table(r'C:\Users\zcfemail\Desktop\aaa.txt',names=['content'], encoding='utf-8')



#删除空值，并转化为列表

df_txt=df_txt.dropna() #删除空值

# df_content=df_txt["content"].tolist()  # 所有内容转化为了列表

content=df_txt.content.values.tolist()  #每行内容转化为列表



#进行初步分词

content_s=[]

for line in content:

    current_segment=jieba.lcut(line)

    if len(current_segment)>1 and current_segment!='\t\r\n':

        content_s.append(current_segment)  



#分词结果转化为 数据框        

df_content=pd.DataFrame({'content_s':content_s})

df_content.head()



#导入停用词

stopwords=pd.read_table(r".\stopwords.txt",index_col=False,sep='\t',quoting=3,names=['stopword'], encoding='utf-8')



#自定义清理停用词函数

def drop_stopwords(contents,stopwords):

    contents_clean=[]

    all_words=[]

    for line in contents:

        line_clean=[]

        for word in line:

            if word in stopwords:

                continue

            line_clean.append(word)

            all_words.append(str(word))

        contents_clean.append(line_clean)

    return contents_clean,all_words



#清理后的词频

contents=df_content.content_s.values.tolist()

stopwords = stopwords.stopword.values.tolist()

contents_clean,all_words = drop_stopwords(contents,stopwords)



# 清理后的词频列表和所有汇总词频转化为数据框

df_content_clean=pd.DataFrame({'contents_clean':contents_clean})

df_all_words=pd.DataFrame({'all_words':all_words})



### 画词云图，构建所有词频

words_count=df_all_words.groupby(by=['all_words'])['all_words'].agg({"count":numpy.size})

words_count=words_count.reset_index().sort_values(by=["count"],ascending=False)



# 词云图

from wordcloud import WordCloud

import matplotlib.pyplot as plt

%matplotlib inline

import matplotlib

matplotlib.rcParams['figure.figsize']=(10.0,5.0)



wordcloud=WordCloud(font_path="./data/simhei.ttf",background_color="white",max_font_size=80)

word_frequence = {x[0]:x[1] for x in words_count.head(300).values}

wordcloud=wordcloud.fit_words(word_frequence)

plt.imshow(wordcloud)



# 每一列提取主要关键词

import jieba.analyse

import numpy as np

#index = 0

for index in np.arange(6):

    print (df_txt['content'][index])

    content_S_str = "".join(content_s[index])  

    print ("  ".join(jieba.analyse.extract_tags(content_S_str, topK=5, withWeight=False)))

    print("\n")

 

#全篇文章提出关键词

content_new=[]

for index in np.arange(6):

    content_new.append(df_txt['content'][index])



content_new=''.join(content_new)

print ("  ".join(jieba.analyse.extract_tags(content_new, topK=10)))

print("\n")
文本分析.py
0 个评论

要回复文章请先登录或注册