文字雲問題

#python #文字雲

cashrain 2022-06-09 18:08:55 ‧ 651 瀏覽

分享至

請問為什麼文字雲產生的時候每個字詞後面都會有'符號呢?

https://drive.google.com/file/d/1WZ3xl37KX-HM71z_LVi1HIuPtuzTVX2h/view?usp=sharing

# 資料整理
comment = pd.read_csv('drink留言內容.csv')

removeword = ['span','class','f3','https','imgur','h1','_   blank','href','rel','nofollow','target','cdn','cgi','b4','jpg','hl','b1','f5','f4','goo.gl','f2','email','map','f1','f6','__cf___','data','bbs''html','cf','f0','b2','b3','b5','b6','原文內容','原文連結','作者''標題','時間','看板','<','>','，','。','？','—','閒聊','・','/','=','\"','\n','」','「','！','[',']','：','‧','╦','╔','╗','║','╠','╬','╬',':','╰','╩','╯','╭','╮','│','╪','─','《','》','_','.','、','（','）','　','*','※','~','○','”','“','～','@','＋','\r','▁',')','(','-','═','?',',','!','…','&',';','『','』','#','＝',"'"
,'\l','的','了','也','就','在','以','會','都','XD','不是','覺得','沒','喔','知道','店','可能','說','看到','感覺','應該']

for i in removeword:
    comment["content"] = comment["content"].apply(lambda x: str(x).replace(i,""))

sentence=''
#中文範例
for i in comment['content']:
    sentence=sentence+i
    print(sentence)

seg_list = jieba.cut(sentence, cut_all=False)
seg_list = jieba.lcut(sentence, cut_all=False)

def remove_stop_words(file_name,seg_list):
  with open(file_name,'r',encoding='utf-8') as f:
    stop_words = f.readlines()
  stop_words = [stop_word.rstrip() for stop_word in stop_words]
  new_list = []
  
  for seg in seg_list:
    if seg not in stop_words:
      new_list.append(seg)
  return new_list


#統計詞頻
file_name = 'stopwords.txt'
seg_list = remove_stop_words(file_name,seg_list)

def count_segment_freq(seg_list):
  seg_df = pd.DataFrame(seg_list,columns=['seg'])
  seg_df['count'] = 1
  sef_freq = seg_df.groupby('seg')['count'].sum().sort_values(ascending=False)
  sef_freq = pd.DataFrame(sef_freq)
  return sef_freq
sef_freq = count_segment_freq(seg_list)
sef_freq.head()


font_path = r'msjh.ttc'
wc = WordCloud(background_color='black',font_path=font_path)
wc.generate(str(seg_list))
plt.imshow(wc)
plt.axis("off")
plt.show()