請問為什麼迴圈跑到第二圈的時候都會報錯Length mismatch: Expected axis has 0 elements, new values have 2 elements
,並且print(data_slected)都會說empty dataframe呢??
已經弄了一個下午還是找不到哪裡有問題..
這些程式碼是要整理PTT_運動內衣_所有資料short.csv檔案,抓取['Nike','Shock_Absorber','addidas','UA','Triumph','Mollifix','Uniqlo','Calvin_Klein','UT','AB']這些字在csv檔出現過幾次並做成圖表
a=['Nike','Shock_Absorber','addidas','UA','Triumph','Mollifix','Uniqlo','Calvin_Klein','UT','AB']
# 匯入原始爬蟲資料
data=pd.read_csv('PTT_運動內衣_所有資料short.csv') #開啟檔案
for i in a:
# 將重複與空白訊息去除
data.drop_duplicates()
data.dropna(inplace=True)
#資料欄位整合
data["所有文"]=data["標題"]+data["內文"]
data['所有文'].replace('Nike','nike')
#下條件找出符合條件的資料,再資料欄位整合,所有文字變成一個大字串
data_slected=data[data["所有文"].str.contains(i)]
theSTR=str(data_slected["所有文"].sum())
#資料清理,無意義字元去除
#無意義字元
removeword = ['span','class','f3','https','imgur','h1','_ blank','href','rel','nofollow','target','cdn','cgi','b4','jpg','hl','b1','f5','f4','goo.gl','f2','email','map','f1','f6','__cf___','data','bbs''html','cf','f0','b2','b3','b5','b6','原文內容','原文連結','作者''標題','時間','看板','<','>',',','。','?','—','閒聊','・','/',
'','=','\"','\n','」','「','!','[',']',':','‧','╦','╔','╗','║','╠','╬','╬',':','╰','╩','╯','╭','╮','│','╪','─','《','》','_','.','、','(',')',' ','*','※','~','○','”','“','~','@','+','\r','▁',')','(','','═','?',',','!','…','&',';','『','』','#','=','\l']
# 移除無意義字元列
for word in removeword:
theSTR = theSTR.replace(word,'')
#-------------Jieba自動切詞-------------
jieba.set_dictionary('dict.txt.big')
words=jieba.lcut(theSTR, cut_all=False)
#計算各詞數量
words_voice=[]
for j in words:
words_voice.append(words.count(j))
words_voice_df=pd.DataFrame(zip(words,words_voice))
words_voice_df.columns=["字詞","聲量"]
#問題:如何應用停用詞stopwords.txt, 去除無意義的斷詞字詞
with open('stopwords.txt','r',encoding='utf-8-sig') as f:
stops=f.read().split('\n')
wordslist=[]
words_voice=[]
for j in words:
if j not in stops:
wordslist.append(j)
words_voice.append(words.count(j))
# 各詞數量匯總
words_voice_df=pd.DataFrame(zip(wordslist,words_voice))
words_voice_df.columns=["字詞","聲量"]
# 將自己常用的詞加入字典
jieba.load_userdict('user_dict.txt')
keywords_top=jieba.analyse.extract_tags(theSTR,topK=5, withWeight=True) #基于TF-IDF算法進行關鍵詞抽取
keywords_top
keywords_top_DF = pd.DataFrame(keywords_top)
keywords_top_DF.columns=["字詞","聲量"]