pip install gensim
from gensim import corpora, models
import gensim
documents = ["This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?"]
# 词袋模型
texts = [[word for word in document.lower().split()] for document in documents]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
# LDA模型
lda_model = gensim.models.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=15)
for topic in lda_model.print_topics():
print(topic)
'''
(0, '0.163*"this" + 0.163*"the" + 0.163*"is" + 0.116*"first" + 0.116*"document." + 0.069*"second" + 0.069*"document?" + 0.069*"document" + 0.024*"third" + 0.024*"and"')
(1, '0.130*"one." + 0.130*"and" + 0.130*"third" + 0.129*"is" + 0.129*"the" + 0.129*"this" + 0.045*"document." + 0.045*"first" + 0.045*"document" + 0.044*"document?"')
主題 0:
關鍵詞:"this," "the," "is," "first," "document," "second," "document?," "document," "third," "and"
權重表示了每個詞彙在主題中的重要性,權重越高,詞彙在主題中的影響越大。
主題 1:
關鍵詞:"one," "and," "third," "is," "the," "this," "document," "first," "document," "document?"
同樣,這是另一個主題,包含一組不同的關鍵詞和權重。
'''
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
# 創建文件範例
documents = ["這是第一個文件。",
"這個文件是第二個文件。",
"這是第三個文件。",
"這是第一個文件嗎?"]
# TF-IDF向量化文件
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)
# LSA模型
lsa_model = TruncatedSVD(n_components=2)
lsa_topic_matrix = lsa_model.fit_transform(X)
print(lsa_topic_matrix)
[[ 0.61176471 0.15294118]
[ 0. 0. ]
[ 0.75294118 0.18823529]
[-0.24253563 0.9701425 ]]
第一個文檔的主題分布是 [0.61176471, 0.15294118],這表示它在主題1上有較高的重要性。
第二個文檔的主題分布是 [0, 0],這表示它在兩個主題上的重要性都很低。
第三個文檔的主題分布是 [0.75294118, 0.18823529],這也表示它在主題1上有較高的重要性。
第四個文檔的主題分布是 [-0.24253563, 0.9701425],這表示它在主題2上有較高的重要性,但這個重要性是負向的,意味著這個文檔在主題2上是負向的。
pip install -U nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
positive_text = "It's a food,I like this food,but I hate that"
# 初始化情感分析器
analyzer = SentimentIntensityAnalyzer()
# 情感分析分數
sentiment_score = analyzer.polarity_scores(positive_text)
print(sentiment_score)
{'neg': 0.33, 'neu': 0.446, 'pos': 0.223, 'compound': -0.296}
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Package vader_lexicon is already up-to-date!
pip install snownlp
from snownlp import SnowNLP
# 創建中文文本範例
chinese_text = "你好讚,他好爛,我很普通"
# 使用SnowNLP進行情感分析
s = SnowNLP(chinese_text)
# 獲取情感分析結果
sentiment_score = s.sentiments
print(sentiment_score)
0.5514232923761263
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')
text = """Hello, how are you doing today? The weather is great.The sky is pinkish-blue. You shouldn't eat cardboard"""
tokenized_text = sent_tokenize(text)
print(tokenized_text)
O:
['Hello, how are you doing today?', 'The weather is great.The sky is pinkish-blue.', "You shouldn't eat cardboard"]
import nltk
sent = "I am come from German"
token = nltk.word_tokenize(sent)
print(token)
O:
['I', 'am', 'come', 'from', 'German']
import string
if __name__ == '__main__':
# 方式一
# s = 'abc.'
text_list = "Are you okay? Yes, I am fine!"
text_list = text_list.translate(str.maketrans(
string.punctuation, " " * len(string.punctuation))) # abc
print("s: ", text_list)
# 方式二
english_punctuations = [',', '.', ':', ';', '?',
'(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
text_list = [
word for word in text_list if word not in english_punctuations]
print("text: ", text_list)
O:
s: Are you okay Yes I am fine
text: ['A', 'r', 'e', ' ', 'y', 'o', 'u', ' ', 'o', 'k', 'a', 'y', ' ', ' ', 'Y', 'e', 's', ' ', ' ', 'I', ' ', 'a', 'm', ' ', 'f', 'i', 'n', 'e', ' ']
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
"""移除停用词"""
stop_words = stopwords.words("english")
if __name__ == '__main__':
text = "The food taste great. I love it."
word_tokens = nltk.tokenize.word_tokenize(text.strip())
filtered_word = [w for w in word_tokens if not w in stop_words]
print("word_tokens: ", word_tokens)
print("filtered_word: ", filtered_word)
O:
word_tokens: ['The', 'food', 'taste', 'great', '.', 'I', 'love', 'it', '.']
filtered_word: ['The', 'food', 'taste', 'great', '.', 'I', 'love', '.']
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
X = lemmatizer.lemmatize('leaves')
print(X)
O:
leaf
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
x = porter_stemmer.stem('maximum')
print(x)
lem = WordNetLemmatizer() # 還原
stem = PorterStemmer() # 提取
word = "flying"
print("Lemmatized Word:", lem.lemmatize(word, "v"))
print("Stemmed Word:", stem.stem(word))
O:
maximum
Lemmatized Word: fly
Stemmed Word: fli
I:
import nltk
nltk.download('averaged_perceptron_tagger')
sent = "Albert Einstein was born in Ulm, Germany in 1879."
tokens = nltk.word_tokenize(sent)
tags = nltk.pos_tag(tokens)
print("sent: ", sent)
print("tokens: ", tokens)
print("tags: ", tags)
O:
sent: Albert Einstein was born in Ulm, Germany in 1879.
tokens: ['Albert', 'Einstein', 'was', 'born', 'in', 'Ulm', ',', 'Germany', 'in', '1879', '.']
tags: [('Albert', 'NNP'), ('Einstein', 'NNP'), ('was', 'VBD'), ('born', 'VBN'), ('in', 'IN'), ('Ulm', 'NNP'), (',', ','), ('Germany', 'NNP'), ('in', 'IN'), ('1879', 'CD'), ('.', '.')]
from nltk.corpus import wordnet
import nltk
# Downloading package wordnet to C:\Users\Administrator\AppData\Roaming\nltk_data...Unzipping corpora\wordnet.zip.
nltk.download('wordnet')
word = wordnet.synsets('spectacular')
print(word)
# [Synset('spectacular.n.01'), Synset('dramatic.s.02'), Synset('spectacular.s.02'), Synset('outstanding.s.02')]
print("1.", word[0].definition())
print("2.", word[1].definition())
print("3.", word[2].definition())
print("4.", word[3].definition())
O:
[Synset('spectacular.n.01'), Synset('dramatic.s.02'), Synset('spectacular.s.02'), Synset('outstanding.s.02')]
1. a lavishly produced performance
2. sensational in appearance or thrilling in effect
3. characteristic of spectacles or drama
4. having a quality that thrusts itself into attention