[Day 26] 利用R語言跑詞嵌入模型 - cbow 與測試

15th鐵人賽 r語言 r 文字探勘 text mining

rlover

2023-10-11 23:01:58

422 瀏覽

分享至

詞嵌入模型

詞嵌入模型 - CBOW

CBOW（Continuous Bag-of-Words）是 word2vec 裡另一個演算法，CBOW 和 Skip-gram 其實很像，但他們的運作方式剛好相反。

在 Skip-gram 中，我們有一個詞，然後嘗試預測它周圍的上下文，對吧？但在 CBOW 中，我們有一堆「上下文詞」，然後嘗試預測這些詞周圍通常會出現什麼「中心詞」。想像一下，你有一句話「貓喜歡追老鼠」，如果我們選擇「喜歡」和「追」作為上下文，CBOW 模型會嘗試從這兩個詞預測出「老鼠」為中心詞。

這個模型會看過大量這種「上下文—中心詞」的配對，然後不斷調整自己，讓自己變得越來越擅長這種預測。最後的結果是，每個詞都會被轉換成一個數字向量，這個向量能捕捉詞的各種語義信息，例如它和哪些詞常常一起出現。

所以，用一句話來總結，CBOW 就是一個「給我一些上下文詞，我猜猜中間應該是什麼詞」的模型。

library(tidyverse)
library(jiebaR)
library(word2vec)

df_article <- read_rds("/Users/macuser/Documents/GitHub/text-mining/data/df_main_pts_daily.rds")
df_article_clean <- df_article %>% mutate(text = str_remove_all(text, " |\\n|\\r|\\t")) %>% select(id, text)

cutter <- worker("tag", stop_word = "data/停用詞-繁體中文.txt")
vector_word <- c("中華民國", "李登輝", "蔣中正", "蔣經國", "李登輝", "陳水扁", "馬英九")
new_user_word(cutter, words = vector_word)

## [1] TRUE

### text part
df_speech_seg <-
  df_article_clean %>% 
  mutate(text = str_replace_all(text, "台灣|臺灣", "臺灣")) %>%
  mutate(text = str_replace_all(text, "台北", "臺北")) %>%
  mutate(text = str_replace_all(text, "台南", "臺南")) %>%
  mutate(text = str_replace_all(text, "台東", "臺東")) %>%
  mutate(text = str_replace_all(text, "台中", "臺中")) %>%
  mutate(text = str_remove_all(text, "\\n|\\r|\\t|:| |　")) %>%
  mutate(text = str_remove_all(text, "[a-zA-Z0-9]+")) %>%
  mutate(text_segment = purrr::map(text, function(x)segment(x, cutter))) %>%
  mutate(text_POS = purrr::map(text_segment, function(x)names(x))) %>%
  unnest(c(text_segment, text_POS)) %>%
  select(-text, everything(), text)

df_speech_word <- df_speech_seg %>% 
  group_by(id) %>% summarise(text = str_c(text_segment, collapse = " ")) %>% ungroup()

model <- word2vec(x = df_speech_word$text, type = "cbow", dim = 30, iter = 20)
embedding <- as.matrix(model)

predict(model, c("冠軍"), type = "nearest", top_n = 5)

## $冠軍
##   term1  term2 similarity rank
## 1  冠軍 巡迴賽  0.9141591    1
## 2  冠軍   奪得  0.9118551    2
## 3  冠軍     姜  0.9114923    3
## 4  冠軍   女單  0.9004914    4
## 5  冠軍   男單  0.8944823    5

predict(model, c("金牌"), type = "nearest", top_n = 5)

## $金牌
##   term1 term2 similarity rank
## 1  金牌  亞運  0.9158620    1
## 2  金牌  銀牌  0.9115074    2
## 3  金牌  隊史  0.9049072    3
## 4  金牌  獎牌  0.9041975    4
## 5  金牌  第名  0.9028842    5

predict(model, c("臺北"), type = "nearest", top_n = 5)

## $臺北
##   term1  term2 similarity rank
## 1  臺北   臺中  0.8396075    1
## 2  臺北     蔣  0.8234465    2
## 3  臺北   彰化  0.8046396    3
## 4  臺北   南下  0.8033378    4
## 5  臺北 盧秀燕  0.7994725    5

predict(model, c("臺灣"), type = "nearest", top_n = 5)

## $臺灣
##   term1    term2 similarity rank
## 1  臺灣     發展  0.7762638    1
## 2  臺灣   影響力  0.7744966    2
## 3  臺灣 中華民國  0.7634369    3
## 4  臺灣     國際  0.7613040    4
## 5  臺灣     印尼  0.7597901    5

library(uwot)
library(ggplot2)
library(ggrepel)

viz <- umap(embedding, n_neighbors = 15, n_threads = 2)

df <- tibble(word = rownames(embedding), 
             x = viz[, 1], y = viz[, 2]) %>% head(20)

ggplot(df, aes(x = x, y = y, label = word)) + 
  geom_text_repel(family = "Noto Sans TC Medium") + theme_void() + 
  labs(title = "word2vec - adjectives in 2D using UMAP") +
  theme(text = element_text(family = "Noto Sans TC Medium"))

前面我們提過，詞嵌入模型企圖捕捉語言的意義，因此，除了尋找相似詞以外，還可以用同一個方法檢查，測試模型表現。

有幾種測試可以讓使用者檢驗：

類比測試：類比測試可能是最常用來測試詞嵌入模型的方法。在類比測試中，我們會給出一個詞彙組合，例如國王與皇后，模型需要生成一個相似的詞對，例如男人和女人。如果模型能夠生成正確的詞對，代表說模型能夠學習到詞彙之間的類比關係。

同義詞測試：同義詞檢測是另一種常用的測試詞嵌入模型的方法，在同義詞檢測中，我們會給出一個詞，模型需要生成一組跟那個詞有著相同意思義的詞彙。如果模型能夠生成正確的同義詞，代表說模型能夠學習到詞彙之間的同義關係，通常這種都有字典可以用，不過台灣大部分都會用對岸的詞典，哈。

反義詞測試：反義詞檢測是拿來對立關係的方法，就是前面同義詞的反面，相對比較少用。

wv <- predict(model, newdata = c("臺北", "臺灣", "日本"), type = "embedding")
wv <- wv["臺北", ] - wv["臺灣", ] + wv["日本", ]
predict(model, newdata = wv, type = "nearest", top_n = 3)

##   term similarity rank
## 1 超市  0.9179568    1
## 2 高雄  0.9140611    2
## 3 航班  0.9129902    3

# 正確答案應該是東京才對

wv <- predict(model, newdata = c("臺北", "臺灣", "中國"), type = "embedding")
wv <- wv["臺北", ] - wv["臺灣", ] + wv["中國", ]
predict(model, newdata = wv, type = "nearest", top_n = 3)

##   term similarity rank
## 1 臺北  0.9114347    1
## 2 取消  0.8787717    2
## 3 超市  0.8647789    3

# 正確答案應該是北京才對

wv <- predict(model, newdata = c("向上", "快樂", "悲傷"), type = "embedding")
wv <- wv["向上", ] - wv["快樂", ] + wv["悲傷", ]
predict(model, newdata = wv, type = "nearest", top_n = 3)

##       term similarity rank
## 1   王薇君  0.9923455    1
## 2     到場  0.9720988    2
## 3 心有餘悸  0.9651424    3

# 正確答案應該是向下才對

wv <- predict(model, newdata = c("臺灣", "政府"), type = "embedding")
predict(model, newdata = wv["臺灣", ] + wv["政府", ], type = "nearest", top_n = 2)

##   term similarity rank
## 1 和平  0.9820176    1
## 2 台美  0.9620961    2

predict(model, newdata = wv["臺灣", ] - wv["政府", ], type = "nearest", top_n = 2)

##   term similarity rank
## 1 彩繪  0.8653653    1
## 2   打  0.8398908    2

# 沒有正確與錯誤，單純看說最貼近的

# 底下附上英文測試的答案
# 類比測試：
# king : man :: queen : ?
#   正確答案：woman
# 
# 同義詞測：
# happy : glad :: sad : ?
#   正確答案：unhappy
# 
# 反義詞測
# good : bad :: up : ?
#   正確答案：down