iT邦幫忙

2023 iThome 鐵人賽

DAY 30
0
AI & Data

用R語言玩轉文字探勘系列 第 30

[Day 30] 用R語言玩轉文字探勘案例 - 視覺化篇

  • 分享至 

  • xImage
  •  

應用案例 - 視覺化篇

我們得到資料後,最後一步就是視覺化了!因為重點是程式碼,所以沒有每張圖片都上傳,可以一張張研究!

library(gt)

df_year_word_top_wide %>% select(-year) %>%
  rename(`年份` = year_full) %>%
  rename(`總統` = president) %>%
  gt::gt() %>%
  tab_header(
    title = "歷年總統國慶演說熱詞",
  ) %>%
  tab_options(
    heading.title.font.size = 24,
    heading.subtitle.font.size = 14,
    table.margin.left = 5,
    table.margin.right = 5
  ) %>%
  cols_align(align = "center", columns = everything()) %>%
  opt_align_table_header(align = "center")
  

https://ithelp.ithome.com.tw/upload/images/20231014/20161981zlqBUhbOpg.png

df_year_word_tfidf_top_wide %>% 
  rename(`年份` = year_full) %>%
  rename(`總統` = president) %>%
  gt::gt() %>%
  tab_header(
    title = "歷年總統國慶演說TF-IDF",
  ) %>%
  tab_options(
    heading.title.font.size = 24,
    heading.subtitle.font.size = 14,
    table.margin.left = 5,
    table.margin.right = 5
  )  %>%
  cols_align(align = "center", columns = everything()) %>%
  opt_align_table_header(align = "center")
  

https://ithelp.ithome.com.tw/upload/images/20231014/201619818T4O1BPJtJ.png

df_president_word_top_wide %>% 
  mutate(president = fct_relevel(as_factor(president), "李登輝","陳水扁","馬英九", "蔡English")) %>%
  arrange(president) %>%
  rename(`總統` = president) %>%
  gt::gt() %>%
  tab_header(
    title = "總統任內國慶演說熱詞",
  ) %>%
  tab_options(
    heading.title.font.size = 24,
    heading.subtitle.font.size = 14,
    table.margin.left = 5,
    table.margin.right = 5
  ) %>%
  cols_align(align = "center", columns = everything()) %>%
  opt_align_table_header(align = "center")

df_president_word_tfidf_top_wide %>% 
  mutate(president = fct_relevel(as_factor(president), "李登輝","陳水扁","馬英九", "蔡English")) %>%
  arrange(president) %>%
  rename(`總統` = president) %>%
  gt::gt() %>%
  tab_header(
    title = "總統任內國慶演說TF-IDF",
  ) %>%
  tab_options(
    heading.title.font.size = 24,
    heading.subtitle.font.size = 14,
    table.margin.left = 5,
    table.margin.right = 5
  )  %>%
  cols_align(align = "center", columns = everything()) %>%
  opt_align_table_header(align = "center")

tibble(`李登輝` = nearest_to(model_01,model_01[["中國"]]) %>% names() %>% `[`(2:4),
       `陳水扁` = nearest_to(model_02,model_02[["中國"]]) %>% names() %>% `[`(2:4),
       `馬英九` = nearest_to(model_03,model_03[["中國"]]) %>% names() %>% `[`(2:4),
       `English` = nearest_to(model_04,model_04[["中國"]]) %>% names() %>% `[`(2:4)) %>%
  gt::gt() %>%
  tab_header(
    title = "跟「中國」距離最近的詞彙",
  ) %>%
  tab_options(
    heading.title.font.size = 24,
    heading.subtitle.font.size = 14,
    table.margin.left = 5,
    table.margin.right = 5
  )  %>%
  cols_align(align = "center", columns = everything()) %>%
  opt_align_table_header(align = "center")

https://ithelp.ithome.com.tw/upload/images/20231014/20161981TmaSC6P05f.png

tibble(`李登輝` = nearest_to(model_01,model_01[["臺灣"]]) %>% names() %>% `[`(2:4),
       `陳水扁` = nearest_to(model_02,model_02[["臺灣"]]) %>% names() %>% `[`(2:4),
       `馬英九` = nearest_to(model_03,model_03[["臺灣"]]) %>% names() %>% `[`(2:4),
       `English` = nearest_to(model_04,model_04[["臺灣"]]) %>% names() %>% `[`(2:4)) %>%
  gt::gt() %>%
  tab_header(
    title = "跟「臺灣」距離最近的詞彙",
  ) %>%
  tab_options(
    heading.title.font.size = 24,
    heading.subtitle.font.size = 14,
    table.margin.left = 5,
    table.margin.right = 5
  )  %>%
  cols_align(align = "center", columns = everything()) %>%
  opt_align_table_header(align = "center")
  

https://ithelp.ithome.com.tw/upload/images/20231014/20161981Ilr1LPv9dN.png

### 圖表
source("theme_vt_mac2.R")

### 中華民國與台灣
p_president_year_roc <-
  df_president_year_roc %>%
  mutate(n = as.integer(n)) %>%
  mutate(year = as_date(str_c(year_full, "/1/1"))) %>%
  ggplot(aes(x = year, y = n, group = text_type)) +
  geom_col(position = "dodge", aes(fill = text_type)) +
  facet_wrap(text_type ~ ., nrow = 2) +
  scale_x_date(date_labels = "'%y", date_breaks = "2 years") +
  scale_y_continuous(limits = c(0,100), breaks = seq(0,100,20)) +
  scale_fill_manual(values = c("#000095", "#1B9431")) +
  labs(x= "",y= "",
       color = "學校類型",
       title = "「中華民國」有沒有變不見?",
       subtitle = "歷年總統國慶演說辭中,「臺灣」和「中華民國」出現次數, 1997-2023",
       caption = "資料來源:自行統計;中華民國包含中華民族、臺灣不包含臺灣海峽") +
  theme_vt +
  theme(legend.position = "none",
        strip.text = element_text(size = 72))

https://ithelp.ithome.com.tw/upload/images/20231014/20161981yhpQiirPBB.png

### 中美
p_president_year_foreign <-
  df_president_year_foreign %>%
  mutate(n = as.integer(n)) %>%
  mutate(year = as_date(str_c(year_full, "/1/1"))) %>%
  ggplot(aes(x = year, y = n, group = text_type)) +
  geom_col(position = "dodge", aes(fill = text_type)) +
  facet_wrap(text_type ~ ., nrow = 2) +
  scale_x_date(date_labels = "'%y", date_breaks = "2 years") +
  scale_y_continuous(limits = c(0,40), breaks = seq(0,40,10)) +
  scale_fill_manual(values = c("#E9292B", "#3E346E")) +
  labs(x= "",y= "",
       color = "學校類型",
       title = "兩岸關係仍是焦點,盟友偶爾出現",
       subtitle = "歷年總統國慶演說辭中,「中國」和「美國」出現次數, 1997-2023",
       caption = "資料來源:自行統計;中國包含大陸、北京、兩岸、中共,美國包含華府") +
  theme_vt +
  theme(legend.position = "none",
        strip.text = element_text(size = 72))

word_cors_tsai %>% 
  filter(correlation > 0.3) %>%
  igraph::graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 10) +
  geom_node_text(aes(label = name, family = "Noto Sans CJK TC Medium"), size = 32, repel = TRUE) +
  labs(title = "宣示捍衛民主自由、維護區域和平是重點",
       subtitle = "蔡English國慶演說詞彙關係網絡, 2016-2023",
       caption = "資料來源:自行統計;相關係數達到0.3以上才列入") +
  theme(plot.margin = margin(2, 2, 2, 2, "cm"),
        plot.background = element_rect(fill = "white"), ##FFFAFA
        strip.background = element_rect(fill = "white"), ##FFFAFA
        panel.background = element_blank(),
        panel.grid = element_blank(),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.grid.major.y = element_blank(),
        text = element_text(family = "Noto Sans CJK TC Medium"),
        plot.title = element_text(size = 108, family = "Noto Sans CJK TC Medium", hjust = 0, margin = margin(0,0,12,0)),
        plot.subtitle = element_text(size = 84, family = "Noto Sans CJK TC Medium", hjust = 0, margin = margin(0,0,15,0)),
        plot.caption = element_text(size = 60, family = "Noto Sans CJK TC Medium", margin = margin(40,0,0,0)),
        legend.title = element_text(size = 60, family = "Noto Sans CJK TC Medium", angle = 0, hjust = 0),
        legend.text = element_text(size = 42, family = "Noto Sans CJK TC Medium", angle = 0, hjust = 0),
        legend.background = element_blank(),
        legend.position = "none",
        legend.key = element_rect(fill = "transparent"),
        axis.line.x = element_blank(), axis.line.y = element_blank(),
        axis.title = element_blank(),
        axis.text = element_blank(),
        plot.title.position = "plot", 
        plot.caption.position =  "plot")

word_cors_ma %>% 
  filter(correlation > 0.4) %>%
  igraph::graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 10) +
  geom_node_text(aes(label = name, family = "Noto Sans CJK TC Medium"), size = 32, repel = TRUE) +
  labs(title = "顧好兩岸關係、簽訂貿易協定並保障就業",
       subtitle = "馬英九國慶演說詞彙關係網絡, 2008-2015",
       caption = "資料來源:自行統計;相關係數達到0.4以上才列入") +
  theme(plot.margin = margin(2, 2, 2, 2, "cm"),
        plot.background = element_rect(fill = "white"), ##FFFAFA
        strip.background = element_rect(fill = "white"), ##FFFAFA
        panel.background = element_blank(),
        panel.grid = element_blank(),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.grid.major.y = element_blank(),
        text = element_text(family = "Noto Sans CJK TC Medium"),
        plot.title = element_text(size = 108, family = "Noto Sans CJK TC Medium", hjust = 0, margin = margin(0,0,12,0)),
        plot.subtitle = element_text(size = 84, family = "Noto Sans CJK TC Medium", hjust = 0, margin = margin(0,0,15,0)),
        plot.caption = element_text(size = 60, family = "Noto Sans CJK TC Medium", margin = margin(40,0,0,0)),
        legend.title = element_text(size = 60, family = "Noto Sans CJK TC Medium", angle = 0, hjust = 0),
        legend.text = element_text(size = 42, family = "Noto Sans CJK TC Medium", angle = 0, hjust = 0),
        legend.background = element_blank(),
        legend.position = "none",
        legend.key = element_rect(fill = "transparent"),
        axis.line.x = element_blank(), axis.line.y = element_blank(),
        axis.title = element_blank(),
        axis.text = element_blank(),
        plot.title.position = "plot", 
        plot.caption.position =  "plot")

word_cors_chen %>% 
  filter(correlation > 0.35) %>%
  igraph::graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 10) +
  geom_node_text(aes(label = name, family = "Noto Sans CJK TC Medium"), size = 32, repel = TRUE) +
  labs(title = "政黨輪替、公投入聯合國成關鍵",
       subtitle = "陳水扁國慶演說詞彙關係網絡, 2000-2007",
       caption = "資料來源:自行統計;相關係數達到0.35以上才列入") +
  theme(plot.margin = margin(2, 2, 2, 2, "cm"),
        plot.background = element_rect(fill = "white"), ##FFFAFA
        strip.background = element_rect(fill = "white"), ##FFFAFA
        panel.background = element_blank(),
        panel.grid = element_blank(),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.grid.major.y = element_blank(),
        text = element_text(family = "Noto Sans CJK TC Medium"),
        plot.title = element_text(size = 108, family = "Noto Sans CJK TC Medium", hjust = 0, margin = margin(0,0,12,0)),
        plot.subtitle = element_text(size = 84, family = "Noto Sans CJK TC Medium", hjust = 0, margin = margin(0,0,15,0)),
        plot.caption = element_text(size = 60, family = "Noto Sans CJK TC Medium", margin = margin(40,0,0,0)),
        legend.title = element_text(size = 60, family = "Noto Sans CJK TC Medium", angle = 0, hjust = 0),
        legend.text = element_text(size = 42, family = "Noto Sans CJK TC Medium", angle = 0, hjust = 0),
        legend.background = element_blank(),
        legend.position = "none",
        legend.key = element_rect(fill = "transparent"),
        axis.line.x = element_blank(), axis.line.y = element_blank(),
        axis.title = element_blank(),
        axis.text = element_blank(),
        plot.title.position = "plot", 
        plot.caption.position =  "plot")

https://ithelp.ithome.com.tw/upload/images/20231014/20161981aitDC17bF9.png

word_cors_lee %>% 
  filter(correlation > 0.15) %>%
  igraph::graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 10) +
  geom_node_text(aes(label = name, family = "Noto Sans CJK TC Medium"), size = 32, repel = TRUE) +
  labs(title = "政黨輪替、公投入聯合國成關鍵",
       subtitle = "李登輝國慶演說詞彙關係網絡, 2000-2007",
       caption = "資料來源:自行統計;相關係數達到0.35以上才列入") +
  theme(plot.margin = margin(2, 2, 2, 2, "cm"),
        plot.background = element_rect(fill = "white"), ##FFFAFA
        strip.background = element_rect(fill = "white"), ##FFFAFA
        panel.background = element_blank(),
        panel.grid = element_blank(),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.grid.major.y = element_blank(),
        text = element_text(family = "Noto Sans CJK TC Medium"),
        plot.title = element_text(size = 108, family = "Noto Sans CJK TC Medium", hjust = 0, margin = margin(0,0,12,0)),
        plot.subtitle = element_text(size = 84, family = "Noto Sans CJK TC Medium", hjust = 0, margin = margin(0,0,15,0)),
        plot.caption = element_text(size = 60, family = "Noto Sans CJK TC Medium", margin = margin(40,0,0,0)),
        legend.title = element_text(size = 60, family = "Noto Sans CJK TC Medium", angle = 0, hjust = 0),
        legend.text = element_text(size = 42, family = "Noto Sans CJK TC Medium", angle = 0, hjust = 0),
        legend.background = element_blank(),
        legend.position = "none",
        legend.key = element_rect(fill = "transparent"),
        axis.line.x = element_blank(), axis.line.y = element_blank(),
        axis.title = element_blank(),
        axis.text = element_blank(),
        plot.title.position = "plot", 
        plot.caption.position =  "plot")

### 雙英對決
df_ying_seg_count <- df_speech_seg_unnest %>% 
  filter(president %in% c("馬英九", "蔡English")) %>%
  filter(!text_POS %in% c("Puncuation")) %>%
  filter(str_length(text_segment) > 1) %>%
  anti_join(df_stopwords) %>%
  count(president, text_segment)

df_ying_seg_diff <- df_ying_seg_count %>% 
  pivot_wider(names_from = president, values_from = n, values_fill = list(n = 0)) %>%
  mutate(diff_tsai = `蔡English` - `馬英九`, diff_ma = -diff_tsai)

df_ying_seg_diff %>% arrange(desc(diff_tsai)) %>% 
  slice(1:10) %>%
  select(text_segment, diff = diff_tsai) %>% mutate(president = "蔡English") %>%
  bind_rows(
    df_ying_seg_diff %>% arrange(desc(diff_ma)) %>% 
      slice(1:10) %>%
      select(text_segment, diff = diff_ma) %>% mutate(president = "馬英九")  
  ) %>%
  mutate(diff2 = if_else(president == "馬英九", -diff, diff)) %>%
  mutate(text_segment = reorder(text_segment, diff2)) %>%
  ggplot(aes(x = diff2, y = text_segment, fill = president)) + geom_col() +
  theme_bw() +
  scale_fill_manual(values = c("#1B9431", "#000095")) +
  labs(x= "使用次數的差異",y= "",
       title = "蔡English提到更多挑戰、馬英九談論更多經濟",
       subtitle = "雙英對決:馬英九與蔡English使用次數差異最大詞彙",
       caption = "資料來源:自行統計") +
  theme_vt +
  theme(legend.position = "none",
        strip.text = element_text(size = 72))

### 熱詞
df_word_count_top <- df_speech_seg_unnest %>% 
  filter(!text_POS %in% c("Puncuation")) %>%
  filter(str_length(text_segment) > 1) %>%
  anti_join(df_stopwords) %>%
  count(text_segment) %>% 
  arrange(desc(n)) %>% ungroup() %>% 
  slice(1:10) %>% select(text_segment)

df_year_word_count %>% 
  group_by(year_full, text_segment) %>%
  summarise(n = sum(n)) %>%
  mutate(year = as_date(str_c(year_full, "/1/1"))) %>%
  inner_join(df_word_count_top) %>%
  ggplot(aes(x = year, y = text_segment, fill = n)) + geom_tile() +
  theme_bw() +
  scale_x_date(date_labels = "'%y", date_breaks = "2 years") +
  scale_fill_gradient(low = "white", high = "red")+
  labs(x= "",y= "", title = "歷屆總統演說 - 總體使用熱詞", fill = "次數") +
  theme_vt +
  theme(legend.position = "none") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"))

### 
df_president_seg_count <- df_year_word_count %>% 
  filter(!text_POS %in% c("Puncuation")) %>%
  anti_join(df_stopwords) %>%
  group_by(president, text_segment) %>%
  summarise(n = sum(n))

df_president_seg_count_top <- df_president_seg_count %>% group_by(president) %>%
  arrange(president, desc(n)) %>% mutate(rn = row_number()) %>%
  filter(rn <= 10) %>% ungroup() %>%
  group_by(president) %>% arrange(president, n) %>% ungroup() %>%
  mutate(president = fct_relevel(as.factor(president), "李登輝", "陳水扁", "馬英九", "蔡English")) %>%
  mutate(text_order = str_c(text_segment, president, rn) %>% forcats::fct_inorder()) %>%
  select(president, text_segment, text_order, rn, n)

df_president_seg_count_top %>%
  ggplot(aes(x = text_order, y = n)) + geom_col() +
  facet_wrap(president ~ ., scales = "free") +
  coord_flip() +
  theme_bw() +
  scale_linetype(guide = "none") +
  scale_x_discrete(labels = setNames(as.character(df_president_seg_count_top$text_segment), df_president_seg_count_top$text_order)) +
  scale_fill_gradient(low = "white", high = "red")+
  labs(x= "",y= "",
       # color = "學校類型",
       title = "總統之間講的熱門語彙中,都有臺灣",
       subtitle = "歷任總統國慶演說辭熱門詞彙, 1997-2023",
       caption = "資料來源:自行統計") +
  theme_vt

### tfidf
df_president_tfidf <- df_president_seg_count %>% filter(n > 4) %>%
  bind_tf_idf(text_segment, president, n) %>%
  group_by(president) %>% arrange(desc(tf_idf)) %>% 
  slice(1:10) %>% ungroup() %>%
  mutate(president = fct_relevel(as.factor(president), "李登輝", "陳水扁", "馬英九", "蔡English")) %>%
  mutate(text_segment = fct_reorder(text_segment, tf_idf))

df_president_tfidf %>%
  ggplot(aes(x = text_segment, y = tf_idf)) + geom_col() +
  facet_wrap(president ~ ., scales = "free") +
  coord_flip() +
  theme_bw() +
  labs(x= "年份",y= "tf-idf", title = "歷屆總統演說 - tfidf", fill = "次數") +
  labs(x= "",y= "",
       title = "政黨輪替、與大陸簽訂協議、再到疫情與韌性",
       subtitle = "歷任總統國慶演說辭重要詞彙, 1997-2023",
       caption = "資料來源:自行統計;以 tf-idf 計算重要詞彙") +
  theme_vt

上一篇
[Day 29] 用R語言玩轉文字探勘案例 - 分析篇
系列文
用R語言玩轉文字探勘30
圖片
  直播研討會
圖片
{{ item.channelVendor }} {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言