HW5 tf-idf

This is an trial to extract data from PTT Koreadrama, and I tried to focus on LIVE article of “Korean Odyssey,” which is traslated into “花遊記” or “和遊記” in Chinese.

load the package needed

library(tidyverse)
library(tidytext)
library(rvest)
library(lubridate)
rm(list = ls())
options(stringsAsFactors = F)
Sys.setlocale(locale = "UTF-8")

## [1] ""

options(Encoding="UTF-8")
assign("last.warning", NULL, envir = baseenv())

step1: to find the article which is titled in “[LIVE] 花遊記” or “[LIVE] 和遊記”.

part1 = "https://www.ptt.cc/bbs/KoreaDrama/index"
part2 = ".html"
# to crawl the articles needed
abc <- paste0(part1, 1950:2025 , part2) %>%
  map(read_html) 
# to get all titles
abc_title <- abc %>%
  map(html_nodes,css= ".title a") %>%
  map(html_text)
# to check which article has the needed title
check <- str_detect(unlist(abc_title), pattern = "\\LIVE] tvN 和遊記")
sum(check)

## [1] 22

#to get the needed url using the "check" index.
abc_link <- abc %>%
  lapply(html_nodes , css= ".title a") %>%
  lapply(html_attr, "href") %>%
  unlist()
urls <- abc_link[check]
article_links <- paste0("https://www.ptt.cc", urls)

step2: Use PTT crawler to get articles.

ptt_article_crawler <- function(url) {
  #main content
  main_author = ".article-metaline:nth-child(1) .article-meta-value"
  main_title = ".article-metaline-right+ .article-metaline .article-meta-value"
  main_datetime = ".article-metaline+ .article-metaline .article-meta-value"
  main_content = "#main-content"
  main_link = ".f2 a"
  #push content
  push_tag = ".push-tag"
  push_id = ".push-userid"
  push_text = ".push-content"
  push_datetime = ".push-ipdatetime"
  # css argument 
  main_column = c(main_author, main_title, main_datetime, main_content, main_link)
  push_column = c(push_tag, push_id, push_text, push_datetime)
  # main and push as two empty lists
  main = list()
  push = list()
  raw_html <- url %>% read_html()
  # for loop
  for(i in 1:length(main_column)){
    content <- raw_html %>% html_nodes(css = main_column[i]) %>% html_text()
    main[[i]] <- content
  }
  main <- as.data.frame(matrix(unlist(main), nrow = 1))
  colnames(main) <- c("main_author", "main_title", "main_datetime", "main_content", "main_link")
  
  for(i in 1:length(push_column)){
    content <- raw_html %>% html_nodes(css = push_column[i]) %>% html_text()
    push[[i]] <- content
  }
  push <- as.data.frame(push)
  colnames(push) <-  c("push_tag", "push_id", "push_text", "push_datetime")
  # clean
  main$main_content <- main$main_content %>%
    str_replace_all("\n", replacement = "") %>% # 清理斷行符號
    str_replace_all("--※.+", replacement = "") %>% # 去尾
    str_replace_all("作者.+:[0-9]{2}\\s[0-9]{4}", replacement = "") %>% #sentfromy
    str_replace_all("-----Sent from.+", replacement = "")
  main$weekday <- main$main_datetime %>% str_sub(1,4)
  main$datetime <- main$main_datetime %>% str_sub(5,-1) %>% parse_datetime("%b %d %H:%M:%S %Y")
  # main$authorip <- main$authorip %>% str_sub(str_locate(., "來")[1] + 4 , -2)
  #######
  push$push_text <- push$push_text%>% str_replace(pattern = ":", replacement = "") %>% str_replace_all(pattern = "\\s", replacement = "")
  #2018/04/0100:42
  push$push_datetime <- toString(year(main$datetime)) %>% str_c(push$push_datetime, sep = "/" ) %>% str_replace_all(" ", "")
  push$date <- push$push_datetime %>% str_sub(1,10) %>% parse_date("%Y/%m/%d", locale = locale(tz = "Asia/Taipei"))
  push$time <- push$push_datetime %>% str_sub(11,-1) %>% parse_time("%H:%M")
  push$weekday <- wday(push$date, label = T)
  push$month <- month(push$date)
  push$day <- day(push$date)
  push$hour <- hour(push$time)
  push$minute <- minute(push$time)
  #######
  article <- list("main" = main, "push" = push)
  return(article)
}
# use the function
article_Korean_Odyssey = list()
for(i in 1:length(article_links)) {
  article_Korean_Odyssey[[i]] = ptt_article_crawler(article_links[i])
}
names(article_Korean_Odyssey)= str_c("article", c(1:22))

step3: Extract push texts and add article ID.

article_Korean_Odyssey_push = tibble()
for(i in 1:22) {
  article_Korean_Odyssey_push <- article_Korean_Odyssey[[i]]$push %>%
    mutate(article = i) %>% rbind(article_Korean_Odyssey_push)
}
# select two variables, others have further uses in project. 
article_tidytext <- select(article_Korean_Odyssey_push, push_text, article)

step4: Use tidytext package to segment the word.

# load the stop words found in someone's github
stop_words_chinese <- read_csv("stopwords_chi_updated.csv", locale = locale(encoding = "UTF-8"), col_names = F)
names(stop_words_chinese) <- c("number", "word")
# segment the article using unnet_tokens function
tidy_KO <- article_tidytext  %>%
  unnest_tokens(word, push_text) %>%
  anti_join(stop_words_chinese, by = c("word"))
# count the hot-trending words
tidy_KO_count <- tidy_KO %>% group_by(article) %>%
  count(article, word, sort = T) %>%
  ungroup()
# calculate tf_idf using bind_tf_idf function
tidy_KO_tfidf <- tidy_KO_count %>%
  bind_tf_idf(word, article, n)

step5: Visualize the work.

# contain 5 hot words in each article
tidy_KO_tfidf_top5 <- tidy_KO_tfidf %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>%
  group_by(article) %>%
  top_n(5) %>%
  ungroup
# use facet_wrap function facetting
tidy_KO_tfidf_top5 %>%
  ggplot(aes(word, tf_idf, fill = article)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~ article, ncol = 4, scales = "free") +
  coord_flip()

HW5 tf-idf

Dennis Tseng

2018/4/11

This is an trial to extract data from PTT Koreadrama, and I tried to focus on LIVE article of “Korean Odyssey,” which is traslated into “花遊記” or “和遊記” in Chinese.

load the package needed

step1: to find the article which is titled in “[LIVE] 花遊記” or “[LIVE] 和遊記”.

step2: Use PTT crawler to get articles.

step3: Extract push texts and add article ID.

step4: Use tidytext package to segment the word.

step5: Visualize the work.