#
install.packages("tibble")
library(tibble)
install.packages("dplyr")
library(dplyr)
install.packages("tidyr")
library(tidyr)

#let's create some text data ... technically not made up
text <- c("Because I could not stop for Death ",
          "He kindly stopped for me ",
          "The Carriage held but just Ourselves ",
          "and Immortality")
text #look at it

#make it a tibble
#add a column for line number
text_df <- tibble(line = 1:4, 
                  text = text)
text_df

#text mining package
install.packages("tidytext")
library(tidytext)

#tidy the data into one token per row
#here use one word per row
text_df %>%
  unnest_tokens(word, text)

#for our examples
install.packages("janeaustenr")
library(janeaustenr)
install.packages("stringr")
library(stringr)

#Jane Austen's Works
austen_books()

#Do some cleaning of the books for this example
original_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text,
                                     regex("^chapter [\\divxlc]",
                                                ignore_case = TRUE)))) %>%
  ungroup()

original_books
View(original_books)
str(original_books)
head(original_books$text)

#tidy the text of all of Jane Austen's works
tidy_books <- original_books %>%
  unnest_tokens(word, text)

tidy_books
#notice that the other columns remain in the new tibble

tidy_books <- tidy_books %>%
  anti_join(stop_words)

#count number of times each word is used
tidy_books %>%
  count(word, sort = TRUE)

install.packages("ggplot2")
library(ggplot2)

#visualize the most used words
tidy_books %>%
  count(word, sort = TRUE) %>%
  filter(n > 600) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

tidy_books %>%
  count(word, sort = TRUE) %>%
  filter(n > 600) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n,fill=word)) +
  geom_col(show.legend = F) +
  xlab(NULL) +
  coord_flip() 

####Sentiment Analysis

#tibble of all sentiments in tidytext package
sentiments

#individual lexicons
get_sentiments("bing")
get_sentiments("nrc")
get_sentiments("afinn")

#play around with sentiments
nrcjoy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")

get_sentiments("nrc") %>%
  filter(sentiment != "joy")

get_sentiments("nrc") %>%
  filter(sentiment == "joy" | sentiment == "fear")

get_sentiments("nrc") %>%
  filter(sentiment %in% c("joy","happy"))

nrcjoy

##########
#count the joyful words in Emma
tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrcjoy) %>% #keep rows that appear intersection of tibbles
  count(word, sort = TRUE)

#plot it
tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrcjoy) %>% #keep rows that appear intersection of tibbles
  count(word, sort = TRUE) %>%
  filter(n>50) %>%
  mutate(word=reorder(word,n)) %>%
  ggplot(aes(word,n,fill=word)) +
  geom_col(show.legend = F) +
  coord_flip()

#####
#sentiment scores per 80 word chunk
tidy_books %>%
  inner_join(get_sentiments("bing")) %>% #add on the sentiments
  count(book, index = linenumber %/% 80, sentiment) %>% 
  # %/% is integer division
  #positive and negative sentiments in each 80 line chunk
  spread(sentiment, n, fill = 0) %>%
  #spread positive,negative to new cols
  mutate(sentiment = positive - negative)

ja_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

library(ggthemes)

ja_sentiment %>%
  ggplot(aes(index,sentiment, fill = book)) +
  geom_col() +
  theme_fivethirtyeight() #explore the possible themes
#theme_economist() is fun too

ja_sentiment %>%
  filter(book == "Emma") %>%
  ggplot(aes(index,sentiment,fill=sentiment)) +
  geom_col() +
  theme_fivethirtyeight()

#count positive and negative words
bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>% 
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

#plot them
bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

#Create custom stop words
custom_stop_words <- bind_rows(data_frame(word = c("miss"),
                                          lexicon = c("custom")),
                               stop_words)
custom_stop_words
#now try
bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>% 
  anti_join(custom_stop_words) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
#plot them
bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()


install.packages("reshape2")
library(reshape2)
install.packages("wordcloud")
library(wordcloud)

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 300)) #play around with max.words

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>% 
  comparison.cloud(colors = c("#F8766D", "#00BFC4"), 
                   max.words = 100)

###Topic Modeling
install.packages("topicmodels")
library(topicmodels)
data("AssociatedPress") #a document term matrix
AssociatedPress
ap_td <- tidy(AssociatedPress) #tidy text data
ap_td
ap_td %>%
  cast_dtm(document,term,count)

#Cast Jane Austen datset to DTM
austen_dtm <- tidy_books %>%
              count(book,word) %>% #get count of book and word
              cast_dtm(book,word,n)

#Perform LDA Topic Modeling on Associated Press Data
ap_LDA <- LDA(AssociatedPress, k = 2, 
              control = list(seed = 1234))
ap_LDA
#extract the beta probabilities
ap_topics <- tidy(ap_LDA, matrix = "beta")
ap_topics
#top 10 words per topic, by Beta value
ap_top_terms <- ap_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
ap_top_terms

ap_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

#LDA on Jane Austen Data
austen_LDA <- LDA(austen_dtm,k=6,
                  control = list(seed=1234))

austen_topics <- tidy(austen_LDA,matrix="beta")

austen_top_terms <- austen_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

austen_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

austen_docs <- tidy(austen_LDA,matrix="gamma")

austen_docs

austen_docs %>%
  ggplot(aes(document,gamma,fill=factor(topic))) +
  geom_col()