# Title: A Normalized Data Model for Natural Language Processing
# Author: Taylor Arnold <taylor.arnold@acm.org>
# Last Modified: 2017-05-20
# Purpose: Replication code from R-journal submission

#######################################################################
# SYSTEM REQUIREMENTS: In order to replicate all of the results you
# must have installed Python >= 2.7 and Java >= 1.8. You must also link
# rJava to the correct version of Java (can be tricky with the default
# macOS set-up; see <https://github.com/s-u/rJava/issues/78> if this
# proves difficult).

# The CoreNLP libraries can be installed within R. To get the spAcy
# library, you can run the following in a terminal:

#   pip install -U spacy
#   python -m spacy download en

# More detailed instructions are available in the spaCy docs:
# <https://spacy.io/docs/usage/>

#######################################################################
# SET UP: This code must be run in R prior to running anything else. It
# assumes that you have download the file sotu_text.tar.bz2 and
# uncompressed it into your working directory.

# Install the cleanNLP R package from CRAN (grabs all dependencies) and
# the sotu data package
install.packages("cleanNLP") # from CRAN
install.packages("sotu") # from CRAN

# You can, alternatively, install package from GitHub:
# devtools::install_github("statsmaths/cleanNLP")

# Now, load all of the required libraries:
library(tidyverse)
library(magrittr)
library(ggplot2)
library(ggrepel)
library(topicmodels)
library(glmnet)
library(cleanNLP)
library(viridis)
library(sotu)

theme_set(theme_minimal())
options(width = 81L) # to match RJournal formatting
options(dplyr.width = Inf)

# Now, you need to install the CoreNLP library (Note: this loads two
# large files totalling over 1.6 GB):
download_core_nlp()

# The Obama dataset can be loaded in by simply loading the file from the
# R package:
data(obama)
# To construct for yourself, initialize the coreNLP library and run it
# over just the Obama State of the Unions:
# init_coreNLP(speed = 3L)
# obama <- run_annotators(sprintf("sotu_text/%03d.txt", 1L:8L))

#######################################################################
# SECTION 2: Basic usage

text <- c("The regular early morning yell of horror was the sound of",
          "Arthur Dent waking up and suddenly remembering where he",
          "was. It wasn't just that the cave was cold, it wasn't just",
          "that it was damp and smelly. It was the fact that the cave",
          "was in the middle of Islington and there wasn't a bus due",
          "for two million years.")
writeLines(text, tf <- tempfile())

init_tokenizers()
anno <- run_annotators(tf)

get_token(anno)

# Running this requires that Python and spaCy be installed
init_spaCy()
anno <- run_annotators(tf)
get_token(anno)

# Running this requires that Java and coreNLP be installed
#init_coreNLP(lib_loc = "~/local/core_nlp_files/stanford-corenlp-full-2016-10-31/")
init_coreNLP()
anno <- run_annotators(tf)
get_token(anno)

#######################################################################
# SECTION 3: A data model for the NLP pipeline

get_document(obama)

get_token(obama, include_root = TRUE)

get_dependency(obama, get_token = TRUE)

get_coreference(obama)

get_entity(obama)

get_sentence(obama)

dim(get_vector(obama))
dim(get_token(obama))

#######################################################################
# SECTION 4: State of the Union addresses

# Load the spaCy pipeline and process all of the files to create the
# sotu dataset:
library(sotu)
init_spaCy()
sotu <- run_annotators(sotu_dir())

# Add metadata to the files:
meta <- sotu_meta
sotu$document$uri <- basename(sotu$document$uri)
sotu$document <- left_join(sotu$document, meta)

get_token(sotu) %>%
  count(id, sid) %$%
  quantile(n, seq(0,1,0.1))

get_token(sotu) %>%
  filter(upos == "NOUN") %>%
  count(lemma) %>%
  top_n(n = 42, n) %>%
  arrange(desc(n)) %>%
  use_series(lemma)

# Plot for Figure 1
get_token(sotu) %>%
  count(id) %>%
  left_join(get_document(sotu)) %>%
  ggplot(aes(year, n)) +
    geom_line(color = grey(0.8)) +
    geom_point(aes(color = sotu_type)) +
    geom_smooth() +
    scale_color_viridis(discrete=TRUE, end = 0.7, option = "C") +
    theme(axis.text.x = element_text(size = 12),
          legend.position="bottom",
          axis.text.y = element_text(size = 12),
          axis.title.x = element_text(size = 14),
          axis.title.y = element_text(size = 14)) +
    xlab("Year") +
    ylab("Number of words") +
    labs(color = "SOTU Address type")
ggsave("num_tokens.pdf", height = 6, width = 8)

get_entity(sotu) %>%
  filter(entity_type == "GPE") %>%
  count(entity) %>%
  top_n(n = 26, n) %>%
  arrange(desc(n)) %>%
  use_series(entity)

get_dependency(sotu, get_token = TRUE) %>%
  left_join(get_document(sotu)) %>%
  filter(year == 2001, relation == "dobj") %>%
  select(id = id, start = word, word = lemma_target) %>%
  left_join(word_frequency) %>%
  filter(frequency < 0.001) %>%
  select(id, start, word) %$%
  sprintf("%s => %s", start, word)

get_dependency(sotu, get_token = TRUE) %>%
  left_join(get_document(sotu)) %>%
  filter(year == 2002, relation == "dobj") %>%
  select(id = id, start = word, word = lemma_target) %>%
  left_join(word_frequency) %>%
  filter(frequency < 0.0005) %>%
  select(id, start, word) %$%
  sprintf("%s => %s", start, word)

pca <- get_token(sotu) %>%
  filter(pos %in% c("NN", "NNS")) %>%
  get_tfidf(min_df = 0.05, max_df = 0.95, type = "tfidf", tf_weight = "dnorm") %$%
  tidy_pca(tfidf, get_document(sotu))

# Plot for Figure 2
ggplot(pca, aes(PC1, PC2)) +
  geom_point(aes(color = cut(year, 10, dig.lab = 4)), alpha = 0.35, size = 4) +
  geom_text_repel(data = filter(pca, !duplicated(president)),
                  aes(label = president), color = grey(0.4), cex = 3) +
  labs(color = "Year") +
  scale_color_viridis(discrete=TRUE, end = 0.9, option = "C") +
  theme(axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14),
        axis.text.x = element_blank(),
        axis.text.y = element_blank())
ggsave("pca_plot.pdf", height = 6, width = 8)

set.seed(2)
tm <- get_token(sotu) %>%
  filter(pos %in% c("NN", "NNS")) %>%
  get_tfidf(min_df = 0.05, max_df = 0.95, type = "tf", tf_weight = "raw") %$%
  LDA(tf, k = 16, control = list(verbose = 1))

# Plot for Figure 3
mat <- get_token(sotu) %>%
  filter(pos %in% c("NN", "NNS")) %>%
  get_tfidf(min_df = 0.05, max_df = 0.95, type = "tf", tf_weight = "raw")
terms <- posterior(tm)$terms
topics <- posterior(tm)$topics
topic_df <- data_frame(topic = as.integer(col(topics)),
                       id = get_document(sotu)$id[as.integer(row(topics))],
                       val = as.numeric(topics)) %>%
              left_join(get_document(sotu))
top_terms <- apply(terms, 1,
               function(v) paste(mat$vocab[order(v, decreasing = TRUE)[1:5]], collapse = ", "))
top_terms <- as.character(top_terms)

index <- rank(-1 * tapply(topic_df$year * topic_df$val, topic_df$topic, which.max))
topic_df$topic_new <- index[topic_df$topic]
top_terms_df <- data_frame(top_terms, topic = 1:length(top_terms))
top_terms_df$topic_new <- index[top_terms_df$topic]

ggplot(topic_df, aes(year, topic_new)) +
  geom_point(aes(size = val, color = factor(topic_new))) +
  geom_text(data = top_terms_df, x = mean(topic_df$year),
            size = 7, aes(y = topic_new + 0.4, label = top_terms, color = factor(topic_new)),
            show.legend = FALSE) +
    scale_color_viridis(discrete=TRUE, end = 0.7, option = "C") +
  theme(axis.text.y=element_blank(),
        axis.title.y=element_blank(),
        legend.position="bottom",
        axis.title.x = element_text(size = 16),
        axis.text.x = element_text(size = 14)) +
  labs(size = "Posterior probability") +
  xlab("Year") +
  guides(colour = FALSE)
ggsave("tm_sotu.pdf", height = 11, width = 8.5)

df <- get_token(sotu) %>%
  left_join(get_document(sotu)) %>%
  filter(year > 2000, pos %in% c("NN", "NNS")) %>%
  mutate(new_id = paste(id, sid, sep = "-"))
mat <- get_tfidf(df, min_df = 0, max_df = 1, type = "tf",
                  tf_weight = "raw", doc_var = "new_id")

meta <- data_frame(new_id = mat$id) %>%
  left_join(df[!duplicated(df$new_id),]) %>%
  mutate(y = as.numeric(president == "Barack Obama"),
         train = year %in% seq(2001,2016, by = 2))

library(glmnet)
model <- cv.glmnet(mat$tf[meta$train,], meta$y[meta$train], family = "binomial")

beta <- coef(model, s = model[["lambda"]][20])[-1]
sprintf("%s (%d)", mat$vocab, sign(beta))[beta != 0]

# Plot for Figure 4
meta$pred <- predict(model, newx = mat$tf, type = "response", s = model$lambda.1se)
ggplot(meta, aes(factor(year),pred)) +
  geom_boxplot(aes(fill = relevel(factor(president), "George W. Bush"))) +
  labs(fill = "President") + xlab("year") + ylab("predicted probability") +
  scale_fill_viridis(discrete = TRUE, alpha = 0.6, end = 0.75, option = "C") +
  coord_flip() +
  theme(axis.title.x = element_text(size = 12),
        axis.text.x = element_text(size = 10),
        axis.title.y = element_text(size = 12),
        axis.text.y = element_text(size = 10)) +
  ylab("Predicted probability") +
  xlab("Year")
ggsave("glmnet_plot.pdf", height = 7, width = 7)
