# This is a stand-alone replication script for the results.
library(osfr)
library(furrr)
library(dplyr)
library(purrr)
library(stringr)
library(tidyr)
library(ggplot2)
library(ggthemes)
library(BayesMallows)
library(parallel)

# The following creates folders in which the figures are saved.
if(!dir.exists("figures")) dir.create("figures")
if(!dir.exists("data")) dir.create("data")

########## Comparison of partition functions, Figure 1
# The simulation results used to create Figure 1 are stored in the OSF Project
# available here: https://osf.io/pj4kb/
# Downloading these files is necessary to reproduce Figure 1, and is done with the 
# following lines. The files are stored in directory "SimulationResults" in the current working directory.
osf_project <- osf_retrieve_node("pj4kb")
if(!dir.exists("SimulationResults")) dir.create("SimulationResults")
osf_retrieve_file("5f3f7b62746a8100cc1a5ed7") %>% 
  osf_download()

# The following computes the asymptotic estimates:
n_items_vec <- seq(from = 50, to = 400, by = 50)
alpha_vec <- seq(from = .1, to = 20, by = .1)

plan(multisession)
asymptotic_estimates <- future_pmap_dfr(list(n_items_vec), function(n_items){
  tibble(
    alpha = !!alpha_vec,
    n_items = !!n_items,
    asymptotic = c(BayesMallows:::asymptotic_partition_function(
      !!alpha_vec, !!n_items, "footrule", K = 1000, n_iterations = 1000)),
  )
}, .progress = TRUE)

# The following extracts the importance sampling estimates from the downloaded simulations
importance_sampling <- file.path("SimulationResults", list.files("./SimulationResults/", pattern = "items.rds$")) %>% 
  map_dfr(function(x){
    readRDS(x)$imps_estimates %>% 
      mutate(n_items = as.integer(str_extract(x, "[0-9]+"))) %>% 
      filter(nmc == max(nmc)) %>% 
      select(n_items, alpha, logz)
  })

# The following computes the exact normalizing constant
comp_exact <- function(alpha_vector, n_items, metric) {
  map_dbl(alpha_vector, function(alpha) {
    card <- BayesMallows:::partition_function_data %>% 
      filter(metric == !!metric, n_items == !!n_items, type == "cardinalities") %>% 
      pull(values) %>% 
      unlist()
    BayesMallows:::get_partition_function(n_items = n_items, alpha = alpha, 
                                          cardinalities = card, metric = metric)
  })
}

exact <- tibble(
  n_items = 50,
  alpha = alpha_vec
) %>% 
  mutate(
    exact = comp_exact(alpha_vec, n_items = 50, metric = "footrule")
  )

# The following joins the different computations
simres <- importance_sampling %>% 
  inner_join(asymptotic_estimates, by = c("n_items", "alpha")) %>% 
  left_join(exact, by = c("n_items", "alpha"))

simres2 <- simres %>% 
  pivot_longer(cols = c("logz", "asymptotic", "exact")) %>% 
  mutate(
    name = recode(name, logz = "IS", asymptotic = "IPFP",
                  exact = "Exact")
  ) %>% 
  na.omit() %>% 
  filter(n_items %in% c(50, 200, 400))

# The following creates Figure 1
ggplot(simres2, aes(x = alpha, y = value, group = name, color = name)) +
  geom_line() +
  facet_wrap(vars(n_items), scales = "free_y", 
             labeller = as_labeller(function(x) paste(x, "items"))) +
  theme_bw() +
  theme(legend.title = element_blank()) +
  xlab(expression(alpha)) +
  ylab(expression(log(Z))) +
  scale_color_tableau(palette = "Classic Color Blind")

ggsave(filename = "./figures/large_items_plots.pdf", height = 6, width = 16, units = "cm")

######### Analysis of Complete Rankings
# Convergence diagnostics
bmm_test <- compute_mallows(potato_visual, seed = 432)
trace_alpha <- assess_convergence(bmm_test) +
  theme_bw()

trace_alpha

# Save the trace plot for alpha
ggplot2::ggsave(
  filename = "./figures/potato-trace-alpha.pdf",
  plot = trace_alpha,
  width = 12, height = 6, units = "cm"
)

trace_rho <- assess_convergence(bmm_test, parameter = "rho", items = 1:5) + 
  theme_bw() +
  labs(color = NULL) +
  scale_color_tableau(palette = "Classic Color Blind")
trace_rho

# Save the trace plot for rho
ggplot2::ggsave(
  filename = "./figures/potato-trace-rho.pdf",
  plot = trace_rho,
  width = 12, height = 6, units = "cm"
)

# Posterior distributions
bmm_visual <- compute_mallows(potato_visual, nmc = 501000, seed = 432)
# Confirm convergence (not shown in paper)
assess_convergence(bmm_visual)
assess_convergence(bmm_visual, parameter = "rho", items = 1:5)

# Set burnin
bmm_visual$burnin <- 1000

# Plot posterior distribution of alpha
posterior_alpha <- plot(bmm_visual) +
  theme_bw()
posterior_alpha

ggplot2::ggsave(
  filename = "./figures/potato-posterior-alpha.pdf",
  plot = posterior_alpha,
  width = 12, height = 6, units = "cm"
)

# Posterior intervals for alpha
potato_alpha_intervals <- compute_posterior_intervals(bmm_visual, decimals = 1L)
potato_alpha_intervals
save(potato_alpha_intervals, file = "./data/potato_alpha_intervals.RData")

# Posterior distribution of rho
posterior_rho <- plot(bmm_visual, parameter = "rho", items = 1:20) +
  theme_bw()

posterior_rho

ggplot2::ggsave(
  filename = "./figures/potato-posterior-rho.pdf",
  plot = posterior_rho,
  width = 16, height = 16, units = "cm"
)

# Jumping over the scale parameter, timing experiment described on page 9.
# The time it takes to run will necessarily vary between computers.
# It is commented out because it takes a long time to run, and is not directly
# part of the examples in the paper.
# Uncomment the next four lines to run it:
# microbenchmark(
#   compute_mallows(potato_visual, nmc = 501000, alpha_jump = 1, seed = 432),
#   compute_mallows(potato_visual, nmc = 501000, alpha_jump = 10, seed = 432)
# )

# Here is the output
# expr      min       lq     mean   median       uq
# compute_mallows(potato_visual, nmc = 501000, alpha_jump = 1) 3.951633 4.163972 4.236114 4.218533 4.308207
# compute_mallows(potato_visual, nmc = 501000, alpha_jump = 10) 1.819980 1.966559 2.024505 2.021911 2.063133
# max neval
# 4.541703   100
# 2.364689   100

# Varying the distance metric
bmm <- compute_mallows(potato_visual, metric = "spearman", nmc = 501000, seed = 432)

######### Preference Data
# First few rows of beach preference dataset
head(beach_preferences)

# Generate transitive closure
beach_tc <- generate_transitive_closure(beach_preferences)

# Generate initial ranking
beach_init_rank <- generate_initial_ranking(beach_tc)

# Give names to the beaches
colnames(beach_init_rank) <- paste("Beach", 1:ncol(beach_init_rank))

# Look at all preferences stated by assessor 1 involving beach 2
filter(beach_preferences, 
       assessor == 1, bottom_item == 2 | top_item == 2)

# Implied ordering for assessor 1, involving beach 2
filter(beach_tc,
       assessor == 1, bottom_item == 2 | top_item == 2)

# Convergence diagnostics
bmm_test <- compute_mallows(rankings = beach_init_rank,
                            preferences = beach_tc, save_aug = TRUE, seed = 432)

# Assess the convergence of alpha and rho (not shown in paper)
assess_convergence(bmm_test)
assess_convergence(bmm_test, parameter = "rho")

# Assess convergence of augmented rankings
# Trace plots for items 2, 6, and 15 for assessor 1
trace_1 <- assess_convergence(bmm_test, parameter = "Rtilde", 
                              items = c(2, 6, 15), assessors = 1) +
  theme_bw() +
  labs(color = NULL) +
  scale_color_tableau(palette = "Classic Color Blind")

trace_1

# Save the plot
ggplot2::ggsave(
  filename = "./figures/beach-trace-assessor-1.pdf",
  plot = trace_1,
  width = 10, height = 6, units = "cm"
)

# Confirm that no orderings between item 1 and 15 are implied for assessor 2
filter(beach_tc, assessor == 2, 
       (bottom_item == 1 & top_item == 15) | (bottom_item == 15 & top_item == 1))

# Find the corresponding trace plot for assessor 2
trace_2 <- assess_convergence(bmm_test, parameter = "Rtilde",
                              items = c(1, 15), assessors = 2) +
  theme_bw() +
  labs(color = NULL) +
  scale_color_tableau(palette = "Classic Color Blind")

trace_2

# Save the plot
ggplot2::ggsave(
  filename = "./figures/beach-trace-assessor-2.pdf",
  plot = trace_2,
  width = 10, height = 6, units = "cm"
)

# Trace plot of Rtilde for more assessors
trace_3 <- assess_convergence(bmm_test, parameter = "Rtilde",
                              items = 13:15, assessors = 1:8) +
  theme_bw() +
  labs(color = NULL) +
  scale_color_tableau(palette = "Classic Color Blind")

trace_3

ggplot2::ggsave(
  filename = "./figures/beach-trace-more-assessors.pdf",
  plot = trace_3,
  width = 16, height = 12, units = "cm"
)

# Posterior distributions
bmm_beaches <- compute_mallows(
  rankings = beach_init_rank,
  preferences = beach_tc,
  nmc = 102000,
  save_aug = TRUE,
  seed = 432
)
bmm_beaches$burnin <- 2000

# Compute posterior intervals
beaches_rho_intervals <- compute_posterior_intervals(bmm_beaches, parameter = "rho")
beaches_rho_intervals
save(beaches_rho_intervals, file = "./data/beaches_rho_intervals.RData")

# Cumulative probability consensus ranking
beaches_cp_consensus <- compute_consensus(bmm_beaches, type = "CP")
beaches_cp_consensus
save(beaches_cp_consensus, file = "./data/beaches_cp_consensus.RData")

# Top-k rankings
beaches_top_3 <- plot_top_k(bmm_beaches, rel_widths = c(1, 5))
beaches_top_3

ggplot2::ggsave(
  filename = "./figures/beaches-top-3.pdf",
  beaches_top_3,
  height = 12, width = 18, units = "cm"
)

# Find all the beaches that are among the top-3 of assessors 1-5 with more than 90 % probability:
predict_top_k_df <- predict_top_k(bmm_beaches) %>%
  filter(prob > 0.9, assessor %in% 1:5)
predict_top_k_df
save(predict_top_k_df, file = "./data/predict_top_k_df.RData")

######### Clustering
# First six rows of the sushi dataset
head(sushi_rankings, 6)

# Convergence diagnostics
# Note, the argument 'setup_strategy = "sequential"' is a workaround for a bug in
# RStudio v1.3.959 causing makeCluster() to hang on mac OS.
# See details here: https://github.com/rstudio/rstudio/issues/6692
cl <- makeCluster(4, setup_strategy = "sequential")
bmm <- compute_mallows_mixtures(n_clusters = c(1, 4, 7, 10), 
                                rankings = sushi_rankings, nmc = 5000, 
                                save_clus = FALSE, include_wcd = FALSE, seed = 432,
                                cl = cl)
stopCluster(cl)

# Trace plot for alpha for each number of mixtures
sushi_alpha_trace <- assess_convergence(bmm) +
  theme_bw() +
  theme(legend.position = "none") +
  scale_color_tableau(palette = "Classic Color Blind")

sushi_alpha_trace

# Save the plot
ggplot2::ggsave(
  filename = "./figures/sushi-alpha-trace.pdf",
  plot = sushi_alpha_trace,
  height = 8, width = 10, units = "cm"
)

# Trace plot for cluster probability tau
sushi_tau_trace <- assess_convergence(bmm, parameter = "cluster_probs") +
  theme_bw() +
  theme(legend.position = "none") +
  scale_color_tableau(palette = "Classic Color Blind")

sushi_tau_trace

ggplot2::ggsave(
  filename = "./figures/sushi-tau-trace.pdf",
  plot = sushi_tau_trace,
  height = 8, width = 10, units = "cm"
)

# Deciding on the number of mixtures
# The call to compute_mallows_mixtures() below takes about 30 minutes to complete,
# and the resulting object has hence also been made available in the OSF repository 
# mentioned above.
# Set 'recompute' to TRUE in order to re-run the computation, and to FALSE in order
# to download the resulting model object
recompute <- TRUE

if(recompute){
  cl <- makeCluster(4, setup_strategy = "sequential")
  bmm <- compute_mallows_mixtures(n_clusters = 1:10, 
                                  rankings = sushi_rankings, 
                                  nmc = 100000, rho_thinning = 10, 
                                  save_clus = FALSE, include_wcd = TRUE, 
                                  seed = 432, cl = cl)
  stopCluster(cl)
} else {
  # This download the object 'bmm' and puts it in the directory "MixtureDistributions"
  if(!dir.exists("MixtureDistributions")) dir.create("MixtureDistributions")
  osf_retrieve_file("5f3f8f8ff579150086eacf67") %>% 
    osf_download()
  
  bmm <- readRDS("MixtureDistributions/bmm_various_mixtures.rds")
}

# Elbow plot
sushi_elbow <- plot_elbow(bmm, burnin = 5000) +
  theme_bw()
sushi_elbow

ggplot2::ggsave(
  filename = "./figures/sushi-elbow.pdf",
  plot = sushi_elbow,
  height = 6, width = 12, units = "cm"
)

# Compute final model with 5 clusters
bmm <- compute_mallows(rankings = sushi_rankings, n_clusters = 5, 
                       save_clus = TRUE, clus_thin = 10, nmc = 100000,
                       rho_thinning = 10, seed = 432)

bmm$burnin <- 5000

# Posterior distribution of cluster assignment
sushi_cluster_assignment <- plot(bmm, parameter = "cluster_assignment")
sushi_cluster_assignment

ggplot2::ggsave(
  filename = "./figures/sushi-cluster-assignment.pdf",
  plot = sushi_cluster_assignment,
  height = 8, width = 16, units = "cm"
)

# Clusterwise CP consensus
# First create the dataframe, and then convert to LaTeX for display as a table in the paper
df <- compute_consensus(bmm) %>%
  select(-cumprob) %>%
  spread(key = cluster, value = item)

names(df)[1] <- ""
tab <- xtable::xtable(df, digits = 0,
                      label = "tab:sushi_consensus",
                      caption = "CP consensus for each of the clusters found for sushi data.")

fileConn <- file("./data/sushi-consensus.txt")
writeLines(print(tab, include.rownames = FALSE), fileConn)
close(fileConn)
