## ----setup, include=FALSE------------------------------------------------
knitr::opts_chunk$set(comment = "R>")

## ----NoOutlier1, cache=TRUE, warning=FALSE-------------------------------
library("ICSOutlier")
# Data simulation
set.seed(123)
X <- matrix(rnorm(1000, 0, 0.1), 500, 2)

## ----NoOutlier1b, cache=TRUE, warning=FALSE------------------------------
icsX <- ics2(X)
icsOutlierJB <- ics.outlier(icsX, test = "jarque", level.test = 0.05, 
                            level.dist = 0.025)
print(icsOutlierJB)

## ----NoOutlier2, cache=TRUE, warning=FALSE-------------------------------
# Robust Mahalanobis distance with MCD estimates with a breakdown point of 25%
library("robustbase")
MCD <-  covMcd(X, alpha = 0.75)
RD <- mahalanobis(X,  MCD$center, MCD$cov)

# Cut-off based on the chi-square distribution
cutoff.chi.sq <- qchisq(0.975, df = ncol(X))
cutoff.chi.sq

# Cut-off based Green and Martin (2017)
library("CerioliOutlierDetection")
cutoff.GM <- hr05CutoffMvnormal(n.obs = nrow(X), p.dim = ncol(X), mcd.alpha = 0.75, 
                                signif.alpha = 0.025, method = "GM14", 
                                use.consistency.correction = TRUE)$cutoff.asy
cutoff.GM                                                                                                     

## ----NoOutlier2b, cache=TRUE, warning=FALSE, fig.width=4, fig.height=4,  fig.cap="\\label{fig:NoOutlier2b}Squared robust Mahalanobis distances and two different cut-off values.", fig.align='center'----
# Code for the Figure 2
colPoints <- ifelse(RD >= min(c(cutoff.chi.sq, cutoff.GM)), 1, grey(0.5))
pchPoints <- ifelse(RD >= min(c(cutoff.chi.sq, cutoff.GM)), 16, 4)
plot(seq_along(RD), RD,  pch = pchPoints, col = colPoints,
     ylim=c(0, max(RD, cutoff.chi.sq, cutoff.GM) + 2), cex.axis = 0.7, cex.lab = 0.7,
     ylab = expression(RD**2), xlab = "Observation Number")
abline(h = c(cutoff.chi.sq, cutoff.GM), lty = c("dashed", "dotted"))
legend("topleft", lty = c("dashed", "dotted"), cex = 0.7, ncol = 2, bty = "n",
       legend = c(expression(paste(chi[p]**2, " cut-off")), "GM cut-off"))

## ----HTP1, cache=TRUE, warning=FALSE-------------------------------------
# HTP dataset
library("ICSOutlier")
set.seed(123)
data(HTP)
outliers <- c(581, 619)

# default ICS 
icsHTP <- ics2(HTP)

# Outlier detection with selection of components based on normality tests
# by default it can take quite long as mDist = 10000 so we choose to 
# use all but one available cores to parallelize the simulations.
library(parallel)
icsOutlierDA <- ics.outlier(icsHTP, ncores = detectCores()-1, iseed = 123)
summary(icsOutlierDA)

## ----HTP1a, cache=TRUE, warning=FALSE, fig.width=4.5, fig.height=4, fig.align='center', fig.cap="\\label{fig:HTP1}Squared ICS distances for HTP data with default parameters."----
# Code for the Figure 3
plot(icsOutlierDA, cex.lab = 0.7, cex.axis = 0.7)
points(outliers, icsOutlierDA@ics.distances[outliers], pch = 5)
text(outliers, icsOutlierDA@ics.distances[outliers], outliers, pos = 2, cex = 0.7)

## ----HTP2, cache=TRUE, warning=FALSE, fig.width=4.5, fig.height=3, fig.align='center',  fig.cap="\\label{fig:HTP2}Screeplot of ICS eigenvalues for HTP data and default parameters."----
# Code for the Figure 4
screeplot(icsHTP, cex.lab = 0.7, cex.axis = 0.7, cex.names = 0.7, cex.main = 0.7)

## ----HTP2b, cache=TRUE, warning=FALSE------------------------------------
ics.dist.scree <- ics.distances(icsHTP, index = 1:3)
# by default it can take quite long as m = 10000, so we choose to 
# use all but one available cores to parallelize the simulations.
library(parallel)
ics.cutOff <- dist.simu.test(icsHTP, 1:3, ncores = detectCores()-1, iseed = 123)
ics.cutOff

## ----HTP4, cache=TRUE, warning=FALSE-------------------------------------
# FSRMCD
library("CerioliOutlierDetection")
FSRMCD <- cerioli2010.fsrmcd.test(HTP, signif.alpha = 1 - 0.975**(1/nrow(HTP)), 
                                  mcd.alpha = 0.75)    
# Two critical values: one for points included in the reweighted MCD (weights == 1) 
# and one for points excluded from the reweighted MCD (weights == 0)).
FSRMCD.cutoffs <- unique(FSRMCD$critvalfcn(FSRMCD$signif.alpha))
FSRMCD.cutoffs

## ----HTP5, cache=TRUE, warning=FALSE-------------------------------------
# SIGN1
library("mvoutlier")
SIGN1 <- sign1(HTP, qcrit = 0.975)

## ----HTP6, cache=TRUE, warning=FALSE, fig.width=5.5, fig.height=2.8, fig.align='center', fig.cap="\\label{fig:HTP6}Squared ICS distances with 3 components against FSRMCD (left panel) and SIGN1 (right panel) measures of outlierness for HTP data."----
# Code for the Figure 5
par(mfrow = c(1, 2))
par(mar = c(4, 4, 2, 0.2))

# Comparison ICS vs FSRMCD
colPoints <- ifelse(ics.dist.scree >= ics.cutOff , 1, grey(0.5))
pchPoints <- ifelse(ics.dist.scree >= ics.cutOff, 16, 4)
plot(FSRMCD$mahdist.rw, ics.dist.scree, col = colPoints, pch = pchPoints,
     cex.lab = 0.7, cex.axis = 0.7, cex.main = 0.7, 
     main = "ICS vs FSRMCD", ylab = "ICS Distances", xlab = "FSRMCD")                                             # This dollar sign works around the mathjax bug: $
points(FSRMCD$mahdist.rw[outliers], ics.dist.scree[outliers], pch = 5)
text(FSRMCD$mahdist.rw[outliers], ics.dist.scree[outliers], labels = outliers, pos = 2, 
     cex = 0.7)
abline(h = ics.cutOff, v = FSRMCD.cutoffs, lty = "dashed")

# Comparison ICS vs SIGN1
plot(SIGN1$x.dist, ics.dist.scree,  col = colPoints, pch = pchPoints,
     cex.lab = 0.7, cex.axis = 0.7, cex.main = 0.7, 
     main = "ICS vs SIGN1", ylab = "ICS Distances", xlab = "SIGN1")
points(SIGN1$x.dist[outliers], ics.dist.scree[outliers], pch = 5)
text(SIGN1$x.dist[outliers], ics.dist.scree[outliers], labels = outliers, pos = 2, 
     cex = 0.7)
abline(h = ics.cutOff, v = SIGN1$const, lty = "dashed")

## ----RelData1, cache=TRUE, warning=FALSE---------------------------------
# ReliabilityData example: the observations 414 and 512 are suspected to be outliers
library("ICSOutlier")
library("REPPlab")
set.seed(123)
data(ReliabilityData)
outliers <- c(414, 512)


# ICS with MLE Cauchy and the Mean-Cov
icsReliabilityData <- ics2(ReliabilityData, S1 = tM, S2 = MeanCov,  
                           S1args = list(df = 1))

# Outlier detection with selection of components based on simulations
# it can take quite long as mEig = 5000 and mDist = 5000, so we choose 
# to use all but one available cores to parallelize the simulations.
icsOutlierPA <- ics.outlier(icsReliabilityData, method = "simulation", 
                            level.test = 0.05, mEig = 5000, 
                            level.dist = 0.01, mDist = 5000,
                            ncores = detectCores()-1, iseed = 123)
icsOutlierPA

## ----RelData2, cache=TRUE, warning=FALSE---------------------------------
# LOF: Local Outlier Factor
library("Rlof")
X.lof <- lof(ReliabilityData, 5:50, cores = 2)
X.lof.max <- apply(X.lof, 1, max)

## ----RelData3, cache=TRUE, warning=FALSE,  results='hide'----------------
# ABOD: Angle-Based Outlier Factor
library("abodOutlier")
# it can take quite long even if we use the randomized method
X.abod <- abod(ReliabilityData, method = "randomized")


## ----RelData4, cache=TRUE, warning=FALSE, fig.width=5.5, fig.height=2.8, fig.align='center', fig.cap="\\label{fig:RelData4}Squared ICS distances with 39 components against LOF (left panel) and ABOD (right panel) measures of outlierness for Reliability data."----
# Code for the Figure 6
par(mfrow = c(1, 2))
par(mar = c(4, 4, 2, 0.2))

# Comparison ICS vs LOF
plot(X.lof.max, icsOutlierPA@ics.distances, cex.lab = 0.7, cex.axis = 0.7, cex.main = 0.7, 
     main = "ICS vs LOF", ylab = "ICS Distances", xlab = "LOF", pch = 4)
text(X.lof.max[outliers], icsOutlierPA@ics.distances[outliers], labels = outliers,
     pos = 4, cex = 0.7)

# Comparison ICS vs ABOD
plot(X.abod, icsOutlierPA@ics.distances, cex.lab = 0.7, cex.axis = 0.7, cex.main = 0.7, 
     main = "ICS vs ABOD", ylab = "ICS Distances", xlab = "ABOD", pch = 4)
text(X.abod[outliers], icsOutlierPA@ics.distances[outliers], labels = outliers, 
     pos = 4, cex = 0.7)

## ----hbkData1,  cache=TRUE, warning=FALSE--------------------------------
library("rrcov")
set.seed(123)
# HBK data set
data(hbk)

# ICS with MCD estimates and the usual estimates
# Need to create a wrapper for the CovMcd function to return first the location estimate
# and the scatter estimate secondly.
myMCD <- function(x,...){
  mcd <- CovMcd(x,...)
  return(list(location = mcd@center, scatter = mcd@cov))
}
icsHBK_mcd <- ics2(hbk[, 1:3], S1 = myMCD, S2 = MeanCov, S1args = list(alpha = 0.75))

# Outlier detection with selection of components based on the D'Agostino test for skewness
# it can take quite long as mEig = 10000 and mDist = 10000, so we choose 
# to use all but one available cores to parallelize the simulations. 
library(parallel)
icsOutlierDA.MCD <- ics.outlier(icsHBK_mcd, mEig = 10000, level.dist = 0.025, 
                                mDist = 10000,
                                ncores = detectCores()-1, iseed = 123,
                                pkg = c("ICSOutlier", "rrcov"))
icsOutlierDA.MCD

# Robust Mahalanobis distance with MCD estimates with a breakdown point of 25%
MCD <-  covMcd(hbk[, 1:3], alpha = 0.75)
RD <- mahalanobis(hbk[, 1:3],  MCD$center, MCD$cov)

# Cut-off based on the chi-square distribution
cutoff.chi.sq <- qchisq(0.975, df = ncol(hbk[, 1:3]))
cutoff.chi.sq

# Cut-off based Green and Martin (2017)
library("CerioliOutlierDetection")
cutoff.GM <- hr05CutoffMvnormal(n.obs = nrow(hbk[, 1:3]), p.dim = ncol(hbk[, 1:3]), mcd.alpha = 0.75,
                                signif.alpha = 0.025, method = "GM14",
                                use.consistency.correction = TRUE)$cutoff.asy
cutoff.GM

# ICS with non robust estimates
icsHBK <- ics2(hbk[,1:3], S1 = MeanCov, S2 = Mean3Cov4)

# Outlier detection with selection of components based on the D'Agostino test for 
# skewness. It can take quite long as mEig = 10000 and mDist = 10000, so we choose 
# to use all but one available cores to parallelize the simulations. 
icsOutlierDA <- ics.outlier(icsHBK, mEig = 10000, level.dist = 0.025, 
                            mDist = 10000,
                            ncores = detectCores()-1, iseed = 123,
                            pkg = c("ICSOutlier", "rrcov"))
icsOutlierDA 


## ----hbkData2, cache=TRUE, warning=FALSE, fig.width=5.5, fig.height=4.31, fig.align='center', fig.cap="\\label{fig:hbkData2}(1st row, 1st column) Squared robust Mahalanobis distance based on the MCD , (1st row, 2nd column) Squared ICS distances with MCD and Mean-Cov, (2nd row, 1st column) ROBPCA with two components, (2nd row, 2nd column) Squared ICS distances with default scatters."----
# Code for the Figure 7
par(mfrow = c(1, 3))
par(mar = c(4, 2, 2, 0.2))

# Robust Mahalanobis distance with MCD estimates
colPoints <- ifelse(RD >= cutoff.chi.sq, 1, grey(0.5))
pchPoints <- ifelse(RD >= cutoff.chi.sq, 16, 4)
plot(RD, col = colPoints, pch = pchPoints, xlab = "Observation Number",
     cex.lab = 0.7, cex.axis = 0.7, main = "Robust MD", cex.main = 0.7)
abline(h = c(cutoff.chi.sq, cutoff.GM), lty = 1:2)
legend("topright", lty = 1:2, cex = 0.7, bty = "n",
        legend = c(expression(paste(chi[p]**2, " cut-off")), "GM cut-off"))

# ICS with MCD estimates and regular covariance
plot(icsOutlierDA.MCD,  cex.lab = 0.7, cex.axis = 0.7, main = "ICS MCD-COV", 
     cex.main = 0.7)

# ICS with default non-robust estimates
plot(icsOutlierDA, cex.lab = 0.7, cex.axis = 0.7, main = "ICS COV-COV4", 
     cex.main = 0.7)


## ----hbkData3, cache=TRUE, warning=FALSE, fig.width=4, fig.height=4, fig.align='center', fig.cap="\\label{fig:hbkData3}Scatter plot of the invariant coordinates of the hbk data for the default scatter combinations. The two outlier groups are marked with different colors."----
# Code for the Figure 8
plot(icsHBK, col = rep(3:1, c(10, 4, 61)))

