General Introduction

This notebook presents an example for the PPMT algorithm.

Simulation of a reference data set

The data set is a mixture of 4 Gaussian random vectors in \(\mathbb{R}^3\).

np  <- 10000
nd  <- 3
nit <- 200
ndir<- 100
flag.ns <- FALSE
set.seed(123)

# mixing probabilities
pg <- c(0.25, 0.5, 0.2, 0.05)
ng <- table(sample(1:length(pg), prob = pg, size = np, replace = TRUE))

# center of the Gaussians
mu  <- vanDerCorput(n = length(pg), nd = nd)
mu  <- matrix(mu$getValues(), nrow = mu$getNRows(), ncol = mu$getNCols())

# dispersion of the Gaussians
ls <- list()
ls[[1+length(ls)]] <- 0.05 * diag(c(0.25,0.25,0.1))
ls[[1+length(ls)]] <- 0.05 * diag(c(0.2,0.01,0.1))
ls[[1+length(ls)]] <- 0.05 * diag(c(0.1,0.1,0.05))
ls[[1+length(ls)]] <- 0.05 * diag(c(0.25,0.01,0.25))

X <- data.frame()
for (k in seq_along(pg)){
  U <- rep(1,ng[k]) %*% t(mu[k,]) + matrix(rnorm(ng[k]*nd), nrow = ng[k], ncol = nd) %*% t(chol(ls[[k]]))
  X <- rbind(X,U)
}

# creation of a database

data <- Db_createFromSamples(nech = np)
data["x1"] <- X[,1]
data["x2"] <- X[,2]
data["x3"] <- X[,3]

dbStatisticsMono(data, c("x*"), opers=EStatOption_fromKeys(c("MINI","MAXI","MEAN","STDV")))$display()
##       Minimum    Maximum       Mean   St. Dev.
## x1     -0.092      0.980      0.423      0.219
## x2     -0.097      1.402      0.680      0.275
## x3     -0.031      1.116      0.407      0.174
## NULL
# histograms and correlation
ggplot() + plot.hist(data, name = "x1", color = "gray", fill = "orange")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() + plot.hist(data, name = "x2", color = "gray", fill = "skyblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() + plot.hist(data, name = "x3", color = "gray", fill = "green")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p = ggplot()
p = p + plot.correlation(data, namex = "x1", namey = "x2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Gaussian mixture (data)")
ggPrint(p)

p = ggplot()
p = p + plot.correlation(data, namex = "x1", namey = "x3", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Gaussian mixture (data)")
ggPrint(p)

p = ggplot()
p = p + plot.correlation(data, namex = "x2", namey = "x3", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Gaussian mixture (data)")
ggPrint(p)

Application of PPMT

Normal score

# normal score
if(flag.ns){
  data["x1.NS"] = VectorHelper_normalScore(data["x1"])
  data["x2.NS"] = VectorHelper_normalScore(data["x2"])
  data["x3.NS"] = VectorHelper_normalScore(data["x3"])
} else {
  data["x1.NS"] = data["x1"]
  data["x2.NS"] = data["x2"]
  data["x3.NS"] = data["x3"]
}
err = data$setName("x1.NS","Y1")
err = data$setName("x2.NS","Y2")
err = data$setName("x3.NS","Y3")

PPMT

nd  <- 100
nit <- 100

err = data$setName("Y1","Z1")
err = data$setName("Y2","Z2")
err = data$setName("Y3","Z3")

# normal score
data["Y1"] = VectorHelper_normalScore(data["Z1"])
data["Y2"] = VectorHelper_normalScore(data["Z2"])
data["Y3"] = VectorHelper_normalScore(data["Z3"])

# Launching PPMT and fitting it on the vector of Input data
ppmt = PPMT_create(ndir=nd, FALSE, EDirGen_fromKey("vdc"))
Xmat <- data$getColumnsAsMatrix(names = c("Y1", "Y2", "Y3"))
err = ppmt$fitFromMatrix(Xmat, niter = nit, FALSE)

# scores vs. iterations
plot(x = 1:nit, y = log(ppmt$getSerieScore(), base = 10), type = "l", col = "red", lty = 3,
     xlab = "Iterations", ylab = "Logarithm of index", main = "Wasserstein distance")

# adding the results to the data base
iuid = data$addColumns(Xmat$getValues(), radix = "U")
dbStatisticsMono(data, c("x*","Y*","U-*"),
                       opers=EStatOption_fromKeys(c("MINI","MAXI","MEAN","STDV")))$display()
##        Minimum    Maximum       Mean   St. Dev.
##  x1     -0.092      0.980      0.423      0.219
##  x2     -0.097      1.402      0.680      0.275
##  x3     -0.031      1.116      0.407      0.174
##  Y1     -3.719      3.719      0.000      0.999
##  Y2     -3.719      3.719      0.000      0.999
##  Y3     -3.719      3.719      0.000      0.999
## U-1     -3.598      3.714      0.000      0.999
## U-2     -3.546      3.903      0.000      0.999
## U-3     -3.680      3.850      0.000      0.999
## NULL

Results

Initial Gaussian values

# Q-Q plots for initial Gaussian
probas = (1:99)/(1+99)
q2 = VectorHelper_qnormVec(probas)

# First components
q1 = VectorHelper_quantiles(data$getColumns("Y1"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles", 
     main = "First Initial Variable")
abline(a = 0, b = 1, col = "red", lty = 3)