General Introduction

This notebook presents an example for the PPMT algorithm.

Simulation of a reference data set

The data set is a mixture of 4 Gaussian random vectors in \(\mathbb{R}^3\).

np  <- 10000
nd  <- 3
nit <- 200
ndir<- 100
flag.ns <- FALSE
set.seed(123)

# mixing probabilities
pg <- c(0.25, 0.5, 0.2, 0.05)
ng <- table(sample(1:length(pg), prob = pg, size = np, replace = TRUE))

# center of the Gaussians
mu  <- vanDerCorput(n = length(pg), nd = nd)
mu  <- matrix(mu$getValues(), nrow = mu$getNRows(), ncol = mu$getNCols())

# dispersion of the Gaussians
ls <- list()
ls[[1+length(ls)]] <- 0.05 * diag(c(0.25,0.25,0.1))
ls[[1+length(ls)]] <- 0.05 * diag(c(0.2,0.01,0.1))
ls[[1+length(ls)]] <- 0.05 * diag(c(0.1,0.1,0.05))
ls[[1+length(ls)]] <- 0.05 * diag(c(0.25,0.01,0.25))

X <- data.frame()
for (k in seq_along(pg)){
  U <- rep(1,ng[k]) %*% t(mu[k,]) + matrix(rnorm(ng[k]*nd), nrow = ng[k], ncol = nd) %*% t(chol(ls[[k]]))
  X <- rbind(X,U)
}

# creation of a database

data <- Db_createFromSamples(nech = np)
data["x1"] <- X[,1]
data["x2"] <- X[,2]
data["x3"] <- X[,3]

dbStatisticsMono(data, c("x*"), opers=EStatOption_fromKeys(c("MINI","MAXI","MEAN","STDV")))$display()
##       Minimum    Maximum       Mean   St. Dev.
## x1     -0.092      0.980      0.423      0.219
## x2     -0.097      1.402      0.680      0.275
## x3     -0.031      1.116      0.407      0.174
## NULL
# histograms and correlation
ggplot() + plot.hist(data, name = "x1", color = "gray", fill = "orange")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() + plot.hist(data, name = "x2", color = "gray", fill = "skyblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() + plot.hist(data, name = "x3", color = "gray", fill = "green")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p = ggplot()
p = p + plot.correlation(data, namex = "x1", namey = "x2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Gaussian mixture (data)")
ggPrint(p)

p = ggplot()
p = p + plot.correlation(data, namex = "x1", namey = "x3", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Gaussian mixture (data)")
ggPrint(p)

p = ggplot()
p = p + plot.correlation(data, namex = "x2", namey = "x3", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Gaussian mixture (data)")
ggPrint(p)

Application of PPMT

Normal score

# normal score
if(flag.ns){
  data["x1.NS"] = VectorHelper_normalScore(data["x1"])
  data["x2.NS"] = VectorHelper_normalScore(data["x2"])
  data["x3.NS"] = VectorHelper_normalScore(data["x3"])
} else {
  data["x1.NS"] = data["x1"]
  data["x2.NS"] = data["x2"]
  data["x3.NS"] = data["x3"]
}
err = data$setName("x1.NS","Y1")
err = data$setName("x2.NS","Y2")
err = data$setName("x3.NS","Y3")

PPMT

nd  <- 100
nit <- 100

err = data$setName("Y1","Z1")
err = data$setName("Y2","Z2")
err = data$setName("Y3","Z3")

# normal score
data["Y1"] = VectorHelper_normalScore(data["Z1"])
data["Y2"] = VectorHelper_normalScore(data["Z2"])
data["Y3"] = VectorHelper_normalScore(data["Z3"])

# Launching PPMT and fitting it on the vector of Input data
ppmt = PPMT_create(ndir=nd, FALSE, EDirGen_fromKey("vdc"))
Xmat <- data$getColumnsAsMatrix(names = c("Y1", "Y2", "Y3"))
err = ppmt$fitFromMatrix(Xmat, niter = nit, FALSE)

# scores vs. iterations
plot(x = 1:nit, y = log(ppmt$getSerieScore(), base = 10), type = "l", col = "red", lty = 3,
     xlab = "Iterations", ylab = "Logarithm of index", main = "Wasserstein distance")

# adding the results to the data base
iuid = data$addColumns(Xmat$getValues(), radix = "U")
dbStatisticsMono(data, c("x*","Y*","U-*"),
                       opers=EStatOption_fromKeys(c("MINI","MAXI","MEAN","STDV")))$display()
##        Minimum    Maximum       Mean   St. Dev.
##  x1     -0.092      0.980      0.423      0.219
##  x2     -0.097      1.402      0.680      0.275
##  x3     -0.031      1.116      0.407      0.174
##  Y1     -3.719      3.719      0.000      0.999
##  Y2     -3.719      3.719      0.000      0.999
##  Y3     -3.719      3.719      0.000      0.999
## U-1     -3.598      3.714      0.000      0.999
## U-2     -3.546      3.903      0.000      0.999
## U-3     -3.680      3.850      0.000      0.999
## NULL

Results

Initial Gaussian values

# Q-Q plots for initial Gaussian
probas = (1:99)/(1+99)
q2 = VectorHelper_qnormVec(probas)

# First components
q1 = VectorHelper_quantiles(data$getColumns("Y1"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles", 
     main = "First Initial Variable")
abline(a = 0, b = 1, col = "red", lty = 3)

# Second components
q1 = VectorHelper_quantiles(data$getColumns("Y2"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles", 
     main = "Second Initial Variable")
abline(a = 0, b = 1, col = "red", lty = 3)

# Third components
q1 = VectorHelper_quantiles(data$getColumns("Y3"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles", 
     main = "Second Initial Variable")
abline(a = 0, b = 1, col = "red", lty = 3)

p = ggplot()
p = p + plot.correlation(data, namex = "Y1", namey = "Y2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Initial Gaussian variables (data)")
ggPrint(p)

p = ggplot()
p = p + plot.correlation(data, namex = "Y1", namey = "Y3", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Initial Gaussian variables (data)")
ggPrint(p)

p = ggplot()
p = p + plot.correlation(data, namex = "Y2", namey = "Y3", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Initial Gaussian variables (data)")
ggPrint(p)

Final Gaussian values

# First components
q1 = VectorHelper_quantiles(data$getColumns("U-1"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles", 
     main = "First Final Variable")
abline(a = 0, b = 1, col = "red", lty = 3)

# Second components
q1 = VectorHelper_quantiles(data$getColumns("U-2"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles", 
     main = "Second Final Variable")
abline(a = 0, b = 1, col = "red", lty = 3)

# Third components
q1 = VectorHelper_quantiles(data$getColumns("U-3"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles", 
     main = "Second Final Variable")
abline(a = 0, b = 1, col = "red", lty = 3)

p = ggplot()
p = p + plot.correlation(data, namex = "U-1", namey = "U-2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Final Gaussian variables (data)")
ggPrint(p)

p = ggplot()
p = p + plot.correlation(data, namex = "U-1", namey = "U-3", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Final Gaussian variables (data)")
ggPrint(p)

p = ggplot()
p = p + plot.correlation(data, namex = "U-2", namey = "U-3", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Final Gaussian variables (data)")
ggPrint(p)

Back transform

Implementation of the transform

library(FNN)

PPMT.anam <- function(this, X){
  X  <- as.matrix(X)
  np <- dim(X)[1]
  nd <- dim(X)[2]
  stopifnot(nd == dim(this$X)[2])
  
  res<- get.knnx(data = this$X, query = X, k = 1+nd)
  Y  <- matrix(NaN, nrow = np, ncol = nd)
  for (i in 1:np) {
    idx <- res$nn.index[i,]
    w   <- res$nn.dist[i,]
    if (max(w == 0)){
      w   <- as.numeric(w == 0)
    } else {
      w   <- 1/w^2
      w   <- w / sum(w)
    }
    Y[i,] <- t(w) %*% this$Z[idx,]
  }
  Y
}

Simulations and application of the transform

# mapping Y to Z
anam <- list(
  Z = as.matrix(cbind(data["Z1"], data["Z2"], data["Z3"])),
  X = as.matrix(cbind(data["U-1"], data["U-2"], data["U-3"]))
  )

# simulations
ntest <- 10000
Y <- matrix(rnorm(n = ntest*3), nrow = ntest, ncol = 3)
Z <- PPMT.anam(this = anam, X = Y)

simu <- Db_createFromSamples(nech = ntest)
simu["x1"] <- Z[,1]
simu["x2"] <- Z[,2]
simu["x3"] <- Z[,3]

# Results
dbStatisticsMono(simu, c("x*"), opers=EStatOption_fromKeys(c("MINI","MAXI","MEAN","STDV")))$display()
##       Minimum    Maximum       Mean   St. Dev.
## x1     -0.071      0.947      0.422      0.215
## x2      0.007      1.389      0.681      0.272
## x3     -0.006      1.092      0.408      0.173
## NULL
# histograms and correlation
ggplot() + plot.hist(simu, name = "x1", color = "gray", fill = "orange")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() + plot.hist(simu, name = "x2", color = "gray", fill = "skyblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() + plot.hist(simu, name = "x3", color = "gray", fill = "green")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p = ggplot()
p = p + plot.correlation(simu, namex = "x1", namey = "x2", bins=100, flagDiag = TRUE) 
p = p + plot.decoration(title = "Gaussian mixture (simulation)")
ggPrint(p)

p = ggplot()
p = p + plot.correlation(simu, namex = "x1", namey = "x3", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Gaussian mixture (simulation)")
ggPrint(p)

p = ggplot()
p = p + plot.correlation(simu, namex = "x2", namey = "x3", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Gaussian mixture (simulation)")
ggPrint(p)

# Functions
fun_corr <- function(i1, i2){
plot(Z[,i1], Z[,i2], pch = 19, cex = 0.5, col = "red",
     xlab = paste0("Z", i1), ylab = paste0("Z", i2), asp = 1)
points(anam$Z[,i1], anam$Z[,i2], pch = 19, cex = 0.1, col = "black")
legend("topleft", legend = c("Initial", "Simulated"), col = c("black", "red"), pch = 19)
  NULL
}

fun_qq <- function(i1, nq = 100){
  p <- (1:(nq-1))/nq
  q_ini <- quantile(x = anam$Z[,i1], probs = p)
  q_sim <- quantile(x = Z[,i1], probs = p)
  
  plot(q_sim, q_ini, xlab = "Simulated values", ylab = "Initial values", asp = 1, 
       main = paste0("Q-Q plot - Z", i1), type = "b")
  abline(a = 0, b = 1, col = "red", lty = 3)
  NULL
}

# Q-Q plots between initial an final variables
fun_qq(1, nq = 100)

## NULL
fun_qq(2, nq = 100)

## NULL
fun_qq(3, nq = 100)

## NULL
# correations
fun_corr(1,2)

## NULL
fun_corr(1,3)

## NULL
fun_corr(2,3)

## NULL

Discussion and further works

References