This notebook presents an example for the PPMT algorithm.
The data set is a mixture of 4 Gaussian random vectors in \(\mathbb{R}^3\).
np <- 10000
nd <- 3
nit <- 200
ndir<- 100
flag.ns <- FALSE
set.seed(123)
# mixing probabilities
pg <- c(0.25, 0.5, 0.2, 0.05)
ng <- table(sample(1:length(pg), prob = pg, size = np, replace = TRUE))
# center of the Gaussians
mu <- vanDerCorput(n = length(pg), nd = nd)
mu <- matrix(mu$getValues(), nrow = mu$getNRows(), ncol = mu$getNCols())
# dispersion of the Gaussians
ls <- list()
ls[[1+length(ls)]] <- 0.05 * diag(c(0.25,0.25,0.1))
ls[[1+length(ls)]] <- 0.05 * diag(c(0.2,0.01,0.1))
ls[[1+length(ls)]] <- 0.05 * diag(c(0.1,0.1,0.05))
ls[[1+length(ls)]] <- 0.05 * diag(c(0.25,0.01,0.25))
X <- data.frame()
for (k in seq_along(pg)){
U <- rep(1,ng[k]) %*% t(mu[k,]) + matrix(rnorm(ng[k]*nd), nrow = ng[k], ncol = nd) %*% t(chol(ls[[k]]))
X <- rbind(X,U)
}
# creation of a database
data <- Db_createFromSamples(nech = np)
data["x1"] <- X[,1]
data["x2"] <- X[,2]
data["x3"] <- X[,3]
dum = data$statistics(c("x*"), opers=EStatOption_fromKeys(c("MINI","MAXI","MEAN","STDV")))
## MINI MAXI MEAN STDV
## x1 -0.092 0.980 0.423 0.219
## x2 -0.097 1.402 0.680 0.275
## x3 -0.031 1.116 0.407 0.174
# histograms and correlation
ggplot() + plot.hist(data, name = "x1", color = "gray", fill = "orange")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot() + plot.hist(data, name = "x2", color = "gray", fill = "skyblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot() + plot.hist(data, name = "x3", color = "gray", fill = "green")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p = ggplot()
p = p + plot.correlation(data, name1 = "x1", name2 = "x2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Gaussian mixture (data)")
ggPrint(p)
p = ggplot()
p = p + plot.correlation(data, name1 = "x1", name2 = "x3", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Gaussian mixture (data)")
ggPrint(p)
p = ggplot()
p = p + plot.correlation(data, name1 = "x2", name2 = "x3", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Gaussian mixture (data)")
ggPrint(p)
# normal score
if(flag.ns){
data["x1.NS"] = VectorHelper_normalScore(data["x1"])
data["x2.NS"] = VectorHelper_normalScore(data["x2"])
data["x3.NS"] = VectorHelper_normalScore(data["x3"])
} else {
data["x1.NS"] = data["x1"]
data["x2.NS"] = data["x2"]
data["x3.NS"] = data["x3"]
}
err = data$setName("x1.NS","Y1")
err = data$setName("x2.NS","Y2")
err = data$setName("x3.NS","Y3")
nd <- 100
nit <- 100
err = data$setName("Y1","Z1")
err = data$setName("Y2","Z2")
err = data$setName("Y3","Z3")
# normal score
data["Y1"] = VectorHelper_normalScore(data["Z1"])
data["Y2"] = VectorHelper_normalScore(data["Z2"])
data["Y3"] = VectorHelper_normalScore(data["Z3"])
# Launching PPMT and fitting it on the vector of Input data
ppmt = PPMT_create(ndir=nd, FALSE, EDirGen_fromKey("vdc"))
Xmat <- data$getColumnsAsMatrix(names = c("Y1", "Y2", "Y3"))
err = ppmt$fitFromMatrix(Xmat, niter = nit, FALSE)
# scores vs. iterations
plot(x = 1:nit, y = log(ppmt$getSerieScore(), base = 10), type = "l", col = "red", lty = 3,
xlab = "Iterations", ylab = "Logarithm of index", main = "Wasserstein distance")
# adding the results to the data base
iuid = data$addColumns(Xmat$getValues(), radix = "U")
dum = data$statistics(c("x*",
"Y*","U-*"),opers=EStatOption_fromKeys(c("MINI","MAXI","MEAN","STDV")))
## MINI MAXI MEAN STDV
## x1 -0.092 0.980 0.423 0.219
## x2 -0.097 1.402 0.680 0.275
## x3 -0.031 1.116 0.407 0.174
## Y1 -3.719 3.719 0.000 0.999
## Y2 -3.719 3.719 0.000 0.999
## Y3 -3.719 3.719 0.000 0.999
## U-1 -3.598 3.714 0.000 0.999
## U-2 -3.546 3.903 0.000 0.999
## U-3 -3.680 3.850 0.000 0.999
# Q-Q plots for initial Gaussian
probas = (1:99)/(1+99)
q2 = VectorHelper_qnormVec(probas)
# First components
q1 = VectorHelper_quantiles(data$getColumns("Y1"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles",
main = "First Initial Variable")
abline(a = 0, b = 1, col = "red", lty = 3)
# Second components
q1 = VectorHelper_quantiles(data$getColumns("Y2"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles",
main = "Second Initial Variable")
abline(a = 0, b = 1, col = "red", lty = 3)
# Third components
q1 = VectorHelper_quantiles(data$getColumns("Y3"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles",
main = "Second Initial Variable")
abline(a = 0, b = 1, col = "red", lty = 3)
p = ggplot()
p = p + plot.correlation(data, name1 = "Y1", name2 = "Y2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Initial Gaussian variables (data)")
ggPrint(p)
p = ggplot()
p = p + plot.correlation(data, name1 = "Y1", name2 = "Y3", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Initial Gaussian variables (data)")
ggPrint(p)
p = ggplot()
p = p + plot.correlation(data, name1 = "Y2", name2 = "Y3", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Initial Gaussian variables (data)")
ggPrint(p)
# First components
q1 = VectorHelper_quantiles(data$getColumns("U-1"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles",
main = "First Final Variable")
abline(a = 0, b = 1, col = "red", lty = 3)
# Second components
q1 = VectorHelper_quantiles(data$getColumns("U-2"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles",
main = "Second Final Variable")
abline(a = 0, b = 1, col = "red", lty = 3)
# Third components
q1 = VectorHelper_quantiles(data$getColumns("U-3"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles",
main = "Second Final Variable")
abline(a = 0, b = 1, col = "red", lty = 3)
p = ggplot()
p = p + plot.correlation(data, name1 = "U-1", name2 = "U-2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Final Gaussian variables (data)")
ggPrint(p)
p = ggplot()
p = p + plot.correlation(data, name1 = "U-1", name2 = "U-3", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Final Gaussian variables (data)")
ggPrint(p)
p = ggplot()
p = p + plot.correlation(data, name1 = "U-2", name2 = "U-3", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Final Gaussian variables (data)")
ggPrint(p)
library(FNN)
PPMT.anam <- function(this, X){
X <- as.matrix(X)
np <- dim(X)[1]
nd <- dim(X)[2]
stopifnot(nd == dim(this$X)[2])
res<- get.knnx(data = this$X, query = X, k = 1+nd)
Y <- matrix(NaN, nrow = np, ncol = nd)
for (i in 1:np) {
idx <- res$nn.index[i,]
w <- res$nn.dist[i,]
if (max(w == 0)){
w <- as.numeric(w == 0)
} else {
w <- 1/w^2
w <- w / sum(w)
}
Y[i,] <- t(w) %*% this$Z[idx,]
}
Y
}
# mapping Y to Z
anam <- list(
Z = as.matrix(cbind(data["Z1"], data["Z2"], data["Z3"])),
X = as.matrix(cbind(data["U-1"], data["U-2"], data["U-3"]))
)
# simulations
ntest <- 10000
Y <- matrix(rnorm(n = ntest*3), nrow = ntest, ncol = 3)
Z <- PPMT.anam(this = anam, X = Y)
simu <- Db_createFromSamples(nech = ntest)
simu["x1"] <- Z[,1]
simu["x2"] <- Z[,2]
simu["x3"] <- Z[,3]
# Results
dum = simu$statistics(c("x*"), opers=EStatOption_fromKeys(c("MINI","MAXI","MEAN","STDV")))
## MINI MAXI MEAN STDV
## x1 -0.071 0.947 0.422 0.215
## x2 0.007 1.389 0.681 0.272
## x3 -0.006 1.092 0.408 0.173
# histograms and correlation
ggplot() + plot.hist(simu, name = "x1", color = "gray", fill = "orange")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot() + plot.hist(simu, name = "x2", color = "gray", fill = "skyblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot() + plot.hist(simu, name = "x3", color = "gray", fill = "green")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p = ggplot()
p = p + plot.correlation(simu, name1 = "x1", name2 = "x2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Gaussian mixture (simulation)")
ggPrint(p)
p = ggplot()
p = p + plot.correlation(simu, name1 = "x1", name2 = "x3", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Gaussian mixture (simulation)")
ggPrint(p)
p = ggplot()
p = p + plot.correlation(simu, name1 = "x2", name2 = "x3", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Gaussian mixture (simulation)")
ggPrint(p)
# Functions
fun_corr <- function(i1, i2){
plot(Z[,i1], Z[,i2], pch = 19, cex = 0.5, col = "red",
xlab = paste0("Z", i1), ylab = paste0("Z", i2), asp = 1)
points(anam$Z[,i1], anam$Z[,i2], pch = 19, cex = 0.1, col = "black")
legend("topleft", legend = c("Initial", "Simulated"), col = c("black", "red"), pch = 19)
NULL
}
fun_qq <- function(i1, nq = 100){
p <- (1:(nq-1))/nq
q_ini <- quantile(x = anam$Z[,i1], probs = p)
q_sim <- quantile(x = Z[,i1], probs = p)
plot(q_sim, q_ini, xlab = "Simulated values", ylab = "Initial values", asp = 1,
main = paste0("Q-Q plot - Z", i1), type = "b")
abline(a = 0, b = 1, col = "red", lty = 3)
NULL
}
# Q-Q plots between initial an final variables
fun_qq(1, nq = 100)
## NULL
fun_qq(2, nq = 100)
## NULL
fun_qq(3, nq = 100)
## NULL
# correations
fun_corr(1,2)
## NULL
fun_corr(1,3)
## NULL
fun_corr(2,3)
## NULL
other indices ?
implementations of the back transform (hermite expension or nearest neighbors
Do we need to apply a initial normal, sphering, and ppmt?