This notebook presents an example for the PPMT algorithm.
The data set is a mixture of 4 Gaussian random vectors in \(\mathbb{R}^3\).
np <- 10000
nd <- 3
nit <- 200
ndir<- 100
flag.ns <- FALSE
set.seed(123)
# mixing probabilities
pg <- c(0.25, 0.5, 0.2, 0.05)
ng <- table(sample(1:length(pg), prob = pg, size = np, replace = TRUE))
# center of the Gaussians
mu <- vanDerCorput(n = length(pg), nd = nd)
mu <- matrix(mu$getValues(), nrow = mu$getNRows(), ncol = mu$getNCols())
# dispersion of the Gaussians
ls <- list()
ls[[1+length(ls)]] <- 0.05 * diag(c(0.25,0.25,0.1))
ls[[1+length(ls)]] <- 0.05 * diag(c(0.2,0.01,0.1))
ls[[1+length(ls)]] <- 0.05 * diag(c(0.1,0.1,0.05))
ls[[1+length(ls)]] <- 0.05 * diag(c(0.25,0.01,0.25))
X <- data.frame()
for (k in seq_along(pg)){
U <- rep(1,ng[k]) %*% t(mu[k,]) + matrix(rnorm(ng[k]*nd), nrow = ng[k], ncol = nd) %*% t(chol(ls[[k]]))
X <- rbind(X,U)
}
# creation of a database
data <- Db_createFromSamples(nech = np)
data["x1"] <- X[,1]
data["x2"] <- X[,2]
data["x3"] <- X[,3]
dbStatisticsMono(data, c("x*"), opers=EStatOption_fromKeys(c("MINI","MAXI","MEAN","STDV")))$display()
## Minimum Maximum Mean St. Dev.
## x1 -0.092 0.980 0.423 0.219
## x2 -0.097 1.402 0.680 0.275
## x3 -0.031 1.116 0.407 0.174
## NULL
# histograms and correlation
ggplot() + plot.hist(data, name = "x1", color = "gray", fill = "orange")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot() + plot.hist(data, name = "x2", color = "gray", fill = "skyblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot() + plot.hist(data, name = "x3", color = "gray", fill = "green")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p = ggplot()
p = p + plot.correlation(data, namex = "x1", namey = "x2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Gaussian mixture (data)")
ggPrint(p)
p = ggplot()
p = p + plot.correlation(data, namex = "x1", namey = "x3", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Gaussian mixture (data)")
ggPrint(p)
p = ggplot()
p = p + plot.correlation(data, namex = "x2", namey = "x3", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Gaussian mixture (data)")
ggPrint(p)
# normal score
if(flag.ns){
data["x1.NS"] = VectorHelper_normalScore(data["x1"])
data["x2.NS"] = VectorHelper_normalScore(data["x2"])
data["x3.NS"] = VectorHelper_normalScore(data["x3"])
} else {
data["x1.NS"] = data["x1"]
data["x2.NS"] = data["x2"]
data["x3.NS"] = data["x3"]
}
err = data$setName("x1.NS","Y1")
err = data$setName("x2.NS","Y2")
err = data$setName("x3.NS","Y3")
nd <- 100
nit <- 100
err = data$setName("Y1","Z1")
err = data$setName("Y2","Z2")
err = data$setName("Y3","Z3")
# normal score
data["Y1"] = VectorHelper_normalScore(data["Z1"])
data["Y2"] = VectorHelper_normalScore(data["Z2"])
data["Y3"] = VectorHelper_normalScore(data["Z3"])
# Launching PPMT and fitting it on the vector of Input data
ppmt = PPMT_create(ndir=nd, FALSE, EDirGen_fromKey("vdc"))
Xmat <- data$getColumnsAsMatrix(names = c("Y1", "Y2", "Y3"))
err = ppmt$fitFromMatrix(Xmat, niter = nit, FALSE)
# scores vs. iterations
plot(x = 1:nit, y = log(ppmt$getSerieScore(), base = 10), type = "l", col = "red", lty = 3,
xlab = "Iterations", ylab = "Logarithm of index", main = "Wasserstein distance")
# adding the results to the data base
iuid = data$addColumns(Xmat$getValues(), radix = "U")
dbStatisticsMono(data, c("x*","Y*","U-*"),
opers=EStatOption_fromKeys(c("MINI","MAXI","MEAN","STDV")))$display()
## Minimum Maximum Mean St. Dev.
## x1 -0.092 0.980 0.423 0.219
## x2 -0.097 1.402 0.680 0.275
## x3 -0.031 1.116 0.407 0.174
## Y1 -3.719 3.719 0.000 0.999
## Y2 -3.719 3.719 0.000 0.999
## Y3 -3.719 3.719 0.000 0.999
## U-1 -3.598 3.714 0.000 0.999
## U-2 -3.546 3.903 0.000 0.999
## U-3 -3.680 3.850 0.000 0.999
## NULL
# Q-Q plots for initial Gaussian
probas = (1:99)/(1+99)
q2 = VectorHelper_qnormVec(probas)
# First components
q1 = VectorHelper_quantiles(data$getColumns("Y1"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles",
main = "First Initial Variable")
abline(a = 0, b = 1, col = "red", lty = 3)