This notebook presents an example for the PPMT algorithm.
We create a reference data set (lognormal distribution) based on a model that we simulate using the Turning Band algorithm.
grd = DbGrid_create(x0=c(0.0,0.0), dx=c(0.01,0.01), nx=c(100,100))
# Simulate two independent Gaussian Random variables
model1 = Model_createFromParam(ECov_GAUSSIAN(), range=0.2, sill=1.0)
err = simtub(NULL, dbout = grd, model = model1, nbsimu = 1)
err = grd$setName("Simu","Y1")
model2 = Model_createFromParam(ECov_EXPONENTIAL(), range=0.1, sill=1.0)
err = simtub(NULL, dbout = grd, model = model2, nbsimu = 1)
err = grd$setName("Simu","Y2")
# Non linear transform
grd["Z"] = grd["Y1"] * grd["Y1"] + 0.5 * grd["Y1"] + 0.2 * grd["Y2"]
Two transforms are applied to the simulated variables:
a normal score transform
a lognormal transform
# Normal transform
grd["Y1.NS"] = VectorHelper_normalScore(grd["Y1"])
grd["Z.NS"] = VectorHelper_normalScore(grd["Z"])
err = grd$deleteColumns(c("Y1","Y2","Z"))
err = grd$setName("Y1.NS","Y1")
err = grd$setName("Z.NS","Y2")
# histograms and correlation
ggplot() + plot.hist(grd, name = "Y1", color = "gray", fill = "orange")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot() + plot.hist(grd, name = "Y2", color = "gray", fill = "skyblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p = ggplot()
p = p + plot.correlation(grd, name1 = "Y1", name2 = "Y2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Initial Gaussian variables (grid)")
ggPrint(p)
# maps
ggplot() + plot.grid(grd, "Y1")
ggplot() + plot.grid(grd, "Y2")
## Lognormal transform
m_1 = 1.0; sigma_1 = 0.25
m_2 = 0.5; sigma_2 = 0.5
grd["Z1"] = m_1 * exp(sigma_1 * grd["Y1"] - 1/2 * sigma_1 * sigma_1 )
grd["Z2"] = m_2 * exp(sigma_2 * grd["Y2"] - 1/2 * sigma_2 * sigma_2 )
dum = grd$statistics(c("Z*","Y*"), opers=EStatOption_fromKeys(c("MINI","MAXI","MEAN","STDV")))
## MINI MAXI MEAN STDV
## Z1 0.382 2.456 1.000 0.254
## Z2 0.069 2.833 0.500 0.266
## Y1 -3.719 3.719 0.000 0.999
## Y2 -3.719 3.719 0.000 0.999
# histograms and correlation
ggplot() + plot.hist(grd, name = "Z1", color = "gray", fill = "orange")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot() + plot.hist(grd, name = "Z2", color = "gray", fill = "skyblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p = ggplot()
p = p + plot.correlation(grd, name1 = "Z1", name2 = "Z2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Initial lognormal variables (grid)")
ggPrint(p)
# maps
ggplot() + plot.grid(grd, "Z1")
ggplot() + plot.grid(grd, "Z2")
nump = 500
data = Db_createSamplingDb(grd, number=nump, names = c("x1", "x2", "Z1", "Z2"))
dum = data$statistics(c("Z*"), opers=EStatOption_fromKeys(c("MINI","MAXI","MEAN","STDV")))
## MINI MAXI MEAN STDV
## Z1 0.431 1.902 0.991 0.255
## Z2 0.100 1.699 0.502 0.258
# histograms and correlation
ggplot() + plot.hist(data, name = "Z1", color = "gray", fill = "orange")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot() + plot.hist(data, name = "Z2", color = "gray", fill = "skyblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p = ggplot()
p = p + plot.correlation(data, name1 = "Z1", name2 = "Z2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Initial lognormal variables (data)")
ggPrint(p)
# maps
p = ggplot()
p = p + plot.grid(grd,"Z1")
p = p + plot.point(data, name_color = "Z1")
p = p + plot.decoration(title = "Initial lognormal variable - Z1")
ggPrint(p)
p = ggplot()
p = p + plot.grid(grd,"Z2")
p = p + plot.point(data, name_color = "Z2")
p = p + plot.decoration(title = "Initial lognormal variable - Z2")
ggPrint(p)
ndir <- 100
niter <- 100
flag.knn = TRUE
# normal score
data["Z1.NS"] = VectorHelper_normalScore(data["Z1"])
data["Z2.NS"] = VectorHelper_normalScore(data["Z2"])
err = data$setName("Z1.NS","Y1")
err = data$setName("Z2.NS","Y2")
# Launching PPMT and fitting it on the vector of Input data
if(flag.knn){
ppmt = PPMT_create(ndir=ndir, FALSE, EDirGen_fromKey("vdc"))
} else {
ppmt = PPMT_create(ndir=ndir, FALSE, EDirGen_fromKey("vdc"),
EGaussInv_fromKey("hmt"))
}
Xmat <- data$getColumnsAsMatrix(names = c("Y1", "Y2"))
ppmt = PPMT_create(ndir=ndir, alpha=2.)
err = ppmt$fitFromMatrix(Xmat, niter, FALSE)
# scores vs. iterations
plot(x = 1:niter, y = ppmt$getSerieScore(TRUE), type = "l", col = "black", lty = 1,
xlab = "Iterations", ylab = "Logarithm of index", main = "Wasserstein distance")
Results
# adding the results to the data base
iuid = data$addColumns(Xmat$getValues(), radix = "U")
dum = data$statistics(c("Y*","U-*"),opers=EStatOption_fromKeys(c("MINI","MAXI","MEAN","STDV")))
## MINI MAXI MEAN STDV
## Y1 -2.879 2.879 0.000 0.989
## Y2 -2.879 2.879 0.000 0.989
## U-1 -2.833 2.941 0.000 0.989
## U-2 -2.884 2.864 0.000 0.989
# Q-Q plots for initial Gaussian
probas = (1:99)/(1+99)
q2 = VectorHelper_qnormVec(probas)
# Initial Gaussian variables
# First components
q1 = VectorHelper_quantiles(data$getColumns("Y1"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles",
main = "First Intial Variable")
abline(a = 0, b = 1, col = "red", lty = 3)
# Second components
q1 = VectorHelper_quantiles(data$getColumns("Y2"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles",
main = "Second Intial Variable")
abline(a = 0, b = 1, col = "red", lty = 3)
p = ggplot()
p = p + plot.correlation(data, name1 = "Y1", name2 = "Y2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Initial Gaussian variables (data)")
ggPrint(p)
# Final Gaussian variables
# First components
q1 = VectorHelper_quantiles(data$getColumns("U-1"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles",
main = "First Final Variable")
abline(a = 0, b = 1, col = "red", lty = 3)
# Second components
q1 = VectorHelper_quantiles(data$getColumns("U-2"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles",
main = "Second Final Variable")
abline(a = 0, b = 1, col = "red", lty = 3)
p = ggplot()
p = p + plot.correlation(data, name1 = "U-1", name2 = "U-2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Final Gaussian variables (data)")
ggPrint(p)
If the gaussianToRaw function is used, the Hermite method should be used. The function does work and the ad-hoc function KNN is used instead.
# simulations
ntest <- 1000
Y <- matrix(rnorm(n = ntest*2), nrow = ntest, ncol = 2)
simu <- Db_createFromSamples(nech = ntest)
simu["Y1"] <- Y[,1]
simu["Y2"] <- Y[,2]
if (flag.knn){
library(FNN)
PPMT.anam <- function(this, X){
X <- as.matrix(X)
np <- dim(X)[1]
nd <- dim(X)[2]
stopifnot(nd == dim(this$X)[2])
res<- get.knnx(data = this$X, query = X, k = 1+nd)
Y <- matrix(NaN, nrow = np, ncol = nd)
for (i in 1:np) {
idx <- res$nn.index[i,]
w <- res$nn.dist[i,]
if (max(w == 0)){
w <- as.numeric(w == 0)
} else {
w <- 1/w^2
w <- w / sum(w)
}
Y[i,] <- t(w) %*% this$Z[idx,]
}
Y
}
# mapping Y to Z
anam <- list(
Z = as.matrix(cbind(data["Z1"], data["Z2"])),
X = as.matrix(cbind(data["U-1"], data["U-2"]))
)
Z <- PPMT.anam(this = anam, X = Y)
simu["Z.Y1"] <- Z[,1]
simu["Z.Y2"] <- Z[,2]
# superposition initial data and simulated data
plot(Z[,1], Z[,2],
main = "Simulated raw variable", xlab = "Z1", ylab = "Z2",
pch = 19, cex = 0.5)
points(anam$Z[,1], anam$Z[,2], pch = 19, cex = 0.5, col = "red")
legend("topleft", legend = c("data", "simulated"), pch = 19, col = c("red", "black"))
} else {
ppmt$gaussianToRaw(simu, names = c("Y1", "Y2"))
}
# Simulation of independent Gaussian variables
p = ggplot()
p = p + plot.correlation(simu, name1 = "Y1", name2 = "Y2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Initial Gaussian variables (simulation)")
ggPrint(p)
# Back-tranform of the simulated Gaussian variables
p = ggplot()
p = p + plot.correlation(simu, name1 = "Z.Y1", name2 = "Z.Y2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Simulations of the raw variables")
ggPrint(p)
other indices ?
implementations of the back transform (hermite expension or nearest neighbors
Do we need to apply a initial normal, sphering, and ppmt?