General Introduction

This notebook presents an example for the PPMT algorithm.

Simulation of a reference data set

We create a reference data set (lognormal distribution) based on a model that we simulate using the Turning Band algorithm.

grd = DbGrid_create(x0=c(0.0,0.0), dx=c(0.01,0.01), nx=c(100,100))

# Simulate two independent Gaussian Random variables
model1 = Model_createFromParam(ECov_GAUSSIAN(), range=0.2, sill=1.0)
err = simtub(NULL, dbout = grd, model = model1, nbsimu = 1)
err = grd$setName("Simu","Y1")

model2 = Model_createFromParam(ECov_EXPONENTIAL(), range=0.1, sill=1.0)
err = simtub(NULL, dbout = grd, model = model2, nbsimu = 1)
err = grd$setName("Simu","Y2")

# Non linear transform
grd["Z"] = grd["Y1"] * grd["Y1"] + 0.5 * grd["Y1"] + 0.2 * grd["Y2"]

Initial dataset

Two transforms are applied to the simulated variables:

a normal score transform
a lognormal transform

# Normal transform
grd["Y1.NS"] = VectorHelper_normalScore(grd["Y1"])
grd["Z.NS"]  = VectorHelper_normalScore(grd["Z"])

err = grd$deleteColumns(c("Y1","Y2","Z"))
err = grd$setName("Y1.NS","Y1")
err = grd$setName("Z.NS","Y2")

# histograms and correlation
ggplot() + plot.hist(grd, name = "Y1", color = "gray", fill = "orange")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() + plot.hist(grd, name = "Y2", color = "gray", fill = "skyblue")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p = ggplot()
p = p + plot.correlation(grd, name1 = "Y1", name2 = "Y2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Initial Gaussian variables (grid)")
ggPrint(p)

# maps
ggplot() + plot.grid(grd, "Y1")

ggplot() + plot.grid(grd, "Y2")

## Lognormal transform
m_1 = 1.0; sigma_1 = 0.25
m_2 = 0.5; sigma_2 = 0.5
grd["Z1"] = m_1 * exp(sigma_1 * grd["Y1"] - 1/2 * sigma_1 * sigma_1 )
grd["Z2"] = m_2 * exp(sigma_2 * grd["Y2"] - 1/2 * sigma_2 * sigma_2 )

dum = grd$statistics(c("Z*","Y*"), opers=EStatOption_fromKeys(c("MINI","MAXI","MEAN","STDV")))

##                  MINI      MAXI      MEAN      STDV
## Z1             0.382     2.456     1.000     0.254
## Z2             0.069     2.833     0.500     0.266
## Y1            -3.719     3.719     0.000     0.999
## Y2            -3.719     3.719     0.000     0.999

# histograms and correlation
ggplot() + plot.hist(grd, name = "Z1", color = "gray", fill = "orange")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() + plot.hist(grd, name = "Z2", color = "gray", fill = "skyblue")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p = ggplot()
p = p + plot.correlation(grd, name1 = "Z1", name2 = "Z2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Initial lognormal variables (grid)")
ggPrint(p)

# maps
ggplot() + plot.grid(grd, "Z1")

ggplot() + plot.grid(grd, "Z2")

Extraction of the dataset

nump  = 500
data = Db_createSamplingDb(grd, number=nump, names = c("x1", "x2", "Z1", "Z2"))

dum = data$statistics(c("Z*"), opers=EStatOption_fromKeys(c("MINI","MAXI","MEAN","STDV")))

##                  MINI      MAXI      MEAN      STDV
## Z1             0.431     1.902     0.991     0.255
## Z2             0.100     1.699     0.502     0.258

# histograms and correlation
ggplot() + plot.hist(data, name = "Z1", color = "gray", fill = "orange")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() + plot.hist(data, name = "Z2", color = "gray", fill = "skyblue")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p = ggplot()
p = p + plot.correlation(data, name1 = "Z1", name2 = "Z2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Initial lognormal variables (data)")
ggPrint(p)

# maps
p = ggplot()
p = p + plot.grid(grd,"Z1")
p = p + plot.point(data, name_color = "Z1")
p = p + plot.decoration(title = "Initial lognormal variable - Z1")
ggPrint(p)

p = ggplot()
p = p + plot.grid(grd,"Z2")
p = p + plot.point(data, name_color = "Z2")
p = p + plot.decoration(title = "Initial lognormal variable - Z2")
ggPrint(p)

Implementing PPMT

Computation

ndir  <- 100
niter <- 100
flag.knn = TRUE

# normal score
data["Z1.NS"] = VectorHelper_normalScore(data["Z1"])
data["Z2.NS"] = VectorHelper_normalScore(data["Z2"])
err = data$setName("Z1.NS","Y1")
err = data$setName("Z2.NS","Y2")

# Launching PPMT and fitting it on the vector of Input data
if(flag.knn){
  ppmt = PPMT_create(ndir=ndir, FALSE, EDirGen_fromKey("vdc"))
} else {
  ppmt = PPMT_create(ndir=ndir, FALSE, EDirGen_fromKey("vdc"),
                     EGaussInv_fromKey("hmt"))
}
Xmat <- data$getColumnsAsMatrix(names = c("Y1", "Y2"))
ppmt = PPMT_create(ndir=ndir, alpha=2.)
err = ppmt$fitFromMatrix(Xmat, niter, FALSE)

# scores vs. iterations
plot(x = 1:niter, y = ppmt$getSerieScore(TRUE), type = "l", col = "black", lty = 1,
     xlab = "Iterations", ylab = "Logarithm of index", main = "Wasserstein distance")

Results

# adding the results to the data base
iuid = data$addColumns(Xmat$getValues(), radix = "U")
dum  = data$statistics(c("Y*","U-*"),opers=EStatOption_fromKeys(c("MINI","MAXI","MEAN","STDV")))

##                  MINI      MAXI      MEAN      STDV
## Y1            -2.879     2.879     0.000     0.989
## Y2            -2.879     2.879     0.000     0.989
## U-1           -2.833     2.941     0.000     0.989
## U-2           -2.884     2.864     0.000     0.989

# Q-Q plots for initial Gaussian
probas = (1:99)/(1+99)
q2 = VectorHelper_qnormVec(probas)

# Initial Gaussian variables

# First components
q1 = VectorHelper_quantiles(data$getColumns("Y1"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles", 
     main = "First Intial Variable")
abline(a = 0, b = 1, col = "red", lty = 3)

# Second components
q1 = VectorHelper_quantiles(data$getColumns("Y2"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles", 
     main = "Second Intial Variable")
abline(a = 0, b = 1, col = "red", lty = 3)

p = ggplot()
p = p + plot.correlation(data, name1 = "Y1", name2 = "Y2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Initial Gaussian variables (data)")
ggPrint(p)

# Final Gaussian variables

# First components
q1 = VectorHelper_quantiles(data$getColumns("U-1"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles", 
     main = "First Final Variable")
abline(a = 0, b = 1, col = "red", lty = 3)

# Second components
q1 = VectorHelper_quantiles(data$getColumns("U-2"), probas)
plot(q1, q2, xlab = "Experimental quantiles", ylab = "Theoretical quantiles", 
     main = "Second Final Variable")
abline(a = 0, b = 1, col = "red", lty = 3)

p = ggplot()
p = p + plot.correlation(data, name1 = "U-1", name2 = "U-2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Final Gaussian variables (data)")
ggPrint(p)

Back transform of simulated Gaussian variables

If the gaussianToRaw function is used, the Hermite method should be used. The function does work and the ad-hoc function KNN is used instead.

# simulations
ntest <- 1000
Y <- matrix(rnorm(n = ntest*2), nrow = ntest, ncol = 2)
simu <- Db_createFromSamples(nech = ntest)
simu["Y1"] <- Y[,1]
simu["Y2"] <- Y[,2]

if (flag.knn){
  library(FNN)

PPMT.anam <- function(this, X){
  X  <- as.matrix(X)
  np <- dim(X)[1]
  nd <- dim(X)[2]
  stopifnot(nd == dim(this$X)[2])
  
  res<- get.knnx(data = this$X, query = X, k = 1+nd)
  Y  <- matrix(NaN, nrow = np, ncol = nd)
  for (i in 1:np) {
    idx <- res$nn.index[i,]
    w   <- res$nn.dist[i,]
    if (max(w == 0)){
      w   <- as.numeric(w == 0)
    } else {
      w   <- 1/w^2
      w   <- w / sum(w)
    }
    Y[i,] <- t(w) %*% this$Z[idx,]
  }
  Y
}

# mapping Y to Z
anam <- list(
  Z = as.matrix(cbind(data["Z1"], data["Z2"])),
  X = as.matrix(cbind(data["U-1"], data["U-2"]))
  )

Z <- PPMT.anam(this = anam, X = Y)
simu["Z.Y1"] <- Z[,1]
simu["Z.Y2"] <- Z[,2]

# superposition initial data and simulated data
plot(Z[,1], Z[,2], 
     main = "Simulated raw variable", xlab = "Z1", ylab = "Z2", 
     pch = 19, cex = 0.5)
points(anam$Z[,1], anam$Z[,2], pch = 19, cex = 0.5, col = "red")
legend("topleft", legend = c("data", "simulated"), pch = 19, col = c("red", "black"))

} else {
ppmt$gaussianToRaw(simu, names = c("Y1", "Y2"))
}

# Simulation of independent Gaussian variables
p = ggplot()
p = p + plot.correlation(simu, name1 = "Y1", name2 = "Y2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Initial Gaussian variables (simulation)")
ggPrint(p)

# Back-tranform of the simulated Gaussian variables
p = ggplot()
p = p + plot.correlation(simu, name1 = "Z.Y1", name2 = "Z.Y2", bins=100, flagDiag = TRUE)
p = p + plot.decoration(title = "Simulations of the raw variables")
ggPrint(p)

Discussion and further works

other indices ?
implementations of the back transform (hermite expension or nearest neighbors
Do we need to apply a initial normal, sphering, and ppmt?

References

Rivoirard, J. (1994). Introduction to disjunctive kriging and non-linear geostatistics. Number 551.021 R626i. Clarendon Press.

Tutorial PPM

D. Renard, N. Desassis, X. Freulon

December 6th 2022