Initialisation

Note: the Generalized Eigen problem is solved using the function geigen in the package geigen.

knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(align="center")
rm(list=ls())
library(gstlearn)

## Loading required package: ggplot2

## Loading required package: ggpubr

## Loading required package: ggnewscale

## 
## Attaching package: 'gstlearn'

## The following object is masked from 'package:base':
## 
##     message

library(geigen)
rm(list=ls())

Data set

Simulation of correlated fields on a grid and extraction of scattered points.

# grid of samples
nx_S = c(100,100)
dx_S = c(0.01, 0.01)
grid = DbGrid_create(nx = nx_S, dx = dx_S)
grid$display()

## 
## Data Base Grid Characteristics
## ==============================
## 
## Data Base Summary
## -----------------
## File is organized as a regular grid
## Space dimension              = 2
## Number of Columns            = 3
## Total number of samples      = 10000
## 
## Grid characteristics:
## ---------------------
## Origin :       0.000      0.000
## Mesh   :       0.010      0.010
## Number :         100        100
## 
## Variables
## ---------
## Column = 0 - Name = rank - Locator = NA
## Column = 1 - Name = x1 - Locator = x1
## Column = 2 - Name = x2 - Locator = x2

## NULL

np = grid$getNSample()
nv = 3 # Number of variables
m_Z = c(1, 2, 3)
S_Z = c(0.25, 3.0, 1.5)

LM_1 = matrix(c(1.0, 0.5, 0.2, 0.0, sqrt(1-0.5^2), 0.3, 0.0, 0.0, sqrt(1 - 0.2^2 - 0.3^2)), 
              nrow = nv, ncol = nv, byrow = FALSE)
LM_2 = matrix(c(1.0, 0.6, 0.5, 0.0, sqrt(1-0.6^2), 0.3, 0.0, 0.0, sqrt(1 - 0.5^2 - 0.3^2)), 
              nrow = nv, ncol = nv, byrow = FALSE)
LM_3 = matrix(c(1.0, 0.1, 0.2, 0.0, sqrt(1-0.1^2), 0.3, 0.0, 0.0, sqrt(1 - 0.2^2 - 0.3^2)), 
              nrow = nv, ncol = nv, byrow = FALSE)

# Simulation of the Gaussian factors for structure #1
m1 = Model_createFromParam(ECov_NUGGET(), sill= 1.0)
err = simtub(NULL, grid, m1, nbsimu = nv, namconv=NamingConvention("U1"))
U1 = matrix(grid$getColumns(names = "U1.*"), nrow = grid$getNSample(), ncol = nv)
# Simulation of the Gaussian factors for structure #2
m2 = Model_createFromParam(ECov_EXPONENTIAL(), range=0.1, sill=1.)
err = simtub(NULL, grid, m2, nbsimu = nv, namconv=NamingConvention("U2"))
U2 = matrix(grid$getColumns(names = "U2.*"), nrow = grid$getNSample(), ncol = nv)
# Simulation of the Gaussian factors for structure #2
m3 = Model_createFromParam(ECov_CUBIC(), range=0.25, sill=1.)
err = simtub(NULL, grid, m3, nbsimu = nv, namconv=NamingConvention("U3"))
U3 = matrix(grid$getColumns(names = "U3.*"), nrow = grid$getNSample(), ncol = nv)

# Correlated variables
Z = outer(X = rep(1, np), Y = m_Z, FUN = "*") + 
  (U1 %*% t(LM_1) + U2 %*% t(LM_2) + U3 %*% t(LM_3)) %*% diag(S_Z)

grid$setColumn(Z[,1], name = "Z1")

## NULL

grid$setColumn(Z[,2], name = "Z2")

## NULL

grid$setColumn(Z[,3], name = "Z3")

## NULL

for (i in 1:nv) {
  nm_var = paste0("Z", i)
  p = ggplot(asp=1) + 
    plot.raster(grid, nm_var) + 
    plot.decoration(xlab = "Easting", ylab = "Northing", title = nm_var)
  plot.end(p)
}

## Warning in fortify(data, ...): Arguments in `...` must be used.
## ✖ Problematic argument:
## • asp = 1
## ℹ Did you misspell an argument name?

## Warning in fortify(data, ...): Arguments in `...` must be used.
## ✖ Problematic argument:
## • asp = 1
## ℹ Did you misspell an argument name?

## Warning in fortify(data, ...): Arguments in `...` must be used.
## ✖ Problematic argument:
## • asp = 1
## ℹ Did you misspell an argument name?

# Data extraction
np = 500
data = Db_createSamplingDb(grid, number=np, names=c("x1", "x2", "Z1", "Z2", "Z3"))
data$setLocators("Z*", ELoc_Z())

## NULL

data$display()

## 
## Data Base Characteristics
## =========================
## 
## Data Base Summary
## -----------------
## File is organized as a set of isolated points
## Space dimension              = 2
## Number of Columns            = 6
## Total number of samples      = 500
## 
## Variables
## ---------
## Column = 0 - Name = rank - Locator = NA
## Column = 1 - Name = x1 - Locator = x1
## Column = 2 - Name = x2 - Locator = x2
## Column = 3 - Name = Z1 - Locator = z1
## Column = 4 - Name = Z2 - Locator = z2
## Column = 5 - Name = Z3 - Locator = z3

## NULL

Statistics and variography on the data set

# Statistics (vector of means and covariance matrix)
data_Z = matrix(data$getColumns(names = "Z*"), nrow = data$getNSample(), ncol = nv)
mZ = apply(X = data_Z, 2, mean)
varZ = var(data_Z)

# Computing the experimental variogram
nlag = 10
dlag  = 0.025
varioparam = VarioParam_createOmniDirection(nlag=nlag, dlag=dlag)
vario_raw  = Vario_computeFromDb(varioparam, db=data)

# Fitting the variogram model on the experimental variogram
model_raw = Model_create()
err = model_raw$fit(vario_raw, 
                    types = ECov_fromKeys(c("NUGGET", "EXPONENTIAL", "CUBIC"))
                    )
model_raw$display()

## 
## Model characteristics
## =====================
## Space dimension              = 2
## Number of variable(s)        = 3
## Number of basic structure(s) = 3
## Number of drift function(s)  = 0
## Number of drift equation(s)  = 0
## 
## Covariance Part
## ---------------
## Nugget Effect
## - Sill matrix:
##                  [,  0]     [,  1]     [,  2]
##       [  0,]      0.036     -0.086      0.039
##       [  1,]     -0.086      3.002     -1.581
##       [  2,]      0.039     -1.581      0.833
## Exponential
## - Sill matrix:
##                  [,  0]     [,  1]     [,  2]
##       [  0,]      0.089      0.971      0.209
##       [  1,]      0.971     14.298      5.518
##       [  2,]      0.209      5.518      3.322
## - Range        =       0.050
## - Theo. Range  =       0.017
## Cubic
## - Sill matrix:
##                  [,  0]     [,  1]     [,  2]
##       [  0,]      0.101      0.182      0.092
##       [  1,]      0.182      8.213      1.105
##       [  2,]      0.092      1.105      2.435
## - Range        =       0.231
## Total Sill
##                  [,  0]     [,  1]     [,  2]
##       [  0,]      0.226      1.068      0.340
##       [  1,]      1.068     25.513      5.043
##       [  2,]      0.340      5.043      6.590
## 
## Known Mean(s)      0.000      0.000      0.000

## NULL

multi.varmod(vario_raw, model_raw)

# Variogram matrix for lag 'ilag'
ilag = 3
Gamma_h = matrix(NaN, nrow = nv, ncol = nv)
for (ivar in 1:nv) {
  for (jvar in 1:nv) {
    Gamma_h[ivar, jvar] = vario_raw$getGgVec(idir = 0,ivar-1,jvar-1)[ilag]
  }
}

Principal Component Analysis

The vectors \(\Phi\) are solution of the Eigen problem:

\[ \Sigma_0 \Phi = \Phi \Lambda \] where \(\Sigma_0\) is the covariance matrix and \(\Lambda\) is the diagonal matrix of the Eigen values.

The linear transform converts the centered data \({ \bf Z}\) into the orthogonal and normalized principal components,
\({ \bf Y = (Z - m) \times M_{Z \rightarrow PCA}}\). The back transform is defined by \({ \bf Z = m + Y \times M_{PCA \rightarrow Z}}\).

The transform matrices are:

\({ \bf M_{Z \rightarrow PCA} = \Phi \Lambda^{-1/2}}\)
\({ \bf M_{PCA \rightarrow Z} = \Lambda^{1/2} \Phi^{T}}\)

Defining the covariance matrix of the raw data as \({\bf \Sigma_0 = (Z - m)^{T} (Z - m) / np}\), the covariance matrix of the principal components is

\[ {\bf Cov(Y) = Y^{T} \, Y / np = M_{Z \rightarrow PCA}^T [(Z-m)^T (Z-m) / np] M_{Z \rightarrow PCA} = M_{Z \rightarrow PCA}^T \Sigma_0 M_{Z \rightarrow PCA} = \Lambda^{-1/2} \Phi^T \Sigma_0 \Phi \Lambda^{-1/2} = I } \] Hence, the principal components are orthogonal, normalized, and centered.

data$deleteColumns("U*")

## NULL

# linear transform
res = eigen(varZ)
M_Z2Y = res$vectors %*% diag(1/sqrt(res$values))
M_Y2Z = diag(sqrt(res$values)) %*% t(res$vectors)
round(M_Y2Z %*% M_Z2Y, 8)

##      [,1] [,2] [,3]
## [1,]    1    0    0
## [2,]    0    1    0
## [3,]    0    0    1

# Z -> Y
data_PCA = (data_Z - outer(X = rep(1.0, np), Y = mZ)) %*% M_Z2Y
round(var(data_PCA), 8)

##      [,1] [,2] [,3]
## [1,]    1    0    0
## [2,]    0    1    0
## [3,]    0    0    1

round(apply(data_PCA, 2, mean), 8)

## [1] 0 0 0

# factors are centered, normalized, and without correlation (for h = 0)

# Y -> Z
ZZ = outer(X = rep(1.0, np), Y = mZ) + data_PCA %*% M_Y2Z

# Back transform must give the initial values back
round(range(abs(data_Z - ZZ)),8)

## [1] 0 0

# adding the transform to the data base
data$setColumn(tab = data_PCA[, 1], name = "U1")

## NULL

data$setColumn(tab = data_PCA[, 2], name = "U2")

## NULL

data$setColumn(tab = data_PCA[, 3], name = "U3")

## NULL

data$setLocators("U*", ELoc_Z())

## NULL

# Fitting the variogram model on the experimental variogram
vario_PCA  = Vario_computeFromDb(varioparam, db=data)
model_PCA = Model_create()
err = model_PCA$fit(vario_PCA, 
                    types = ECov_fromKeys(c("NUGGET", "EXPONENTIAL", "CUBIC"))
                    )
model_PCA$display()

## 
## Model characteristics
## =====================
## Space dimension              = 2
## Number of variable(s)        = 3
## Number of basic structure(s) = 3
## Number of drift function(s)  = 0
## Number of drift equation(s)  = 0
## 
## Covariance Part
## ---------------
## Nugget Effect
## - Sill matrix:
##                  [,  0]     [,  1]     [,  2]
##       [  0,]      0.276      0.054      0.049
##       [  1,]      0.054      0.330     -0.014
##       [  2,]      0.049     -0.014      0.331
## Exponential
## - Sill matrix:
##                  [,  0]     [,  1]     [,  2]
##       [  0,]      0.701     -0.100     -0.204
##       [  1,]     -0.100      0.510      0.083
##       [  2,]     -0.204      0.083      0.226
## - Range        =       0.144
## - Theo. Range  =       0.048
## Cubic
## - Sill matrix:
##                  [,  0]     [,  1]     [,  2]
##       [  0,]      0.056      0.044      0.149
##       [  1,]      0.044      0.209     -0.015
##       [  2,]      0.149     -0.015      0.501
## - Range        =       0.247
## Total Sill
##                  [,  0]     [,  1]     [,  2]
##       [  0,]      1.032     -0.002     -0.006
##       [  1,]     -0.002      1.049      0.054
##       [  2,]     -0.006      0.054      1.059
## 
## Known Mean(s)      0.000      0.000      0.000

## NULL

multi.varmod(vario = vario_PCA, model = model_PCA)

Min/Max Autocorelation factors

The vectors \(\Psi\) are solution of the Generalized Eigen problem:

\[ \Gamma_{\Delta} \Psi = \Sigma_0 \Psi \Lambda \] where \(\Gamma_{\Delta}\) is the variogram matrix at lag \(\Delta\), \(\Sigma_0\) the covariance matrix, and \(\lambda\) is the diagonal matrix of the eigen values. In addition, the solution verifies \({ \bf \Psi^{-1} \Sigma_0 \Psi = I}\).

The linear transform converts the centered data \({ \bf Z}\) into the orthogonal and normalized Min/Max autocorrelation factors,
\({ \bf F = (Z - m) \times M_{Z \rightarrow F}}\). The back transform is defined by \({ \bf Z = m + F \times M_{F \rightarrow Z}}\).

The transform matrices are:

\({ \bf M_{Z \rightarrow F} = \Psi}\)
\({ \bf M_{F \rightarrow Z} = \Psi^{-1}}\)

The covariance matrix of the MAFs is

\[ {\bf Cov(F) = F^{T} \, F / np = M_{Z \rightarrow F}^T [(Z-m)^T (Z-m) / np] M_{Z \rightarrow F} = \Psi^T \Sigma_0 \Psi = I } \] Hence, the factors are orthogonal, normalized, and centered.

data$deleteColumns("F*")

## NULL

# Linear transform MAF
res_maf = geigen(A = Gamma_h, B = varZ)
M_Z2F = res_maf$vectors
M_F2Z = solve(M_Z2F)
round(M_F2Z %*% M_Z2F, 8)

##      [,1] [,2] [,3]
## [1,]    1    0    0
## [2,]    0    1    0
## [3,]    0    0    1

# Z -> MAF
data_MAF = (data_Z - outer(X = rep(1.0, np), Y = mZ)) %*% M_Z2F
round(var(data_MAF), 8)

##      [,1] [,2] [,3]
## [1,]    1    0    0
## [2,]    0    1    0
## [3,]    0    0    1

round(apply(data_MAF, 2, mean), 8)

## [1] 0 0 0

# factors are centered, normalized, and without correlation (for h = 0)

# Y -> Z
ZZ = outer(X = rep(1.0, np), Y = mZ) + data_MAF %*% M_F2Z

# Back transform must give the initial values back
round(range(abs(data_Z - ZZ)),8)

## [1] 0 0

# adding the transform to the data base
data$setColumn(tab = data_MAF[, 1], name = "F1")

## NULL

data$setColumn(tab = data_MAF[, 2], name = "F2")

## NULL

data$setColumn(tab = data_MAF[, 3], name = "F3")

## NULL

data$setLocators("F*", ELoc_Z())

## NULL

# Fitting the variogram model on the experimental variogram
vario_MAF  = Vario_computeFromDb(varioparam, db=data)
model_MAF = Model_create()
err = model_MAF$fit(vario_MAF, 
                    types = ECov_fromKeys(c("NUGGET", "EXPONENTIAL", "SPHERICAL"))
                    )
model_MAF$display()

## 
## Model characteristics
## =====================
## Space dimension              = 2
## Number of variable(s)        = 3
## Number of basic structure(s) = 3
## Number of drift function(s)  = 0
## Number of drift equation(s)  = 0
## 
## Covariance Part
## ---------------
## Nugget Effect
## - Sill matrix:
##                  [,  0]     [,  1]     [,  2]
##       [  0,]      0.233      0.005      0.002
##       [  1,]      0.005      0.407      0.012
##       [  2,]      0.002      0.012      0.000
## Exponential
## - Sill matrix:
##                  [,  0]     [,  1]     [,  2]
##       [  0,]      0.074      0.058      0.012
##       [  1,]      0.058      0.045      0.013
##       [  2,]      0.012      0.013      0.602
## - Range        =       0.042
## - Theo. Range  =       0.014
## Spherical
## - Sill matrix:
##                  [,  0]     [,  1]     [,  2]
##       [  0,]      0.686     -0.093      0.013
##       [  1,]     -0.093      0.602     -0.049
##       [  2,]      0.013     -0.049      0.437
## - Range        =       0.180
## Total Sill
##                  [,  0]     [,  1]     [,  2]
##       [  0,]      0.994     -0.031      0.028
##       [  1,]     -0.031      1.054     -0.024
##       [  2,]      0.028     -0.024      1.040
## 
## Known Mean(s)      0.000      0.000      0.000

## NULL

multi.varmod(vario_MAF, model_MAF)

Linear transform in gstlearn

We now compare with the linear transforms implemented in gstlearn.

PCA

change_sign <- function(data, namex, namey)
{
  v1 = data[namex]
  v2 = data[namey]
  if (sum(abs(v1 + v2) < sum(abs(v1 - v2))))
    data[namey] = -data[namey]
}

err = data$setLocators("Z*", ELoc_Z())
pca = PCA(nvar = nv)
err = pca$pca_compute(db = data, verbose = FALSE)
err = pca$dbZ2F(data, verbose = FALSE, namconv = NamingConvention(prefix = "PCA", FALSE))

# the eigen values are identical
round(range(abs(abs(pca$getEigVals()) - abs(res$values))),8)

## [1] 0 0

# getting the computed factors
data_gsPCA = matrix(data$getColumns(names = "PCA.*"), 
                    nrow = data$getNSample(), ncol = nv)

# Statistics (mean and variance) compared to the values of eigen
round(apply(X = data_gsPCA, MARGIN = 2, mean), 8)

## [1] 0 0 0

round(t(data_gsPCA) %*% data_gsPCA / (np-1), 8)

##      [,1] [,2] [,3]
## [1,]    1    0    0
## [2,]    0    1    0
## [3,]    0    0    1

# The factors are centered, normalized, and with a null correlation for h = 0

for (i in 1:nv) {
  titre = paste0("PCA #", i)
  change_sign(data, paste0("PCA.",i), paste0("U",i))
  p = plot.init() +
    plot.correlation(data, 
                   namex = paste0("U",i), 
                   namey = paste0("PCA.", i), 
                   asPoint = TRUE, col = "red", 
                   flagDiag = TRUE) +   
    plot.decoration(xlab = "Eigen problem", ylab = "gstlearn", title = titre)
  plot.end(p)
}

# Back transform
err = data$setLocators("PCA*", ELoc_Z())
err = pca$dbF2Z(data, verbose = FALSE, namconv = NamingConvention(prefix = "ZZ.PCA", FALSE))
for (i in 1:nv) {
  titre = paste0("Z", i)
  p = plot.init() +
    plot.correlation(data, 
                   namex = paste0("Z",i), 
                   namey = paste0("ZZ.PCA.", i), 
                   asPoint = TRUE, col = "red", 
                   flagDiag = TRUE) +   
    plot.decoration(xlab = "Initial values", ylab = "Back transformed values", title = titre)
  plot.end(p)
}

MAF

err = data$setLocators(c("Z1", "Z2", "Z3"), ELoc_Z(), cleanSameLocator = TRUE)
maf = PCA(nvar = nv)
err = maf$maf_compute(db = data, varioparam = varioparam, ilag0 = ilag-1, verbose = FALSE)
err = maf$dbZ2F(data, verbose = FALSE, namconv = NamingConvention(prefix = "MAF", FALSE))

# Check the ordering of the Generalized Eigen values
reorder = (abs(maf$getEigVal(0)) - abs(res_maf$values[1])) > 0.001

# The eigen values are identical (they are between 0 and 1 as 1-lambda is a correlation

sequence = seq(1, nv)
if (reorder) 
  sequence = seq(nv, 1)
round(range(abs(abs(maf$getEigVals()) - abs(res_maf$values[sequence]))),8)

## [1] 0 0

# computing the factors
data_gsMAF = matrix(data$getColumns(names = "MAF.*"), 
                    nrow = data$getNSample(), ncol = nv)
# Statistics (mean and variance)
round(apply(X = data_gsMAF, MARGIN = 2, mean), 8)

## [1] 0 0 0

round(t(data_gsMAF) %*% data_gsMAF / (np -1), 8)

##      [,1] [,2] [,3]
## [1,]    1    0    0
## [2,]    0    1    0
## [3,]    0    0    1

for (i in 1:nv) {
  j = sequence[i]
  titre = paste0("MAF #", i)
  change_sign(data, paste0("MAF.",i), paste0("F",j))
  p = plot.init() +
    plot.correlation(data, 
                   namex = paste0("F",j), 
                   namey = paste0("MAF.", i), 
                   asPoint = TRUE, col = "red", 
                   flagDiag = TRUE) +   
    plot.decoration(xlab = "Eigen problem", ylab = "gstlearn", title = titre)
  plot.end(p)
}

# back transform
err = data$setLocators(paste0("MAF.", 1:nv), ELoc_Z(), cleanSameLocator = TRUE)
err = maf$dbF2Z(data, verbose = FALSE, namconv = NamingConvention(prefix = "ZZ.MAF", FALSE))

for (i in 1:nv) {
  titre = paste0("Z", i)
  p = plot.init() +
    plot.correlation(data, 
                   namex = paste0("Z",i), 
                   namey = paste0("ZZ.MAF.", i), 
                   asPoint = TRUE, col = "red", 
                   flagDiag = TRUE) +   
    plot.decoration(xlab = "Initial values", ylab = "Back transformed values", title = titre)
  plot.end(p)
}

Test of PCA and MAF in gstlearn

D. Renard, N. Desassis, X. Freulon

May 31, 2023