loading libraries

knitr::opts_chunk$set(echo = TRUE)
rm(list=ls())
library("gstlearn")

## 
## Attaching package: 'gstlearn'

## The following objects are masked from 'package:base':
## 
##     message, toString

library("ggplot2")
library(ggpubr)

OptCst_defineByKey("ASP",0)

## NULL

set.seed(43243)
opers = EStatOption_fromKeys(c("NUM","MINI","MAXI","MEAN","STDV"))

This script demonstrates the capabilities of the cross-validation feature in gstlearn

A fictitious data set is generated by sampling a given simulation. Then it is used to demonstrate the cross-validation tool.

A data set is first generated, composed of a series of a few samples scattered randomly within a 100 by 100 square. The number of samples is set to a small number, so as to make the results more legible. However, it could be raised to a higher value to see better results (say, 100 - but note that large numbers should be avoided since the test is performed for a unique neighborhood). In that case, the variable turnPrintON below may be switched off to avoid generating tedious printouts.

generating the data set

nech = 10
turnPrintON = TRUE

data = Db_createFromBox(nech, # number of samples 
                        coormin = c(0,0), 
                        coormax = c(100,100))

# Plotting the sampling locations generated 
ggplot()+
  plot.point(db = data, 
             color = "red",
             size = 0.5) +
  plot.decoration(title = "Sampling locations")

A spherical model is defined, with the universality condition and a non conditional simulation is performed at the sampling locations. The values generated, renamed as "data", will from now on constitute the data set.

model = Model_createFromParam(type = ECov_SPHERICAL(), range = 30, sill = 4)
model$setDriftIRF(0)

## NULL

err = simtub(NULL,
             data,
             model = model)
data$setName("Simu","data")

## NULL

data

## 
## Data Base Characteristics
## =========================
## 
## Data Base Summary
## -----------------
## File is organized as a set of isolated points
## Space dimension              = 2
## Number of Columns            = 4
## Total number of samples      = 10
## 
## Variables
## ---------
## Column = 0 - Name = rank - Locator = NA
## Column = 1 - Name = x-1 - Locator = x1
## Column = 2 - Name = x-2 - Locator = x2
## Column = 3 - Name = data - Locator = z1

ggplot()+
  plot.point(data, nameSize="data",
             color = "red")+
  plot.decoration(title = "Measurement values")

Now cross-validation is performed. This requires the definition of a neighborhood, chosen to be unique due to the small number of samples. If necessary, a moving neighborhood could be used.

neigh = NeighUnique()
err = xvalid(db = data,
             model = model, 
             neigh = neigh,
             flag_xvalid_est=1,
             flag_xvalid_std=1,
             namconv=NamingConvention("Xvalid",TRUE,TRUE,FALSE))

The cross-validation feature offers several types of outputs, according to the flags: * flag_xvalid_est indicates if the function must return the estimation error Z-Z (flag_xvalid_est=1) or the estimation Z per se (flag_xvalid_est=-1) * flag_xvalid_std indicates if the function must return the normalized error (Z*-Z)/S (flag_xvalid_std=1) or the standard deviation S (flag_xvalid_std=-1)

For a complete demonstration, all options are used. Note the use of NamingConvention which explicitly maintains the Z locator on the input variable (= "data").

The cross-validation is performed again, but the storing option is changed, as well as the radix given to the output variables.

err = xvalid(db = data,
             model = model,
             neigh = neigh,
             flag_xvalid_est=-1, 
             flag_xvalid_std=-1,
             namconv=NamingConvention("Xvalid2",TRUE, TRUE, FALSE))
data

## 
## Data Base Characteristics
## =========================
## 
## Data Base Summary
## -----------------
## File is organized as a set of isolated points
## Space dimension              = 2
## Number of Columns            = 8
## Total number of samples      = 10
## 
## Variables
## ---------
## Column = 0 - Name = rank - Locator = NA
## Column = 1 - Name = x-1 - Locator = x1
## Column = 2 - Name = x-2 - Locator = x2
## Column = 3 - Name = data - Locator = z1
## Column = 4 - Name = Xvalid.data.esterr - Locator = NA
## Column = 5 - Name = Xvalid.data.stderr - Locator = NA
## Column = 6 - Name = Xvalid2.data.estim - Locator = NA
## Column = 7 - Name = Xvalid2.data.stdev - Locator = NA

Checking the results gathered on the first sample

data[1,0:8]

## [1]  1.000000 22.701345 83.641175  2.501800 -1.951677 -1.094985  0.550123
## [8]  1.782378

The printed values correspond to the following information: * the sample rank : 1 * the sample abscissa X : 22.7 * the sample y-coordinate Y : 83.6 * the data value Z : 2.5 * the cross-validation error Z-Z : -1.952 the standardized cross-validation error Z∗−Z/S : -1.095 * the cross-validation estimated value Z∗ : 0.550 * the standard deviation of the cross-validation error S : 1.781

These results can also be double-checked by requesting a full dump of all the information when processing the first sample. The next chunk doesn't store any result : it simply produces an output on the terminal to better understand the process.

if (turnPrintON){
  OptDbg_setReference(1)
    err = xvalid(data,
                    model = model,
                    neigh = neigh,
                    flag_xvalid_est=1, 
                    flag_xvalid_std=1,
                    namconv=NamingConvention("Xvalid3",TRUE, TRUE, FALSE))
  OptDbg_setReference(0)
}

## 
## Target location
## ---------------
## Sample #1 (from 10)
## Coordinate #1 = 22.701345
## Coordinate #2 = 83.641175
## 
## Data selected in neighborhood
## -----------------------------
##        Rank     Sample         x1         x2
##           1          1     22.701     83.641
##           2          2     82.323     43.955
##           3          3     15.270      3.385
##           4          4     55.428     19.935
##           5          5     93.142     79.864
##           6          6     85.694     97.845
##           7          7     73.690     37.455
##           8          8     32.792     43.164
##           9          9     32.178     78.710
##          10         10     64.591     82.042
## 
## LHS of Kriging matrix (compressed)
## ==================================
## Number of active samples    = 10
## Total number of equations   = 11
## Reduced number of equations = 11
## 
##        Rank                     1          2          3          4          5
##                   Flag          1          2          3          4          5
##           1          1      4.000      0.000      0.000      0.000      0.000
##           2          2      0.000      4.000      0.000      0.000      0.000
##           3          3      0.000      0.000      4.000      0.000      0.000
##           4          4      0.000      0.000      0.000      4.000      0.000
##           5          5      0.000      0.000      0.000      0.000      4.000
##           6          6      0.000      0.000      0.000      0.000      0.654
##           7          7      0.000      1.932      0.000      0.139      0.000
##           8          8      0.000      0.000      0.000      0.000      0.000
##           9          9      1.954      0.000      0.000      0.000      0.000
##          10         10      0.000      0.000      0.000      0.000      0.012
##          11         11      1.000      1.000      1.000      1.000      1.000
## 
##        Rank                     6          7          8          9         10
##                   Flag          6          7          8          9         10
##           1          1      0.000      0.000      0.000      1.954      0.000
##           2          2      0.000      1.932      0.000      0.000      0.000
##           3          3      0.000      0.000      0.000      0.000      0.000
##           4          4      0.000      0.139      0.000      0.000      0.000
##           5          5      0.654      0.000      0.000      0.000      0.012
##           6          6      4.000      0.000      0.000      0.000      0.085
##           7          7      0.000      4.000      0.000      0.000      0.000
##           8          8      0.000      0.000      4.000      0.000      0.000
##           9          9      0.000      0.000      0.000      4.000      0.000
##          10         10      0.085      0.000      0.000      0.000      4.000
##          11         11      1.000      1.000      1.000      1.000      1.000
## 
##        Rank                    11
##                   Flag         11
##           1          1      1.000
##           2          2      1.000
##           3          3      1.000
##           4          4      1.000
##           5          5      1.000
##           6          6      1.000
##           7          7      1.000
##           8          8      1.000
##           9          9      1.000
##          10         10      1.000
##          11         11      0.000
## 
## Cross-validation results
## ========================
## Target Sample = 1
## Variable Z1 
##  - True value        =       2.502
##  - Estimated value   =       0.550
##  - Estimation Error  =      -1.952
##  - Std. deviation    =       1.782
##  - Normalized Error  =      -1.095

## NULL

It appears that : * the cross-validation system is dimensionned to 40. Actually, for a unique neighborhood, the kriging system is established with the whole data set. The one suppressing one datum in turn is derived using the Schur theorem. * the result reminds the value of the measurement (2.502), the estimated value (0.550) the standard deviation of the estimation error (1.782) and the cross-validation estimation error (-1.952).

The result can also be checked using a moving neighborhood which has been tuned to cover a pseudo-unique neighborhood.

neighM = NeighMoving_create()

if (turnPrintON){
  OptDbg_setReference(1)
  err = xvalid(db = data,
               model = model,
               neigh = neighM,
               flag_xvalid_est = 1,
               flag_xvalid_std = 1,
               namconv = NamingConvention("Xvalid4, TRUE, TRUE, FALSE"))
  OptDbg_setReference(0)
}

## 
## Target location
## ---------------
## Sample #1 (from 10)
## Coordinate #1 = 22.701345
## Coordinate #2 = 83.641175
## 
## Data selected in neighborhood
## -----------------------------
##        Rank     Sample         x1         x2     Sector
##           1          2     82.323     43.955          1
##           2          3     15.270      3.385          1
##           3          4     55.428     19.935          1
##           4          5     93.142     79.864          1
##           5          6     85.694     97.845          1
##           6          7     73.690     37.455          1
##           7          8     32.792     43.164          1
##           8          9     32.178     78.710          1
##           9         10     64.591     82.042          1
## 
## LHS of Kriging matrix (compressed)
## ==================================
## Number of active samples    = 9
## Total number of equations   = 10
## Reduced number of equations = 10
## 
##        Rank                     1          2          3          4          5
##                   Flag          1          2          3          4          5
##           1          1      4.000      0.000      0.000      0.000      0.000
##           2          2      0.000      4.000      0.000      0.000      0.000
##           3          3      0.000      0.000      4.000      0.000      0.000
##           4          4      0.000      0.000      0.000      4.000      0.654
##           5          5      0.000      0.000      0.000      0.654      4.000
##           6          6      1.932      0.000      0.139      0.000      0.000
##           7          7      0.000      0.000      0.000      0.000      0.000
##           8          8      0.000      0.000      0.000      0.000      0.000
##           9          9      0.000      0.000      0.000      0.012      0.085
##          10         10      1.000      1.000      1.000      1.000      1.000
## 
##        Rank                     6          7          8          9         10
##                   Flag          6          7          8          9         10
##           1          1      1.932      0.000      0.000      0.000      1.000
##           2          2      0.000      0.000      0.000      0.000      1.000
##           3          3      0.139      0.000      0.000      0.000      1.000
##           4          4      0.000      0.000      0.000      0.012      1.000
##           5          5      0.000      0.000      0.000      0.085      1.000
##           6          6      4.000      0.000      0.000      0.000      1.000
##           7          7      0.000      4.000      0.000      0.000      1.000
##           8          8      0.000      0.000      4.000      0.000      1.000
##           9          9      0.000      0.000      0.000      4.000      1.000
##          10         10      1.000      1.000      1.000      1.000      0.000
## 
## RHS of Kriging matrix (compressed)
## ==================================
## Number of active samples    = 9
## Total number of equations   = 10
## Reduced number of equations = 10
## Number of right-hand sides  = 1
## Punctual Estimation
## 
##        Rank       Flag          1
##           1          1      0.000
##           2          2      0.000
##           3          3      0.000
##           4          4      0.000
##           5          5      0.000
##           6          6      0.000
##           7          7      0.000
##           8          8      1.954
##           9          9      0.000
##          10         10      1.000
## 
## (Co-) Kriging weights
## =====================
##        Rank         x1         x2       Data        Z1*
##           1     82.323     43.955      1.266      0.045
##           2     15.270      3.385      2.184      0.064
##           3     55.428     19.935     -2.917      0.063
##           4     93.142     79.864      0.870      0.055
##           5     85.694     97.845     -0.730      0.054
##           6     73.690     37.455      2.955      0.040
##           7     32.792     43.164     -0.573      0.064
##           8     32.178     78.710      0.824      0.553
##           9     64.591     82.042     -0.157      0.063
## Sum of weights                                    1.000
## 
## Drift coefficients
## ==================
##        Rank   Lagrange      Coeff
##           1     -0.256      0.289
## 
## Cross-validation results
## ========================
## Target Sample = 1
## Variable Z1 
##  - True value        =       2.502
##  - Estimated value   =       0.550
##  - Estimation Error  =      -1.952
##  - Std. deviation    =       1.782
##  - Normalized Error  =      -1.095

## NULL

The next chunk displays the graphic outputs expected following a cross-validation process. They are provided by the function draw.xvalid, which produces : * the base map of the absolute value of the standardized cross-validation error * the histogram of the standardized cross-validation error * the scatter plot of the standardized error as a function of the estimate * the scatter plot of the real value as a function of the estimate.

# plotting absolute value of the standardized error
ggplot()+
  plot.point(data,
             nameSize = "Xvalid.data.stderr", 
             color = "red",
             flagAbsSize = TRUE)+
  plot.decoration(title="Standardized error (absolute value)")

# plotting standardized error
ggplot()+
  plot.point(data,
             nameSize = "Xvalid.data.stderr", 
             color = "blue",
             flagAbsSize = FALSE)+
  plot.decoration(title = "Standardized error")

# plotting the histogram of the standardized error 
ggplot()+
  plot.hist(data,
            name = "Xvalid.data.stderr", 
            bins=30,
            color = "cornflowerblue",
            fill="cyan")+
  plot.decoration(xlab="Estimation Error", title="Cross-Validation")

# plotting the correlation between the estimate and the standardized error
ggplot()+
  plot.correlation(data,
                   namex = "Xvalid2.data.estim", 
                   namey = "Xvalid.data.stderr",
                   asPoint = TRUE)+
  plot.decoration(title = "Cross-Validation",
                  xlab = "kriging estimate",
                  ylab = "kriging error")

# plotting the correlation between the estimate and the standardized error
ggplot()+
  plot.correlation(data,
                   namex = "Xvalid2.data.estim", 
                   namey = "data",
                   flagRegr = TRUE,
                   regrColor = "purple",
                   asPoint = TRUE,
                   flagSameAxes = TRUE)+
  plot.decoration(title = "Cross-Validation",
                  xlab = "kriging estimate",
                  ylab = "measured values")

K-Fold cross-validation

Data set

'dbin' contains scattered points.

Note: the K-fold cross validation is not available for a unique neighborhood.

N     = 300
K     = 10
dbin  = Db_createFillRandom(ndat = N, nvar = 0, seed = 125)
dbin["folds"] = sample(1:K, size = dbin$getSampleNumber(TRUE), replace = TRUE)
p <- ggDefaultGeographic() +
  plot.point(dbin,  nameColor = "folds", palette = "Spectral", 
             flagLegendColor = TRUE, flagLegendLabel = "Folds") +
  plot.decoration(xlab = "X", ylab = "Z", title = "Simulated Folds")
ggPrint(p)

We define a Model and simulate the variable at sample locations.

model = Model()
err = model$addCovFromParam(type = ECov_EXPONENTIAL(), range = 0.2,  sill = 1)
err = model$setDriftIRF(0,0)

err = simtub(dbin = NULL, dbout = dbin, model = model, nbsimu = 1, seed = 235, nbtuba = 1000, 
             namconv = NamingConvention("Y"))

We define a (Moving) neighborhood

neigh = NeighMoving_create(nmini = 10, nmaxi = 10)

We perform the cross-validation (using Ordinary Kriging) without the folds: it is based on the Leave-One-Out algorithm..

err = dbin$deleteColumns(names = "*.xval.*")

err = dbin$setLocators("Y", locatorType = ELoc_Z(), cleanSameLocator = TRUE)
err = xvalid(db = dbin, model = model, neigh = neigh, flag_kfold = FALSE, 
             flag_xvalid_est = 1, flag_xvalid_std = 1, namconv = NamingConvention("OK.xval.LOO"))

We now perform the same task but with K-Fold option

err = dbin$setLocators("Y", locatorType = ELoc_Z(), cleanSameLocator = TRUE)
err = dbin$setLocator("folds", locatorType = ELoc_C(), cleanSameLocator = TRUE)
err = xvalid(db = dbin, model = model, neigh = neigh, flag_kfold = TRUE, 
             flag_xvalid_est = 1, flag_xvalid_std = 1, namconv = NamingConvention("OK.xval.kfold"))

knitr::kable(dbStatisticsMono(dbin, names = c("Y", "OK.xval.LOO.Y.*", "OK.xval.kfold.Y.*"), 
                              opers = opers)$toTL(),
             digits = 3, caption = paste0("Cross validation scores (K  = ", K, ")"))

Cross validation scores (K = 10)
	Number	Minimum	Maximum	Mean	St. Dev.
Y	300	-2.859	2.595	0.081	0.965
OK.xval.LOO.Y.esterr	300	-1.897	1.778	-0.011	0.703
OK.xval.LOO.Y.stderr	300	-2.498	2.560	-0.007	1.033
OK.xval.kfold.Y.esterr	300	-1.897	1.890	-0.009	0.713
OK.xval.kfold.Y.stderr	300	-2.498	2.590	-0.003	1.038

p1 <- ggDefault() +
  plot.hist(dbin,  name = "OK.xval.LOO.Y.stderr", bins = 20, fill = "orange") +
  plot.decoration(xlab = "Normalised estimation error (Z*-Z)/S", 
                  title = "Leave-One-Out cross validation")

p2 <- ggDefault() +
  plot.hist(dbin,  name = "OK.xval.kfold.Y.stderr", bins = 20, fill = "skyblue") +
  plot.decoration(xlab = "Normalised estimation error (Z*-Z)/S", 
                  title = paste0("K-fold cross validation (K = ", K, ")"))

ggarrange(p1, p2, nrow = 1, ncol = 2)

Xvalid

Meryem Meziane and Xavier Freulon

2023-11-30

K-Fold cross-validation

Data set