This notebook presents an example for Discrete-Disjunctive Kriging (DDK). The theory is not detailed here, see for example:
Rivoirard, J. (1994). Introduction to disjunctive kriging and non-linear geostatistics. Number
551.021 R626i. Clarendon Press
import numpy as np
import matplotlib.pyplot as plt
import gstlearn as gl
import gstlearn.plot as gp
We create a reference data set (lognormal distribution) based on a model that we define, using simtub (based on Turning Bands).
# parameters for the simulation
m = 1.
sig = 0.5
# initialization of the grid
grd = gl.DbGrid.create(x0=(0.0,0.0), dx=(0.01,0.01), nx=(100,100))
# simulation from a model
model = gl.Model.createFromParam(gl.ECov.EXPONENTIAL, range=0.2)
gl.simtub(dbin = None, dbout = grd, model = model, nbsimu = 1)
grd.setName("Simu", "Y")
grd["Z"] = m * np.exp(sig * grd["Y"].squeeze() - sig**2 / 2)
# Data set (10% of the grid)
data = gl.Db.createSamplingDb(grd, 0.1, flag_add_rank=0)
data.useSel = True
# plots
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(15,6))
ax1.raster(grd,"Z")
ax1.decoration(title="Initial variable")
ax2.histogram(grd, name='Z', bins = 25, color="orange")
ax2.decoration(xlabel = "Raw variable", title="Histogram of the initial variable")
fig, ax3 = plt.subplots(figsize=(6,6))
ax3.raster(grd,"Z")
ax3.gstpoint(data,name_size="Z", color='yellow', flagLegendSymbol=False)
ax3.decoration(title="Random data subsample")
We define cutoffs values corresponding to quantiles 0\%, 30\%, 50\%, 70\% and 90\%.
zcut = np.quantile(data['Z'], q = [0.3, 0.5, 0.7, 0.9])
print("\n{:^40}".format("Coupures sur la variable Z"))
print("{:^10}{:^10}{:^10}{:^10}".format(0.3, 0.5, 0.7, 0.9))
print("{:^10.3f}{:^10.3f}{:^10.3f}{:^10.3f}".format(*zcut),'\n')
mylimits = gl.Limits(zcut, True) #defines limits based on the cutoff values
# True for addFromZero, so that we add the interval [0, z_1[
mylimits.display()
Coupures sur la variable Z 0.3 0.5 0.7 0.9 0.701 0.927 1.194 1.711 Bound( 1 ) : [ 0 ; 0.701469 [ Bound( 2 ) : [ 0.701469 ; 0.926697 [ Bound( 3 ) : [ 0.926697 ; 1.1939 [ Bound( 4 ) : [ 1.1939 ; 1.71068 [
From the cutoff values defined above, the variable Z is discretized on the four intervals delimited by the cutoff values. The indicators of the intervals are $\mathbb{1}(z_i \le Z < z_{i+1})$. The fifth indicator ($\mathbb{1}(Z \ge z_{5})$) is not computed because it can be deducted from the four other ones as their sum equals to one.
# Compute the indicators (4 new variables)
iud_Indic = mylimits.toIndicator(data, name='Z', OptionIndicator=1)
# Compute the discretized version of the variable
iud_Mean = mylimits.toIndicator(data, name='Z', OptionIndicator=0)
# statistics on the indicators
w = data.statistics(["Indicator.Z.Class*"], verbose = False) #by default : mean. How to change ? gl.EStatOption.MEAN doesn't work
w = list(w) + [1 - np.sum(w)]
print("Proportions = ", np.round(w,3))
Nclass = len(w) # total number of indicators, including the fifth one which is not computed
Proportions = [0.3 0.2 0.2 0.2 0.1]
print(data)
ax = data.plot(name_color="*Mean")
ax.decoration(title="Discretized variable")
Data Base Characteristics ========================= Data Base Summary ----------------- File is organized as a set of isolated points Space dimension = 2 Number of Columns = 10 Maximum Number of UIDs = 10 Total number of samples = 1000 Variables --------- Column = 0 - Name = rank - Locator = NA Column = 1 - Name = x1 - Locator = x1 Column = 2 - Name = x2 - Locator = x2 Column = 3 - Name = Y - Locator = NA Column = 4 - Name = Z - Locator = NA Column = 5 - Name = Indicator.Z.Class.1 - Locator = NA Column = 6 - Name = Indicator.Z.Class.2 - Locator = NA Column = 7 - Name = Indicator.Z.Class.3 - Locator = NA Column = 8 - Name = Indicator.Z.Class.4 - Locator = NA Column = 9 - Name = Indicator.Z.Mean - Locator = z1
# Locate Z
data.clearLocators(gl.ELoc.Z)
data.setLocator("Z", gl.ELoc.Z)
# Variogram parameters (omnidirectional)
varioParam = gl.VarioParam.createOmniDirection(npas=10, dpas=0.05)
# Compute variogram
var_Z = gl.Vario.computeFromDb(varioParam, data)
# fit model
mod_Z = gl.Model()
opt=gl.Option_AutoFit()
opt.setWmode(2) # weighted proportional to the number of pairs and inverse proportional to the distance
mod_Z.fit(var_Z, [gl.ECov.NUGGET, gl.ECov.EXPONENTIAL, gl.ECov.EXPONENTIAL], mauto = opt)
# plot
ax = gp.varmod(var_Z, mod_Z, flagLegend=True)
ax.decoration(title = "Variogram of the data sample")
mod_Z.display()
Model characteristics ===================== Space dimension = 2 Number of variable(s) = 1 Number of basic structure(s) = 2 Number of drift function(s) = 0 Number of drift equation(s) = 0 Covariance Part --------------- Exponential - Sill = 0.196 - Range = 0.149 - Theo. Range = 0.050 Exponential - Sill = 0.195 - Range = 2.652 - Theo. Range = 0.885 Total Sill = 0.392
data.clearLocators(gl.ELoc.Z)
data.setLocator("Indicator.Z.Mean", gl.ELoc.Z)
var_Z = gl.Vario.computeFromDb(varioParam, data)
mod_Z = gl.Model()
mod_Z.fit(var_Z, [gl.ECov.NUGGET, gl.ECov.EXPONENTIAL, gl.ECov.EXPONENTIAL], mauto = opt)
ax = gp.varmod(var_Z, mod_Z, flagLegend=True)
ax.decoration(title = "Variogram of the discretized variable")
mod_Z.display()
Model characteristics ===================== Space dimension = 2 Number of variable(s) = 1 Number of basic structure(s) = 2 Number of drift function(s) = 0 Number of drift equation(s) = 0 Covariance Part --------------- Exponential - Sill = 0.045 - Range = 0.048 - Theo. Range = 0.016 Exponential - Sill = 0.080 - Range = 0.356 - Theo. Range = 0.119 Total Sill = 0.125
data.clearLocators(gl.ELoc.Z)
data.setLocator("Indicator.Z.Class*", gl.ELoc.Z)
var_Z = gl.Vario.computeFromDb(varioParam, data)
mod_Z = gl.Model()
mod_Z.fit(var_Z, [gl.ECov.NUGGET, gl.ECov.EXPONENTIAL, gl.ECov.EXPONENTIAL], mauto = opt)
ax = gp.varmod(var_Z, mod_Z, flagLegend=False)
gp.decoration(ax,title = "Simple and cross variograms of the indicators of Z", fontsize=20)
The indicators are spatially correlated. A method for disjunctive kriging would consist in cokriging all indicators. Instead, we will decompose them into factors that are not correlated spatially, so that they can all be estimated seperately by kriging.
data.clearLocators(gl.ELoc.Z)
data.setLocator("Indicator.Z.Class*", gl.ELoc.Z)
maf_I = gl.PCA()
maf_I.maf_compute_interval(data, hmin=0.045, hmax=0.055)
maf_I.display()
#extract matrices of conversion between factors and indicators
#Mz2f = np.reshape(maf_I.getZ2F(),(Nclass-1,Nclass-1)).T
#Mf2z = np.reshape(maf_I.getF2Z(),(Nclass-1,Nclass-1)).T
PCA Contents ------------ Means [, 0] [ 0,] 0.300 [ 1,] 0.200 [ 2,] 0.200 [ 3,] 0.200 Covariance Matrix [, 0] [, 1] [, 2] [, 3] [ 0,] 0.210 -0.060 -0.060 -0.060 [ 1,] -0.060 0.160 -0.040 -0.040 [ 2,] -0.060 -0.040 0.160 -0.040 [ 3,] -0.060 -0.040 -0.040 0.160 Variogram Matrix at lag h [, 0] [, 1] [, 2] [, 3] [ 0,] 0.144 -0.073 -0.038 -0.026 [ 1,] -0.073 0.161 -0.036 -0.037 [ 2,] -0.038 -0.036 0.141 -0.048 [ 3,] -0.026 -0.037 -0.048 0.144 Matrix MZ2F to transform standardized Variables Z into Factors F [, 0] [, 1] [, 2] [, 3] [ 0,] -2.838 1.704 1.189 0.974 [ 1,] -1.885 2.027 2.240 -1.518 [ 2,] -1.005 3.698 0.465 0.290 [ 3,] -0.603 2.066 2.951 1.282 Y = (Z - m) * MZ2F) Matrix MF2Z to back-transform Factors F into standardized Variables Z [, 0] [, 1] [, 2] [, 3] [ 0,] -2.838 -1.885 -1.005 -0.603 [ 1,] 1.704 2.027 3.698 2.066 [ 2,] 1.189 2.240 0.465 2.951 [ 3,] 0.974 -1.518 0.290 1.282 Z = m + Y * MF2Z
# Calcul des maf comme fonction de l'index
maf = np.reshape(maf_I.mafOfIndex(), (Nclass, Nclass)).T
## Check that the factors are orthogonal
tab = np.dot(np.dot(maf.T, np.diag(w)),maf).round(10)
print("Correlation between factors MAF")
print(tab)
## plot MAF as a function of the index
fig, ax = plt.subplots(figsize = (10,6))
for k in range(Nclass-1):
ax.plot(range(Nclass), np.sign(maf[0, k+1]) * maf[:,k+1], marker='.', label="MAF-"+str(k+1))
ax.set_xticks(range(Nclass), range(1, Nclass+1))
ax.hlines(0, 0, Nclass-1, color='black', linestyle='dashed', linewidth=0.7)
ax.set_xlabel("Indice des classes")
ax.set_ylabel("MAF")
l = ax.legend()
Correlation between factors MAF [[ 1. 0. -0. -0. -0. ] [ 0. 4.67232537 0.17352826 0.0912897 0.69401709] [-0. 0.17352826 6.60200317 0.26790826 -0.15252915] [-0. 0.0912897 0.26790826 6.44320235 -0.09621599] [-0. 0.69401709 -0.15252915 -0.09621599 5.95249859]]
maf_I.dbZ2F(data) # calculate MAF on the data points (new variables in data)
vario_maf = gl.Vario.computeFromDb(varioParam, data)
model_maf = gl.Model()
model_maf.fit(vario_maf, [gl.ECov.NUGGET, gl.ECov.EXPONENTIAL, gl.ECov.EXPONENTIAL], mauto = opt)
ax = gp.varmod(vario_maf, model_maf, flagLegend=False)
gp.decoration(ax, title = "Variograms of the MAFs", fontsize=20)
# models of the MAFs
fig, axs = plt.subplots(2,2, figsize=(11,7))
varios, models = [], []
for f in range(Nclass-1):
vario_reduced = gl.Vario(vario_maf)
vario_reduced.reduce(varcols = [f], dircols = [])
model = gl.Model()
opt=gl.Option_AutoFit()
opt.setWmode(0)
model.fit(vario_reduced, [gl.ECov.NUGGET, gl.ECov.EXPONENTIAL, gl.ECov.EXPONENTIAL], mauto=opt)
varios.append(vario_reduced)
models.append(model)
gp.varmod(vario_reduced, model, axs_old=axs.flat[f])
axs.flat[f].set_title(f"Simple variogram of MAF{f+1}")
#print("\nModel MAF"+str(f+1))
#model.display()
For example, we can obtain an estimation of the variable Z with DDK. Here we only consider the first two MAFs as an example, and because MAF3 and MAF4 are not very structured and have a very small range. MAF1 and MAF2 reprensent the pattern of the most continuity, and large scale structures.
First, we can do the kriging of MAF1 and MAF2 separately, as they are built to be orthogonal.
#define neighborhood
neigh = gl.NeighMoving.create(nmaxi=30, radius=0.5)
# Kriging MAF1
data.setLocator("F*1", gl.ELoc.Z, cleanSameLocator=True)
gl.kriging(data, grd, models[0], neigh)
# Kriging MAF2
data.setLocator("F*2", gl.ELoc.Z, cleanSameLocator=True)
err = gl.kriging(data, grd, models[1], neigh)
Then, we obtain the estimate of any variable that can be expressed as a linear combination of the indicators, and that are thus also a linear combination of the MAFs.
### outils de gstlearn : a ameliorer ? Ci-dessous permet de trouver l'estimation des indicatrices à partir des MAFs:
# gridData.setLocator("Kriging.MAF*.estim", gl.ELoc.Z)
# maf_I.dbF2Z(gridData) # estimation of the indicators
### mais 2 manques:
# avoir directement l'estimation d'une autre variable (ici S15), à partir des coefficient de la combinaison linéaire des indicatrices
# ne prendre en compte que les premiers MAFs (eg dans le cas où les derniers MAFs ne sont pas structurés, c'est inutile de les kriger)
# en attendant j'utilise la fonction définie ci-dessous
def vecF2myvar(maf, VecF, coefs=None, mean=None, SigmaF=None):
"""Compute values of a variable that is a linear combination of Z variables, from vectors of factors F.
maf: object containing infos on the MAF computation (class PCA)
VecF: table containing values of the factors, shape (Ndata*NF), where NF is the number of factors used (if NF is lower
than the total number of factors, then only the first ones are used, the others are considered not structured spatially)
coefs : coefficients of the variable along all variables Z
mean : mean of the variable.
SigmaF: table of the standard deviation of the estimation error of the factors, same shape as VecF. If given, the standard
deviation of the output variable are also computed and returned.
eg: coefs = [1,0,0,0,0] is the first variable (indicator), with mean = 0 if not specified.
"""
VecF = np.atleast_2d(VecF)
Ndata, NF = VecF.shape
Nvar = maf.getNVar()
if NF < Nvar:
print("The number of factors is less than the number of variables. The other factors will be neglected.")
Mf2z = np.reshape(maf.getF2Zs(),(Nvar, Nvar)).T
sigma = np.array(maf.getSigmas())
means = np.array(maf.getMeans())
mean_N = 1 - np.sum(means)
coefs = np.atleast_2d(coefs).T # one column = coefs for 1 variable
if len(coefs) > Nvar + 1:
raise ValueError(f"Too many coefficients were given ({len(coefs)}), coefs should be of length {Nvar} or {Nvar+1}.")
if mean is None and len(coefs) == Nvar + 1:
all_means = np.array(list(means) + [mean_N])
mean = np.sum(coefs.T*all_means, axis=1)
elif mean is None:
raise ValueError("The mean of the variable should be given when only the N-1 coefficients are given.")
if len(coefs) == Nvar:
c_N = 1/mean_N*(mean - np.sum(coefs.T*means, axis=1))
coefs = np.append(coefs, np.atleast_2d(c_N).T, axis=1) # add c_N
if len(coefs) != Nvar + 1:
print("Wrong len for coefs : ", len(coefs))
Vweights = (sigma*(coefs[:-1] - coefs[-1]).T).T # w_i = (c_i - c_N)*sigma_i
M = np.dot(Mf2z[:NF,:], Vweights)
VecMyvar = np.dot(VecF, M) + mean
if SigmaF is not None:
VecSigmaMyvar = np.sqrt(np.dot(SigmaF**2, M**2))
return np.squeeze(VecMyvar), VecSigmaMyvar.squeeze()
else:
return VecMyvar.squeeze()
# extract values of the first two MAFs (the other ones have only nugget effect)
grd.useSel=True
VecMAF = grd["Krig*estim"]
SigmaMAF = grd["Krig*stdev"]
# estimation of the indicators
INDestim, INDstdev = vecF2myvar(maf_I, VecMAF, coefs=np.eye(5), SigmaF=SigmaMAF)
for i_indicator in range(5):
grd[f"Kriging.Indicator{i_indicator+1}.estim"] = INDestim[:,i_indicator]
# estimation of S15
mean_per_class = np.unique(data["*Mean"])
Zestim, Zstdev = vecF2myvar(maf_I, VecMAF, coefs=mean_per_class, SigmaF=SigmaMAF, mean=np.mean(data['Z']))
grd["Kriging.Z.estim"] = Zestim # Z*
grd["Kriging.Z.stdev"] = Zstdev # standard deviation of the estimation error Var(Z*-Z) = S
The number of factors is less than the number of variables. The other factors will be neglected. The number of factors is less than the number of variables. The other factors will be neglected.
fig, axs = plt.subplots(1,2, figsize=(10,5))
axs[0].raster(grd,"Kriging.F*1.estim")
axs[0].decoration(title='Estimation of MAF1')
axs[1].raster(grd,"Kriging.F*2.estim")
axs[1].decoration(title='Estimation of MAF2')
fig, axs = plt.subplots(1,3, figsize=(15,5))
axs[0].raster(grd,"Z")
axs[0].decoration(title='Initial Variable Z')
axs[1].raster(grd,"Kriging.Z.estim")
axs[1].decoration(title='Estimation of Z with DDK')
axs[2].raster(grd,"Kriging.Z.stdev")
axs[2].decoration(title='Standard deviation of kriging')