In [1]:
import gstlearn as gl
import gstlearn.plot as gp
import gstlearn.document as gdoc
import numpy as np
import matplotlib.pyplot as plt
gdoc.setNoScroll()
Constraints on drifts¶
This tutorial has two essential taks:
- cross-check the calculation of the Log-Likelihood (by hand or with gstlearn)
- design the definition pattern for the linkage of the drift functions in the multivariable framework
Parameters¶
In [2]:
# Data
np.random.seed(123)
ndat = 100
ndim = 2
# Model
rangev = 0.2
sill = 1.0
nugget = 0.1
In [3]:
# Z : vecteur des données
# Covmat : matrice de covariance
# drift : matrice de drift
# A et c permettent d'encoder les contraintes sur le vecteur des coefficients beta :
# sous la forme A * beta = c
def estimCoeff(Z, Covmat, drift, A=None, c=None):
if A is not None and c is not None:
if A.shape[0] != len(c) or A.shape[1] != drift.shape[1]:
return np.nan
invcovmat = np.linalg.inv(Covmat)
invu = np.linalg.inv(drift.T @ invcovmat @ drift)
estimatedCoeffs = invu @ drift.T @ invcovmat @ Z
if A is None or c is None:
return estimatedCoeffs
temp = invu @ A.T @ np.linalg.inv(A @ invu @ A.T)
return estimatedCoeffs - temp @ A @ estimatedCoeffs + temp @ c
def computeLogLikelihoodByHand(Z, Covmat, drift, coeffs=None, A=None, c=None):
if coeffs is None:
coeffs = estimCoeff(Z, Covmat, drift, A, c)
Zc = Z - coeffs @ drift.T
cholcovmat = np.linalg.cholesky(Covmat)
Zcstd = np.linalg.solve(cholcovmat, Zc)
quad = Zcstd.T @ Zcstd
logdet = 2.0 * np.sum(np.log(np.diag(cholcovmat)))
return -0.5 * (quad + logdet + len(Z) * np.log(2.0 * np.pi))
def printCoeffs(title, coeffs, ndec=6):
print(title + " : " + f"{str(np.round(coeffs, ndec))}")
Monovariate case¶
Model¶
In [4]:
model = gl.Model.createFromParam(gl.ECov.MATERN, param=1, range=rangev, sill=sill)
model.addCovFromParam(gl.ECov.NUGGET, sill=nugget)
model
Out[4]:
Model characteristics ===================== Space dimension = 2 Number of variable(s) = 1 Number of basic structure(s) = 2 Number of drift function(s) = 0 Number of drift equation(s) = 0 Covariance Part --------------- Matern (Third Parameter = 1) - Sill = 1.000 - Range = 0.200 - Theo. Range = 0.058 Nugget Effect - Sill = 0.100 Total Sill = 1.100 Known Mean(s) 0.000
Data¶
In [5]:
dat = gl.Db.createFillRandom(ndat, ndim, 0)
dat["drift"] = dat["x-1"]
gl.simtub(None, dat, model)
dat
Out[5]:
Data Base Characteristics ========================= Data Base Summary ----------------- File is organized as a set of isolated points Space dimension = 2 Number of Columns = 5 Total number of samples = 100 Variables --------- Column = 0 - Name = rank - Locator = NA Column = 1 - Name = x-1 - Locator = x1 Column = 2 - Name = x-2 - Locator = x2 Column = 3 - Name = drift - Locator = NA Column = 4 - Name = Simu - Locator = z1
In [6]:
truecoeffs = [0.5]
dat["Simu"] = truecoeffs[0] + dat["Simu"]
# dat.setLocator("drift",gl.ELoc.F)
dat
Out[6]:
Data Base Characteristics ========================= Data Base Summary ----------------- File is organized as a set of isolated points Space dimension = 2 Number of Columns = 5 Total number of samples = 100 Variables --------- Column = 0 - Name = rank - Locator = NA Column = 1 - Name = x-1 - Locator = x1 Column = 2 - Name = x-2 - Locator = x2 Column = 3 - Name = drift - Locator = NA Column = 4 - Name = Simu - Locator = z1
In [7]:
model.setDriftIRF(0, 0)
model
Out[7]:
Model characteristics ===================== Space dimension = 2 Number of variable(s) = 1 Number of basic structure(s) = 2 Number of drift function(s) = 1 Number of drift equation(s) = 1 Covariance Part --------------- Matern (Third Parameter = 1) - Sill = 1.000 - Range = 0.200 - Theo. Range = 0.058 Nugget Effect - Sill = 0.100 Total Sill = 1.100 Drift Part ---------- Universality_Condition
In [8]:
X = model.evalDriftMat(dat).toTL()
Covmat = model.evalCovMatSym(dat).toTL()
In [9]:
A = np.array([1]).reshape(1, 1)
c = [0.3]
coeffs = estimCoeff(dat["Simu"], Covmat, X, A, c)
printCoeffs("a=0.3", coeffs)
a=0.3 : [0.3]
This lack of constraint can be emulated using the LogLikelihood principle:
- calculated by hand:
In [10]:
print(
"Computed manually : "
+ str(np.round(computeLogLikelihoodByHand(dat["Simu"], Covmat, X), 6))
)
Computed manually : -120.377463
- calculated within gstlearn
In [11]:
likelihoodG = model.computeLogLikelihood(dat, True)
Likelihood calculation: - Number of active samples = 100 - Number of variables = 1 - Length of Information Vector = 100 - Number of drift conditions = 1 Log-Determinant = -33.879144 Quadratic term = 90.846364 Log-likelihood = -120.377463
- using the Vecchia approximation
In [12]:
likelihoodV = gl.logLikelihoodVecchia(dat, model, 4, True)
Log-Determinant = -33.200255 Quadratic term = 89.332909 Log-likelihood = -119.960180
Bivariate¶
In [13]:
s1 = 0.4
s2 = 2.0
r = 0.8
sills = np.array([[s1**2, r * s1 * s2], [r * s1 * s2, s2**2]])
model = gl.Model.createFromParam(gl.ECov.MATERN, param=1, range=rangev, sills=sills)
In [14]:
ndat = 200
dat = gl.Db.createFillRandom(ndat, ndim, 0, 2)
dat["drift"] = dat["x-1"]
gl.simtub(None, dat, model)
dat
Out[14]:
Data Base Characteristics ========================= Data Base Summary ----------------- File is organized as a set of isolated points Space dimension = 2 Number of Columns = 8 Total number of samples = 200 Variables --------- Column = 0 - Name = rank - Locator = NA Column = 1 - Name = x-1 - Locator = x1 Column = 2 - Name = x-2 - Locator = x2 Column = 3 - Name = f-1 - Locator = f1 Column = 4 - Name = f-2 - Locator = f2 Column = 5 - Name = drift - Locator = NA Column = 6 - Name = Simu.1 - Locator = z1 Column = 7 - Name = Simu.2 - Locator = z2
In [15]:
ax = plt.scatter(dat["Simu.1"], dat["Simu.2"])
In [16]:
truecoeffs1 = [0.5, 3]
truecoeffs2 = [1.5, -2]
model.setDriftIRF(0, 1)
dat["Simu.1"] = truecoeffs1[0] + truecoeffs1[1] * dat["drift"] + dat["Simu.1"]
dat["Simu.2"] = truecoeffs2[0] + truecoeffs2[1] * dat["drift"] + dat["Simu.2"]
dat.setLocator("drift", gl.ELoc.F)
No constraint¶
In [17]:
Covmat = model.evalCovMatSym(dat).toTL()
X = model.evalDriftMat(dat).toTL()
Z = dat["Simu*"]
Z = Z.T.reshape(-1)
In [18]:
coeffs = estimCoeff(Z, Covmat, X)
fig, ax = gp.init(1, 2)
ax[0].scatter(dat["x-1"], dat["Simu.1"])
ax[0].plot([0, 1], [coeffs[0], coeffs[0] + coeffs[1]])
ax[1].scatter(dat["x-1"], dat["Simu.2"])
ax[1].plot([0, 1], [coeffs[2], coeffs[2] + coeffs[3]])
gp.close()
In [19]:
printCoeffs("No Constraint", coeffs)
No Constraint : [ 0.58471 3.06693 1.693381 -1.421991]
This option can be emulated in gstlearn
In [20]:
likelihood = model.computeLogLikelihood(dat, True)
Likelihood calculation: - Number of active samples = 200 - Number of variables = 2 - Length of Information Vector = 400 - Number of drift conditions = 4 Log-Determinant = -676.517039 Quadratic term = 484.076013 Log-likelihood = -271.354901
Means of both variables are imposed¶
In [21]:
A = np.array([[1, 0, 0, 0], [0, 0, 1, 0]])
c = [0.5, 1.5]
coeffs = estimCoeff(Z, Covmat, X, A, c)
In [22]:
fig, ax = gp.init(1, 2)
ax[0].scatter(dat["x-1"], dat["Simu.1"])
ax[0].plot([0, 1], [coeffs[0], coeffs[0] + coeffs[1]])
ax[1].scatter(dat["x-1"], dat["Simu.2"])
ax[1].plot([0, 1], [coeffs[2], coeffs[2] + coeffs[3]])
gp.close()
In [23]:
printCoeffs("a0=0.5 and b0=1.5", coeffs)
a0=0.5 and b0=1.5 : [ 0.5 3.18586 1.5 -1.15049]
Same coefficients for mean and drift coefficients¶
In [24]:
A = np.array([[1, 0, -1, 0], [0, 1, 0, -1]])
c = [0, 0]
coeffs = estimCoeff(Z, Covmat, X, A, c)
In [25]:
fig, ax = gp.init(1, 2)
ax[0].scatter(dat["x-1"], dat["Simu.1"])
ax[0].plot([0, 1], [coeffs[0], coeffs[0] + coeffs[1]])
ax[1].scatter(dat["x-1"], dat["Simu.2"])
ax[1].plot([0, 1], [coeffs[2], coeffs[2] + coeffs[3]])
gp.close()
In [26]:
printCoeffs("a0=b0 and a1=b1", coeffs)
a0=b0 and a1=b1 : [0.399932 3.815084 0.399932 3.815084]
This can be emulated with the current flagLinked option
In [27]:
model.setFlagLinked(True)
likelihood = model.computeLogLikelihood(dat, True)
Likelihood calculation: - Number of active samples = 200 - Number of variables = 2 - Length of Information Vector = 400 - Number of drift conditions = 2 Log-Determinant = -676.517039 Quadratic term = 519.260332 Log-likelihood = -288.947060
Means are equal¶
In [28]:
A = np.array([[1, 0, -1, 0]])
c = [0]
coeffs = estimCoeff(Z, Covmat, X, A, c)
In [29]:
fig, ax = gp.init(1, 2)
ax[0].scatter(dat["x-1"], dat["Simu.1"])
ax[0].plot([0, 1], [coeffs[0], coeffs[0] + coeffs[1]])
ax[1].scatter(dat["x-1"], dat["Simu.2"])
ax[1].plot([0, 1], [coeffs[2], coeffs[2] + coeffs[3]])
gp.close()
In [30]:
printCoeffs("a0=b0", coeffs)
a0=b0 : [0.399932 3.326353 0.399932 0.39397 ]
Means are linked, coefficient of drift on the first variable is imposed¶
In [31]:
A = np.array([[1, 0, -1, 0], [0, 1, 0, 0]])
c = [0, 1]
coeffs = estimCoeff(Z, Covmat, X, A, c)
In [32]:
fig, ax = gp.init(1, 2)
ax[0].scatter(dat["x-1"], dat["Simu.1"])
ax[0].plot([0, 1], [coeffs[0], coeffs[0] + coeffs[1]])
ax[1].scatter(dat["x-1"], dat["Simu.2"])
ax[1].plot([0, 1], [coeffs[2], coeffs[2] + coeffs[3]])
gp.close()
In [33]:
printCoeffs("a0=b0 and a1=1", coeffs)
a0=b0 and a1=1 : [ 1.266229 1. 1.266229 -5.262681]
Multivariate¶
We test the case of 3 variables for running a specific test.
In [34]:
sills = gl.MatrixSymmetric.createRandomDefinitePositive(3)
model = gl.Model.createFromParam(gl.ECov.MATERN, param=1, range=rangev, sills=sills)
In [35]:
model
Out[35]:
Model characteristics
=====================
Space dimension = 2
Number of variable(s) = 3
Number of basic structure(s) = 1
Number of drift function(s) = 0
Number of drift equation(s) = 0
Covariance Part
---------------
Matern (Third Parameter = 1)
- Sill matrix:
[, 0] [, 1] [, 2]
[ 0,] 5.167 -3.000 1.843
[ 1,] -3.000 3.831 2.299
[ 2,] 1.843 2.299 6.110
- Range = 0.200
- Theo. Range = 0.058
Total Sill
[, 0] [, 1] [, 2]
[ 0,] 5.167 -3.000 1.843
[ 1,] -3.000 3.831 2.299
[ 2,] 1.843 2.299 6.110
Known Mean(s) 0.000 0.000 0.000
In [36]:
ndat = 200
dat = gl.Db.createFillRandom(ndat, ndim, 0, 2)
dat["drift"] = dat["x-1"]
gl.simtub(None, dat, model)
dat
Out[36]:
Data Base Characteristics ========================= Data Base Summary ----------------- File is organized as a set of isolated points Space dimension = 2 Number of Columns = 9 Total number of samples = 200 Variables --------- Column = 0 - Name = rank - Locator = NA Column = 1 - Name = x-1 - Locator = x1 Column = 2 - Name = x-2 - Locator = x2 Column = 3 - Name = f-1 - Locator = f1 Column = 4 - Name = f-2 - Locator = f2 Column = 5 - Name = drift - Locator = NA Column = 6 - Name = Simu.1 - Locator = z1 Column = 7 - Name = Simu.2 - Locator = z2 Column = 8 - Name = Simu.3 - Locator = z3
In [37]:
truecoeffs1 = [0.5, 3]
truecoeffs2 = [1.5, -2]
truecoeffs3 = [-0.5, -2]
model.setDriftIRF(0, 1)
dat["Simu.1"] = truecoeffs1[0] + truecoeffs1[1] * dat["drift"] + dat["Simu.1"]
dat["Simu.2"] = truecoeffs2[0] + truecoeffs2[1] * dat["drift"] + dat["Simu.2"]
dat["Simu.3"] = truecoeffs3[0] + truecoeffs3[1] * dat["drift"] + dat["Simu.3"]
dat.setLocator("drift", gl.ELoc.F)
In [38]:
Covmat = model.evalCovMatSym(dat).toTL()
X = model.evalDriftMat(dat).toTL()
Z = dat["Simu*"]
Z = Z.T.reshape(-1)
In [39]:
coeffs = estimCoeff(Z, Covmat, X)
fig, ax = gp.init(1, 3, figsize=[15, 4])
printCoeffs("Coefficients", coeffs)
ax[0].scatter(dat["x-1"], dat["Simu.1"])
ax[0].plot([0, 1], [coeffs[0], coeffs[0] + coeffs[1]])
ax[1].scatter(dat["x-1"], dat["Simu.2"])
ax[1].plot([0, 1], [coeffs[2], coeffs[2] + coeffs[3]])
ax[2].scatter(dat["x-1"], dat["Simu.2"])
ax[2].plot([0, 1], [coeffs[4], coeffs[4] + coeffs[5]])
gp.close()
Coefficients : [ 0.679814 2.990676 1.092411 -0.961489 -0.969794 -0.317018]
Means of all variables are equal¶
This test is meant to check the way to constrain the mean value of all (three) variables to be equal: this is done by setting constraints to the drift coefficients of the variables taken two by two.
In [40]:
A = np.array([[1, 0, -1, 0, 0, 0], [0, 0, 1, 0, -1, 0]])
c = [0, 0]
coeffs = estimCoeff(Z, Covmat, X, A, c)
printCoeffs("Coefficients", coeffs)
Coefficients : [ 1.887024 1.295792 1.887024 -2.0771 1.887024 -4.327899]