In [1]:
import gstlearn as gl
import gstlearn.plot as gp
import gstlearn.document as gdoc
import numpy as np
import matplotlib.pyplot as plt

gdoc.setNoScroll()

Constraints on drifts¶

This tutorial has two essential taks:

  • cross-check the calculation of the Log-Likelihood (by hand or with gstlearn)
  • design the definition pattern for the linkage of the drift functions in the multivariable framework

Parameters¶

In [2]:
# Data
np.random.seed(123)
ndat = 100
ndim = 2

# Model
rangev = 0.2
sill = 1.
nugget = 0.1
In [3]:
# Z : vecteur des données
# Covmat : matrice de covariance
# drift : matrice de drift
# A et c permettent d'encoder les contraintes sur le vecteur des coefficients beta :
# sous la forme A * beta = c
def estimCoeff(Z,Covmat,drift,A=None,c=None):
    if A is not None and c is not None: 
        if A.shape[0]!= len(c) or  A.shape[1]!=drift.shape[1]:
            return np.nan
    
    invcovmat = np.linalg.inv(Covmat)
    invu = np.linalg.inv(drift.T@invcovmat@drift)
    estimatedCoeffs = invu@drift.T@invcovmat@Z
    
    if A is None or c is None :
        return estimatedCoeffs
    
    temp = invu@A.T@np.linalg.inv(A@invu@A.T)
    return estimatedCoeffs - temp@A@estimatedCoeffs+temp@c

def computeLogLikelihoodByHand(Z,Covmat,drift,coeffs=None,A=None,c=None):
    if coeffs is None:
        coeffs = estimCoeff(Z,Covmat,drift,A,c)
    Zc = Z - coeffs@drift.T
    cholcovmat = np.linalg.cholesky(Covmat)
    Zcstd = np.linalg.solve(cholcovmat,Zc) 
    quad = Zcstd.T@Zcstd
    logdet = 2. * np.sum(np.log(np.diag(cholcovmat)))
    return -0.5 * (quad + logdet + len(Z) * np.log(2.* np.pi))

def printCoeffs(title, coeffs, ndec=6):
    print(title + " : " + f"{str(np.round(coeffs,ndec))}")

Monovariate case¶

Model¶

In [4]:
model = gl.Model.createFromParam(gl.ECov.MATERN,param=1,range=rangev,sill=sill)
model.addCovFromParam(gl.ECov.NUGGET,sill=nugget)
model
Out[4]:
Model characteristics
=====================
Space dimension              = 2
Number of variable(s)        = 1
Number of basic structure(s) = 2
Number of drift function(s)  = 0
Number of drift equation(s)  = 0

Covariance Part
---------------
Matern (Third Parameter = 1)
- Sill         =      1.000
- Range        =      0.200
- Theo. Range  =      0.058
Nugget Effect
- Sill         =      0.100
Total Sill     =      1.100
Known Mean(s)     0.000

Data¶

In [5]:
dat = gl.Db.createFillRandom(ndat, ndim, 0)
dat["drift"] = dat["x-1"]
gl.simtub(None,dat,model)
dat
Out[5]:
Data Base Characteristics
=========================

Data Base Summary
-----------------
File is organized as a set of isolated points
Space dimension              = 2
Number of Columns            = 5
Total number of samples      = 100

Variables
---------
Column = 0 - Name = rank - Locator = NA
Column = 1 - Name = x-1 - Locator = x1
Column = 2 - Name = x-2 - Locator = x2
Column = 3 - Name = drift - Locator = NA
Column = 4 - Name = Simu - Locator = z1
In [6]:
truecoeffs = [0.5]
dat["Simu"] = truecoeffs[0] + dat["Simu"]
#dat.setLocator("drift",gl.ELoc.F)
dat
Out[6]:
Data Base Characteristics
=========================

Data Base Summary
-----------------
File is organized as a set of isolated points
Space dimension              = 2
Number of Columns            = 5
Total number of samples      = 100

Variables
---------
Column = 0 - Name = rank - Locator = NA
Column = 1 - Name = x-1 - Locator = x1
Column = 2 - Name = x-2 - Locator = x2
Column = 3 - Name = drift - Locator = NA
Column = 4 - Name = Simu - Locator = z1
In [7]:
model.setDriftIRF(0,0)
model
Out[7]:
Model characteristics
=====================
Space dimension              = 2
Number of variable(s)        = 1
Number of basic structure(s) = 2
Number of drift function(s)  = 1
Number of drift equation(s)  = 1

Covariance Part
---------------
Matern (Third Parameter = 1)
- Sill         =      1.000
- Range        =      0.200
- Theo. Range  =      0.058
Nugget Effect
- Sill         =      0.100
Total Sill     =      1.100

Drift Part
----------
Universality_Condition
In [8]:
X = model.evalDriftMat(dat).toTL()
Covmat = model.evalCovMatSym(dat).toTL()
In [9]:
A = np.array([1]).reshape(1,1)
c = [0.3]

coeffs = estimCoeff(dat["Simu"],Covmat,X,A,c)
printCoeffs("a=0.3", coeffs)
a=0.3 : [0.3]

This lack of constraint can be emulated using the LogLikelihood principle:

  • calculated by hand:
In [10]:
print(f"Computed manually : " + str(np.round(computeLogLikelihoodByHand(dat["Simu"],Covmat,X),6)))
Computed manually : -120.377463
  • calculated within gstlearn
In [11]:
likelihoodG = model.computeLogLikelihood(dat, True)
Likelihood calculation:
- Number of active samples     = 100
- Number of variables          = 1
- Length of Information Vector = 100
- Number of drift conditions = 1
Optimal Drift coefficients = 
     0.728
Log-Determinant = -33.879144
Quadratic term  = 90.846364
Log-likelihood  = -120.377463
  • using the Vecchia approximation
In [12]:
likelihoodV = gl.logLikelihoodVecchia(dat, model, 4, True)
Lower Vecchia Matrix
--------------------
- Number of rows    = 100
- Number of columns = 100
- Sparse Format
               [,  0]    [,  1]    [,  2]    [,  3]    [,  4]    [,  5]    [,  6]
     [  0,]     1.000         .         .         .         .         .         .
     [  1,]    -0.011     1.000         .         .         .         .         .
     [  2,]    -0.274     0.000     1.000         .         .         .         .
     [  3,]     0.000    -0.001     0.000     1.000         .         .         .
     [  4,]    -0.171    -0.002    -0.001     0.000     1.000         .         .
     [  5,]    -0.028    -0.142     0.003         .    -0.022     1.000         .
     [  6,]     0.000    -0.026         .     0.000         .     0.001     1.000
(Ncols=7[from 100],Nrows=7[from 100])

Diagonal of Vecchia Matrix
               [,  0]    [,  1]    [,  2]    [,  3]    [,  4]    [,  5]    [,  6]
     [  0,]     0.909     0.909     0.983     0.909     0.937     0.929     0.910
     [  7,]     0.909     0.910     1.190     1.099     0.930     1.022     0.951
     [ 14,]     0.995     0.952     0.959     2.441     2.026     1.933     1.013
     [ 21,]     0.910     0.917     1.359     0.941     0.911     1.139     0.911
     [ 28,]     0.933     0.961     1.126     1.361     1.934     2.438     1.028
     [ 35,]     1.817     1.363     1.309     3.203     0.931     1.230     1.736
     [ 42,]     0.984     0.910     1.567     0.946     1.246     2.879     1.127
     [ 49,]     1.381     1.987     1.332     1.233     0.996     1.946     1.331
     [ 56,]     3.414     1.258     1.157     1.210     1.351     1.993     1.212
     [ 63,]     1.573     2.322     0.971     1.060     2.079     1.220     1.334
     [ 70,]     0.936     1.189     1.639     1.910     1.943     1.915     1.685
     [ 77,]     2.417     1.301     4.589     1.714     1.137     4.998     1.026
     [ 84,]     1.148     1.636     1.558     1.415     1.014     0.996     3.764
     [ 91,]     1.897     2.804     1.283     3.324     2.089     2.169     1.425
     [ 98,]     1.296     2.566
Optimal Drift coefficients = 
     0.725
Log-Determinant = -33.200255
Quadratic term  = 89.332909
Log-likelihood  = -119.960180

Bivariate¶

In [13]:
s1 = 0.4
s2 = 2.0
r  = 0.8
sills = np.array([[s1**2,r*s1*s2],[r*s1*s2,s2**2]])
model = gl.Model.createFromParam(gl.ECov.MATERN,param=1,range=rangev,sills=sills)
In [14]:
ndat=200
dat = gl.Db.createFillRandom(ndat, ndim, 0,2)
dat["drift"] = dat["x-1"]
gl.simtub(None,dat,model)
dat
Out[14]:
Data Base Characteristics
=========================

Data Base Summary
-----------------
File is organized as a set of isolated points
Space dimension              = 2
Number of Columns            = 8
Total number of samples      = 200

Variables
---------
Column = 0 - Name = rank - Locator = NA
Column = 1 - Name = x-1 - Locator = x1
Column = 2 - Name = x-2 - Locator = x2
Column = 3 - Name = f-1 - Locator = f1
Column = 4 - Name = f-2 - Locator = f2
Column = 5 - Name = drift - Locator = NA
Column = 6 - Name = Simu.1 - Locator = z1
Column = 7 - Name = Simu.2 - Locator = z2
In [15]:
ax = plt.scatter(dat["Simu.1"],dat["Simu.2"])
No description has been provided for this image
In [16]:
truecoeffs1 = [0.5, 3]
truecoeffs2 = [1.5,-2]
model.setDriftIRF(0,1)
dat["Simu.1"] =  truecoeffs1[0] + truecoeffs1[1] * dat["drift"] + dat["Simu.1"]
dat["Simu.2"] =  truecoeffs2[0] + truecoeffs2[1] * dat["drift"] + dat["Simu.2"]
dat.setLocator("drift",gl.ELoc.F)

No constraint¶

In [17]:
Covmat = model.evalCovMatSym(dat).toTL()
X = model.evalDriftMat(dat).toTL()
Z = dat["Simu*"]
Z = Z.T.reshape(-1)
In [18]:
coeffs = estimCoeff(Z,Covmat,X)
fig,ax = gp.init(1,2)
ax[0].scatter(dat["x-1"],dat["Simu.1"])
ax[0].plot([0,1],[coeffs[0],coeffs[0]+coeffs[1]])
ax[1].scatter(dat["x-1"],dat["Simu.2"])
ax[1].plot([0,1],[coeffs[2],coeffs[2]+coeffs[3]])
gp.close()
No description has been provided for this image
In [19]:
printCoeffs("No Constraint",coeffs)
No Constraint : [ 0.591383  3.049498  1.968406 -2.140452]

This option can be emulated in gstlearn

In [20]:
likelihood = model.computeLogLikelihood(dat, True)
Likelihood calculation:
- Number of active samples     = 200
- Number of variables          = 2
- Length of Information Vector = 400
- Number of drift conditions = 4
Optimal Drift coefficients = 
     0.591     3.049     1.968    -2.140
Log-Determinant = -676.517039
Quadratic term  = 484.076013
Log-likelihood  = -271.354901

Means of both variables are imposed¶

In [21]:
A = np.array([[1,0,0,0],[0,0,1,0]])
c = [0.5,1.5]
coeffs=estimCoeff(Z,Covmat,X,A,c)
In [22]:
fig,ax = gp.init(1,2)
ax[0].scatter(dat["x-1"],dat["Simu.1"])
ax[0].plot([0,1],[coeffs[0],coeffs[0]+coeffs[1]])
ax[1].scatter(dat["x-1"],dat["Simu.2"])
ax[1].plot([0,1],[coeffs[2],coeffs[2]+coeffs[3]])
gp.close()
No description has been provided for this image
In [23]:
printCoeffs("a0=0.5 and b0=1.5", coeffs)
a0=0.5 and b0=1.5 : [ 0.5       3.177797  1.5      -1.482825]

Same coefficients for mean and drift coefficients¶

In [24]:
A = np.array([[1,0,-1,0],[0,1,0,-1]])
c = [0,0]
coeffs = estimCoeff(Z,Covmat,X,A,c)
In [25]:
fig,ax = gp.init(1,2)
ax[0].scatter(dat["x-1"],dat["Simu.1"])
ax[0].plot([0,1],[coeffs[0],coeffs[0]+coeffs[1]])
ax[1].scatter(dat["x-1"],dat["Simu.2"])
ax[1].plot([0,1],[coeffs[2],coeffs[2]+coeffs[3]])
gp.close()
No description has been provided for this image
In [26]:
printCoeffs("a0=b0 and a1=b1", coeffs)
a0=b0 and a1=b1 : [0.361879 3.91449  0.361879 3.91449 ]

This can be emulated with the current flagLinked option

In [27]:
model.setFlagLinked(True)
likelihood = model.computeLogLikelihood(dat, True)
Likelihood calculation:
- Number of active samples     = 200
- Number of variables          = 2
- Length of Information Vector = 400
- Number of drift conditions = 2
Optimal Drift coefficients = 
     0.362     3.914
Log-Determinant = -676.517039
Quadratic term  = 528.711994
Log-likelihood  = -293.672891

Means are equal¶

In [28]:
A = np.array([[1,0,-1,0]])
c = [0]
coeffs = estimCoeff(Z,Covmat,X,A,c)
In [29]:
fig,ax = gp.init(1,2)
ax[0].scatter(dat["x-1"],dat["Simu.1"])
ax[0].plot([0,1],[coeffs[0],coeffs[0]+coeffs[1]])
ax[1].scatter(dat["x-1"],dat["Simu.2"])
ax[1].plot([0,1],[coeffs[2],coeffs[2]+coeffs[3]])
gp.close()
No description has been provided for this image
In [30]:
printCoeffs("a0=b0", coeffs)
a0=b0 : [0.361879 3.371714 0.361879 0.11506 ]

Means are linked, coefficient of drift on the first variable is imposed¶

In [31]:
A = np.array([[1,0,-1,0],[0,1,0,0]])
c = [0,1]
coeffs = estimCoeff(Z,Covmat,X,A,c)
In [32]:
fig,ax = gp.init(1,2)
ax[0].scatter(dat["x-1"],dat["Simu.1"])
ax[0].plot([0,1],[coeffs[0],coeffs[0]+coeffs[1]])
ax[1].scatter(dat["x-1"],dat["Simu.2"])
ax[1].plot([0,1],[coeffs[2],coeffs[2]+coeffs[3]])
gp.close()
No description has been provided for this image
In [33]:
printCoeffs("a0=b0 and a1=1", coeffs)
a0=b0 and a1=1 : [ 1.245068  1.        1.245068 -5.651889]

Multivariate¶

We test the case of 3 variables for running a specific test.

In [34]:
sills = gl.MatrixSymmetric.createRandomDefinitePositive(3)
model = gl.Model.createFromParam(gl.ECov.MATERN,param=1,range=rangev,sills=sills)
In [35]:
model
Out[35]:
Model characteristics
=====================
Space dimension              = 2
Number of variable(s)        = 3
Number of basic structure(s) = 1
Number of drift function(s)  = 0
Number of drift equation(s)  = 0

Covariance Part
---------------
Matern (Third Parameter = 1)
- Sill matrix:
               [,  0]    [,  1]    [,  2]
     [  0,]     5.167    -3.000     1.843
     [  1,]    -3.000     3.831     2.299
     [  2,]     1.843     2.299     6.110
- Range        =      0.200
- Theo. Range  =      0.058
Total Sill
               [,  0]    [,  1]    [,  2]
     [  0,]     5.167    -3.000     1.843
     [  1,]    -3.000     3.831     2.299
     [  2,]     1.843     2.299     6.110

Known Mean(s)     0.000     0.000     0.000
In [36]:
ndat=200
dat = gl.Db.createFillRandom(ndat, ndim, 0,2)
dat["drift"] = dat["x-1"]
gl.simtub(None,dat,model)
dat
Out[36]:
Data Base Characteristics
=========================

Data Base Summary
-----------------
File is organized as a set of isolated points
Space dimension              = 2
Number of Columns            = 9
Total number of samples      = 200

Variables
---------
Column = 0 - Name = rank - Locator = NA
Column = 1 - Name = x-1 - Locator = x1
Column = 2 - Name = x-2 - Locator = x2
Column = 3 - Name = f-1 - Locator = f1
Column = 4 - Name = f-2 - Locator = f2
Column = 5 - Name = drift - Locator = NA
Column = 6 - Name = Simu.1 - Locator = z1
Column = 7 - Name = Simu.2 - Locator = z2
Column = 8 - Name = Simu.3 - Locator = z3
In [37]:
truecoeffs1 = [ 0.5, 3]
truecoeffs2 = [ 1.5,-2]
truecoeffs3 = [-0.5,-2]
model.setDriftIRF(0,1)
dat["Simu.1"] =  truecoeffs1[0] + truecoeffs1[1] * dat["drift"] + dat["Simu.1"]
dat["Simu.2"] =  truecoeffs2[0] + truecoeffs2[1] * dat["drift"] + dat["Simu.2"]
dat["Simu.3"] =  truecoeffs3[0] + truecoeffs3[1] * dat["drift"] + dat["Simu.3"]
dat.setLocator("drift",gl.ELoc.F)
In [38]:
Covmat = model.evalCovMatSym(dat).toTL()
X = model.evalDriftMat(dat).toTL()
Z = dat["Simu*"]
Z = Z.T.reshape(-1)
In [39]:
coeffs = estimCoeff(Z,Covmat,X)
fig,ax = gp.init(1,3, figsize=[15,4])
print(coeffs)
ax[0].scatter(dat["x-1"],dat["Simu.1"])
ax[0].plot([0,1],[coeffs[0],coeffs[0]+coeffs[1]])
ax[1].scatter(dat["x-1"],dat["Simu.2"])
ax[1].plot([0,1],[coeffs[2],coeffs[2]+coeffs[3]])
ax[2].scatter(dat["x-1"],dat["Simu.2"])
ax[2].plot([0,1],[coeffs[4],coeffs[4]+coeffs[5]])
gp.close()
[ 0.89610178  2.34510063  1.71794026 -1.729727   -0.00333606 -1.53702255]
No description has been provided for this image

Means of all variables are equal¶

This test is meant to check the way to constrain the mean value of all (three) variables to be equal: this is done by setting constraints to the drift coefficients of the variables taken two by two.

In [40]:
A = np.array([[1,0,-1,0,0,0],[0,0,1,0,-1,0]])
c = [0, 0]
coeffs = estimCoeff(Z,Covmat,X,A,c)
In [41]:
coeffs
Out[41]:
array([ 2.05802776,  0.71379393,  2.05802776, -2.20719891,  2.05802776,
       -4.43111105])
In [ ]: