GPvecchia tutorial

Marcin Jurek, Daniel Zilber, Matthias Katzfuss

2024-02-08

In this vignette, we will demonstrate the main capabilities of the GPvecchia package. These include estimating parameters and making spatial predictions. We also show how to use the package for processing non-linear, non-Gaussian data by combining the Vecchia with a Laplace approximation.

We start by importing the GPvecchia library.

library(GPvecchia)
library(Matrix)
library(fields)
#> Loading required package: spam
#> Spam version 2.10-0 (2023-10-23) is loaded.
#> Type 'help( Spam)' or 'demo( spam)' for a short introduction 
#> and overview of this package.
#> Help for individual functions is also obtained by adding the
#> suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
#> 
#> Attaching package: 'spam'
#> The following object is masked from 'package:Matrix':
#> 
#>     det
#> The following objects are masked from 'package:base':
#> 
#>     backsolve, forwardsolve
#> Loading required package: viridisLite
#> 
#> Try help(fields) to get started.

Simulating data for illustration

To illustrate our method, we simulate a small dataset. First, consider a unit square and randomly select observation locations. (Set spatial.dim=1 to consider data on the one-dimensional unit interval.)

set.seed(1988)
spatial.dim=2
n=50
if(spatial.dim==1){
  locs=matrix(runif(n),ncol=1)
} else {
  locs <- cbind(runif(n),runif(n))
}

Next, we define the covariance function of the field as well as the scale of the measurement error

beta=2
sig2=1; range=.1; smooth=1.5
covparms =c(sig2,range,smooth)
covfun <- function(locs) sig2*MaternFun(fields::rdist(locs),covparms)
nuggets=rep(.1,n)

We are now ready to simulate the field and visualize it as a sanity check.

Om0 <- covfun(locs)+diag(nuggets)
z=as.numeric(t(chol(Om0))%*%rnorm(n))
data=z+beta

# plot simulated data
if(spatial.dim==1) {
  plot(locs,data)
} else {
  fields::quilt.plot(locs,data, nx=n, ny=n)
}

We also create a grid of \(n.p\) locations at which we would like to make predictions.

n.p=100
if(spatial.dim==1){  #  1-D case
  locs.pred=matrix(seq(0,1,length=n.p),ncol=1)
} else {   # 2-D case
  grid.oneside=seq(0,1,length=round(sqrt(n.p)))
  locs.pred=as.matrix(expand.grid(grid.oneside,grid.oneside)) # grid of pred.locs
}
n.p=nrow(locs.pred)

Basic functions for parameter estimation and prediction

Let us now estimate the mean and covariance parameters using the default settings, which assume a spatially constant mean or trend, and a Matern covariance structure. Note that the following code might take a minute or so to run.

vecchia.est=vecchia_estimate(data,locs,,output.level=0)
#>   Nelder-Mead direct search function minimizer
#> function value for initial parameters = 61.133162
#>   Scaled convergence tolerance is 9.10955e-07
#> Stepsize computed as 0.100000
#> BUILD              5 61.381731 60.754055
#> LO-REDUCTION       7 61.182128 60.754055
#> LO-REDUCTION       9 61.133162 60.754055
#> LO-REDUCTION      11 61.070965 60.754055
#> LO-REDUCTION      13 60.846320 60.754055
#> LO-REDUCTION      15 60.820974 60.754055
#> LO-REDUCTION      17 60.788447 60.754055
#> EXTENSION         19 60.781072 60.725514
#> LO-REDUCTION      21 60.764690 60.725514
#> HI-REDUCTION      23 60.764237 60.725514
#> REFLECTION        25 60.754055 60.724434
#> REFLECTION        27 60.745117 60.719430
#> REFLECTION        29 60.728191 60.719258
#> EXTENSION         31 60.725514 60.708763
#> HI-REDUCTION      33 60.724434 60.708763
#> LO-REDUCTION      35 60.719430 60.708763
#> LO-REDUCTION      37 60.719258 60.708763
#> EXTENSION         39 60.715353 60.697299
#> LO-REDUCTION      41 60.710655 60.697299
#> LO-REDUCTION      43 60.710599 60.697299
#> REFLECTION        45 60.708763 60.697132
#> EXTENSION         47 60.705199 60.692044
#> LO-REDUCTION      49 60.697702 60.692044
#> LO-REDUCTION      51 60.697299 60.692044
#> LO-REDUCTION      53 60.697132 60.692044
#> HI-REDUCTION      55 60.693648 60.692044
#> EXTENSION         57 60.693185 60.690552
#> LO-REDUCTION      59 60.692802 60.690552
#> LO-REDUCTION      61 60.692353 60.690552
#> LO-REDUCTION      63 60.692044 60.690552
#> REFLECTION        65 60.691268 60.690489
#> LO-REDUCTION      67 60.691153 60.690489
#> REFLECTION        69 60.690624 60.690421
#> HI-REDUCTION      71 60.690604 60.690260
#> EXTENSION         73 60.690552 60.690088
#> EXTENSION         75 60.690489 60.689662
#> LO-REDUCTION      77 60.690421 60.689662
#> LO-REDUCTION      79 60.690260 60.689662
#> LO-REDUCTION      81 60.690088 60.689662
#> LO-REDUCTION      83 60.689910 60.689643
#> EXTENSION         85 60.689750 60.689335
#> LO-REDUCTION      87 60.689668 60.689335
#> HI-REDUCTION      89 60.689662 60.689335
#> EXTENSION         91 60.689643 60.689126
#> EXTENSION         93 60.689521 60.688750
#> LO-REDUCTION      95 60.689485 60.688750
#> EXTENSION         97 60.689335 60.688626
#> EXTENSION         99 60.689126 60.687839
#> LO-REDUCTION     101 60.688960 60.687839
#> EXTENSION        103 60.688750 60.687295
#> LO-REDUCTION     105 60.688626 60.687295
#> EXTENSION        107 60.688189 60.686386
#> LO-REDUCTION     109 60.687866 60.686386
#> EXTENSION        111 60.687839 60.685778
#> REFLECTION       113 60.687295 60.685655
#> REFLECTION       115 60.686674 60.685381
#> EXTENSION        117 60.686386 60.684860
#> LO-REDUCTION     119 60.685778 60.684860
#> LO-REDUCTION     121 60.685655 60.684860
#> LO-REDUCTION     123 60.685381 60.684860
#> LO-REDUCTION     125 60.685048 60.684821
#> REFLECTION       127 60.684991 60.684749
#> LO-REDUCTION     129 60.684956 60.684749
#> LO-REDUCTION     131 60.684860 60.684749
#> EXTENSION        133 60.684821 60.684642
#> REFLECTION       135 60.684765 60.684638
#> HI-REDUCTION     137 60.684761 60.684638
#> HI-REDUCTION     139 60.684749 60.684638
#> EXTENSION        141 60.684679 60.684568
#> REFLECTION       143 60.684677 60.684557
#> REFLECTION       145 60.684642 60.684517
#> EXTENSION        147 60.684638 60.684430
#> LO-REDUCTION     149 60.684568 60.684430
#> LO-REDUCTION     151 60.684557 60.684430
#> EXTENSION        153 60.684517 60.684361
#> LO-REDUCTION     155 60.684482 60.684361
#> LO-REDUCTION     157 60.684457 60.684361
#> HI-REDUCTION     159 60.684430 60.684361
#> REFLECTION       161 60.684410 60.684343
#> EXTENSION        163 60.684389 60.684311
#> LO-REDUCTION     165 60.684363 60.684311
#> HI-REDUCTION     167 60.684361 60.684311
#> LO-REDUCTION     169 60.684343 60.684311
#> LO-REDUCTION     171 60.684329 60.684309
#> LO-REDUCTION     173 60.684320 60.684309
#> HI-REDUCTION     175 60.684314 60.684309
#> REFLECTION       177 60.684311 60.684303
#> REFLECTION       179 60.684311 60.684303
#> LO-REDUCTION     181 60.684309 60.684303
#> REFLECTION       183 60.684309 60.684301
#> REFLECTION       185 60.684305 60.684299
#> LO-REDUCTION     187 60.684303 60.684299
#> LO-REDUCTION     189 60.684303 60.684299
#> REFLECTION       191 60.684301 60.684298
#> HI-REDUCTION     193 60.684300 60.684298
#> EXTENSION        195 60.684299 60.684296
#> HI-REDUCTION     197 60.684299 60.684296
#> LO-REDUCTION     199 60.684299 60.684296
#> EXTENSION        201 60.684298 60.684293
#> LO-REDUCTION     203 60.684297 60.684293
#> LO-REDUCTION     205 60.684297 60.684293
#> REFLECTION       207 60.684296 60.684293
#> REFLECTION       209 60.684295 60.684293
#> REFLECTION       211 60.684294 60.684292
#> LO-REDUCTION     213 60.684293 60.684292
#> REFLECTION       215 60.684293 60.684292
#> Exiting from Nelder Mead minimizer
#>     217 function evaluations used

Based on these parameter estimates, we can then make predictions at the grid of locations we had specified above.

preds=vecchia_pred(vecchia.est,locs.pred)

Finally, we compare the approximate predictions with the best possible ones (i.e. those obtained using analytic expressions for conditional mean in the Gaussian distribution).

##  exact prediction
mu.exact=as.numeric(covfun(rbind(locs,locs.pred))[,1:n]%*%solve(Om0,data-beta))+beta
cov.exact=covfun(rbind(locs,locs.pred))-
  covfun(rbind(locs,locs.pred))[,1:n]%*%solve(Om0,t(covfun(rbind(locs,locs.pred))[,1:n]))
var.exact=diag(cov.exact)
cov.exact.pred=cov.exact[n+(1:n.p),n+(1:n.p)]


### plot Vecchia and exact predictions
if(spatial.dim==1) {
  plot(locs,z)
  lines(locs.pred,preds$mean.pred,col='blue')
  lines(locs.pred,preds$mean.pred-1.96*sqrt(preds$var.pred),col='blue',lty=2)
  lines(locs.pred,preds$mean.pred+1.96*sqrt(preds$var.pred),col='blue',lty=2)
  lines(locs.pred,mu.exact[n+(1:n.p)],col='red')
  lines(locs.pred,mu.exact[n+(1:n.p)]-1.96*sqrt(var.exact[n+(1:n.p)]),col='red',lty=2)
  lines(locs.pred,mu.exact[n+(1:n.p)]+1.96*sqrt(var.exact[n+(1:n.p)]),col='red',lty=2)
} else {
  sdrange=range(sqrt(c(preds$var.pred,var.exact[n+(1:n.p)])))
  defpar = par(mfrow=c(2,3))
  fields::quilt.plot(locs,z, nx=sqrt(n.p), ny=sqrt(n.p))
  fields::quilt.plot(locs.pred,preds$mean.pred, nx=sqrt(n.p), ny=sqrt(n.p))
  fields::quilt.plot(locs.pred,sqrt(preds$var.pred),zlim=sdrange, nx=sqrt(n.p), ny=sqrt(n.p))
  fields::quilt.plot(locs,z, nx=sqrt(n.p), ny=sqrt(n.p))
  fields::quilt.plot(locs.pred,mu.exact[n+(1:n.p)], nx=sqrt(n.p), ny=sqrt(n.p))
  fields::quilt.plot(locs.pred,sqrt(var.exact[n+(1:n.p)]),zlim=sdrange, nx=sqrt(n.p), ny=sqrt(n.p))
  par(defpar)
}

More details on likelihood evaluation

Let’s take a closer look at how the likelihood is evaluated using Vecchia. Most importantly, we can specify a parameter, \(m\). Its value determines the number of “neighbours” of each point, or, in other words, how many other points a given point conditions on. The larger this parameter, the more accurate and expensive the approximation will be.

m=20
vecchia.approx=vecchia_specify(locs,m)
vecchia_likelihood(z,vecchia.approx,covparms,nuggets)
#> [1] -61.09357

Note that the function vecchia_specify determines the general properties of the approximation, but it does not depend on the data or the specific parameter values. Hence, it does not have to be re-run when searching over different parameter values in an estimation procedure.

We can also compare the results to the exact likelihood:

library(mvtnorm)
#> 
#> Attaching package: 'mvtnorm'
#> The following objects are masked from 'package:spam':
#> 
#>     rmvnorm, rmvt
dmvnorm(z,mean=rep(0,n),sigma=Om0,log=TRUE)
#> [1] -61.09842

In this case the approximation is very good. In general, \(m=20\) is a good value, and \(m\) should usually be between 10 and 40. For one-dimensional space, we can get good approximations even with \(m=5\) or smaller.

More details on spatial prediction

Similar to the previous section we next specify the approximation and indicate at which locations prediction is desired.

m=30
vecchia.approx=vecchia_specify(locs,m,locs.pred=locs.pred)
preds=vecchia_prediction(z,vecchia.approx,covparms,nuggets)
# returns a list with elements mu.pred,mu.obs,var.pred,var.obs,V.ord

It is also possible to print the entire predictive covariance matrix. We do it here only for the purpose of illustration. If \(n.p\) is very large, this matrix might use up a lot of memory and we generally do not recommend plotting it directly.

Sigma=V2covmat(preds)$Sigma.pred
cov.range=quantile(rbind(Sigma,cov.exact.pred),c(.01,.99))
defpar = par(mfrow=c(1,2))
fields::image.plot(cov.exact.pred,zlim=cov.range)
fields::image.plot(Sigma,zlim=cov.range)

par(mfrow=c(defpar))

Linear combinations

We might sometimes be interested in a linear combination of the predicted values. In particular, we can limit our attention to only a subset of our predictions. This can be accomplished by specifying the linear combination coefficients as a matrix. As an example, we assume we are only interested in predictions at the unobserved prediction locations (not at the first n observed locations):

H=Matrix::sparseMatrix(i=1:(n+n.p),j=1:(n+n.p),x=1)[(n+1):(n+n.p),]

# compute variances of Hy
lincomb.vars=vecchia_lincomb(H,preds$U.obj,preds$V.ord)
plot(preds$var.pred,lincomb.vars)

As another example, we consider the overall mean of the process at all prediction locations. Using the vecchia_lincomb() function enables us to get the variance estimates easily.

mean(preds$mu.pred)
#> [1] -0.03611141

# compute entire covariance matrix of Hy (here, 1x1)
H=Matrix::sparseMatrix(i=rep(1,n.p),j=n+(1:n.p),x=1/n.p)
lincomb.cov=vecchia_lincomb(H,preds$U.obj,preds$V.ord,cov.mat=TRUE)

Other GP approximations as special cases

By specifying appropriate options in vecchia_specify, we can do everything described above for several other GP approximations: Modified predictive process, FSA, MRA, latent, standard Vecchia

Setting \(M=1\) results in block full-scale approximation, specifically one with \(r_0 = \frac{m}{2}\) knots spread over the entire domain and the remaining locations being partitioned into blocks of size \(<\frac{m}{2}+1\).

m=20
mra.options.fulls=list(M=1)
blockFS = vecchia_specify(locs, m, 'maxmin', conditioning='mra', mra.options=mra.options.fulls, verbose=TRUE)
#> MRA params: m=19; J=4; r=10,10; M=1

Another popular existing approximation method, modified predictive process (MPP), can also be obtained by specifying appropriate parameter settings:

mra.options.mpproc=list(r=c(m,1))
MPP = vecchia_specify(locs, m, 'maxmin', conditioning='mra', mra.options=mra.options.mpproc, verbose=TRUE)
#> MRA params: m=20; J=32; r=20; M=0

As we can see, MPP can be viewed as a special case of the multi-resolution approximation (MRA).

A general MRA is obtained my specifying all of its three parameters

mra.options.mra = list(r=c(10, 5, 5), M=2, J=2)
MRA_rJM = vecchia_specify(locs, m, 'maxmin', conditioning='mra', mra.options=mra.options.mra, verbose=TRUE)
#> Warning in get.mra.params(n, mra.options, m): M, r set for MRA. If parameter m
#> was given, it will be overridden
#> MRA params: m=22; J=2,2; r=10,5,7; M=2

We should note two things to note about this full specifiction of an MRA. First, providing all three \(r\),\(J\) and \(M\) overrides whatever value of \(m\) was provided. Second, in order to be able to place a knot at each point of the grid, the provided parameters might need to be adjusted.

Finally, we can also use the GPvecchia package to specify a Nearest Neighbour Gaussian Process (NNGP) approximation. This can be accomplished as shown below.

NNGP = vecchia_specify(locs, m, cond.yz='y')

We can now easily compare different approximation methods and compare it with SGV and exact likelihood.

vecchia_likelihood(z,blockFS,covparms,nuggets)
#> [1] -61.24514
vecchia_likelihood(z,MPP,covparms,nuggets)
#> [1] -63.68507
vecchia_likelihood(z,MRA_rJM,covparms,nuggets)
#> [1] -61.00043
vecchia_likelihood(z,NNGP,covparms,nuggets)
#> [1] -61.09416
vecchia_likelihood(z, vecchia_specify(locs, m), covparms, nuggets)
#> [1] -61.09357
dmvnorm(z,mean=rep(0,n),sigma=Om0,log=TRUE)
#> [1] -61.09842

Non-Gaussian data

Here we demonstrate how GPVecchia can fit a latent model to non-Gaussian data using the Vecchia-Laplace method. We simulate data by first generating a correlated latent field without noise, assuming the same covariance and locations generated earlier:

# simulate latent process
y=as.numeric(t(chol(Om0))%*%rnorm(n))

Then we sample a single non-Gaussian value for each latent value. The variability introduced by the sampling induces heteroskedasticity, in contrast the the constant noise added to the Gaussian case. Below we use a logistic model for binary data, but there are implementations for count and continuous positive (right-skewed) data as well.

data.model = "logistic"

# simulate data
if(data.model=='poisson'){
  z = rpois(n, exp(y))
} else if(data.model=='logistic'){
  z = rbinom(n,1,prob = exp(y)/(1+exp(y)))
} else if(data.model=='gamma'){
  z = rgamma(n, shape = default_lh_params$alpha, rate = default_lh_params$alpha*exp(-y))
}else{
  print('Error: Distribution not implemented yet.')
}

# plot simulated data, 1 or 2D
defpar = par(mfrow=c(1,2))
if(spatial.dim==1) {
  plot(locs,y, main = "latent")
  plot(locs,z, main = "observed")
} else {
  fields::quilt.plot(locs,y, main = "Latent")
  fields::quilt.plot(locs,z, main = "Observed")
}

par(defpar)

Given the simulated data, we now can efficiently estimate the latent field by specifying the number of conditioning points \(m\) described earlier. Interweaved ordering is best for 1D data while response-first (‘zy’) ordering is best for higher dimensions.

m=10
if(spatial.dim==1){
  vecchia.approx=vecchia_specify(locs,m) #IW ordering
} else {
  vecchia.approx=vecchia_specify(locs,m,cond.yz='zy') #RF ordering
}

With the approximated covariance structure, we can calculate the posterior estimate for the latent field using the Vecchia-Laplace method and plot the result. Pure Laplace approximation is included for comparison; even with a small value for \(m\), we can get a result similar to Laplace but with much lower cost.

posterior = calculate_posterior_VL(z,vecchia.approx,likelihood_model=data.model,
                                   covparms = covparms)
if (spatial.dim==1){
  par(mfrow=c(1,1))
  ord = order(locs) # order so that lines appear correctly
  y_limits = c(min(y, posterior$mean[ord]), max(y, posterior$mean[ord]))
  plot(locs[ord], y[ord], type = "l", ylim = y_limits )
  lines(locs[ord], posterior$mean[ord], type = "l", col=3, lwd=3)
  legend("bottomright", legend = c("Latent", "VL"), col= c(1,3), lwd=c(1,3))
} else if (spatial.dim==2){
  dfpar = par(mfrow=c(1,2))
  # ordering unnecessary; we are using a scatter plot rather than lines
  quilt.plot(locs, y, main= "Truth")
  quilt.plot(locs, posterior$mean,  main= "VL m=10")
  par(defpar)
  
}

Non-Gaussian predictions

Predictions are computed as before, using Vecchia-Laplace methods where needed

######  specify prediction locations   #######
n.p=30^2
if(spatial.dim==1){  #  1-D case
  locs.pred=matrix(seq(0,1,length=n.p),ncol=1)
} else {   # 2-D case
  grid.oneside=seq(0,1,length=round(sqrt(n.p)))
  locs.pred=as.matrix(expand.grid(grid.oneside,grid.oneside)) # grid of pred.locs
}
n.p=nrow(locs.pred)

######  specify Vecchia approximation   #######
vecchia.approx.pred = vecchia_specify(locs, m=10, locs.pred=locs.pred)
###  carry out prediction
preds = vecchia_laplace_prediction(posterior, vecchia.approx.pred, covparms)

# plotting predicitions
if (spatial.dim==1){
  defpar = par(mfrow=c(1,1))
  ord = order(locs) # order so that lines appear correctly
  plot(locs[ord], y[ord], type = "l", xlim=c(0,1.2), ylim = c(-1,3))
  lines(locs, posterior$mean, type = "p", col=4, lwd=3, lty=1)
  lines(locs.pred, preds$mu.pred, type = "l", col=3, lwd=3, lty=1)
  lines(locs.pred,preds$mu.pred+sqrt(preds$var.pred), type = "l", lty = 3, col=3)
  lines(locs.pred,preds$mu.pred-sqrt(preds$var.pred), type = "l", lty = 3, col=3)
  legend("topleft", legend = c("Latent", "VL: Pred", "VL: 1 stdev"), 
         col= c(1,3,3), lwd=c(1,2,1), lty = c(1,1,3))
  par(defpar)
} else if (spatial.dim==2){
  defpar =  par(mfrow=c(1,2))
  # ordering unnecessary; we are using a scatter plot rather than lines
  quilt.plot(locs, y, main= "True Latent", 
             xlim = c(0,1), ylim = c(0,1), nx=64, ny=64)
  quilt.plot(locs.pred, preds$mu.pred,  main= "VL Prediction",nx = 30, ny=30)
  par(defpar)
}

Parameter estimation

The likelihood of the data for a set of parameters can be computed efficiently using the command below.

vecchia_laplace_likelihood(z,vecchia.approx,likelihood_model=data.model,covparms = covparms)
#> [1] -35.38816

This can be used for parameter estimation by evaluating the likelihood over a grid of parameter values or in an iterative optimization method such as Nelder-Mead. Reparameterizing the parameters improves performance.

# currently set up for covariance estimation 
vecchia.approx=vecchia_specify(locs, m=10, cond.yz = "zy") # for posterior
vecchia.approx.IW = vecchia_specify(locs, m=10) # for integrated likelihood
if (spatial.dim==1) vecchia.approx=vecchia.approx.IW

vl_likelihood = function(x0){
  theta = exp(x0)
  covparms=c(theta[1], theta[2], theta[3]) # sigma range smoothness
  prior_mean = 0 # can be a parameter as well
  # Perform inference on latent mean with Vecchia Laplace approximation
  vll = vecchia_laplace_likelihood(z,vecchia.approx, likelihood_model=data.model,
                                   covparms, return_all = FALSE,
                                   likparms = default_lh_params, prior_mean = prior_mean,
                                   vecchia.approx.IW = vecchia.approx.IW)
  return(-vll)

}
x0 = log(c(.07,1.88, 1.9))
vl_likelihood(x0)
# Issues with R aborting, maxit set to 1
res = optim(x0, vl_likelihood, method = "Nelder-Mead", control = list("trace" = 1, "maxit" = 1))
exp(res$par[1:3])
vl_likelihood(x0)