Simulated data analysis

Make some data

rmvn <- function(n, mu = 0, V = matrix(1)) {
    p <- length(mu)
    if (any(is.na(match(dim(V), p))))
        stop("Dimension problem!")
    D <- chol(V)
    t(matrix(rnorm(n * p), ncol = p) %*% D + rep(mu, rep(n, p)))
}

set.seed(1)
n <- 100
coords <- cbind(runif(n, 0, 1), runif(n, 0, 1))

x <- cbind(1, rnorm(n))

B <- as.matrix(c(1, 5))

sigma.sq <- 5
tau.sq <- 1
phi <- 3/0.5

D <- as.matrix(dist(coords))
R <- exp(-phi * D)
w <- rmvn(1, rep(0, n), sigma.sq * R)
y <- rnorm(n, x %*% B + w, sqrt(tau.sq))

Fit a Latent NNGP model

library(spNNGP)

n.samples <- 500

starting <- list(phi = phi, sigma.sq = 5, tau.sq = 1)

tuning <- list(phi = 0.5, sigma.sq = 0.5, tau.sq = 0.5)

priors <- list(phi.Unif = c(3/1, 3/0.01), sigma.sq.IG = c(2, 5), tau.sq.IG = c(2,
    1))

cov.model <- "exponential"

m.s <- spNNGP(y ~ x - 1, coords = coords, starting = starting, method = "latent",
    n.neighbors = 5, tuning = tuning, priors = priors, cov.model = cov.model, n.samples = n.samples,
    return.neighbor.info = TRUE, n.omp.threads = 2)
----------------------------------------
    Building the neighbor list
----------------------------------------
----------------------------------------
Building the neighbors of neighbors list
----------------------------------------
----------------------------------------
    Model description
----------------------------------------
NNGP Latent model fit with 100 observations.

Number of covariates 2 (including intercept if specified).

Using the exponential spatial correlation model.

Using 5 nearest neighbors.

Number of MCMC samples 500.

Priors and hyperpriors:
    beta flat.
    sigma.sq IG hyperpriors shape=2.00000 and scale=5.00000
    tau.sq IG hyperpriors shape=2.00000 and scale=1.00000
    phi Unif hyperpriors a=3.00000 and b=300.00000

Source compiled with OpenMP support and model fit using 2 thread(s).
----------------------------------------
        Sampling
----------------------------------------
Sampled: 100 of 500, 20.00%
Report interval Metrop. Acceptance rate: 58.00%
Overall Metrop. Acceptance rate: 58.00%
-------------------------------------------------
Sampled: 200 of 500, 40.00%
Report interval Metrop. Acceptance rate: 53.00%
Overall Metrop. Acceptance rate: 55.50%
-------------------------------------------------
Sampled: 300 of 500, 60.00%
Report interval Metrop. Acceptance rate: 52.00%
Overall Metrop. Acceptance rate: 54.33%
-------------------------------------------------
Sampled: 400 of 500, 80.00%
Report interval Metrop. Acceptance rate: 42.00%
Overall Metrop. Acceptance rate: 51.25%
-------------------------------------------------
Sampled: 500 of 500, 100.00%
Report interval Metrop. Acceptance rate: 55.00%
Overall Metrop. Acceptance rate: 52.00%
-------------------------------------------------
round(summary(m.s$p.beta.samples)$quantiles[, c(3, 1, 5)], 2)
    50% 2.5% 97.5%
x1 2.11  1.4  2.79
x2 4.87  4.5  5.23
round(summary(m.s$p.theta.samples)$quantiles[, c(3, 1, 5)], 2)
           50% 2.5% 97.5%
sigma.sq  3.95 2.17  6.12
tau.sq    0.92 0.26  2.38
phi      11.77 4.45 25.91
plot(w, apply(m.s$p.w.samples, 1, median), xlab = "True w", ylab = "Posterior median w")

Take a look at the neighbor ordering (just for fun)

names(m.s$neighbor.info)
 [1] "n.neighbors"      "n.indx"           "nn.indx"          "nn.indx.lu"      
 [5] "u.indx"           "u.indx.lu"        "ui.indx"          "ord"             
 [9] "nn.indx.run.time" "u.indx.run.time" 
ord <- m.s$neighbor.info$ord
n.indx <- m.s$neighbor.info$n.indx

s <- 50
plot(m.s$coords[ord, ], xlab = "Easting", ylab = "Northing")
points(m.s$coords[ord, ][s, , drop = FALSE], cex = 2, col = "red")
points(m.s$coords[ord, ][n.indx[[s]], , drop = FALSE], pch = 19, col = "blue")
abline(v = m.s$coords[ord, ][s, 1], lty = 3)

# for(s in 2:n){ plot(m.s$coords[ord,], xlab='Easting', ylab='Northing')
# points(m.s$coords[ord,][s,,drop=FALSE], cex=2, col='red')
# points(m.s$coords[ord,][n.indx[[s]],,drop=FALSE], pch=19, col='blue')
# abline(v=m.s$coords[ord,][s,1], lty=3) readline(prompt = 'Pause. Press
# <Enter> to continue...') }

Harvard Forest canopy height analysis

Here we consider forest canopy height (m) data measured using the NASA Goddard’s LiDAR Hyperspectral and Thermal (G-LiHT) Airborne Imager over a subset of Harvard Forest Simes Tract, MA, collected in Summer 2012. This is a sampling LiDAR system that only records strips of canopy height across the landscape. We would like to use the Harvard Forest data to assess if the current density of LiDAR measurements can be reduced, which would allow for wider strips to be collected. Ultimately, interest is in creating wall-to-wall maps of forest canopy height with associated uncertainty.

Let’s load the necessary packages and canopy height data which are part of the spNNGP package. Here too, we subset the data and divide it into a model and testing set.

library(geoR)
library(raster)
library(leaflet)

load("CHM.rda")

CHM <- CHM[CHM[, 3] > 0, ]

set.seed(1)
mod <- sample(1:nrow(CHM), 25000)
ho <- sample((1:nrow(CHM))[-mod], 10000)

CHM.mod <- CHM[mod, ]
CHM.ho <- CHM[ho, ]

Let’s again start with a leaflet basemap then overlay the canopy height data. Recall, leaflet maps expect data to be in geographic coordinate system (i.e., longitude and latitude), so we first need reproject the CHM data (just for visualization purposes, we’ll fit the model using the projected coordinates).

chm.r <- rasterFromXYZ(CHM)
proj4string(chm.r) <- "+proj=utm +zone=18 +datum=WGS84 +units=m +no_defs +ellps=WGS84 +towgs84=0,0,0"
chm.r.ll <- projectRaster(chm.r, crs = "+proj=longlat +datum=WGS84")

pal <- colorNumeric(rev(terrain.colors(50)), domain = values(chm.r.ll), na.color = "transparent")

base.map <- leaflet(width = "100%") %>%
    addProviderTiles("Esri.WorldImagery", group = "Satellite") %>%
    addProviderTiles("Esri.WorldShadedRelief", group = "Terrain")

base.map %>%
    addRasterImage(chm.r.ll, colors = pal, opacity = 1, group = "Canopy height") %>%
    addLegend("bottomright", pal = pal, values = values(chm.r.ll), opacity = 1, title = "<center>Canopy height (m)</center>") %>%
    addLayersControl(baseGroup = c("Satellite", "Terrain"), overlayGroups = c("Canopy height"),
        options = layersControlOptions(collapsed = FALSE))

Let’s try and fit a variogram to the data to get a sense of the spatial structure. These variog function calculates the \(n\times n\) Euclidean distance matrix to construct the empirical variogram. When \(n\) is large will you will likely run out of memory, so you might need to consider only a subset of your data.

sub <- 1:10000

# note, max intersite distance is ~1.5km
v <- variog(coords = CHM.mod[sub, 1:2], data = CHM.mod[sub, 3], uvec = (seq(0, 500,
    length = 30)))
variog: computing omnidirectional variogram
plot(v, xlab = "Distance (m)")

Now let’s fit some spatial regression models using NNGP random effects.

n.samples <- 1000

starting <- list(phi = 3/50, sigma.sq = 15, tau.sq = 2.5)

tuning <- list(phi = 0.05, sigma.sq = 0.01, tau.sq = 0.01)

priors <- list(phi.Unif = c(3/1000, 3/10), sigma.sq.IG = c(2, 10), tau.sq.IG = c(2,
    5))

cov.model <- "exponential"

## Response model
m.r <- spNNGP(CHM.mod[, 3] ~ 1, coords = CHM.mod[, 1:2], starting = starting, method = "response",
    n.neighbors = 10, tuning = tuning, priors = priors, cov.model = cov.model, n.samples = n.samples,
    n.omp.threads = 2, n.report = 100)
----------------------------------------
    Building the neighbor list
----------------------------------------
----------------------------------------
    Model description
----------------------------------------
NNGP Response model fit with 25000 observations.

Number of covariates 1 (including intercept if specified).

Using the exponential spatial correlation model.

Using 10 nearest neighbors.

Number of MCMC samples 1000.

Priors and hyperpriors:
    beta flat.
    sigma.sq IG hyperpriors shape=2.00000 and scale=10.00000
    tau.sq IG hyperpriors shape=2.00000 and scale=5.00000
    phi Unif hyperpriors a=0.00300 and b=0.30000

Source compiled with OpenMP support and model fit using 2 thread(s).
----------------------------------------
        Sampling
----------------------------------------
Sampled: 100 of 1000, 10.00%
Report interval Metrop. Acceptance rate: 39.00%
Overall Metrop. Acceptance rate: 39.00%
-------------------------------------------------
Sampled: 200 of 1000, 20.00%
Report interval Metrop. Acceptance rate: 40.00%
Overall Metrop. Acceptance rate: 39.50%
-------------------------------------------------
Sampled: 300 of 1000, 30.00%
Report interval Metrop. Acceptance rate: 35.00%
Overall Metrop. Acceptance rate: 38.00%
-------------------------------------------------
Sampled: 400 of 1000, 40.00%
Report interval Metrop. Acceptance rate: 37.00%
Overall Metrop. Acceptance rate: 37.75%
-------------------------------------------------
Sampled: 500 of 1000, 50.00%
Report interval Metrop. Acceptance rate: 47.00%
Overall Metrop. Acceptance rate: 39.60%
-------------------------------------------------
Sampled: 600 of 1000, 60.00%
Report interval Metrop. Acceptance rate: 41.00%
Overall Metrop. Acceptance rate: 39.83%
-------------------------------------------------
Sampled: 700 of 1000, 70.00%
Report interval Metrop. Acceptance rate: 46.00%
Overall Metrop. Acceptance rate: 40.71%
-------------------------------------------------
Sampled: 800 of 1000, 80.00%
Report interval Metrop. Acceptance rate: 45.00%
Overall Metrop. Acceptance rate: 41.25%
-------------------------------------------------
Sampled: 900 of 1000, 90.00%
Report interval Metrop. Acceptance rate: 45.00%
Overall Metrop. Acceptance rate: 41.67%
-------------------------------------------------
Sampled: 1000 of 1000, 100.00%
Report interval Metrop. Acceptance rate: 33.00%
Overall Metrop. Acceptance rate: 40.80%
-------------------------------------------------
round(summary(m.r$p.beta.samples)$quantiles[c(3, 1, 5)], 2)
  50%  2.5% 97.5% 
21.07 20.80 21.34 
round(summary(m.r$p.theta.samples)$quantiles[, c(3, 1, 5)], 3)
            50%   2.5%  97.5%
sigma.sq 15.422 14.390 16.377
tau.sq    2.250  1.992  2.445
phi       0.048  0.044  0.052
plot(m.r$p.beta.samples)

plot(m.r$p.theta.samples)

m.r$run.time
   user  system elapsed 
197.858   0.027  98.959 

Now prediction for the holdout set.

burn.in <- floor(0.5 * n.samples)

p.r <- predict(m.r, X.0 = as.matrix(rep(1, nrow(CHM.ho))), coords.0 = CHM.ho[, c("x",
    "y")], sub.sample = list(start = burn.in, thin = 2), n.report = 5000, n.omp.threads = 2)
----------------------------------------
    Prediction description
----------------------------------------
NNGP Response model fit with 25000 observations.

Number of covariates 1 (including intercept if specified).

Using the exponential spatial correlation model.

Using 10 nearest neighbors.

Number of MCMC samples 251.

Predicting at 10000 locations.


Source compiled with OpenMP support and model fit using 2 threads.
-------------------------------------------------
        Predicting
-------------------------------------------------
Location: 5000 of 10000, 50.00%
Location: 10000 of 10000, 100.00%
y.hat.r <- apply(p.r$p.y.0, 1, mean)

plot(CHM.ho[, 3], y.hat.r, main = "Response NNGP model", xlab = "True canopy height",
    ylab = "Posterior predictive distribution mean")