In this section we apply ordinary kriging to predict the spatial distribution of zinc in Lake Rangsdorf based on sediment samples taken during a field survey in 2017.


Data preparation

We start with our data set lake_data_sf, an object of class sf.

library(sf)
load(url("https://userpage.fu-berlin.de/soga/data/r-data/Lake_Rangsdorf.RData"))
str(lake_data_sf)
## Classes 'sf' and 'data.frame':   32 obs. of  23 variables:
##  $ ID             : chr  "10a" "11a" "12a" "13b" ...
##  $ depth_m        : num  0.68 0.66 1.93 3.98 4.95 0.93 3.92 8.4 5.15 4 ...
##  $ visual_depth_cm: int  34 28 15 15 18 20 18 18 15 15 ...
##  $ elec_cond_muS  : int  305 274 680 825 903 416 1060 1278 795 910 ...
##  $ pH             : num  7.19 7.33 7.25 7.22 7.05 7.31 6.82 7.02 6.99 7.01 ...
##  $ LOI550         : num  1.04 1.27 27.32 36.93 38.33 ...
##  $ LOI990         : num  0.2 0.19 17.46 18.25 15.5 ...
##  $ LOI            : num  98.5 98.3 33 21.6 26.4 ...
##  $ C              : num  0.85 0.93 19 24.4 23.7 1.79 24.7 23.1 22.8 25.4 ...
##  $ N              : num  NA NA 1.18 1.85 1.89 NA 1.94 1.82 1.75 2.09 ...
##  $ Ca_mg_g        : num  9.83 3.78 172.12 182.74 193.42 ...
##  $ Fe_mg_g        : num  0.52 0.42 4.73 8.58 10.12 ...
##  $ K_mg_g         : num  0.22 0.13 0.34 0.43 0.5 0.14 0.55 0.64 0.41 0.52 ...
##  $ Pb_mug_g       : num  1.37 1.81 18.49 34.19 31 ...
##  $ Mg_mg_g        : num  0.15 0.13 1.41 1.47 1.42 0.2 1.46 1.45 1.35 1.44 ...
##  $ Mn_mug_g       : num  15.4 12 481.6 454.4 546.7 ...
##  $ Na_mg_g        : num  0.26 0.22 0.54 0.7 0.88 0.32 0.74 0.72 0.63 0.82 ...
##  $ PO4_mg_g       : num  0.3 0.37 4.83 4.93 5.31 1.18 5.34 5.97 4.83 5.51 ...
##  $ S_mg_g         : num  0.77 0.43 8.24 17.18 20.03 ...
##  $ Sr_mug_g       : num  7.87 6.77 238.28 261.05 245.53 ...
##  $ Zi_mug_g       : num  11.5 17.4 84.2 94.9 98.5 ...
##  $ geometry       :sfc_POINT of length 32; first list element:  'XY' num  391829 5794052
##  $ dist           : num  27.54 6.85 10.09 68.47 93.63 ...
##  - attr(*, "sf_column")= chr "geometry"
##  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "names")= chr [1:22] "ID" "depth_m" "visual_depth_cm" "elec_cond_muS" ...

Here, we are only interested in the Zi_mug_g variable. Hence we subset the data set. Note that we do not have to specify the geometry column to be included. After subsetting to one variable the object is still of class sf.

lake_zinc <- lake_data_sf[, "Zi_mug_g"]
str(lake_zinc)
## Classes 'sf' and 'data.frame':   32 obs. of  2 variables:
##  $ Zi_mug_g: num  11.5 17.4 84.2 94.9 98.5 ...
##  $ geometry:sfc_POINT of length 32; first list element:  'XY' num  391829 5794052
##  - attr(*, "sf_column")= chr "geometry"
##  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA
##   ..- attr(*, "names")= chr "Zi_mug_g"

For future usage we cast the sf object into a spatial object of class Spatial.

library(sp)
lake_zinc_sp <- as(lake_zinc, "Spatial")

Sample variogram

For the sake of reproducibility we first define a formula object, that defines the dependent variable as a linear model of independent variable.

# formula object
f <- formula(Zi_mug_g ~ 1)

Now, we build the sample variogram using the variogram() function.

library(gstat)
ev <- variogram(f, lake_zinc_sp)
plot(ev)


Variogram modelling

We build a variogram model using the a self written best fit algorithm function using the gstat package, and plot it.

models <- c("Sph", "Exp", "Gau", "Ste")
fits <- lapply(models, function(m) {
  try(fit.variogram(ev, vgm(psill = 100, model = m, range = 1000, nugget = 10)), silent = TRUE)
})
fits <- Filter(function(x) !inherits(x, "try-error"), fits)
rss <- sapply(fits, attr, "SSErr")
vm_best <- fits[[which.min(rss)]]
plot(ev, model = vm_best)


Model evaluation

We evaluate the variogram model by applying leave-one-out cross-validation (LOOCV), using the krige.cv() command.

OK_LOOCV <- krige.cv(
  formula = f,
  locations = lake_zinc_sp,
  model = vm_best
)

We plot the model residuals using the bubble() function. We add the lake shore line to the plot. Note that we use the :: notation to make sure we apply the layer() function from the correct package namespace.

install.packages('latticeExtra')
## The following package(s) will be installed:
## - latticeExtra [0.6-30]
## These packages will be installed into "~/UNI/Tutorium/GitHub/soga/Soga-R/300/30900_geostatistics/renv/library/R-4.3/x86_64-w64-mingw32".
## 
## # Installing packages --------------------------------------------------------
## - Installing latticeExtra ...                   OK [linked from cache]
## Successfully installed 1 package in 38 milliseconds.
library(latticeExtra)
## Lade nötiges Paket: lattice
library(sp)
bubble(OK_LOOCV,
  "residual",
  main = "Ordinary Kriging"
)+
  latticeExtra::layer(sp.polygons(as(lake, "Spatial")))


Spatial prediction

Finally, we may use our model to predict the zinc concentration in Lake Rangsdorf. Note that we could apply the krige() function, a wrapper method around the gstat() and predict() functions. This time however, we use another handy function, the interpolate function from the raster package.

First we build a gstat object using the gstat() function.

g <- gstat(
  formula = f,
  data = lake_zinc_sp,
  model = vm_best
)

Then, we use the predict function, which accepts a SpatialPixelsDataFrame object and a model object.

library(gstat)
library(terra)
# Kriging-prediction
zinc_pred <- predict(g, newdata = lake_interpolation)
## [using ordinary kriging]
zinc<- rast(zinc_pred)

The resulting RasterLayer can be processed like any other raster data set. Hence we mask out the area outside the lake shoreline. Then we plot it using the generic plot() function.

# masking
zinc <- mask(zinc, vect(lake))

plot(zinc, main = "Zinc concentration [µg/g]")
plot(vect(shoreline), add = TRUE)
plot(vect(lake_zinc), add = TRUE, pch = 16, cex = 0.75)


Exercise: Now it is your turn. Model the concentration of lead (Pb_mug_g) in Lake Rangsdorf and visualize the residuals and results.

## Your code here...

# subset the data
lake_lead <- NULL
str(lake_lead)

# sp and formula abject
lake_lead_sp <- NULL
f <- NULL

# sample variogram
lead_ev <- NULL
plot(lead_ev)

# variogram model
lead_vm <- NULL
plot(lead_vm)

# model evaluation
lead_OK_LOOCV <- NULL

# interpolation
g <- NULL
lead <- NULL

# masking
lead <- NULL

# plotting
plot(lead,
  main = expression(paste("Lead concentration Lake Rangsdorf [", mu, "g/g]"))
)
plot(shoreline$geometry, add = TRUE)
plot(lake_lead$geometry, add = TRUE, pch = 16, cex = 0.75)
Show code
install.packages('latticeExtra')
library(lattice)
library(sf)
library(sp)
library(gstat)
library(raster)
library(latticeExtra)

# 1. Subset the data 
lake_lead <- lake_data_sf[, "Pb_mug_g"]
str(lake_lead)

# 2. sp object and formula
lake_lead_sp <- as(lake_lead, "Spatial")
f <- formula(Pb_mug_g ~ 1)

# 3. calculationg variogram
lead_ev <- variogram(f, lake_lead_sp)
plot(lead_ev)

#starting values
nugget_guess <- min(lead_ev$gamma)               
psill_guess <- max(lead_ev$gamma) - nugget_guess 
range_guess <- lead_ev$dist[which.max(lead_ev$gamma >= (max(lead_ev$gamma)*0.95))]

# 4. function for fitting models and calculationg residuals
fit_vgm_model <- function(model_name, variogram_data) {
 
  start_model <- vgm(psill = psill_guess, model = model_name, range = range_guess, nugget = nugget_guess)  
  fit <- fit.variogram(variogram_data, model = start_model)
  ssq <- attr(fit, "SSErr")
  list(fit_model = fit, error = ssq)
}
 

# 5. testing various models (w/o Matern)
model_names <- c("Sph", "Exp", "Gau")

fits <- lapply(model_names, fit_vgm_model, variogram_data = lead_ev)
errors <- sapply(fits, function(x) x$error)

best_idx <- which.min(errors)
best_fit <- fits[[best_idx]]$fit_model
best_model_name <- model_names[best_idx]

cat("Bestes Modell ohne Matern:", best_model_name, "\n")
plot(lead_ev, best_fit)

# 6. Matern-model with kappa-optimisation
kappas <- seq(0.1, 2, by=0.1)
errors_kappa <- numeric(length(kappas))
fits_kappa <- vector("list", length(kappas))

for(i in seq_along(kappas)) {
  mod <- vgm(psill = psill_guess, model = "Mat", range = range_guess, nugget = nugget_guess, kappa = kappas[i])
  fit <- fit.variogram(lead_ev, model = mod)
  errors_kappa[i] <- attr(fit, "SSErr")
  fits_kappa[[i]] <- fit
}

best_kappa_idx <- which.min(errors_kappa)
best_matern_fit <- fits_kappa[[best_kappa_idx]]
best_kappa <- kappas[best_kappa_idx]

cat("best Matern kappa:", best_kappa, "\n")
plot(lead_ev, best_matern_fit)

# 7. compare Matern-model with best fit
if(attr(best_matern_fit, "SSErr") < attr(best_fit, "SSErr")) {
  lead_vm <- best_matern_fit
  cat("Matern-model ist being used.\n")
} else {
  lead_vm <- best_fit
  cat("No better model found, remain with previous model.\n")
}

# 8. Modell-Evaluation mit krige.cv()
lead_OK_LOOCV <- krige.cv(
  formula = f,
  locations = lake_lead_sp,
  model = lead_vm
)

bubble(lead_OK_LOOCV,
  "residual",
  main = "Ordinary Kriging"
)+
  latticeExtra::layer(sp.polygons(as(lake, "Spatial")))

# 9. preparing interpolation
g <- gstat(
  formula = f,
  data = lake_lead_sp,
  model = lead_vm
)

lead <- interpolate(
  object = raster(lake_interpolation),
  model = g
)

# 10. Masking & Plot
lead <- mask(lead, as(lake, "Spatial"))

plot(lead,
  main = expression(paste("Lead concentration Lake Rangsdorf [", mu, "g/g]"))
)
plot(shoreline$geometry, add = TRUE)
plot(lake_lead$geometry, add = TRUE, pch = 16, cex = 0.75)



Citation

The E-Learning project SOGA-R was developed at the Department of Earth Sciences by Kai Hartmann, Joachim Krois and Annette Rudolph. You can reach us via mail by soga[at]zedat.fu-berlin.de.

Creative Commons License
You may use this project freely under the Creative Commons Attribution-ShareAlike 4.0 International License.

Please cite as follow: Hartmann, K., Krois, J., Rudolph, A. (2023): Statistics and Geodata Analysis using R (SOGA-R). Department of Earth Sciences, Freie Universitaet Berlin.