Collection of training data for remote sensing model building

Tutorial: EON Summer School 2024

Author

Paul Magdon, University of Applied Sciences and Arts (HAWK)

Published

August 27, 2024

Setup & Installation

library(sf)
library(RStoolbox)
library(terra)
library(ggplot2)
library(mapview)
library(kableExtra)
library(dplyr)
library(rprojroot)
library(patchwork)
library(rmarkdown)

Data

In this tutorial we will work with a Sentinel-2 scene from 18/06/2022 from the National Park, Harz. We will also use the boundary of the National Park to define our study area. Before we can start you may download the S2 Scene from the following link: S2-download. Place this file into the data folder of this tutorial.

# create a string containing the current working directory
wd=paste0(find_rstudio_root_file(),"/tdv_session/data/")

#Import the boundary of the n
np_boundary = st_transform(st_read(paste0(wd,"nlp-harz_aussengrenze.gpkg")),25832)

s2  <- terra::rast(paste0(wd,"S2B_MSIL2A_20220618T102559_N0400_R10_resampled_harz_np.tif"))

names(s2)<-c('blue','green','red','vnir1','vnir2','vnir3','nir1','nir2','swir')

s2 <-terra::mask(s2,np_boundary)

Anaylsing the spectral variablity within the study area

If we have no access to prior information on our target variable in the study area we can use the spectral variability as a proxy for the variability of the target variable. By using the spectral variability as a sampling criterion we also ensure, that we cover the spectral range with our sampling.

Dimension reduction (PCA)

In a fist step we reduce the dimensions of the 9 Sentinel-2 bands while maintaining most of the information, using a principal component analysis (PCA).

# Calculation of the principlal components using the RStoolbox
pca<-RStoolbox::rasterPCA(s2,nSamples = 5000, spca=TRUE )


# Extracting the first three components
rgb_raster <- subset(pca$map, 1:3)

# Function to scale the pixel values to 0-255
scale_fun <- function(x) {
  # Calculation of the 2% and 98% quantile
  q <- quantile(x, c(0.02, 0.98), na.rm = TRUE)
  
  # scaling the values
  x <- (x - q[1]) / (q[2] - q[1]) * 255
  
  # restrict the values to 0-255
  x <- pmin(pmax(x, 0), 255)
  
  return(x)
}

# Scaling of each band
for (i in 1:3) {
  rgb_raster[[i]] <- app(rgb_raster[[i]], scale_fun)
}

# Plot the first three principal components as RGB
plotRGB(rgb_raster, r = 1, g = 2, b = 3)

# Show importance of componentes
summary(pca$model)

From the output of the PCA we see that we can capture 92% of the variability with the first two components. Thus we will only use the PC1 and PC2 for the subsequent analysis.

Unsupervised clustering

In the next step we run an unsupervised classification of the PC1 and PC2 to get a clustered map. For the unsupervised classification we need to take a decision on the number of classes/clusters to be created. Here we will take \(n=5\) classes. However, depending on the target variable this value need to be adjusted.

set.seed(2222)
cluster <- RStoolbox::unsuperClass(pca$map[[c('PC1','PC2')]], nSamples = 100, nClasses = 5, nStarts = 5)


## Plots
colors <- rainbow(5)
plot(cluster$map, col = colors, legend = TRUE, axes = TRUE, box =TRUE)

The map shows a clear spatial patterns related to the elevation, tree species and vitality status of the Nationalpark forests.

Create a stratified sample

In the next step we take a stratified random sample with \(n=10\) points from each of the 5 spectral classes.

# Draw a stratified random sample from the raster
samples <- terra::spatSample(cluster$map, size=10, method = "stratified",
                             na.rm = TRUE, xy = TRUE, cells = TRUE)
# convert to sf object
sf_samples <- sf::st_as_sf(samples, coords = c("x", "y"),crs = 25832)

# convert the classes to factors
sf_samples$class_unsupervised <- as.factor(sf_samples$class_unsupervised)

# Show map using ggplot
ggplot()+geom_sf(data = np_boundary,fill=NA)+
geom_sf(data = sf_samples,aes(color = as.factor(class_unsupervised)), size = 0.5) +
  scale_color_manual(values = rainbow(5), name = "Class Unsupervised") +
  ggtitle("Stratified Random Sample for Training") +
  theme_minimal()+
  coord_sf(crs = st_crs(25832))

We can now print the sample plot list as following:

kableExtra::kable(samples[c('x','y','class_unsupervised')], caption='Training plot list') %>%
  kable_styling(fixed_thead = T) %>% scroll_box(height = "400px")

Implement a plot design

# Create a training data set by extracting the mean value of all pixels touching
# a buffered area with 13m around the plot center
plots <- sf::st_buffer(sf_samples,dist = 13)
train<-terra::extract(s2,plots,fun='mean',bind=FALSE,na.rm=TRUE)

plots <- plots %>% mutate(ID=row_number())
train <- plots %>% left_join(train, by= "ID")
mapview::mapview(train, zcol="class_unsupervised",
        map.types = c("Esri.WorldShadedRelief", "OpenStreetMap.DE"))+
  mapview(np_boundary,alpha.regions = 0.2, aplha = 1)

Compare the pixel value range between the sample and the image

image.sample <- terra::spatSample(s2, size = 100000, method = "random", as.df = TRUE)
image.sample$group<-'image'

train.df<- train[,names(s2)]
train.df <- sf::st_drop_geometry(train.df)
train.df$group<-'train'

df <- rbind(image.sample,train.df)

blue <-ggplot(df, aes(blue,fill=group)) + theme_classic()+
        geom_histogram(
        aes(y=after_stat(density)),alpha=0.2, color='gray80',
        position='identity',bins=30)

green <-ggplot(df, aes(green,fill=group)) + theme_classic()+
        geom_histogram(
        aes(y=after_stat(density)),alpha=0.2, color='gray80',
        position='identity',bins=30)

nir1<-ggplot(df, aes(nir1,fill=group)) + theme_classic()+
      geom_histogram(
      aes(y=after_stat(density)),alpha=0.2, color='gray80',
      position='identity',bins=30)

swir<-ggplot(df, aes(swir,fill=group)) + theme_classic()+
      geom_histogram(
      aes(y=after_stat(density)),alpha=0.2, color='gray80',
      position='identity',bins=30)

blue+green+nir1+swir+plot_layout(ncol=2)