Cerdocyon thous Data Preparation

Author

Florencia Grattarola

Published

October 1, 2024

Data preparation for modelling for Cerdocyon thous, with the covariates bio_16, bio_19, npp, and tree (selected in the previous step).

library(knitr)
library(lubridate)
library(tmap)
library(leaflet)
library(patchwork)
library(terra)
library(sf)
sf::sf_use_s2(FALSE)
library(tidyverse)
equalareaCRS <-  '+proj=laea +lon_0=-73.125 +lat_0=0 +datum=WGS84 +units=m +no_defs'
latam <- st_read('data/latam.gpkg', layer = 'latam', quiet = T)
countries <- st_read('data/latam.gpkg', layer = 'countries', quiet = T)
latam_land <- st_read('data/latam.gpkg', layer = 'latam_land', quiet = T)
latam_raster <- rast('data/latam_raster.tif', lyrs='latam')
latam_countries <- rast('data/latam_raster.tif', lyrs='countries')

# IUCN range distribution
cthous_IUCN <- sf::st_read('big_data/cthous_IUCN.shp', quiet = T) %>% sf::st_transform(crs=equalareaCRS)

MAP.IUCN <- tm_graticules(alpha = 0.3) +
    tm_shape(countries) +
    tm_fill(col='grey90') +
    tm_borders(col='grey60', alpha = 0.4) +
    tm_shape(cthous_IUCN) +
    tm_fill(col='grey20', alpha = 0.4) +
    tm_layout(legend.outside = T, frame.lwd = 0.3, scale=1.2, legend.outside.size = 0.1)

MAP.IUCN

Data

Species data (PA & PO)

# presence-only rasters
PO_cthous_time1_raster <- terra::rast('data/species_POPA_data/PO_cthous_time1_raster.tif')
PO_cthous_time2_raster <- terra::rast('data/species_POPA_data/PO_cthous_time2_raster.tif')

# blobs
PA_cthous_time1_blobs <- readRDS('data/species_POPA_data/PA_cthous_time1_blobs.rds')
PA_cthous_time2_blobs <- readRDS('data/species_POPA_data/PA_cthous_time2_blobs.rds')

Covariates

The following covariates were chosen using the ‘variableSelection’ script:

  • bio_16: Temperature Annual Range
  • bio_19: Mean Temperature of Warmest Quarter
  • npp: Net Primary Production (tree)
  • tree: Percentage of No Tree Cover
Code
bio <- rast(here::here('big_data/bio_high.tif'))

npp <- rast(here::here('big_data/npp_high.tif'))
npp <- resample(npp, bio, method='bilinear') 

|---------|---------|---------|---------|
=========================================
                                          
Code
names(npp) <- 'npp'
tree <- rast(here::here('big_data/tree_high.tif'))
tree <- resample(tree, bio, method='bilinear') 

|---------|---------|---------|---------|
=========================================
                                          
Code
names(tree) <- 'tree'

# environmental layers for PA
env <- c(bio[[c('bio_16','bio_19')]], npp, tree) %>% 
  classify(cbind(NaN, NA))

|---------|---------|---------|---------|
=========================================
                                          
Code
# resample and scaled values for PO
env_100k_unscaled <- terra::resample(env, latam_raster, method='bilinear') %>% 
  classify(cbind(NaN, NA))

env_100k_resampled <- env_100k_unscaled %>% scale() # scaled values to 0 mean and sd of 1

Plots

npp.plot <- tm_graticules(alpha = 0.3) +  
    tm_shape(env_100k_unscaled[["npp"]]) +
    tm_raster(palette = 'Greens', midpoint = NA, style= "cont", n=20)  + 
    tm_shape(latam) +
    tm_borders(alpha = 0.3) + 
    tm_layout(legend.outside = T, frame.lwd = 0.3, scale=1.2, legend.outside.size = 0.1)

tree.plot <- tm_graticules(alpha = 0.3) +  
    tm_shape(env_100k_unscaled[["tree"]]) +
    tm_raster(palette = 'BuGn', midpoint = NA, style= "cont")  + 
    tm_shape(latam) +
    tm_borders(alpha = 0.3) + 
    tm_layout(legend.outside = T, frame.lwd = 0.3, scale=1.2, legend.outside.size = 0.1)

bio_16.plot <- tm_graticules(alpha = 0.3) +  
    tm_shape(env_100k_unscaled[["bio_16"]]) +
    tm_raster(palette = 'PuBu', midpoint = NA, style= "cont")  + 
    tm_shape(latam) +
    tm_borders(alpha = 0.3) + 
    tm_layout(legend.outside = T, frame.lwd = 0.3, scale=1.2, legend.outside.size = 0.1)

bio_19.plot <- tm_graticules(alpha = 0.3) +  
    tm_shape(env_100k_unscaled[["bio_19"]]) +
    tm_raster(palette = 'YlOrBr', midpoint = NA, style= "cont")  + 
    tm_shape(latam) +
    tm_borders(alpha = 0.3) + 
    tm_layout(legend.outside = T, frame.lwd = 0.3, scale=1.2, legend.outside.size = 0.1)

npp.plot 

tree.plot

bio_16.plot

bio_19.plot

Thinning parametres

  • acce: Accessibility
  • country: Country of origin
Code
acce_100k_resampled <- terra::rast('big_data/acce.tif') %>% 
  classify(cbind(NaN, NA)) # values are scaled

Plot

acce.plot <- tm_graticules(alpha = 0.3) +  
    tm_shape(acce_100k_resampled[["acce"]]) +
    tm_raster(palette = 'RdYlBu', midpoint = NA, style= "cont", n=10)  + 
    tm_shape(latam) +
    tm_borders(alpha = 0.3) + 
    tm_layout(legend.outside = T, frame.lwd = 0.3, scale=1.2, legend.outside.size = 0.1)

acce.plot

Data for the model

Presence-only data

Code
# calculate area in each raster cell
PO_cthous.area <- terra::cellSize(PO_cthous_time1_raster[[1]], unit='m', transform=F) %>% 
  classify(cbind(NaN, NA)) %>% terra::values()

# calculate coordinates for each raster cell
PO_cthous.coords <- terra::xyFromCell(PO_cthous_time1_raster, cell=1:ncell(PO_cthous_time1_raster)) %>% as.data.frame()
names(PO_cthous.coords) <- c('X', 'Y')

# number of cells in the rasters (both the same)
PO_cthous.cell <- 1:ncell(PO_cthous_time1_raster)

# with the environmental variables values for each cell
PO_cthous.env <- env_100k_resampled[] # scaled values

# with the mean accessibility value for each cell
PO_cthous.acce <- acce_100k_resampled # scaled values

# calculate point count in each raster cell
PO_cthous_time1.count <- PO_cthous_time1_raster %>%
    classify(cbind(NaN, NA)) %>% terra::values() %>% as_tibble %>% 
  dplyr::select(count)

PO_cthous_time2.count <- PO_cthous_time2_raster %>%
    classify(cbind(NaN, NA)) %>% terra::values() %>% as_tibble %>% 
  dplyr::select(count)

PO_cthous.countries <- latam_countries %>% classify(cbind(NaN, NA))

# save time1 and time2 datasets 
PO_cthous_time1.model.data <- data.frame(PO_cthous.coords,
                                            pixel = PO_cthous.cell[],
                                            area = PO_cthous.area,
                                            pt.count = PO_cthous_time1.count,
                                            env = PO_cthous.env,
                                            acce = PO_cthous.acce[], 
                                            country = PO_cthous.countries[])

PO_cthous_time2.model.data <- data.frame(PO_cthous.coords,
                                            pixel = PO_cthous.cell[],
                                            area = PO_cthous.area,
                                            pt.count = PO_cthous_time2.count,
                                            env = PO_cthous.env,
                                            acce = PO_cthous.acce[], 
                                            country = PO_cthous.countries[])
  • Check data
Code
head(PO_cthous_time1.model.data) %>% kable()
X Y pixel area count env.bio_16 env.bio_19 env.npp env.tree acce countries
-4357686 3867905 1 1e+10 NA NA NA NA NA NA NA
-4257686 3867905 2 1e+10 NA NA NA NA NA NA NA
-4157686 3867905 3 1e+10 0 -0.9146447 0.2152231 -1.194929 -1.055997 0.0115694 47
-4057686 3867905 4 1e+10 0 -0.9552867 0.6444218 -1.597718 -1.303418 0.0209173 47
-3957686 3867905 5 1e+10 NA -0.7612649 0.8823676 -1.786366 -1.391160 0.0270081 NA
-3857686 3867905 6 1e+10 NA -0.8330892 0.8484078 -1.858230 -1.407916 0.0456958 NA
Code
head(PO_cthous_time2.model.data) %>% kable()
X Y pixel area count env.bio_16 env.bio_19 env.npp env.tree acce countries
-4357686 3867905 1 1e+10 NA NA NA NA NA NA NA
-4257686 3867905 2 1e+10 NA NA NA NA NA NA NA
-4157686 3867905 3 1e+10 0 -0.9146447 0.2152231 -1.194929 -1.055997 0.0115694 47
-4057686 3867905 4 1e+10 0 -0.9552867 0.6444218 -1.597718 -1.303418 0.0209173 47
-3957686 3867905 5 1e+10 NA -0.7612649 0.8823676 -1.786366 -1.391160 0.0270081 NA
-3857686 3867905 6 1e+10 NA -0.8330892 0.8484078 -1.858230 -1.407916 0.0456958 NA

Presence-absence data

Code
# calculate area, coordinates, and point count in each raster cell
PA_cthous.area.time1 <- as.numeric(PA_cthous_time1_blobs$blobArea) 
PA_cthous.area.time2 <- as.numeric(PA_cthous_time2_blobs$blobArea) 

PA_cthous.coords.time1 <- st_coordinates(st_centroid(PA_cthous_time1_blobs)) %>% as_tibble()
PA_cthous.coords.time2 <- st_coordinates(st_centroid(PA_cthous_time2_blobs)) %>% as_tibble()

PA_cthous.pixel.time1 <- terra::cells(PO_cthous_time1_raster, vect(st_centroid(PA_cthous_time1_blobs))) 
PA_cthous.pixel.time2 <- terra::cells(PO_cthous_time2_raster, vect(st_centroid(PA_cthous_time2_blobs))) 

# mode to calculate the most frequent value for the blob - this needs to be changed to getting the %are for each landcover
mode_raster  <- function(x, na.rm = FALSE) {
  if(na.rm){ x = x[!is.na(x)]}
  ux <- unique(x)
  return(ux[which.max(tabulate(match(x, ux)))])
}

PA_cthous.env.time1 <- terra::extract(x = env, 
                                      y = vect(PA_cthous_time1_blobs), 
                                      fun = mean, na.rm=TRUE) %>% 
  mutate(across(where(is.numeric), ~ifelse(is.nan(.), NA, .))) %>% scale()

PA_cthous.env.time2 <- terra::extract(x = env, 
                                      y = vect(PA_cthous_time2_blobs), 
                                      fun = mean, na.rm=TRUE) %>% 
  mutate(across(where(is.numeric), ~ifelse(is.nan(.), NA, .))) %>% scale()

PA_cthous_time1.model.data <- data.frame(pixel = PA_cthous.pixel.time1, 
                                            PA_cthous.coords.time1,
                                            area = PA_cthous.area.time1,
                                            presabs = PA_cthous_time1_blobs$presence,
                                            effort = PA_cthous_time1_blobs$effort,
                                            env = PA_cthous.env.time1[,2:5])

PA_cthous_time2.model.data <- data.frame(pixel = PA_cthous.pixel.time2, 
                                            PA_cthous.coords.time2,
                                            area = PA_cthous.area.time2,
                                            presabs = PA_cthous_time2_blobs$presence,
                                            effort = PA_cthous_time2_blobs$effort,
                                            env = PA_cthous.env.time2[,2:5])
  • Check data
Code
head(PA_cthous_time1.model.data) %>% kable()
pixel.ID pixel.cell X Y area presabs effort env.bio_16 env.bio_19 env.npp env.tree
1 1661 -1768526 1918184 241873943 0 1043 0.5372054 0.7938779 0.8189866 0.4985362
2 1660 -1844290 1927648 537081878 0 2404 0.8588915 1.1876186 0.3621302 0.2921552
3 1661 -1731415 1940145 2241827859 0 11078 0.5154249 0.7879039 0.8676071 0.4185537
4 1660 -1820735 1948513 162791738 0 1291 0.6693835 1.0099642 0.6329681 0.4079050
5 1661 -1782702 1959695 756798182 0 1779 0.4230620 0.7830680 0.7204713 0.4614976
6 1567 -2552096 2041811 19990863 0 3300 -0.3871083 0.2393677 -0.0070323 -1.0809188
Code
head(PA_cthous_time2.model.data) %>% kable()
pixel.ID pixel.cell X Y area presabs effort env.bio_16 env.bio_19 env.npp env.tree
1 3734 -817257.8 -439796.5 2.229446e+09 0 11095 0.8846129 0.6549251 -0.7710085 -0.5731781
2 2527 -1139813.8 954656.5 1.542295e+03 0 12276 1.4883146 1.4538964 -0.2779545 1.1800694
3 2527 -1168048.8 958569.1 1.049520e-01 0 1585 1.4579390 1.4145635 -0.5703401 1.6631377
4 2527 -1116769.5 965197.0 7.494574e+01 0 1440 1.1933258 1.1143948 -0.0384092 1.4473626
5 2527 -1108298.2 967725.2 2.546836e+01 0 400 1.5573632 1.4691857 0.0808761 1.2961172
6 2528 -1086336.4 1003862.9 2.398904e-01 0 7164 -0.5632262 -0.9662805 0.5782478 1.0532113

Save data

saveRDS(PO_cthous_time1.model.data, 'data/species_POPA_data/data_cthous_PO_time1.rds')
saveRDS(PO_cthous_time2.model.data, 'data/species_POPA_data/data_cthous_PO_time2.rds')
saveRDS(PA_cthous_time1.model.data, 'data/species_POPA_data/data_cthous_PA_time1.rds')
saveRDS(PA_cthous_time2.model.data, 'data/species_POPA_data/data_cthous_PA_time2.rds')

Extra: Datasets in the covariate space

The range (and interquartile range) of values for each covariate that is sampled by each dataset

Code
getRangeIQR <- function(env_PX){
  env_PX.df <- tibble(env= character(),
                          range = numeric(),
                     iqr = numeric(),
                     stringsAsFactors=FALSE)
  
  for (i in 1:ncol(env_PX)){
    df <- tibble(env = names(env_PX[i]),
             range = range(env_PX[[i]], na.rm=T)[2],
             iqr=  IQR(env_PA[[i]], na.rm = T))
    env_PX.df <- rbind(env_PX.df, df)
  }
  return(env_PX.df)
}

# covariates for PA
env_PA <- terra::extract(x = env, 
                         y = rbind(vect(PA_cthous_time1_blobs), 
                                   vect(PA_cthous_time2_blobs)), 
                                     fun = mean, na.rm=TRUE) %>% 
  mutate(across(where(is.numeric), ~ifelse(is.nan(.), NA, .))) 

summary(env_PA[-1])
     bio_16           bio_19             npp             tree         
 Min.   :-9.938   Min.   : 0.8405   Min.   :    0   Min.   :  0.2019  
 1st Qu.: 9.595   1st Qu.:17.0648   1st Qu.: 6963   1st Qu.: 19.7279  
 Median :12.088   Median :20.2466   Median : 9498   Median : 37.9586  
 Mean   :13.080   Mean   :20.3751   Mean   : 9781   Mean   : 44.8148  
 3rd Qu.:16.787   3rd Qu.:23.8671   3rd Qu.:12746   3rd Qu.: 65.9958  
 Max.   :22.600   Max.   :29.0108   Max.   :21085   Max.   :200.0000  
Code
env_PA.RangeIQR <- getRangeIQR(env_PA[-1]) %>% mutate(data='PA')

for (i in 2:ncol(env_PA)){
    range <- range(env_PA[[i]], na.rm=T)[2]
    iqr <- IQR(env_PA[[i]], na.rm = T)
    print(str_glue('PA {names(env_PA[i])} -> range: {range} - IQR: {iqr}'))
}
PA bio_16 -> range: 22.6000003814697 - IQR: 7.19196185126157
PA bio_19 -> range: 29.01082732813 - IQR: 6.80222331451684
PA npp -> range: 21085.2897600446 - IQR: 5783.17029000419
PA tree -> range: 200 - IQR: 46.2679435287136
Code
# covariates layers for PO
env_PO <- terra::resample(env, latam_raster, method='bilinear') %>% 
  classify(cbind(NaN, NA)) %>% as.data.frame()

summary(env_PO)
     bio_16           bio_19             npp                 tree         
 Min.   :-9.787   Min.   : 0.8727   Min.   :    1.935   Min.   :  0.1423  
 1st Qu.: 5.157   1st Qu.:16.0414   1st Qu.: 4567.937   1st Qu.: 14.4756  
 Median :14.076   Median :22.3359   Median : 8232.417   Median : 29.7706  
 Mean   :11.862   Mean   :20.1836   Mean   : 7782.353   Mean   : 35.7296  
 3rd Qu.:18.966   3rd Qu.:25.5338   3rd Qu.:10876.451   3rd Qu.: 57.8680  
 Max.   :22.770   Max.   :28.7862   Max.   :17393.445   Max.   :111.9899  
Code
env_PO.RangeIQR <- getRangeIQR(env_PO) %>% mutate(data='PO')

for (i in 1:ncol(env_PO)){
    range <- range(env_PO[[i]], na.rm=T)[2]
    iqr <- IQR(env_PO[[i]], na.rm = T)
    print(str_glue('PO {names(env_PO[i])} -> range: {range} - IQR: {iqr}'))
}
PO bio_16 -> range: 22.7698287963867 - IQR: 13.8090486526489
PO bio_19 -> range: 28.7862033843994 - IQR: 9.49241542816162
PO npp -> range: 17393.4453125 - IQR: 6308.51477050781
PO tree -> range: 111.989860534668 - IQR: 43.3923239707947