library(countrycode)
library(janitor)
library(patchwork)
library(knitr)
library(measurements)
library(lubridate)
library(terra)
library(sf)
::sf_use_s2(FALSE)
sflibrary(tidyverse)
Species data
Data cleaning for presence-absence and presence-only records.
- R Libraries
- Basemaps
<- '+proj=laea +lon_0=-73.125 +lat_0=0 +datum=WGS84 +units=m +no_defs'
equalareaCRS
<- st_read('data/latam.gpkg', layer = 'latam', quiet = T)
latam <- st_read('data/latam.gpkg', layer = 'countries', quiet = T)
countries <- st_read('data/latam.gpkg', layer = 'latam_land', quiet = T)
latam_land <- rast('data/latam_raster.tif', lyrs='latam')
latam_raster <- rast('data/latam_raster.tif', lyrs='countries') latam_countries
- Species list
<- read_csv('data/carnivores_list.csv') carnivores_list
Data
Presence/absence data
Terms standardised:
- recordID
- species
- temporalSpan
- yearEnd
- decimalLatitude
- decimalLongitude
- area_m2
- countryCode
- stateProvince
- locality
- presence
- abundance
- effort
- effortUnit
- dataset
<- read_csv('data/raw/OWN_literature_extraction.csv')
pointsAndSurveys_Literature
<- pointsAndSurveys_Literature %>%
data_PA_Ab_1 ::clean_names('lower_camel') %>%
janitorfilter(speciesLatin %in% carnivores_list$sp) %>%
mutate(hemisphere_lat=ifelse(grepl('S', latitude), 'S', ifelse(grepl('N', latitude), 'N', NA))) %>%
mutate(hemisphere_lon=ifelse(grepl('W', longitude), 'W', ifelse(grepl('E', longitude), 'E', NA))) %>%
mutate(latitude = str_squish(str_replace_all(latitude, '°|’|S|W|E|N', ' '))) %>%
mutate(longitude = str_squish(str_replace_all(longitude, '°|’|S|W|E|N', ' '))) %>%
rowwise() %>%
mutate(latitude = ifelse(latLonUnit=='deg', conv_unit(latitude, from = 'deg_dec_min', to = 'dec_deg'), latitude)) %>%
mutate(longitude = ifelse(latLonUnit=='deg', conv_unit(longitude, from = 'deg_dec_min', to = 'dec_deg'), longitude)) %>%
ungroup() %>%
mutate(latitude=ifelse(latLonUnit=='deg' & hemisphere_lat=='S', as.numeric(latitude)*-1, as.numeric(latitude))) %>%
mutate(longitude=ifelse(latLonUnit=='deg' & hemisphere_lon=='W', as.numeric(longitude)*-1, as.numeric(longitude))) %>%
mutate(dateStart=dmy(dateStart), dateEnd=dmy(dateEnd)) %>%
filter(!(year(dateEnd)>=2014 & year(dateStart)<2014)) %>%
mutate(area_m2=ifelse(areaUnit=='ha', area*10000, area*1e+6)) %>%
mutate(countryCode=countrycode::countrycode(countryCode,
origin = 'country.name',
destination = 'iso2c')) %>%
mutate(temporalSpan=ifelse(temporalSpan!='G', dateEnd-dateStart, temporalSpan)) %>%
mutate(effort=ifelse(effortUnit=='camera trap hours', effort/24, effort)) %>%
mutate(effortUnit='days') %>%
mutate(independentLocation=str_c(latitude,':',longitude)) %>%
mutate(independentYearSpan=str_c(year(dateStart),':',year(dateEnd))) %>%
mutate(dataset='pointsAndSurveys_Literature') %>%
select(recordID=recordId, date_start=dateStart, date_end=dateEnd,
species=speciesLatin,
temporalSpan,
independentLocation,
independentYearSpan,decimalLatitude=latitude,
decimalLongitude=longitude,
area_m2=area,
countryCode,
stateProvince,
locality,
presence,# not all of the records have abundance data
abundance,
effort,
effortUnit,
dataset)
<-
pointsAndSurveys_DataPaper read_csv('data/raw/NEOTROPICAL_CARNIVORES_DATASET_2020-04.csv',
guess_max = 90000) # Neotropical carnivores Data Paper
<-
data_PA_Ab_2 %>%
pointsAndSurveys_DataPaper # filter the records to include only the Neotropical Mammals
filter(SPECIES %in% carnivores_list$sp) %>%
# filter only the records taken with camera traps -> assumption: these will reflect presence-absece data
filter(METHOD=='Camera trap') %>%
# filter records with no coordinates
filter(!is.na(LAT_Y)) %>%
# filter occurrences that don't have month and year when they were recorded
filter(!is.na(COL_END_YR)&!is.na(COL_START_MO)&!is.na(COL_END_MO)&!is.na(COL_START_YR)) %>%
# one area has a range instead of a value, I took the mean of the range as area
mutate(AREA_HA=ifelse(AREA_HA=='700-1000', mean(c(700, 1000)), AREA_HA)) %>%
# filter records with no area of study recorded / or with sampling level area and precision recorded
filter(!is.na(AREA_HA) | (is.na(AREA_HA) & !is.na(PRECISION_m) & SAMPLING_LEVEL=='AREA')) %>%
mutate(area_ha=as.numeric(AREA_HA),
area_m2=ifelse(!is.na(AREA_HA),
*10000, # 1 hectare are 10000 meters / will be used for buffers
area_ha%>%
PRECISION_m)) # to have the same values as the presence-only I kept country codes
mutate(countryCode=countrycode::countrycode(COUNTRY,
origin = 'country.name',
destination = 'iso2c')) %>%
# since the dataset has no day, dates were assumed to start and end the first day of the month
mutate(date_start=lubridate::dmy(str_c('1', COL_START_MO, COL_START_YR))) %>%
mutate(date_end=lubridate::dmy(str_c('1', COL_END_MO, COL_END_YR))) %>%
# the temporal span was calculated -> there are errors in some dates, and the temporal span is negative
mutate(temporalSpan=ifelse(date_end-date_start<0, date_start-date_end, date_end-date_start)) %>%
filter(!(year(date_end)>=2014 & year(date_start)<2014)) %>%
# decimal places for some records corrected
mutate(EFFORT=ifelse(grepl('\\.', EFFORT) & (temporalSpan>=31 | temporalSpan>=30),
str_remove(EFFORT, '\\.' ), EFFORT)) %>%
# the sampling effort was standardized to days
mutate(samplingEffort=ifelse(grepl('hours', EFFORT_UNIT, ignore.case = T), as.numeric(EFFORT)/24,
ifelse(grepl('days', EFFORT_UNIT, ignore.case = T), as.numeric(EFFORT), NA))) %>%
# filter those without sampling effort (no data was provided and it cannot be derived)
filter(!is.na(samplingEffort)) %>%
mutate(effortUnit='days') %>%
# calculate the number of camera traps needed
mutate(numCameras=floor(samplingEffort/temporalSpan)) %>%
# filter unreliable values
mutate(temporalSpan=ifelse(numCameras==Inf & temporalSpan==0 & samplingEffort>31, NA,
ifelse(numCameras==Inf & temporalSpan==0 & samplingEffort<=31,
floor(as.numeric(EFFORT)), temporalSpan))) %>%
filter(!is.na(temporalSpan)) %>%
mutate(independentLocation=str_c(LAT_Y,':',LONG_X)) %>%
mutate(independentYearSpan=str_c(COL_START_YR,':',COL_END_YR)) %>%
::select(recordID=ID,
dplyr
date_start, date_end,species=SPECIES,
temporalSpan,
independentLocation,
independentYearSpan,decimalLatitude=LAT_Y,
decimalLongitude=LONG_X,
area_m2,
countryCode,stateProvince=STATE,
locality=SITE,
presence=OCCUR,
abundance=N_RECORDS, # not all of the records have abundance data
effort=samplingEffort,
effortUnit,dataset=DATASET)%>%
mutate(dataset='pointsAndSurveys_DataPaper') # in total 34,389 records
# Generate 0 (zeros) for locations where the species hasn't been recorded
# this can very probably be done in a more efficient way, but this one works!
<- bind_rows(data_PA_Ab_1, data_PA_Ab_2) %>%
data_PA_Ab_0 pivot_wider(names_from = species,
values_from = c(presence, abundance, effort),
values_fill = c(list(presence=0),
list(abundance=0),
list(effort=0))) %>%
pivot_longer(cols=starts_with(c('presence_', 'abundance_', 'effort_')),
names_to=c('metric', 'species'),
names_sep='_') %>%
pivot_wider(names_from = metric,
values_from = value,
values_fill = c(list(value=0))) %>%
distinct(species, independentYearSpan, independentLocation, presence, .keep_all = T) %>%
group_by(species, independentYearSpan, independentLocation) %>%
mutate(presence=sum(presence, na.rm = T),
abundance=sum(abundance, na.rm = T),
effort=max(effort, na.rm = T)) %>%
distinct(species, independentYearSpan, independentLocation, .keep_all = T) %>%
ungroup() %>% group_by(independentLocation, independentYearSpan) %>%
mutate(effort=max(effort, na.rm = T)) %>%
ungroup()
Check the data
Code
%>%
data_PA_Ab_0 mutate(period=ifelse(year(date_end)<2014, 'time1', 'time2')) %>%
filter(species %in% c('Herpailurus yagouaroundi', 'Leopardus pardalis',
'Cerdocyon thous', 'Chrysocyon brachyurus',
'Eira barbara', 'Leopardus wiedii',
'Nasua nasua', 'Pteronura brasiliensis')) %>%
filter(presence==1) %>%
group_by(species, period) %>% count() %>%
pivot_wider(names_from = species, values_from = n, values_fill = 0) %>%
::adorn_totals() %>% kable() janitor
period | Cerdocyon thous | Chrysocyon brachyurus | Eira barbara | Herpailurus yagouaroundi | Leopardus pardalis | Leopardus wiedii | Nasua nasua | Pteronura brasiliensis |
---|---|---|---|---|---|---|---|---|
time1 | 886 | 174 | 819 | 290 | 1584 | 393 | 878 | 15 |
time2 | 1106 | 212 | 826 | 312 | 1379 | 327 | 1029 | 6 |
Total | 1992 | 386 | 1645 | 602 | 2963 | 720 | 1907 | 21 |
Code
bind_rows(data_PA_Ab_1, data_PA_Ab_2) %>%
mutate(period=ifelse(year(date_end)<2014, 'time1', 'time2')) %>%
filter(species %in% c('Herpailurus yagouaroundi', 'Leopardus pardalis',
'Cerdocyon thous', 'Chrysocyon brachyurus',
'Eira barbara', 'Leopardus wiedii',
'Nasua nasua', 'Pteronura brasiliensis')) %>%
filter(presence==1) %>%
::st_as_sf(coords=c('decimalLongitude', 'decimalLatitude')) %>%
sf::st_set_crs(4326) %>%
sfggplot() +
geom_sf(data = latam, fill='white', size=0.2) +
geom_sf(aes(col=year(date_end))) +
facet_grid(species~period) +
scale_color_distiller(palette = 'Greens', direction = 1) +
labs(col='Year') +
theme_bw()
Presence-only data
Terms standardised:
- recordID,
- species,
- year,
- eventDate,
- decimalLatitude,
- decimalLongitude,
- countryCode, - stateProvince,
- locality,
- dataset
# The data download and cleaning can be found in the GBIF_download.R script
<- read_csv('data/raw/0240776-230224095556074_CLEAN.csv',
occurrencePoints_GBIF guess_max = 130000) # GBIF data (cleaned)
<- occurrencePoints_GBIF %>%
data_PO_1 # filter only presences
filter(occurrenceStatus == 'PRESENT') %>%
# rename the species yaguarundi and filter our species of interest
mutate(species=ifelse(species=='Puma yagouaroundi', 'Herpailurus yagouaroundi', species)) %>%
filter(species %in% c('Herpailurus yagouaroundi', 'Leopardus pardalis',
'Cerdocyon thous', 'Chrysocyon brachyurus',
'Eira barbara', 'Leopardus wiedii',
'Nasua nasua', 'Pteronura brasiliensis')) %>%
# filter records from unrealistic or unwanted) locations: USA, Denmark, Belgium and France
filter(!countryCode %in% c('US', 'DK', 'BE', 'FR')) %>%
# filter occurrences that don't have the year when they were recorded
filter(!is.na(year) & year>=2000 & year<=2020) %>%
# transform eventdate to class datetime
mutate(eventDate=lubridate::ymd_hms(eventDate)) %>%
# remove duplicates according to species, date, latitude and longitude
group_by(species, eventDate, decimalLatitude, decimalLongitude) %>%
slice_head(n=1) %>% ungroup() %>%
# select columns of interest
::select(recordID=gbifID,
dplyr
species,
year, eventDate,
decimalLatitude,
decimalLongitude,
countryCode,
stateProvince,%>%
locality) mutate(dataset='occurrencePoints_GBIF')
<- pointsAndSurveys_DataPaper %>%
data_PO_2 filter(SPECIES %in% c('Herpailurus yagouaroundi', 'Leopardus pardalis',
'Cerdocyon thous', 'Chrysocyon brachyurus',
'Eira barbara', 'Leopardus wiedii',
'Nasua nasua', 'Pteronura brasiliensis')) %>%
filter(!is.na(COL_END_YR)&!is.na(COL_START_MO)&!is.na(COL_END_MO)&!is.na(COL_START_YR)) %>%
filter(DATA_TYPE=='Count data') %>%
filter(METHOD %in% c('Opportunistic', 'Line transect', 'Active searching', 'Roadkill', 'Museum')) %>%
mutate(countryCode=countrycode::countrycode(COUNTRY,
origin = 'country.name',
destination = 'iso2c')) %>%
mutate(date_start=lubridate::dmy(str_c('1', COL_START_MO, COL_START_YR))) %>%
mutate(date_end=lubridate::dmy(str_c('1', COL_END_MO, COL_END_YR))) %>%
filter(date_start==date_end) %>% mutate(year=year(date_start)) %>%
filter(!is.na(date_end) & year(date_start)>=2000 & year(date_end)<=2020) %>%
group_by(SPECIES, date_start, LAT_Y, LONG_X) %>%
slice_head(n=1) %>% ungroup() %>%
::select(recordID=ID,
dplyrspecies=SPECIES,
eventDate=date_start,
year, decimalLatitude=LAT_Y,
decimalLongitude=LONG_X,
countryCode,stateProvince=STATE,
locality=SITE) %>%
mutate(dataset='occurrencePoints_NeotropCarnivores')
<- bind_rows(data_PO_1, data_PO_2) %>%
data_PO_0 filter(species %in% c('Herpailurus yagouaroundi', 'Leopardus pardalis',
'Cerdocyon thous', 'Chrysocyon brachyurus',
'Eira barbara', 'Leopardus wiedii',
'Nasua nasua', 'Pteronura brasiliensis'))
Check the data
Code
bind_rows(data_PO_1, data_PO_2) %>%
mutate(period=ifelse(year(eventDate)<2014, 'time1', 'time2')) %>%
filter(species %in% c('Herpailurus yagouaroundi', 'Leopardus pardalis',
'Cerdocyon thous', 'Chrysocyon brachyurus',
'Eira barbara', 'Leopardus wiedii',
'Nasua nasua', 'Pteronura brasiliensis')) %>%
group_by(species, period) %>% count() %>%
pivot_wider(names_from = species, values_from = n, values_fill = 0) %>%
::adorn_totals()%>% kable() janitor
period | Cerdocyon thous | Chrysocyon brachyurus | Eira barbara | Herpailurus yagouaroundi | Leopardus pardalis | Leopardus wiedii | Nasua nasua | Pteronura brasiliensis |
---|---|---|---|---|---|---|---|---|
time1 | 1112 | 335 | 294 | 216 | 378 | 106 | 469 | 103 |
time2 | 1891 | 140 | 1446 | 588 | 2212 | 448 | 1515 | 96 |
Total | 3003 | 475 | 1740 | 804 | 2590 | 554 | 1984 | 199 |
Code
bind_rows(data_PO_1, data_PO_2) %>%
mutate(period=ifelse(year(eventDate)<2014, 'time1', 'time2')) %>%
::st_as_sf(coords=c('decimalLongitude', 'decimalLatitude')) %>%
sf::st_set_crs(4326) %>%
sfggplot() +
geom_sf(data = latam, fill='white', size=0.2) +
geom_sf(aes(col=year)) +
facet_grid(species~period) +
scale_color_distiller(palette = 'Greens', direction = 1) +
labs(col='Year') +
theme_bw()
Calculate the ratio of the global PO effort between time1 and time2
<- occurrencePoints_GBIF %>%
globalEffort filter(occurrenceStatus == 'PRESENT') %>%
filter(!countryCode %in% c('US', 'DK', 'BE', 'FR')) %>%
mutate(eventDate=lubridate::ymd_hms(eventDate), year=year(eventDate)) %>%
filter(!is.na(year) & year>=2000 & year<=2020) %>%
mutate(species=ifelse(species=='Puma yagouaroundi', 'Herpailurus yagouaroundi', species)) %>%
# select columns of interest
::select(recordID=gbifID,
dplyr
species, year, eventDate,
decimalLatitude,
decimalLongitude, %>%
countryCode) mutate(period=ifelse(year<2014, 'time1', 'time2'))
<- globalEffort %>%
globalEffort_sf ::st_as_sf(coords=c('decimalLongitude', 'decimalLatitude')) %>%
sf::st_set_crs(4326) %>% st_transform(crs=equalareaCRS)
sf
<- st_join(latam, globalEffort_sf, left=TRUE) %>%
globalEffort_time group_by(gridID, period) %>%
summarise(effort=n(), .groups = 'drop') %>% st_drop_geometry()
%>% select(-gridID) %>% ungroup() %>%
globalEffort_time ::adorn_totals()%>% kable() janitor
period | effort |
---|---|
time1 | 2056 |
time2 | 7571 |
Total | 9627 |
<- globalEffort_time %>% filter(period=='time1') %>% pull(effort) /
globalEffort_value %>% filter(period=='time2') %>% pull(effort) * 100
globalEffort_time
ggplot(globalEffort_sf) +
geom_sf(data = latam, fill='white', size=0.2) +
geom_sf(aes(col=species, alpha=0.5), show.legend = F) +
facet_grid(~period) +
scale_color_brewer(palette = 'Set3') +
labs(col='Year', title= paste0('Records increase ',
round(globalEffort_value, 1), '% from time1 to time2' )) +
theme_bw()
Save files
# R object species data
saveRDS(data_PO_0, 'data/data_PO.Rds')
saveRDS(data_PA_Ab_0, 'data/data_PA.Rds')
# csv species data
write_csv(data_PO_0, 'data/data_PO.csv')
write_csv(data_PA_Ab_0, 'data/data_PA.csv')