Comparing Statistical Models

Code
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.1     ✔ stringr   1.5.2
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Code
library(terra)
terra 1.8.70

Attaching package: 'terra'

The following object is masked from 'package:tidyr':

    extract
Code
library(geodata)
library(tidyterra)

Attaching package: 'tidyterra'

The following object is masked from 'package:stats':

    filter
Code
library(caret)
Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift

Getting Started

This exercise is going to rely on simulated data so you don’t need to join a repository. You’ll just need to get our credentials and login to a new RStudio session. I’m using simulation because one of the best ways to test out whether you’ve specified your model correctly is to simulate the process with known coefficients. If your statistical model returns those values, then, assuming you’ve simulated a reasonable generating process, you should be able to trust that the model you fit is actually doing what you want it to.

Get the Data

Code
elevation <- geodata::elevation_30s(country = "CRI", 
                                    path = tempdir())
climate <- geodata::worldclim_country(country = "CRI", 
                                      var = "bio", 
                                      res = 0.5, 
                                      path = tempdir())
names(climate) <- str_extract(names(climate), "bio_\\d+")
names(elevation) <- "elev"

climate_df <- as.data.frame(climate, na.rm=TRUE)

climate_subset <- climate[[sample(1:nlyr(climate),size = 8)]]
climate_aligned <- resample(climate_subset, elevation)

# Stack the predictors
predictors <- c(elevation, climate_aligned)


plot(predictors)

Generate a sample

Code
set.seed(123)
n_points <- 750

# Sample random locations from the study area
sample_points <- spatSample(elevation, size = n_points, 
                           method = "random", na.rm = TRUE, 
                           xy = TRUE, values = FALSE)

# Extract environmental values at these points
env_values <- terra::extract(predictors, sample_points)

species_data <- tibble(
  x = sample_points[,1],
  y = sample_points[,2]) %>% 
  bind_cols(env_values) %>% 
  drop_na()  # Remove any NA values

species_data_scl <- species_data %>% 
  mutate(across(colnames(env_values), ~ as.numeric(scale(.x))))

Specify predictors

Code
K <- nlyr(predictors)

gen_predictors <- function(K, style = c("random", "correlated"), K_corr = NULL, rho = NULL){
  if(style == "random"){
    preds <- runif(n=K, min = -3, max = 3)
  }else{
    if (is.null(K_corr) | is.null(rho)) {
      stop("You selected method = 'correlated', so you must supply values for K_corr and rho")
    }
    # build correlation matrix for correlated block
  Sigma <- matrix(rho, nrow = K_corr, ncol = K_corr)
  diag(Sigma) <- 1
  # generate correlated variables
  corr_preds <- MASS::mvrnorm(n = 1, mu = rep(0, K_corr), Sigma = Sigma)
  random_preds <- runif(n=(K-K_corr), min = -3, max = 3)
  preds <- sample(c(corr_preds, random_preds))
  }
return(preds)
} 

known_betas <- gen_predictors(K=9, style = "correlated", K_corr = 4, rho=0.7)

Simulate Outcomes

Code
species_data_preds <- species_data_scl %>%
  mutate(
    across(
      all_of(colnames(env_values)),
      ~ .x * known_betas[which(colnames(env_values) == cur_column())]
    )
  ) %>% 
  mutate(. , linpred = rowSums(across(all_of(colnames(env_values)))),
         prob = plogis(linpred),
         presence = rbinom(n(), size = 1, prob = prob)) 


# Check the prevalence
species_data_preds %>%
  summarize(
    prevalence = mean(presence),
    n_present = sum(presence),
    n_absent = sum(presence == 0)
  )  
# A tibble: 1 × 3
  prevalence n_present n_absent
       <dbl>     <int>    <int>
1      0.501       376      374

Check Out Values

Code
ggplot() +
  geom_spatraster(data = elevation) +
  scale_fill_viridis_c(name = "Elevation (m)", alpha = 0.7) +
  geom_point(data = species_data_preds, 
             aes(x = x, y = y, color = factor(presence)),
             size = 1, alpha = 0.6) +
  scale_color_manual(values = c("0" = "blue", "1" = "red"),
                     labels = c("Absent", "Present"),
                     name = "Species") +
  labs(title = "Species Distribution") +
  theme_minimal() +
  theme(legend.position = "right")

Code
# Compare environmental values between presence and absence
# Reshape data for easier plotting
plot_data <- species_data_preds %>%
  select(all_of(c(colnames(env_values), "presence"))) %>%
  pivot_longer(cols = all_of(colnames(env_values)),
               names_to = "variable",
               values_to = "value") %>%
  mutate(
    presence = factor(presence, labels = c("Absent", "Present")),
    variable = factor(variable, 
                     levels = c(colnames(env_values)),
                     labels = c(colnames(env_values)))
  )

ggplot(plot_data, aes(x = presence, y = value, fill = presence)) +
  geom_boxplot(alpha = 0.7) +
  facet_wrap(~variable, scales = "free_y", ncol = 3) +
  scale_fill_manual(values = c("Absent" = "blue", "Present" = "red")) +
  labs(title = "Environmental Conditions by Presence/Absence",
       x = "Species Presence",
       y = "Value") +
  theme_minimal() +
  theme(legend.position = "none")

Fit Logistic Regression

Code
vars <- species_data_preds %>%
  select(colnames(env_values)) %>%
  names()

form <- reformulate(vars, response = "presence")
form
presence ~ elev + bio_18 + bio_13 + bio_14 + bio_4 + bio_1 + 
    bio_12 + bio_10 + bio_6
Code
model <- glm(form ,
             data = species_data_preds,
             family = binomial(link = "logit"))

# Model summary
summary(model)

Call:
glm(formula = form, family = binomial(link = "logit"), data = species_data_preds)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -0.0599     0.1370  -0.437 0.662000    
elev         -0.1117     1.4446  -0.077 0.938351    
bio_18        3.2527     0.8440   3.854 0.000116 ***
bio_13        0.9740     0.1623   6.003 1.94e-09 ***
bio_14        1.2394     0.1385   8.952  < 2e-16 ***
bio_4         1.2373     0.2518   4.914 8.93e-07 ***
bio_1         1.3859     5.9439   0.233 0.815640    
bio_12        1.1884     0.4542   2.616 0.008886 ** 
bio_10       -1.7922   118.9681  -0.015 0.987981    
bio_6         1.9436     0.5715   3.401 0.000672 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1039.72  on 749  degrees of freedom
Residual deviance:  463.04  on 740  degrees of freedom
AIC: 483.04

Number of Fisher Scoring iterations: 6
Code
# Compare estimated coefficients to true values
tibble(
  Parameter = c("Intercept", colnames(env_values)),
  True_Value = c(0, known_betas),
  Estimated = coef(model),
  Difference = Estimated - True_Value,
  Pct_Error = (Difference / True_Value) * 100
)
# A tibble: 10 × 5
   Parameter True_Value Estimated Difference Pct_Error
   <chr>          <dbl>     <dbl>      <dbl>     <dbl>
 1 Intercept      0       -0.0599    -0.0599    -Inf  
 2 elev          -1.12    -0.112      1.01       -90.0
 3 bio_18         0.295    3.25       2.96      1003. 
 4 bio_13         1.46     0.974     -0.488      -33.4
 5 bio_14        -2.72     1.24       3.96      -146. 
 6 bio_4         -2.10     1.24       3.34      -159. 
 7 bio_1         -2.64     1.39       4.03      -152. 
 8 bio_12        -0.802    1.19       1.99      -248. 
 9 bio_10        -0.131   -1.79      -1.66      1272. 
10 bio_6          2.63     1.94      -0.688      -26.1

Fit Random Forest

Code
species_data_preds_rf <- species_data_preds %>% 
  mutate(presence = as.factor(presence))

rf_model <- caret::train(
    form,
    data = species_data_preds_rf,
    method = "ranger",
    importance = "impurity",
    tuneGrid = expand.grid(
        "mtry" = 4,
        "splitrule" = "gini",
        "min.node.size" = 5
    ),
    num.trees = 100
)

varimp_tbl <- rf_model$finalModel$variable.importance %>%
  enframe(name = "variable", value = "importance") %>%
  mutate(importance_scaled = importance / max(importance)) %>%
  arrange(desc(importance_scaled))

# View top variables
head(varimp_tbl, 10)
# A tibble: 9 × 3
  variable importance importance_scaled
  <chr>         <dbl>             <dbl>
1 bio_14        131.              1    
2 bio_4          52.9             0.404
3 bio_12         43.9             0.335
4 bio_13         41.4             0.316
5 bio_18         24.2             0.185
6 bio_10         17.0             0.130
7 elev           16.4             0.125
8 bio_1          15.4             0.118
9 bio_6          14.0             0.107
Code
# Plot
ggplot(varimp_tbl, aes(x = reorder(variable, importance_scaled),
                       y = importance_scaled)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(x = "Variable", y = "Scaled importance (0–1)",
       title = "Random Forest Variable Importance (Ranger)") +
  theme_minimal()

Compare Model Implications

Code
scale_stack <- scale(predictors)
pred_map <- terra::predict(
  object = scale_stack,
  model = model,
  type = "response"
)


pred_map_rf <- terra::predict(
  object = scale_stack,
  model = rf_model$finalModel$forest,
  type = "response"
)