Models for Spatial Autocorrelation

Code
library(tigris)
To enable caching of data, set `options(tigris_use_cache = TRUE)`
in your R script or .Rprofile.
Code
library(sf)
Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
Code
library(spdep)
Loading required package: spData
Code
library(spatialreg)
Loading required package: Matrix

Attaching package: 'spatialreg'
The following objects are masked from 'package:spdep':

    get.ClusterOption, get.coresOption, get.mcOption,
    get.VerboseOption, get.ZeroPolicyOption, set.ClusterOption,
    set.coresOption, set.mcOption, set.VerboseOption,
    set.ZeroPolicyOption
Code
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
Code
library(ggplot2)

options(tigris_use_cache = TRUE)
Code
cnty <- tigris::counties(state = "ID", year = 2020, class = "sf") %>%
  st_transform(5070)  # Albers projection for distance calculations
Code
# Contiguity neighbors (queen)
nb_contig <- poly2nb(cnty, queen = TRUE)
plot(cnty$geometry)
plot(nb_contig, st_coordinates(
  st_centroid(cnty)), add=TRUE, col='red')
Warning: st_centroid assumes attributes are constant over geometries

Code
lw_contig <- nb2listw(nb_contig, style = "W")

# Distance neighbors: threshold = 100 km
coords <- st_centroid(cnty) |> st_coordinates()
Warning: st_centroid assumes attributes are constant over geometries
Code
nb_dist <- dnearneigh(coords, 0, 100000)   # 60 km
Warning in dnearneigh(coords, 0, 1e+05): neighbour object has 2 sub-graphs
Code
plot(cnty$geometry)
plot(nb_dist, st_coordinates(
  st_centroid(cnty)), add=TRUE, col='blue')
Warning: st_centroid assumes attributes are constant over geometries

Code
lw_dist <- nb2listw(nb_dist, style = "W", zero.policy = TRUE)
Code
# Simulate SAR-Lag: y = (I - rho W)^(-1)(beta*X + eps)
sim_sar <- function(X, W, rho, sigma=1){
  I <- diag(length(X))
  eps <- rnorm(length(X), 0, sigma)
  as.numeric(solve(I - rho * W) %*% (X + eps))
}

# Simulate SEM (CAR-like): y = X + u, where u = (I - lambda W)^(-1) eps
sim_sem <- function(X, W, lambda, sigma=1){
  I <- diag(length(X))
  eps <- rnorm(length(X), 0, sigma)
  u <- solve(I - lambda * W) %*% eps
  as.numeric(X + u)
}
Code
# Choose adjacency:
lw <- lw_contig          # contiguity (swap to lw_dist for distance)

Wmat <- listw2mat(lw)
n <- length(lw$neighbours)

# Covariate
X <- rnorm(n)
beta <- 1
rho  <- 0.6   # spatial lag strength
lam  <- 0.6   # spatial error strength

y <- sim_sem(X, beta, lambda = lam, W = Wmat)
data <- cnty  %>%  mutate(y = y, X = X)
Code
fit_lm  <- lm(y ~ X, data = data)
fit_sar <- lagsarlm(y ~ X, data = data, listw = lw)
fit_sem <- errorsarlm(y ~ X, data = data, listw = lw)

summary(fit_lm)

Call:
lm(formula = y ~ X, data = data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.72894 -0.57174  0.07531  0.57193  2.81319 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.05183    0.16747   0.309    0.759    
X            1.23815    0.17806   6.954 1.69e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.111 on 42 degrees of freedom
Multiple R-squared:  0.5351,    Adjusted R-squared:  0.5241 
F-statistic: 48.35 on 1 and 42 DF,  p-value: 1.694e-08
Code
summary(fit_sar)

Call:lagsarlm(formula = y ~ X, data = data, listw = lw)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.07840 -0.57578  0.17613  0.50159  2.02895 

Type: lag 
Coefficients: (asymptotic standard errors) 
            Estimate Std. Error z value  Pr(>|z|)
(Intercept) 0.045598   0.141723  0.3217    0.7476
X           1.027034   0.156333  6.5695 5.048e-11

Rho: 0.46585, LR test value: 10.143, p-value: 0.0014485
Asymptotic standard error: 0.13134
    z-value: 3.5468, p-value: 0.00038991
Wald statistic: 12.58, p-value: 0.00038991

Log likelihood: -60.96573 for lag model
ML residual variance (sigma squared): 0.88344, (sigma: 0.93992)
Number of observations: 44 
Number of parameters estimated: 4 
AIC: 129.93, (AIC for lm: 138.07)
LM test for residual autocorrelation
test value: 1.7164, p-value: 0.19015
Code
summary(fit_sem)

Call:errorsarlm(formula = y ~ X, data = data, listw = lw)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.70218 -0.49116  0.14118  0.65169  2.33306 

Type: error 
Coefficients: (asymptotic standard errors) 
            Estimate Std. Error z value  Pr(>|z|)
(Intercept) 0.037287   0.274559  0.1358     0.892
X           1.019187   0.167417  6.0877 1.145e-09

Lambda: 0.45243, LR test value: 5.1191, p-value: 0.023664
Asymptotic standard error: 0.16398
    z-value: 2.759, p-value: 0.0057981
Wald statistic: 7.612, p-value: 0.0057981

Log likelihood: -63.47767 for error model
ML residual variance (sigma squared): 0.9939, (sigma: 0.99694)
Number of observations: 44 
Number of parameters estimated: 4 
AIC: 134.96, (AIC for lm: 138.07)
Code
coef_tbl <- tibble(
  model = c("OLS","SAR","SEM"),
  beta  = c(coef(fit_lm)["X"], 
            coef(fit_sar)["X"], 
            coef(fit_sem)["X"])
)

ggplot(coef_tbl, aes(model, beta)) +
  geom_col(fill = "steelblue") +
  geom_hline(yintercept = 1, linetype="dashed") +
  labs(title = "Estimated β under different models",
       subtitle = "Dashed line = true β = 1")

Code
# Storage
results <- data.frame(param=numeric(), beta_hat=numeric(), model=character())

# LM on SAR-generated data
for(r in rho_vals){
  y <- gen_SAR(r)
  fit <- lm(y ~ a)
  results <- rbind(results,
                   data.frame(param=r, beta_hat=coef(fit)[2], model="LM_on_SAR"))
}

# LM on SEM-generated data
for(l in lam_vals){
  y <- gen_SEM(l)
  fit <- lm(y ~ a)
  results <- rbind(results,
                   data.frame(param=l, beta_hat=coef(fit)[2], model="LM_on_SEM"))
}

# Print results
print(results)