Modelos Lineales Generalizados (GML)

Contents

Modelos Lineales Generalizados (GML)#

Regresión Lineal Gaussiana#

import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 4
      2 import numpy as np
      3 import matplotlib.pyplot as plt
----> 4 import statsmodels.api as sm
      5 from sklearn.preprocessing import StandardScaler

ModuleNotFoundError: No module named 'statsmodels'

gdf = gpd.read_file(
    "https://github.com/algarciach/AnalisisGeoespacial/raw/main/Covid19_model/Data/covid19_municipios_antioquia.gpkg"
)
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   codigo_municipio     125 non-null    object  
 1   nombre_municipio     125 non-null    object  
 2   codigo_subregion     125 non-null    object  
 3   nombre_subregion     125 non-null    object  
 4   area_municipio       125 non-null    float64 
 5   altitud              125 non-null    float64 
 6   temperatura          125 non-null    float64 
 7   humedad_relativa     125 non-null    float64 
 8   poblacion            125 non-null    int64   
 9   urbanizacion         125 non-null    float64 
 10  densidad             125 non-null    float64 
 11  muertes_covid19      125 non-null    int64   
 12  recuperados_covid19  125 non-null    int64   
 13  cfr                  125 non-null    float64 
 14  geometry             125 non-null    geometry
dtypes: float64(7), geometry(1), int64(3), object(4)
memory usage: 14.8+ KB

gdf["y_log"] = np.log(gdf["cfr"] + 1)
var = ["area_municipio", "altitud", "temperatura", "humedad_relativa", "urbanizacion", "densidad"]

st = StandardScaler()
X_scaled = st.fit_transform(gdf[var])

X = sm.add_constant(X_scaled)
y = gdf["y_log"]

model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  y_log   R-squared:                       0.199
Model:                            OLS   Adj. R-squared:                  0.159
Method:                 Least Squares   F-statistic:                     4.899
Date:                Sat, 26 Jul 2025   Prob (F-statistic):           0.000166
Time:                        01:02:40   Log-Likelihood:                -33.475
No. Observations:                 125   AIC:                             80.95
Df Residuals:                     118   BIC:                             100.7
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.3240      0.029     45.473      0.000       1.266       1.382
x1            -0.0201      0.036     -0.555      0.580      -0.092       0.052
x2            -0.2719      0.214     -1.268      0.207      -0.697       0.153
x3            -0.1359      0.213     -0.638      0.525      -0.558       0.286
x4            -0.0455      0.034     -1.341      0.183      -0.113       0.022
x5            -0.0099      0.032     -0.306      0.760      -0.074       0.054
x6            -0.0452      0.033     -1.371      0.173      -0.111       0.020
==============================================================================
Omnibus:                        2.070   Durbin-Watson:                   1.760
Prob(Omnibus):                  0.355   Jarque-Bera (JB):                1.736
Skew:                          -0.059   Prob(JB):                        0.420
Kurtosis:                       3.565   Cond. No.                         16.2
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# Obtener las predicciones
y_pred = model.predict(X)
gdf["y_pred"] = y_pred

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
gdf.plot(
    column="y_pred", cmap="viridis", legend=True, ax=ax, linewidth=0.8, edgecolor="0.8"
)
ax.set_title("Mapa de Predicciones del CFR (log) con OLS")
ax.set_axis_off()
plt.show()

_images/78a96704279e917dbbee573b40bc63b59d51171d2e1d2f09cf78e4cae1090343.png

# Calculate the residuals
residuals = y - y_pred

# Create figure for residuals
plt.figure(figsize=(15, 5))

for i, variable in enumerate(var):
    plt.subplot(1, len(var), i + 1)
    plt.scatter(gdf[variable], residuals, alpha=0.5)
    plt.axhline(0, color="red", linestyle="--", lw=2)
    plt.title(f"Residuals vs {variable}")
    plt.xlabel(variable)
    plt.ylabel("Residuals")
    plt.grid(True)

plt.tight_layout()
plt.show()

_images/712c1c64a7e60aa77ff16284c2f7f2e1cddcc494a1d873382ec4e937deba8b5a.png

Regresión Logística#

from sklearn.metrics import roc_curve, roc_auc_score

gdf["evento_cfr"] = np.where(gdf["cfr"] > 3, 1, 0)

var = ["area_municipio", "altitud", "temperatura", "humedad_relativa", "urbanizacion", "densidad"]
y = gdf["evento_cfr"]

st = StandardScaler()
X_scaled = st.fit_transform(gdf[var])
X = sm.add_constant(X_scaled)

model = sm.Logit(y, X).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.573067
         Iterations 10
                           Logit Regression Results                           
==============================================================================
Dep. Variable:             evento_cfr   No. Observations:                  125
Model:                          Logit   Df Residuals:                      118
Method:                           MLE   Df Model:                            6
Date:                Sat, 26 Jul 2025   Pseudo R-squ.:                  0.1701
Time:                        01:03:16   Log-Likelihood:                -71.633
converged:                       True   LL-Null:                       -86.319
Covariance Type:            nonrobust   LLR p-value:                 5.174e-05
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.9660      0.722     -1.337      0.181      -2.382       0.450
x1            -0.1266      0.247     -0.513      0.608      -0.610       0.357
x2            -0.8958      1.522     -0.589      0.556      -3.879       2.087
x3            -0.1861      1.506     -0.124      0.902      -3.138       2.766
x4            -0.2994      0.237     -1.266      0.206      -0.763       0.164
x5            -0.0707      0.252     -0.280      0.779      -0.565       0.423
x6            -4.6452      3.622     -1.283      0.200     -11.744       2.454
==============================================================================

y_prob = model.predict(X)

gdf["probability_eventCFR"] = y_prob

# Plot map with probabilities predicted by the logistic regression
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
gdf.plot(
    column="probability_eventCFR",
    cmap="OrRd",
    legend=True,
    ax=ax,
    linewidth=0.8,
    edgecolor="0.8",
    vmin=0,
    vmax=1,
)
ax.set_title("Mapa de Probabilidad de Evento CFR > 3 con Regresión Logística")
ax.set_axis_off()
plt.show()

_images/a5f75081ad46485f94c61c51d9aeba95369077a5c5e656043edb56f7b99ac1ad.png

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y, y_prob)
auc = roc_auc_score(y, y_prob)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC Curve (AUC = {auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.title("ROC Curve for Logistic Regression Model")
plt.legend(loc="lower right")
plt.grid()
plt.show()

_images/1ffa9c4ffc033a0e66766b618557980bdb6fa9f3664ffe39b280ecd6cdf92f5b.png

# Calculate the residuals for the logistic regression model
residuals_logit = y - y_prob
# Create figure for residuals of logistic regression
plt.figure(figsize=(15, 5))
for i, variable in enumerate(var):
    plt.subplot(1, len(var), i + 1)
    plt.scatter(gdf[variable], residuals_logit, alpha=0.5)
    plt.axhline(0, color="red", linestyle="--", lw=2)
    plt.title(f"Residuals vs {variable} (Logit)")
    plt.xlabel(variable)
    plt.ylabel("Residuals")
    plt.grid(True)
plt.tight_layout()
plt.show()

_images/851a014e5811c3724518486bc4287f6a8132ca2b9e320d9dbe3055ac43c0e5ac.png

Regresión de Poisson#

from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod import families

y = gdf["cfr"]
var = ["altitud", "temperatura", "humedad_relativa", "urbanizacion", "densidad"]  
# No area_municipio for GLM
exposure = np.log(gdf["area_municipio"])

st = StandardScaler()
X_scaled = st.fit_transform(gdf[var])
X = sm.add_constant(X_scaled)
poisson_model_offset = GLM(y, X, family=families.Poisson(), offset=exposure).fit()

print(poisson_model_offset.summary())

                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                    cfr   No. Observations:                  125
Model:                            GLM   Df Residuals:                      119
Model Family:                 Poisson   Df Model:                            5
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -328.61
Date:                Sat, 26 Jul 2025   Deviance:                       296.14
Time:                        01:04:28   Pearson chi2:                     448.
No. Iterations:                     6   Pseudo R-squ. (CS):             0.5450
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -4.9109      0.053    -92.880      0.000      -5.015      -4.807
x1             1.2899      0.408      3.162      0.002       0.490       2.089
x2             0.8924      0.401      2.226      0.026       0.106       1.678
x3            -0.2209      0.049     -4.546      0.000      -0.316      -0.126
x4            -0.0076      0.061     -0.124      0.902      -0.128       0.113
x5             0.1025      0.084      1.218      0.223      -0.062       0.267
==============================================================================

# Get predictions
y_pred_offset = poisson_model_offset.predict(X)
gdf["predicted_cfr_offset"] = y_pred_offset

# Calculate residuals
gdf["difference_cfr_offset"] = gdf["cfr"] - gdf["predicted_cfr_offset"]

# Plotting the predictions with offset
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
gdf.plot(
    column="predicted_cfr_offset",
    cmap="YlGnBu",
    legend=True,
    ax=ax,
    linewidth=0.8,
    edgecolor="0.8",
)
ax.set_title("Mapa de Predicciones de Frecuencia con GLM (Ajustado por Área)")
ax.set_axis_off()
plt.show()

_images/338f4524e2f135af47cfa564fd17e755e43537f70527b8b177adb1783a4eb898.png

# Plotting the residuals for GLM with offset
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
gdf.plot(
    column="difference_cfr_offset",
    cmap="RdBu",
    legend=True,
    ax=ax,
    linewidth=0.8,
    edgecolor="0.8",
)
ax.set_title("Mapa de Diferencia (Valor REal vs. Predicción Ajustada por Área)")
ax.set_axis_off()
plt.show()

_images/ef117518409dac4694523a45833b1da7a8112b80dad4dd9ea4ef702a4fe5d085.png

Regresión Binomial Negativa#

y = gdf["cfr"]
var = ["altitud", "temperatura", "humedad_relativa", "urbanizacion", "densidad"]  
# No area_municipio for GLM
exposure = np.log(gdf["area_municipio"])

st = StandardScaler()
X_scaled = st.fit_transform(gdf[var])
X = sm.add_constant(X_scaled)

negative_binomial_model_offset = GLM(y, X, family=families.NegativeBinomial()).fit()
print(negative_binomial_model_offset.summary())

                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                    cfr   No. Observations:                  125
Model:                            GLM   Df Residuals:                      119
Model Family:        NegativeBinomial   Df Model:                            5
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -279.07
Date:                Sat, 26 Jul 2025   Deviance:                       17.170
Time:                        01:05:54   Pearson chi2:                     17.7
No. Iterations:                     5   Pseudo R-squ. (CS):            0.03499
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0758      0.104     10.353      0.000       0.872       1.279
x1            -0.4077      0.744     -0.548      0.584      -1.867       1.051
x2            -0.2362      0.751     -0.315      0.753      -1.707       1.235
x3            -0.0571      0.115     -0.495      0.620      -0.283       0.169
x4            -0.0290      0.115     -0.252      0.801      -0.254       0.196
x5            -0.0710      0.130     -0.545      0.586      -0.326       0.184
==============================================================================

/usr/local/Caskroom/miniforge/base/envs/geo/lib/python3.11/site-packages/statsmodels/genmod/families/family.py:1367: ValueWarning: Negative binomial dispersion parameter alpha not set. Using default value alpha=1.0.
  warnings.warn("Negative binomial dispersion parameter alpha not "

# Get predictions for Negative Binomial model with offset
y_pred_nb_offset = negative_binomial_model_offset.predict(X)

gdf["predicted_cfr_nb_offset"] = y_pred_nb_offset

# Calculate residuals for Negative Binomial model with offset
gdf["difference_cfr_nb_offset"] = gdf["cfr"] - gdf["predicted_cfr_nb_offset"]

# Plotting the predictions for Negative Binomial model with offset
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
gdf.plot(
    column="predicted_cfr_nb_offset",
    cmap="YlGnBu",
    legend=True,
    ax=ax,
    linewidth=0.8,
    edgecolor="0.8",
)
ax.set_title(
    "Mapa de Predicciones de Frecuencia (Binomial Negativo, Ajustado por Área)"
)
ax.set_axis_off()
plt.show()

_images/a64939e5e52ae903242805aad9d8a6a74e4f6f3459d994544671a144da305972.png

# Plotting the residuals for Negative Binomial model with offset
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
gdf.plot(
    column="difference_cfr_nb_offset",
    cmap="RdBu",
    legend=True,
    ax=ax,
    linewidth=0.8,
    edgecolor="0.8",
)
ax.set_title(
    "Mapa de Diferencia (Valor Real vs. Predicción Binomial Negativa Ajustada por Área)"
)
ax.set_axis_off()
plt.show()

_images/b53228f3a0e64e52a09ba82b2ba8dcd9ed17522ff86e7e716e11b1c963fa633f.png

Modelo de Ceros Inflados de Poisson#

from statsmodels.discrete.count_model import ZeroInflatedPoisson

y = gdf["cfr"]
var = ["altitud", "temperatura", "humedad_relativa", "urbanizacion", "densidad"]  
# No area_municipio for ZIP
exposure = np.log(gdf["area_municipio"])

st = StandardScaler()
X_scaled = st.fit_transform(gdf[var])

X = sm.add_constant(X_scaled)
zip_model_offset = ZeroInflatedPoisson(y, X, exposure=exposure).fit()

print(zip_model_offset.summary())

         Current function value: 1.744450
         Iterations: 35
         Function evaluations: 37
         Gradient evaluations: 37
                     ZeroInflatedPoisson Regression Results                    
===============================================================================
Dep. Variable:                     cfr   No. Observations:                  125
Model:             ZeroInflatedPoisson   Df Residuals:                      119
Method:                            MLE   Df Model:                            5
Date:                 Sat, 26 Jul 2025   Pseudo R-squ.:                 0.01311
Time:                         01:07:25   Log-Likelihood:                -218.06
converged:                       False   LL-Null:                       -220.95
Covariance Type:             nonrobust   LLR p-value:                    0.3266
=================================================================================
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
inflate_const   -18.0946    759.870     -0.024      0.981   -1507.411    1471.222
const            -0.6533      0.053    -12.359      0.000      -0.757      -0.550
x1               -0.1380      0.365     -0.378      0.705      -0.854       0.578
x2               -0.0627      0.369     -0.170      0.865      -0.785       0.660
x3               -0.0778      0.054     -1.453      0.146      -0.183       0.027
x4               -0.0088      0.058     -0.153      0.878      -0.122       0.104
x5               -0.0255      0.082     -0.312      0.755      -0.186       0.135
=================================================================================

/usr/local/Caskroom/miniforge/base/envs/geo/lib/python3.11/site-packages/scipy/optimize/_optimize.py:1313: OptimizeWarning: Maximum number of iterations has been exceeded.
  res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)
/usr/local/Caskroom/miniforge/base/envs/geo/lib/python3.11/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  warnings.warn("Maximum Likelihood optimization failed to "

# Get predictions for Zero-Inflated Poisson model with offset
y_pred_zip_offset = zip_model_offset.predict(X)
gdf["predicted_cfr_zip_offset"] = y_pred_zip_offset

gdf["difference_cfr_zip_offset"] = gdf["cfr"] - gdf["predicted_cfr_zip_offset"]

# Plotting the predictions for Zero-Inflated Poisson model with offset
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
gdf.plot(
    column="predicted_cfr_zip_offset",
    cmap="YlGnBu",
    legend=True,
    ax=ax,
    linewidth=0.8,
    edgecolor="0.8",
)
ax.set_title("Mapa de Predicciones de Frecuencia (ZIP, Ajustado por Área)")
ax.set_axis_off()
plt.show()

_images/c4a9d7cf8a95b4e22f7b24382f1523655e7297e9310462bc0353f52fbe24b043.png

# Plotting the residuals for Zero-Inflated Poisson model with offset
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
gdf.plot(
    column="difference_cfr_zip_offset",
    cmap="RdBu",
    legend=True,
    ax=ax,
    linewidth=0.8,
    edgecolor="0.8",
)
ax.set_title("Mapa de Diferencia (Valor Real vs. Predicción ZIP Ajustada por Área)")
ax.set_axis_off()
plt.show()

_images/71c2a1c48798dd7de0705531eff5456737632dd5414c31dcf71e8e1d0cdb1d28.png