Binomial GLM Predicted Probabilities

T.E.G. · 2018/02/04 · 4 minute read

Here is an example of binomial glm (e.g., logistic regression):

df <- carData::Mroz # U.S. Women's Labor-Force Participation data
glmfit <- glm(lfp ~ k5 + age + inc + lwg, df, family=binomial)
summary(glmfit)
## 
## Call:
## glm(formula = lfp ~ k5 + age + inc + lwg, family = binomial, 
##     data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7987  -1.1326   0.6516   0.9605   2.3122  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.758672   0.534883   5.158 2.50e-07 ***
## k5          -1.342270   0.190481  -7.047 1.83e-12 ***
## age         -0.059005   0.011246  -5.247 1.55e-07 ***
## inc         -0.024638   0.007199  -3.422  0.00062 ***
## lwg          0.787655   0.148339   5.310 1.10e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1029.75  on 752  degrees of freedom
## Residual deviance:  925.17  on 748  degrees of freedom
## AIC: 935.17
## 
## Number of Fisher Scoring iterations: 4

Calculating Predicted Probabilities for representative values of k5:

library(tidyverse)
# Take mean values of the other predictors
mean_val_pred <- df %>% 
  select(age, inc, lwg) %>% 
  map_df(., mean)

# Create new data frame for prediction
preddf <- with(df, data.frame(k5=c(0:3), mean_val_pred))

# Calculate predicted probabilities
preddf$pred <- predict(glmfit, preddf, type = "response")
preddf
##   k5      age      inc      lwg       pred
## 1  0 42.53785 20.12897 1.097115 0.64952405
## 2  1 42.53785 20.12897 1.097115 0.32622200
## 3  2 42.53785 20.12897 1.097115 0.11228675
## 4  3 42.53785 20.12897 1.097115 0.03198863

Here is how to plot predicted probabilities using ggplot2:

library(hrbrthemes)

preddf %>% 
  ggplot(aes(k5, pred)) +
  geom_point() +
  geom_line() +
  labs(y="Predicted Probabilities", 
       x="Number of children 5 years old or younger.", 
       title="Predicted Probabilities for Labor Force Participation") +
  theme_ipsum_rc() # from hrbrthemes package

With a grouping variable

We can repeat the same analysis, but this time using a grouping variable (wc).

df <- carData::Mroz # U.S. Women's Labor-Force Participation data
glmfit <- glm(lfp ~ k5 + wc + age + inc + lwg, df, family=binomial)

# Take mean values of the other predictors
mean_val_pred <- df %>% 
  select(age, inc, lwg) %>% 
  map_df(., mean)

# Create new data frame for prediction
# Note that wc is added to the data frame
preddf <- with(df, data.frame(k5=c(0:3), wc=c(rep("yes",4), rep("no",4)), mean_val_pred))

# Calculate predicted probabilities
preddf$pred <- predict(glmfit, preddf, type = "response")
preddf
##   k5  wc      age      inc      lwg       pred
## 1  0 yes 42.53785 20.12897 1.097115 0.78280676
## 2  1 yes 42.53785 20.12897 1.097115 0.46264344
## 3  2 yes 42.53785 20.12897 1.097115 0.17058190
## 4  3 yes 42.53785 20.12897 1.097115 0.04682807
## 5  0  no 42.53785 20.12897 1.097115 0.60102192
## 6  1  no 42.53785 20.12897 1.097115 0.26462269
## 7  2  no 42.53785 20.12897 1.097115 0.07915511
## 8  3  no 42.53785 20.12897 1.097115 0.02012059
preddf %>% 
  ggplot(aes(k5, pred, color=wc)) +
  geom_point() +
  geom_line() +
  labs(y="Predicted Probabilities", 
       x="Number of children 5 years old or younger.", 
       title="Predicted Probabilities for Labor Force Participation",
       color="Wife's College\nAttendance") +
  theme_ipsum_rc() + # from hrbrthemes package 
  theme(legend.position="bottom")

Adding Error Bars

Adding error bars to the plot requires few additional steps:

df <- carData::Mroz # U.S. Women's Labor-Force Participation data
glmfit <- glm(lfp ~ k5 + wc + age + inc + lwg, df, family=binomial)

# Take mean values of the other predictors
mean_val_pred <- df %>% 
  select(age, inc, lwg) %>% 
  map_df(., mean)

# Create new data frame for prediction
# Note that wc is added to the data frame
preddf <- with(df, data.frame(k5=c(0:3), wc=c(rep("yes",4), rep("no",4)), mean_val_pred))

# Calculate predicted probabilities and standard errors
pred_se <- predict(glmfit, preddf, type = "response", se.fit = T)
predfit <- pred_se$fit
low_lim <- predfit  - (1.96*pred_se$se.fit)
upp_lim <- predfit  + (1.96*pred_se$se.fit)

# Add the columns to the data frame
preddf <- cbind(preddf, predfit, low_lim, upp_lim)
preddf
##   k5  wc      age      inc      lwg    predfit       low_lim    upp_lim
## 1  0 yes 42.53785 20.12897 1.097115 0.78280676  0.7205413155 0.84507220
## 2  1 yes 42.53785 20.12897 1.097115 0.46264344  0.3593517348 0.56593514
## 3  2 yes 42.53785 20.12897 1.097115 0.17058190  0.0707470485 0.27041675
## 4  3 yes 42.53785 20.12897 1.097115 0.04682807 -0.0001973055 0.09385345
## 5  0  no 42.53785 20.12897 1.097115 0.60102192  0.5528598110 0.64918403
## 6  1  no 42.53785 20.12897 1.097115 0.26462269  0.1952997156 0.33394567
## 7  2  no 42.53785 20.12897 1.097115 0.07915511  0.0276018221 0.13070839
## 8  3  no 42.53785 20.12897 1.097115 0.02012059 -0.0011171785 0.04135836

To add error bars, geom_errorbar() can be used

preddf %>% 
  ggplot(aes(k5, predfit, color=wc)) +
  geom_point() +
  geom_line() +
  geom_errorbar(aes(ymin=low_lim, ymax=upp_lim), width=0.05) +
  labs(y="Predicted Probabilities", 
       x="Number of children 5 years old or younger.", 
       title="Predicted Probabilities for Labor Force Participation",
       color="Wife's College\nAttendance") +
  theme_ipsum_rc() + # from hrbrthemes package 
  theme(legend.position="bottom")