Logistic regression from scratch (Newton Raphson and Fisher Scoring)

In an earlier post, I had shown this using iteratively reweighted least squares (IRLS). This is just an alternative method using Newton Raphson and the Fisher scoring algorithm. For further details, you can look here as well.

library(MLMusingR)
data(suspend)

m1 <- glm(sus ~ male + gpa * frpl + fight + frmp.c * pminor.c, 
          data = suspend, 
          family = binomial)

### extracting raw components

dat <- model.frame(m1)
fml <- formula(m1)
X <- model.matrix(fml, dat)
y <- model.response(dat, 'numeric')
k <- ncol(X)
#pp <- function(x) 1 / (1 + exp(-x)) #convert logit to pred prob

beta_0 <- rep(0, k) #initialize with zeroes
eta <- X %*% beta_0 #predicted logit
mu <- plogis(eta) #same as pp but built-in; convert logit to pred prob
vr <- (mu * (1 - mu)) #variance

wts <- diag(as.vector(vr)) #create weight matrix
d2 <- t(X) %*% wts %*% X #the Fisher information matrix
u1 <- t(X) %*% (y - mu) #the score function
iter <- 50 #max number of iterations
tol <- 1e-8 #tolerance

### now iterate

for (i in 1:iter){
  #beta_1 <- beta_0 + solve(d2) %*% u1 #update beta
  beta_1 <- beta_0 + chol2inv(chol(d2)) %*% u1 #update beta avoiding inversion
  #print(beta_1) #show estimates
  
  if(any(abs((beta_1 - beta_0) / beta_0) < tol)) break #stop loop if no change

  eta <- X %*% beta_1 #predicted logit
  mu <- plogis(eta) #this is just pp(eta)
  vr <- mu * (1 - mu) #variance
  
  wts <- diag(as.vector(vr)) #putting in a weight matrix
  d2 <- t(X) %*% wts %*% X #information matrix
  u1 <- t(X) %*% (y - mu) #score function
  beta_0 <- beta_1
  cat(i, "::") #show iteration number
}
1 ::2 ::3 ::4 ::5 ::6 ::7 ::
### compare results

se <- sqrt(diag(solve(d2))) #get standard error, inverse of information matrix
z <- beta_1 / se
p.val <- 2 * pt(-abs(z), df = Inf)
df <- data.frame(beta = beta_1, se = se, z = z, p = format(p.val, 3))
df
                         beta          se          z            p
(Intercept)     -1.5922023202 0.269404299 -5.9100851 3.419311e-09
male             0.3248969875 0.099383812  3.2691138 1.078849e-03
gpa             -0.7954794245 0.084849355 -9.3751971 6.904665e-21
frpl            -0.5627344684 0.318873753 -1.7647563 7.760473e-02
fight            2.0780999956 0.098472087 21.1034422 7.395171e-99
frmp.c           0.0030040140 0.003189348  0.9418898 3.462491e-01
pminor.c        -0.0022362792 0.002302005 -0.9714485 3.313250e-01
gpa:frpl         0.3872562271 0.109168665  3.5473204 3.891710e-04
frmp.c:pminor.c  0.0001243666 0.000106535  1.1673780 2.430578e-01
#### to compare

m2 <- update(m1, epsilon = 1e-16) #change tolerance so results match
summary(m2)$coef
                     Estimate  Std. Error    z value     Pr(>|z|)
(Intercept)     -1.5922023202 0.269404299 -5.9100851 3.419311e-09
male             0.3248969875 0.099383812  3.2691138 1.078849e-03
gpa             -0.7954794245 0.084849355 -9.3751971 6.904665e-21
frpl            -0.5627344684 0.318873753 -1.7647563 7.760473e-02
fight            2.0780999956 0.098472087 21.1034422 7.395171e-99
frmp.c           0.0030040140 0.003189348  0.9418898 3.462491e-01
pminor.c        -0.0022362792 0.002302005 -0.9714485 3.313250e-01
gpa:frpl         0.3872562271 0.109168665  3.5473204 3.891710e-04
frmp.c:pminor.c  0.0001243666 0.000106535  1.1673780 2.430578e-01
## equal deviances

-2 * sum((y * log(mu)) + (1 - y) * log(1 - mu))
[1] 3331.017
deviance(m2)
[1] 3331.017

An alternative using maximum likelihood estimation (MLE)

m3 <- glm(sus ~ male + gpa + frpl + gpa * frpl, 
          data = suspend, 
          family = binomial)
X <- model.matrix(m3)

## this is the function to maximize
mle <- function(b){
  pp <- plogis(X %*% b)
  sum(dbinom(y, 1, pp, log = TRUE))
}

k <- ncol(X) #number of variables
sta <- rep(0, k) #initial values

out <- optim(par = sta, mle, #starting values and function to optimize
            control = list(fnscale = -1, #-1 to maximize
            reltol = 1e-15, #tolerance
            maxit = 1000),
            hessian = TRUE) #so it doesn't keep iterating
out$par #coefficients, difference due to tolerance?
[1] -0.3712208  0.6727835 -0.9884962 -0.5433412  0.4056071
coef(m3) #using built in function
(Intercept)        male         gpa        frpl    gpa:frpl 
 -0.3712205   0.6727835  -0.9884965  -0.5433411   0.4056072 
out$value #the log likelihood
[1] -1908.202
logLik(m3)
'log Lik.' -1908.202 (df=5)
sqrt(diag(solve(-out$hessian))) #standard errors
[1] 0.23858787 0.09314298 0.07768979 0.29413387 0.10102125
sqrt(diag(vcov(m3)))
(Intercept)        male         gpa        frpl    gpa:frpl 
 0.23858792  0.09314278  0.07768976  0.29413406  0.10102131 

– END

Related

Next
Previous
comments powered by Disqus