## com23a of October 2018,
#  in loco parentis for Celine stk 4021,
#  doing my stk 4020 exam 2010 Exercise 2,
#  with the 189 babies of Lemeshow and Hosmer, 
#  tenk at kona hans var med i OL, hvilket
#  er evig imponating, selv om hun fikk siste plass

library("numDeriv") 

yy <- scan("babies") # weight of babies, in kg 
nn <- length(yy)     # 189 

plot(density(yy))    
# looks reasonably normal, after all, 
# apart from the tails -- this is a qq plot: 
plot(qnorm((1:nn)/(nn+1)), sort(yy))

qq <- 0.10 # wish inference for the 0.10 quantile 

## model 0, the normal: 
#  can do everything explicitly by math, 
#  but ok to let the nlm do the job: 

logL0 <- function(para)
{
mu <- para[1]
sigma <- para[2]
# 
hei <- -log(sigma) - 0.5*(yy-mu)^2/sigma^2 - 0.5*log(2*pi)
sum(hei)
}

minuslogL0 <- function(para)
{-logL0(para)}

nils0 <- nlm(minuslogL0,c(4.444,1),hessian=T)
ML0 <- nils0$estimate
Jhat0 <- nils0$hessian
logL0max <- logL0(ML0) # -207.9445
se0 <- sqrt(diag(solve(Jhat0)))

show0 <- cbind(ML0,se0)
print(round(show0,4))
#        ML0    se0
#[1,] 2.9447 0.0529 mu 
#[2,] 0.7271 0.0374 sigma 

mu0hat <- ML0[1]
sigma0hat <- ML0[2]

kappa0hat <- mu0hat + sigma0hat*qnorm(qq) # 2.0128 
# close to the 0.10 quantile of the data, 2.038 

## then the more demanding three-parameter model: 
#  F = Phi((y-mu)/sigma)^exp(a) 
#  f = exp(a) Phi((y-mu)/sigma)^(exp(a)-1) phi((y-mu)/sigma) (1/sigma) 

logL1 <- function(para)
{
mu <- para[1]
sigma <- para[2]
a <- para[3]
# 
hei0 <- -log(sigma) - 0.5*(yy-mu)^2/sigma^2 - 0.5*log(2*pi)
hei1 <- a + (exp(a)-1)*log(pnorm((yy-mu)/sigma)) 
sum(hei0) + sum(hei1) 
}

logL1(c(ML0[1],ML0[2],0))

minuslogL1 <- function(para)
{-logL1(para)}

nils1 <- nlm(minuslogL1,c(ML0[1],ML0[2],0),hessian=T)
ML1 <- nils1$estimate
Jhat1 <- nils1$hessian
logL1max <- logL1(ML1) # -207.9445
se1 <- sqrt(diag(solve(Jhat1)))

show1 <- cbind(ML1,se1)
print(round(show1,4))
#         ML1    se1
#[1,]  3.6051 0.3751 mu
#[2,]  0.4827 0.1509 sigma 
#[3,] -1.2193 0.8528 a      
# wald ratio for a is -1.429, not significant,
# so we expect a not too far from zero 

Delta <- logL1max - logL0max
# 0.928, not quite big enough increase,
# so at least Model 0 wins the aic competition 

mu1hat <- ML1[1]
sigma1hat <- ML1[2]
ahat <- ML1[3]

kappa1hat <- mu1hat + sigma1hat*qnorm(qq^(1/exp(ahat))) # 1.9906
# so the two kappa estimates are quite close, actually 

# a bit more, to get the approximate standard deviation,
# via the delta method: 

kappa1 <- function(para)
{
mu <- para[1]
sigma <- para[2]
a <- para[3]
# 
mu + sigma*qnorm(qq^(1/exp(a)))
}

kappaML <- kappa1(ML1) 
cc1 <- grad(kappa1,ML1) 
stdevkappa1 <- sqrt( t(cc1) %*% solve(Jhat1) %*% cc1 ) 
lazylow <- kappaML - 1.645*stdevkappa1
lazyup <- kappaML + 1.645*stdevkappa1
c(lazylow,kappaML,lazyup)
# 1.858, 1.990, 2.123

## 
yval <- seq(0.70,5.00,by=0.01)
# 
f0val <- dnorm((yval-mu0hat)/sigma0hat)*(1/sigma0hat) 
# 
aux11 <- dnorm((yval-mu1hat)/sigma1hat)*(1/sigma1hat)
aux12 <- exp(ahat)*pnorm((yval-mu1hat)/sigma1hat)^(exp(ahat)-1) 
f1val <- aux11*aux12

hist(yy,prob=T,ylim=c(0,0.55),main=" ",
   xlab="birthweight",ylab="density")
matlines(yval,f0val,type="l",lwd=2,col=1)
matlines(yval,f1val,type="l",lwd=2,col=2)

## ok, prior for (mu,sigma) for Model 0: 
#  c \le \sigma \le d, 
#  1/c^2 \ge \lambda \ge 1/d^2 with 0.90 - 0.10: 

cc <- 0.50
dd <- 2.00 

1/cc^2 # 4.00
1/dd^2 # 0.25  

Q <- function(ab)
{
a <- ab[1]
b <- ab[2]
# 
oi1 <- pgamma(1/cc^2,a,b) - 0.90
oi2 <- pgamma(1/dd^2,a,b) - 0.10
oi1^2 + oi2^2
}

oi <- nlm(Q,c(1.5,1.1),hessian=T)
a0 <- oi$estimate[1] # 1.1733
b0 <- oi$estimate[2] # 0.6493

Q(c(a0,b0)) # zero 

# so with 1/sigma^2 from Gamma(a0,b0) we succeed
# in having 0.50 as 0.10 quantile and 2.00 as 0.90 quantile
# for the prior for sigma 

mu0 <- 2.500
nu0 <- 3.333 

# Nils Exercise 22:
# prior is GN(a0, b0, mu0, nu0) 
# posterior is GN(an, bn, mun, nun) 

ybar <- mean(yy)
Q0 <- sum((yy-ybar)^2) 

dn <- (1/nn + 1/nu0)^(-1) 

an <- a0 + 0.5*nn
bn <- b0 + 0.5*Q0 + 0.5*dn*(ybar - mu0)^2 
mun <- (nu0*mu0 + nn*ybar) / (nu0 + nn) 
nun <- nu0 + nn 

## simulating lots of (mu,lambda) 
#  and then via lambda = 1/sigma^2, sigma = 1/sqrt(lambda)

# so sigma is such that 1/sigma^2 is Gamma(a0,b0)
# and mu | sigma is N(mu0, sigma^2/nu0) 

sim <- 10^5
lambdasim <- rgamma(sim,an,bn)
sigmasim <- 1/sqrt(lambdasim) 
musim <- mun + (sigmasim/sqrt(nun))*rnorm(sim)

plot(musim,sigmasim) 

kappa0sim <- musim + sigmasim*qnorm(qq)
hist(kappa0sim) 

quantile(kappa0sim,c(0.05,0.50,0.95))

# 0.05   0.50   0.95
# 1.876  2.000  2.112 

#  choose to represent model in terms of (mu,lambda,a)
#  can then transform back to sigma afterwards 

logL3 <- function(para)
{
mu <- para[1]
lambda <- para[2]
a <- para[3]
# 
sigma <- 1/sqrt(lambda)
# 
hei0 <- -log(sigma) - 0.5*(yy-mu)^2/sigma^2 - 0.5*log(2*pi)
hei1 <- a + (exp(a)-1)*log(pnorm((yy-mu)/sigma)) 
sum(hei0) + sum(hei1) 
}

logL3(c(mu1hat,1/sigma1hat^2,ahat)) # correct 

## mcmc for (mu,lambda,a): 

ccc <- 4.00 # prior for a is uniform on [-ccc, ccc]
big <- 135

logprior <- function(para) 
{
mu <- para[1]
lambda <- para[2]
a <- para[3]
sigma <- 1/sqrt(lambda) 
# 
aux21 <- log(dgamma(lambda,a0,b0)) 
aux22 <- - 0.5*nu0*(mu-mu0)^2/sigma^2 - log(sigma/sqrt(nu0)) - 0.5*log(2*pi)
aux23 <- log(1/(2*ccc))*(abs(a) <= ccc) - big*(abs(a) > ccc)
# 
aux21 + aux22 + aux23
}

logprior(c(3.3,0.7,0.4))

sim <- 10^4
mcmc <- 0*(1:sim) %*% t(1:3)  
mcmc[1, ] <- c(mu1hat,1/sqrt(sigma1hat),ahat) 

uu <- runif(sim)
ok <- 0*(1:sim)
pracc <- 0*(1:sim) 

for (ss in 2:sim)
{
old <- mcmc[ss-1, ]
prop <- old + 0.11*se1*rnorm(3) 
# 
aux11 <- logprior(prop) - logprior(old) 
aux12 <- logL3(prop) - logL3(old)  
# 
oi <- exp(aux11+aux12)
pracc[ss] <- min(c(1,oi))
ok[ss] <- 1*(uu[ss] <= pracc[ss])
mcmc[ss, ] <- (1-ok[ss])*old + ok[ss]*prop
} 

# plot(mcmc[ ,1:2])
plot(mcmc[ ,3])

mu1sim <- mcmc[ ,1]
lambda1sim <- mcmc[ ,2]
sigma1sim <- 1/sqrt(lambda1sim)
asim <- mcmc[ ,3]

kappa1sim <- mu1sim + sigma1sim*qnorm(qq^(1/exp(asim)))
hist(kappa1sim) 

print(quantile(kappa0sim,c(0.05,0.50,0.95)))
print(quantile(kappa1sim,c(0.05,0.50,0.95)))

# 0.05   0.50   0.95    
# 1.877  2.000  2.112 via model 0 
# 1.864  2.003  2.115 via model 1 and mcmc 
# 1.854  1.991  2.123 via lazy Bayes and delta method  

## last point: p1 f1 / (p1 f1 + p2 f2)
#              p2 f1 / (p1 f1 + p2 f2) 
#  with f = \int L_n(\theta) \pi(\theta) d\theta
#  for the two models 

#  f = \exp(lmax) \int \exp(ll - lmax) (pi(theta)/pi^*(theta))
#                                      pi^*(theta) d\theta 

## simple bic Laplace type approximations first: 

logf0 <- logL0max - 0.5*length(ML0)*log(nn) 
logf1 <- logL1max - 0.5*length(ML1)*log(nn) 

aux0 <- logf0 + 213
aux1 <- logf1 + 213

p0star <- exp(aux0) / (exp(aux0) + exp(aux1)) # 0.8446
p1star <- exp(aux1) / (exp(aux0) + exp(aux1)) # 0.1554

print(c(p0star,p1star))

# more careful Laplace approximation: 
# f = exp(lmax) n^(-p/2) |\hat J|^{-1/2} \pi(\hat\theta) 

logf0more <- logf0 - 0.5*log(det(Jhat0/nn)) + 0 + 0.5*2*log(2*pi) 
logf1more <- logf1 - 0.5*log(det(Jhat1/nn)) + log(1/8) + 0.5*3*log(2*pi)

aux0 <- logf0more + 213
aux1 <- logf1more + 213

p0starmore <- exp(aux0) / (exp(aux0) + exp(aux1)) # 0.8446
p1starmore <- exp(aux1) / (exp(aux0) + exp(aux1)) # 0.1554

print(c(p0starmore,p1starmore))