rm(list=ls())
library(AER)
data(Affairs, package='AER')
summary(Affairs)
affairs gender age yearsmarried
Min. : 0.000 female:315 Min. :17.50 Min. : 0.125
1st Qu.: 0.000 male :286 1st Qu.:27.00 1st Qu.: 4.000
Median : 0.000 Median :32.00 Median : 7.000
Mean : 1.456 Mean :32.49 Mean : 8.178
3rd Qu.: 0.000 3rd Qu.:37.00 3rd Qu.:15.000
Max. :12.000 Max. :57.00 Max. :15.000
children religiousness education occupation
no :171 Min. :1.000 Min. : 9.00 Min. :1.000
yes:430 1st Qu.:2.000 1st Qu.:14.00 1st Qu.:3.000
Median :3.000 Median :16.00 Median :5.000
Mean :3.116 Mean :16.17 Mean :4.195
3rd Qu.:4.000 3rd Qu.:18.00 3rd Qu.:6.000
Max. :5.000 Max. :20.00 Max. :7.000
rating
Min. :1.000
1st Qu.:3.000
Median :4.000
Mean :3.932
3rd Qu.:5.000
Max. :5.000
table(Affairs$affairs)
0 1 2 3 7 12
451 34 17 19 42 38
## the binary outcome is of interest
Affairs$ynaffair[Affairs$affairs > 0] <- 1
Affairs$ynaffair[Affairs$affairs == 0] <- 0
## write data to file
con <- file(description="ynaffair.dat", open="wb")
writeBin(object=as.double(Affairs$ynaffair), con=con)
close(con)
Affairs$ynaffair <- factor(Affairs$ynaffair,
levels = c(0, 1),
labels = c("NO", "Yes"))
table(Affairs$ynaffair)
NO Yes
451 150
## full logistic regression model
fit.full <- glm(ynaffair ~ gender + age + yearsmarried + children + religiousness + education + occupation + rating, data = Affairs, family = binomial())
summary(fit.full)
Call:
glm(formula = ynaffair ~ gender + age + yearsmarried + children +
religiousness + education + occupation + rating, family = binomial(),
data = Affairs)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.5713 -0.7499 -0.5690 -0.2539 2.5191
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.37726 0.88776 1.551 0.120807
gendermale 0.28029 0.23909 1.172 0.241083
age -0.04426 0.01825 -2.425 0.015301 *
yearsmarried 0.09477 0.03221 2.942 0.003262 **
childrenyes 0.39767 0.29151 1.364 0.172508
religiousness -0.32472 0.08975 -3.618 0.000297 ***
education 0.02105 0.05051 0.417 0.676851
occupation 0.03092 0.07178 0.431 0.666630
rating -0.46845 0.09091 -5.153 2.56e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 675.38 on 600 degrees of freedom
Residual deviance: 609.51 on 592 degrees of freedom
AIC: 627.51
Number of Fisher Scoring iterations: 4
## reduced model
fit.reduced <- glm(ynaffair ~ age + yearsmarried + religiousness + rating, data = Affairs, family = binomial())
summary(fit.reduced)
Call:
glm(formula = ynaffair ~ age + yearsmarried + religiousness +
rating, family = binomial(), data = Affairs)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.6278 -0.7550 -0.5701 -0.2624 2.3998
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.93083 0.61032 3.164 0.001558 **
age -0.03527 0.01736 -2.032 0.042127 *
yearsmarried 0.10062 0.02921 3.445 0.000571 ***
religiousness -0.32902 0.08945 -3.678 0.000235 ***
rating -0.46136 0.08884 -5.193 2.06e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 675.38 on 600 degrees of freedom
Residual deviance: 615.36 on 596 degrees of freedom
AIC: 625.36
Number of Fisher Scoring iterations: 4
## fit.reduced <- glm(ynaffair ~ age + yearsmarried + religiousness + rating, data = Affairs, family = poisson)
## summary(fit.reduced)
## write data to file
df <- as.matrix(Affairs[c(3,4,6,9)])
N = 601
con <- file(description="x.dat", open="wb")
for(i in 1:N){
writeBin(object=as.vector(as.double(c(1, df[i,])), mode = "numeric"), con=con)
}
close(con)
## compare two model
anova(fit.reduced, fit.full, test = 'Chisq')
Analysis of Deviance Table
Model 1: ynaffair ~ age + yearsmarried + religiousness + rating
Model 2: ynaffair ~ gender + age + yearsmarried + children + religiousness +
education + occupation + rating
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 596 615.36
2 592 609.51 4 5.8474 0.2108
## The nonsignificant chi-square value suggests that the reduced model fits as well as the full model.
## ##########################
## Model Parameters
## ##########################
coef(fit.reduced)
(Intercept) age yearsmarried religiousness rating
1.93083017 -0.03527112 0.10062274 -0.32902386 -0.46136144
exp(coef(fit.reduced))
(Intercept) age yearsmarried religiousness rating
6.8952321 0.9653437 1.1058594 0.7196258 0.6304248
## ##########################
## Predict
## ##########################
testdata <- data.frame(rating=c(1, 2, 3, 4, 5), age=mean(Affairs$age),
yearsmarried=mean(Affairs$yearsmarried),
religiousness=mean(Affairs$religiousness))
testdata$prob <- predict(fit.reduced, newdata=testdata, type="response")
testdata
## ##########################
## overdispersion
##
## Overdispersion occurs when the observed variance of the response variable is
## larger than what would be expected from a binomial distribution.
##
## One way to detect overdispersion is to compare the residual deviance with the
## residual degrees of freedom in your binomial model.
##
## If the ratio phi, is considerably larger than 1, you have evidence of overdispersion.
## ##########################
phi = 615.4/596
## another way for testing Overdispersion
fit <- glm(ynaffair ~ age + yearsmarried + religiousness +
rating, family = binomial(), data = Affairs)
fit.od <- glm(ynaffair ~ age + yearsmarried + religiousness +
rating, family = quasibinomial(), data = Affairs)
pchisq(summary(fit.od)$dispersion * fit$df.residual,
fit$df.residual, lower = F)
[1] 0.340122
## The resulting p-value is clearly not significant, strengthening our belief that Overdispersion is not a problem.
## data
x = matrix(c(1, 10,25,1, 5, 27, 1, 8, 18,1, 11, 11,1, 12, 28, 1,14, 3,1,3, 6, 1,1, 30,1, 16, 26, 1,18, 22), byrow=T, ncol=3)
y = c(0, 0, 0, 0, 1, 1, 1, 1, 1, 1)
x1 = x[,2]
x2 = x[,3]
## logistic regression
model2 = glm(y~x1+x2, family = binomial())
summary(model2)
Call:
glm(formula = y ~ x1 + x2, family = binomial())
Deviance Residuals:
Min 1Q Median 3Q Max
-1.4408 -1.2713 0.8002 0.9391 1.3365
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.146584 2.022968 -0.072 0.942
x1 0.077872 0.126582 0.615 0.538
x2 -0.009927 0.072092 -0.138 0.890
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 13.460 on 9 degrees of freedom
Residual deviance: 13.034 on 7 degrees of freedom
AIC: 19.034
Number of Fisher Scoring iterations: 4
The results of cpp program are as follows: