Tags

, , , , ,

# Continuing from the Part 1...

# For this tutorial I am using code written by Robert Brandon Gramacy of The University of Chicago Booth (I found this to be wonderful and hence though up including into my blog)

# reading csv data into R environment

credit <- read.csv("germancredit1.csv")
# Let us use 80% of the total as training data set
train <- 1:800

## using credit history build a generalized linear model, since we have credit history in the data.
null <- glm(GoodCredit ~ history3, family=binomial, data=credit[train, ])

# similarly, build a model using complete set of data
full <- glm(GoodCredit ~ ., family = binomial, data = credit[train,])

# stepwise regression modelling
reg <- step(null, scope=formula(full), direction="forward", k=log(length(train)))

# results would be as follows:
## Start:  AIC=967.1
## GoodCredit ~ history3
## 
##                   Df Deviance AIC
## + checkingstatus1  3      848 902
## + duration2        1      906 947
## + amount5          1      920 960
## + foreign20        1      926 966
## + installment8     1      926 966
## <none>                    934 967
## + age13            1      929 969
## + savings6         4      910 970
## + housing15        2      924 970
## + others10         2      925 971
## + cards16          1      932 973
## + tele19           1      933 973
## + status9          3      919 973
## + property12       3      920 973
## + residence11      1      934 974
## + liable18         1      934 974
## + otherplans14     2      931 978
## + employ7          4      919 980
## + job17            3      930 984
## + purpose4         9      900 994
## 
## Step:  AIC=901.7
## GoodCredit ~ history3 + checkingstatus1
## 
##                Df Deviance AIC
## + duration2     1      824 884
## + amount5       1      837 898
## + installment8  1      840 900
## + foreign20     1      840 900
## <none>                 848 902
## + others10      2      835 902
## + age13         1      844 904
## + cards16       1      847 907
## + tele19        1      848 908
## + liable18      1      848 908
## + residence11   1      848 908
## + housing15     2      842 909
## + property12    3      836 910
## + status9       3      837 910
## + otherplans14  2      846 913
## + job17         3      844 918
## + savings6      4      838 918
## + employ7       4      838 918
## + purpose4      9      822 936
## 
## Step:  AIC=884
## GoodCredit ~ history3 + checkingstatus1 + duration2
## 
##                Df Deviance AIC
## + installment8  1      816 883
## <none>                 824 884
## + others10      2      811 885
## + foreign20     1      818 885
## + age13         1      820 887
## + cards16       1      822 889
## + tele19        1      822 889
## + status9       3      809 890
## + amount5       1      824 891
## + residence11   1      824 891
## + liable18      1      824 891
## + housing15     2      819 893
## + otherplans14  2      822 896
## + employ7       4      810 897
## + property12    3      819 899
## + savings6      4      814 901
## + job17         3      822 903
## + purpose4      9      793 913
## 
## Step:  AIC=882.9
## GoodCredit ~ history3 + checkingstatus1 + duration2 + installment8
## 
##                Df Deviance AIC
## <none>                 816 883
## + others10      2      804 884
## + age13         1      811 885
## + foreign20     1      812 885
## + status9       3      798 885
## + amount5       1      813 887
## + cards16       1      814 888
## + tele19        1      815 889
## + residence11   1      816 890
## + liable18      1      816 890
## + housing15     2      811 891
## + employ7       4      801 894
## + otherplans14  2      814 895
## + property12    3      811 898
## + savings6      4      806 899
## + job17         3      815 902
## + purpose4      9      786 913

# all other variables are excluded (due to the reason that they are not significant, I guess..)
# make prediction using "response" from train data set (using credit history) 
predreg <- predict(reg, newdata = credit[-train,], type="response") 

# make prediction using "response" from train data set (using complete set)
predfull <- predict(full, newdata = credit[-train,], type="response"

# storing errors 
errorreg <- credit[-train, 1] - (predreg >= .5)
errorfull <- credit[-train, 1] - (predfull >= .5) 

mean(abs(errorreg))
## [1] 0.22
mean(abs(errorfull))
## [1] 0.265
covars <- credit[, 2:21]
Y <- credit$Good
index <- credit$history3[-train]=="A32"
locs <- (covars[-train, c(2, 8)])[index, ]
check <- (covars[-train, 1])[index]
resp <- (Y[-train])[index]+1

duration <- 1:72
installment <- 1:4
grid <- expand.grid(duration2 = duration, installment8 = installment)

par(mfrow = c(2, 2), mai = c(.3, .3, .2, .1), omi = c(.3, .3, .3, .1))

newdata <- cbind(grid, checkingstatus1 = rep("A11", 288), history3 = "A32")
pred <- predict(reg, newdata)
type <-  matrix((exp(pred)/(1+exp(pred))) > 0.5, ncol=4)
image(x=duration, y=installment, z=type, col = c(2, 5), xlab="", ylab="")
text(x = 65, y = 3.5,labels = "A11", font = 2)
points(locs[check == "A11", ], pch = c(4, 1)[resp[check == "A11"]])

newdata <- cbind(grid, checkingstatus1 = rep("A12", 288), history3="A32")
pred <- predict(reg, newdata)
type <-  matrix((exp(pred)/(1+exp(pred))) > 0.5, ncol=4)
image(x=duration, y=installment, z=type, col = c(2,5), xlab="", ylab="")
text(x=65,y=3.5,labels="A12", font=2)
points(locs[check=="A12",], pch=c(4,1)[resp[check=="A12"]])

newdata <- cbind(grid, checkingstatus1 = rep("A13", 288), history3="A32")
pred <- predict(reg, newdata)
type <-  matrix((exp(pred)/(1+exp(pred))) > 0.5, ncol=4)
image(x=duration, y=installment, z=type, col = c(2,5), xlab="", ylab="")
text(x=65,y=3.5,labels="A13", font=2)
points(locs[check=="A13",], pch=c(4,1)[resp[check=="A13"]])

newdata <- cbind(grid, checkingstatus1 = rep("A14", 288), history3="A32")
pred <- predict(reg, newdata)
type <-  matrix((exp(pred)/(1+exp(pred))) > 0.5, ncol=4)
image(x=duration, y=installment, z=type, col = c(2,5), xlab="", ylab="")
text(x=65,y=3.5,labels="A14", font=2)
points(locs[check=="A14",], pch=c(4,1)[resp[check=="A14"]])

## Throw on the axis labels
mtext("duration", side=1, font=3, outer=TRUE, line=.5)
mtext("installment", side=2, font=3, outer=TRUE, line=.5)
mtext("One time default borrowers", side=3, font=2, outer=TRUE)

cs0

index <- credit$history3[-train]=="A34"
locs <- (covars[-train,c(2,8)])[index,]
check <- (covars[-train,1])[index]
resp <- (Y[-train])[index]+1
par(mfrow=c(2,2), mai=c(.3,.3,.2,.1), omi=c(.3,.3,.3,.1))

duration <- 1:72
installment <-1:4
grid <- expand.grid( duration2=duration, installment8=installment)

newdata <- cbind(grid, checkingstatus1 = rep("A11", 288), history3="A34")
pred <- predict(reg, newdata)
type <-  matrix((exp(pred)/(1+exp(pred))) > 0.5, ncol=4)
image(x=duration, y=installment, z=type, col = c(2,5), xlab="", ylab="")
text(x=65,y=3.5,labels="A11", font=2)
points(locs[check=="A11",], pch=c(4,1)[resp[check=="A11"]])

newdata <- cbind(grid, checkingstatus1 = rep("A12", 288), history3="A34")
pred <- predict(reg, newdata)
type <-  matrix((exp(pred)/(1+exp(pred))) > 0.5, ncol=4)
image(x=duration, y=installment, z=type, col = c(2,5), xlab="", ylab="")
text(x=65,y=3.5,labels="A12", font=2)
points(locs[check=="A12",], pch=c(4,1)[resp[check=="A12"]])

newdata <- cbind(grid, checkingstatus1 = rep("A13", 288), history3="A34")
pred <- predict(reg, newdata)
type <-  matrix((exp(pred)/(1+exp(pred))) > 0.5, ncol=4)
image(x=duration, y=installment, z=type, col = c(2,5), xlab="", ylab="")
text(x=65,y=3.5,labels="A13", font=2)
points(locs[check=="A13",], pch=c(4,1)[resp[check=="A13"]])

newdata <- cbind(grid, checkingstatus1 = rep("A14", 288), history3="A34")
pred <- predict(reg, newdata)
type <-  matrix((exp(pred)/(1+exp(pred))) > 0.5, ncol=4)
image(x=duration, y=installment, z=type, col = c(2,5), xlab="", ylab="")
text(x=65,y=3.5,labels="A14", font=2)
points(locs[check=="A14",], pch=c(4,1)[resp[check=="A14"]])

mtext("duration", side=1, font=3, outer=TRUE, line=.5)
mtext("installment", side=2, font=3, outer=TRUE, line=.5)
mtext("Borrowers with multiple defaults", side=3, font=2, outer=TRUE)

cs1

index <- credit$history3[-train]=="A30"
locs <- (covars[-train,c(2,8)])[index,]
check <- (covars[-train,1])[index]
resp <- (Y[-train])[index]+1
par(mfrow=c(2,2), mai=c(.3,.3,.2,.1), omi=c(.3,.3,.3,.1))


duration <- 1:72
installment <-1:4
grid <- expand.grid( duration2=duration, installment8=installment)

newdata <- cbind(grid, checkingstatus1 = rep("A11", 288), history3="A30")
pred <- predict(reg, newdata)
type <-  matrix((exp(pred)/(1+exp(pred))) > 0.5, ncol=4)
image(x=duration, y=installment, z=type, col = c(2,5), xlab="", ylab="")
text(x=65,y=3.5,labels="A11", font=2)
points(locs[check=="A11",], pch=c(4,1)[resp[check=="A11"]])

newdata <- cbind(grid, checkingstatus1 = rep("A12", 288), history3="A30")
pred <- predict(reg, newdata)
type <-  matrix((exp(pred)/(1+exp(pred))) > 0.5, ncol=4)
image(x=duration, y=installment, z=type, col = c(2,5), xlab="", ylab="")
text(x=65,y=3.5,labels="A12", font=2)
points(locs[check=="A12",], pch=c(4,1)[resp[check=="A12"]])

newdata <- cbind(grid, checkingstatus1 = rep("A13", 288), history3="A30")
pred <- predict(reg, newdata)
type <-  matrix((exp(pred)/(1+exp(pred))) > 0.5, ncol=4)
image(x=duration, y=installment, z=type, col = c(2,5), xlab="", ylab="")
text(x=65,y=3.5,labels="A13", font=2)
points(locs[check=="A13",], pch=c(4,1)[resp[check=="A13"]])

newdata <- cbind(grid, checkingstatus1 = rep("A14", 288), history3="A30")
pred <- predict(reg, newdata)
type <-  matrix((exp(pred)/(1+exp(pred))) > 0.5, ncol=4)
image(x=duration, y=installment, z=type, col = c(2,5), xlab="", ylab="")
text(x=65,y=3.5,labels="A14", font=2)
points(locs[check=="A14",], pch=c(4,1)[resp[check=="A14"]])

mtext("duration", side=1, font=3, outer=TRUE, line=.5)
mtext("installment", side=2, font=3, outer=TRUE, line=.5)
mtext("Clean-history borrowers", side=3, font=2, outer=TRUE)

cs2

There're many other ways we can make predictions using this data set.
I will soon come back with some more tutorials on the topic...

 

Advertisements