Bhawna G. Panwar

14 minute read

Logistic Regression was used to run the Credit dataset.

credit <- read.csv("C:/Users/Bhawna/Documents/blog/data/credit.csv")

# examine the launch data
str(credit)
## 'data.frame':    1000 obs. of  17 variables:
##  $ checking_balance    : Factor w/ 4 levels "< 0 DM","> 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
##  $ months_loan_duration: int  6 48 12 42 24 36 24 36 12 30 ...
##  $ credit_history      : Factor w/ 5 levels "critical","good",..: 1 2 1 2 4 2 2 2 2 1 ...
##  $ purpose             : Factor w/ 6 levels "business","car",..: 5 5 4 5 2 4 5 2 5 2 ...
##  $ amount              : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ savings_balance     : Factor w/ 5 levels "< 100 DM","> 1000 DM",..: 5 1 1 1 1 5 4 1 2 1 ...
##  $ employment_duration : Factor w/ 5 levels "< 1 year","> 7 years",..: 2 3 4 4 3 3 2 3 4 5 ...
##  $ percent_of_income   : int  4 2 2 2 3 2 3 2 2 4 ...
##  $ years_at_residence  : int  4 2 3 4 4 4 4 2 4 2 ...
##  $ age                 : int  67 22 49 45 53 35 53 35 61 28 ...
##  $ other_credit        : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ housing             : Factor w/ 3 levels "other","own",..: 2 2 2 1 1 1 2 3 2 2 ...
##  $ existing_loans_count: int  2 1 1 1 2 1 1 1 1 2 ...
##  $ job                 : Factor w/ 4 levels "management","skilled",..: 2 2 4 2 2 4 2 1 4 1 ...
##  $ dependents          : int  1 1 2 2 2 2 1 1 1 1 ...
##  $ phone               : Factor w/ 2 levels "no","yes": 2 1 1 1 1 2 1 2 1 1 ...
##  $ default             : Factor w/ 2 levels "no","yes": 1 2 1 1 2 1 1 1 1 2 ...
# logisitic regression

# set up trainning and test data sets

indx = sample(1:nrow(credit), as.integer(0.9*nrow(credit)))
indx
##   [1] 504 183 769  61 668 182 950 227 404 823 423 454 236 865 294 174 567
##  [18] 473 317 681  74 200 840 274 365 770 420 509 781 953 137 122 562 462
##  [35] 828 923 470 997 341 705 369 872 777 418  53 927 499 168 607 457  41
##  [52] 129 827 584 640 738 482 976 304 390  36 796 816 356 517 968 242 971
##  [69] 123 754 870 275 563 896 752 717 271 750 815 511 407 530 118  18 841
##  [86] 735  50 734 992 930 684 161 195 965 290  13 106 709 925 600 737 916
## [103] 487 783 329   5 175 895  60 797 500 519 393 701 389 415  49 891 384
## [120] 426  68 219 434 638 111   1  38 879 372 807 198 260  10 821 883 644
## [137] 202 825 715 836 272 616 598  16 830 309 412 749 177 497 375 417 676
## [154] 253 498 296 671 951 114 432 746 824 811 871 978 335 438 894 707 545
## [171] 962 608 100 154 990 104 289   8 906 491 117 138 784 868 131 909  48
## [188] 408  40 905 987 522 189 747 993 793  85 712   2 277 218  15 698 613
## [205] 446 647 708 246 897 843 151 333 533 458 226 964 919 901 995  46 360
## [222] 262 295   7 981 570 286 568 213  17 666  97 192 635 521 342 772 669
## [239] 955 143 541 938 231 364 459 520 952 303  70 791 116 193 866 630 822
## [256] 383 264 849 914 611 693 973 970  80 212 658  59 551 771  14 685 648
## [273] 232 575 234 523 476 947 853 903 744 656 931 645 205 299 406 526 651
## [290] 667 624 889 524 851 633 996 537 478 940 721 306 539 867 199 848  81
## [307] 697 786 256 826 578 979 543 683 141 318 959 326 437 918 216 839  33
## [324] 402 689 653 661 321 655 261 224 954 856   9 577 547 163 729  30 753
## [341] 603 818 363 494 751  71  35 222 431 327 546 732 590 109 101 251 490
## [358] 829 700  34 394 642 727 348 595 719 316  69 994 240 880 377 801 486
## [375] 665 312 276 723 165 844 890  90  72 283 493 265 359 774 270 803 720
## [392] 714 305 552 741  78 433 850  92  94 975 411  91 191 832 946 350 589
## [409] 153  65 606  86 452 706 502 550 366 235 787 929  87 542 776 986 255
## [426] 508 810 269 802 349 439 785 913 120 532 673 601 634 538 492 453  79
## [443] 999 400 217 725 833 331 926 949 391 139 157 663 599 559 513 179 942
## [460] 659 489  95 481 628 380 627 768  37 960 325  67 298 576 512 128 430
## [477] 180 760 657 795  21 581 688 892 444 789 156 267 900 686 501 147 819
## [494] 620 586 631 605 352 654  96  51  75 980 814 249  12 579  63 188 536
## [511] 112 718 281 748 293 780 549 215 842 711 302 190 531 873  89 382 310
## [528] 140 121 702 376 728 319 845 875 445 572 378 113  32 817 887 130 127
## [545] 580 710 775 166 247 535 912 695 862 761 429 440 558 311 337 220 928
## [562] 413 922 300 328 985 878 483 677 238 495 409 831 258 564 435 804 888
## [579] 794 282 660   4 991 171  55 790 904 185 196 778 756 228 664 416 726
## [596] 874 397 460 301 733 618 554 911 641 902 908 557 221 755 503 110 496
## [613] 982 164 983  39 211 465 410 948 347 362 798 716 932 134  76  73 560
## [630] 809 571 379 373 428 604 150 399 678 449  11 480 933 988 924 675 854
## [647] 468 625 820 268 155 582  93 471 967 469 966 323 338 119 263 257 133
## [664] 528 427 507 687 680 588 650 353 886 561 813 448 812 956 340 834 455
## [681] 422 574 569 743 479 386 178  20 893 203 773 167 764 989 210  83 145
## [698] 132 103 233  24 184 898 421 740 884 285 672 921 254 766 244 652 292
## [715] 403 172 506  22  88 591 125 186  25 736 371 297 343 339 425  31 800
## [732] 555 475 361 204 344 881 649  26 860  54 699 529 885 450 313 152 162
## [749] 692 593 385 225 730 612 682 206 910 724 284 534 354 969 696 320 280
## [766] 149 108 441 243 587 527 187 855 442 351 632  27 863 565 939 915 525
## [783] 488 876 907 516  52  43 957 583 273 518 170 181 573 358 370 160 679
## [800] 451 806 690 194 463 346  57 241 759 436 107 158 614 670 485 621 443
## [817]  64 484 920 597 374 124 674  82 899 398 250 937 622  44  77 115 388
## [834]  19  58 722 548   6 917 592 639 742 126 544 148 159 345 945 757 307
## [851] 984 858 935 136 972 368 877 314 864 239 619 882 694 835 623 315 102
## [868] 477 788 419 424 703 357 958 936 401 223 805 731  98 763 229 941 808
## [885] 782  99 259 308 846 713 792 602   3 566 847 779 266 629 556 230
credit_train = credit[indx,]
credit_test = credit[-indx,]

credit_train_labels = credit[indx,17]
credit_test_labels = credit[-indx,17]   

# Check if there are any missing values

library(Amelia)
## Warning: package 'Amelia' was built under R version 3.3.3
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(credit, main = "Missing values vs observed")

# number of missing values in each column

sapply(credit,function(x) sum(is.na(x)))
##     checking_balance months_loan_duration       credit_history 
##                    0                    0                    0 
##              purpose               amount      savings_balance 
##                    0                    0                    0 
##  employment_duration    percent_of_income   years_at_residence 
##                    0                    0                    0 
##                  age         other_credit              housing 
##                    0                    0                    0 
## existing_loans_count                  job           dependents 
##                    0                    0                    0 
##                phone              default 
##                    0                    0
# number of unique values in each column

sapply(credit, function(x) length(unique(x)))
##     checking_balance months_loan_duration       credit_history 
##                    4                   33                    5 
##              purpose               amount      savings_balance 
##                    6                  921                    5 
##  employment_duration    percent_of_income   years_at_residence 
##                    5                    4                    4 
##                  age         other_credit              housing 
##                   53                    3                    3 
## existing_loans_count                  job           dependents 
##                    4                    4                    2 
##                phone              default 
##                    2                    2
# fit the logistic regression model, with all predictor variables

model <- glm(default ~.,family=binomial(link='logit'),data=credit_train)
model
## 
## Call:  glm(formula = default ~ ., family = binomial(link = "logit"), 
##     data = credit_train)
## 
## Coefficients:
##                    (Intercept)        checking_balance> 200 DM  
##                     -2.328e+00                      -9.605e-01  
##     checking_balance1 - 200 DM         checking_balanceunknown  
##                     -2.609e-01                      -1.719e+00  
##           months_loan_duration              credit_historygood  
##                      3.438e-02                       1.037e+00  
##          credit_historyperfect              credit_historypoor  
##                      1.493e+00                       6.539e-01  
##        credit_historyvery good                      purposecar  
##                      1.590e+00                       2.389e-01  
##                    purposecar0                purposeeducation  
##                     -2.067e-01                       6.630e-01  
##    purposefurniture/appliances              purposerenovations  
##                     -2.109e-01                       4.148e-01  
##                         amount        savings_balance> 1000 DM  
##                      8.711e-05                      -1.025e+00  
##    savings_balance100 - 500 DM    savings_balance500 - 1000 DM  
##                     -1.707e-01                      -1.808e-01  
##         savings_balanceunknown    employment_duration> 7 years  
##                     -9.805e-01                      -2.460e-01  
## employment_duration1 - 4 years  employment_duration4 - 7 years  
##                     -2.602e-01                      -8.296e-01  
##  employment_durationunemployed               percent_of_income  
##                      5.115e-03                       3.160e-01  
##             years_at_residence                             age  
##                     -1.293e-02                      -1.288e-02  
##               other_creditnone               other_creditstore  
##                     -5.519e-01                      -8.932e-03  
##                     housingown                     housingrent  
##                     -6.249e-02                       3.896e-01  
##           existing_loans_count                      jobskilled  
##                      3.542e-01                       1.388e-01  
##                  jobunemployed                    jobunskilled  
##                     -4.270e-01                      -3.646e-02  
##                     dependents                        phoneyes  
##                      2.380e-01                      -3.638e-01  
## 
## Degrees of Freedom: 899 Total (i.e. Null);  864 Residual
## Null Deviance:       1105 
## Residual Deviance: 848.8     AIC: 920.8
summary(model)
## 
## Call:
## glm(formula = default ~ ., family = binomial(link = "logit"), 
##     data = credit_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9447  -0.7793  -0.4034   0.7916   2.6465  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    -2.328e+00  9.573e-01  -2.432 0.015016 *  
## checking_balance> 200 DM       -9.605e-01  3.833e-01  -2.506 0.012214 *  
## checking_balance1 - 200 DM     -2.609e-01  2.129e-01  -1.226 0.220380    
## checking_balanceunknown        -1.719e+00  2.384e-01  -7.211 5.56e-13 ***
## months_loan_duration            3.438e-02  9.682e-03   3.551 0.000384 ***
## credit_historygood              1.037e+00  2.718e-01   3.815 0.000136 ***
## credit_historyperfect           1.493e+00  4.393e-01   3.397 0.000681 ***
## credit_historypoor              6.539e-01  3.485e-01   1.876 0.060630 .  
## credit_historyvery good         1.590e+00  4.385e-01   3.625 0.000289 ***
## purposecar                      2.389e-01  3.276e-01   0.729 0.465827    
## purposecar0                    -2.067e-01  8.204e-01  -0.252 0.801123    
## purposeeducation                6.630e-01  4.644e-01   1.428 0.153406    
## purposefurniture/appliances    -2.109e-01  3.218e-01  -0.655 0.512227    
## purposerenovations              4.148e-01  5.873e-01   0.706 0.480046    
## amount                          8.711e-05  4.532e-05   1.922 0.054591 .  
## savings_balance> 1000 DM       -1.025e+00  5.218e-01  -1.965 0.049430 *  
## savings_balance100 - 500 DM    -1.707e-01  2.916e-01  -0.585 0.558331    
## savings_balance500 - 1000 DM   -1.808e-01  4.100e-01  -0.441 0.659146    
## savings_balanceunknown         -9.805e-01  2.718e-01  -3.607 0.000310 ***
## employment_duration> 7 years   -2.460e-01  2.945e-01  -0.835 0.403563    
## employment_duration1 - 4 years -2.602e-01  2.434e-01  -1.069 0.285122    
## employment_duration4 - 7 years -8.296e-01  3.029e-01  -2.739 0.006161 ** 
## employment_durationunemployed   5.115e-03  4.365e-01   0.012 0.990650    
## percent_of_income               3.160e-01  8.956e-02   3.528 0.000418 ***
## years_at_residence             -1.293e-02  8.785e-02  -0.147 0.883007    
## age                            -1.288e-02  9.309e-03  -1.383 0.166531    
## other_creditnone               -5.519e-01  2.394e-01  -2.306 0.021126 *  
## other_creditstore              -8.932e-03  4.319e-01  -0.021 0.983499    
## housingown                     -6.249e-02  3.024e-01  -0.207 0.836277    
## housingrent                     3.896e-01  3.459e-01   1.127 0.259936    
## existing_loans_count            3.542e-01  2.008e-01   1.764 0.077705 .  
## jobskilled                      1.388e-01  2.906e-01   0.478 0.632853    
## jobunemployed                  -4.270e-01  6.610e-01  -0.646 0.518238    
## jobunskilled                   -3.646e-02  3.532e-01  -0.103 0.917775    
## dependents                      2.380e-01  2.435e-01   0.978 0.328304    
## phoneyes                       -3.638e-01  2.058e-01  -1.767 0.077193 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1104.59  on 899  degrees of freedom
## Residual deviance:  848.83  on 864  degrees of freedom
## AIC: 920.83
## 
## Number of Fisher Scoring iterations: 5
anova(model, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: default
## 
## Terms added sequentially (first to last)
## 
## 
##                      Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                   899    1104.59              
## checking_balance      3  121.790       896     982.80 < 2.2e-16 ***
## months_loan_duration  1   40.938       895     941.86 1.571e-10 ***
## credit_history        4   29.624       891     912.24 5.837e-06 ***
## purpose               5    5.555       886     906.68  0.351953    
## amount                1    0.001       885     906.68  0.979092    
## savings_balance       4   17.427       881     889.26  0.001597 ** 
## employment_duration   4    8.392       877     880.87  0.078215 .  
## percent_of_income     1   10.176       876     870.69  0.001422 ** 
## years_at_residence    1    0.007       875     870.68  0.931580    
## age                   1    3.303       874     867.38  0.069163 .  
## other_credit          2    6.226       872     861.15  0.044477 *  
## housing               2    3.669       870     857.48  0.159663    
## existing_loans_count  1    2.926       869     854.56  0.087181 .  
## job                   3    1.623       866     852.93  0.654123    
## dependents            1    0.937       865     852.00  0.333011    
## phone                 1    3.163       864     848.83  0.075311 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# drop the insignificant predictors, alpha = 0.10

model <- glm(default ~ checking_balance + months_loan_duration + credit_history +  percent_of_income + age,family=binomial(link='logit'),data=credit_train)
model
## 
## Call:  glm(formula = default ~ checking_balance + months_loan_duration + 
##     credit_history + percent_of_income + age, family = binomial(link = "logit"), 
##     data = credit_train)
## 
## Coefficients:
##                (Intercept)    checking_balance> 200 DM  
##                   -1.70840                    -1.12915  
## checking_balance1 - 200 DM     checking_balanceunknown  
##                   -0.39299                    -1.90221  
##       months_loan_duration          credit_historygood  
##                    0.03799                     0.68929  
##      credit_historyperfect          credit_historypoor  
##                    1.66167                     0.62875  
##    credit_historyvery good           percent_of_income  
##                    1.49958                     0.20899  
##                        age  
##                   -0.01213  
## 
## Degrees of Freedom: 899 Total (i.e. Null);  889 Residual
## Null Deviance:       1105 
## Residual Deviance: 902.3     AIC: 924.3
summary(model)
## 
## Call:
## glm(formula = default ~ checking_balance + months_loan_duration + 
##     credit_history + percent_of_income + age, family = binomial(link = "logit"), 
##     data = credit_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8981  -0.8016  -0.4600   0.8738   2.4245  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -1.708395   0.443545  -3.852 0.000117 ***
## checking_balance> 200 DM   -1.129147   0.363849  -3.103 0.001914 ** 
## checking_balance1 - 200 DM -0.392991   0.195108  -2.014 0.043986 *  
## checking_balanceunknown    -1.902211   0.223999  -8.492  < 2e-16 ***
## months_loan_duration        0.037992   0.006934   5.479 4.27e-08 ***
## credit_historygood          0.689289   0.209315   3.293 0.000991 ***
## credit_historyperfect       1.661675   0.407930   4.073 4.63e-05 ***
## credit_historypoor          0.628748   0.329187   1.910 0.056133 .  
## credit_historyvery good     1.499582   0.382487   3.921 8.83e-05 ***
## percent_of_income           0.208988   0.075562   2.766 0.005679 ** 
## age                        -0.012131   0.007591  -1.598 0.110026    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1104.59  on 899  degrees of freedom
## Residual deviance:  902.31  on 889  degrees of freedom
## AIC: 924.31
## 
## Number of Fisher Scoring iterations: 5
anova(model, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: default
## 
## Terms added sequentially (first to last)
## 
## 
##                      Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                   899    1104.59              
## checking_balance      3  121.790       896     982.80 < 2.2e-16 ***
## months_loan_duration  1   40.938       895     941.86 1.571e-10 ***
## credit_history        4   29.624       891     912.24 5.837e-06 ***
## percent_of_income     1    7.327       890     904.91  0.006793 ** 
## age                   1    2.606       889     902.31  0.106489    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# check Accuracy

fitted.results <- predict(model,newdata=credit_test,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)

misClasificError <- mean(fitted.results != credit_test$default)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0"
# ROC
# Because this data set is so small, it is possible that the test data set
# does not contain both 0 and 1 values.  If this happens the code will not
# run.  And since the test data set is so small the ROC is not useful here
# but the code is provided.

library(ROCR)
## Warning: package 'ROCR' was built under R version 3.3.3
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
## Loading required package: methods
p <- predict(model, newdata=credit_test, type="response")
pr <- prediction(p, credit_test$default)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.688483
comments powered by Disqus