Transcript
```Milestone2 Logistic Regression and Naive Bayes Classifier
Kushal Thakkar, Snighdha Petluru, Viral Shah
April 4, 2016
Logistic Regression with different Models
##logistic model
##helps find the intercept, co-efficients of each feature and log-odds
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.2.4
## Warning: package 'foreach' was built under R version 3.2.4
##DV with individual IV's
m1 = glm(eliteStatus~review_count,family=binomial(),data=train_vardata)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
m2 = glm(eliteStatus~nmonths,family=binomial(),data=train_vardata)
m3 = glm(eliteStatus~fans,family=binomial(),data=train_vardata)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
m4 = glm(eliteStatus~total_compliments,family=binomial(),data=train_vardata)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
m6 = glm(eliteStatus~nfriends,family=binomial(),data=train_vardata)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
m7 =
glm(eliteStatus~AverageLeniencyScore,family=binomial(),data=train_vardata)
m8 =
glm(eliteStatus~nmonths+review_count+nfriends+fans+total_compliments+total_vo
tes,family=binomial(),data=train_vardata,maxit=100)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
m9 =
geLeniencyScore,family=binomial(),data=train_vardata,maxit=100)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
logreg.model = glm(eliteStatus~nmonths+review_count+fans+total_compliments+
),maxit=100)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
library(aod)
## Warning: package 'aod' was built under R version 3.2.4
Co-efficients, Intercepts and Odd Ratios of each Models
##coefficients and intercepts of each of the logistic regression model
##with individual independent feature and the full model
summary(m1)
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
Call:
glm(formula = eliteStatus ~ review_count, family = binomial(),
data = train_vardata)
Deviance Residuals:
Min
1Q
Median
-8.4904 -0.2009 -0.1753
3Q
-0.1660
Max
2.9205
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.3328801 0.0149974 -288.9
<2e-16 ***
review_count 0.0274719 0.0001509
182.1
<2e-16 ***
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 155622
Residual deviance: 75625
AIC: 75629
on 306401
on 306400
degrees of freedom
degrees of freedom
Number of Fisher Scoring iterations: 9
summary(m2)
##
## Call:
## glm(formula = eliteStatus ~ nmonths, family = binomial(), data =
train_vardata)
##
## Deviance Residuals:
##
Min
1Q
Median
3Q
Max
## -1.3949 -0.3948 -0.2751 -0.1870
3.0569
##
## Coefficients:
##
Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.8192167 0.0209684 -229.8
<2e-16 ***
## nmonths
0.0390971 0.0002914
134.1
<2e-16 ***
## --## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
##
Null deviance: 155622 on 306401 degrees of freedom
## Residual deviance: 135274 on 306400 degrees of freedom
## AIC: 135278
##
## Number of Fisher Scoring iterations: 6
summary(m3)
##
## Call:
## glm(formula = eliteStatus ~ fans, family = binomial(), data =
train_vardata)
##
## Deviance Residuals:
##
Min
1Q
Median
3Q
Max
## -8.4904 -0.2247 -0.2247 -0.2247
2.7173
##
## Coefficients:
##
Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.666728
0.011538 -317.8
<2e-16 ***
## fans
0.441793
0.002785
158.6
<2e-16 ***
## --## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
##
Null deviance: 155622 on 306401 degrees of freedom
## Residual deviance: 92066 on 306400 degrees of freedom
## AIC: 92070
##
## Number of Fisher Scoring iterations: 14
summary(m4)
##
## Call:
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
glm(formula = eliteStatus ~ total_compliments, family = binomial(),
data = train_vardata)
Deviance Residuals:
Min
1Q
Median
-8.4904 -0.2768 -0.2713
3Q
-0.2713
Max
2.5770
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept)
-3.2836673 0.0097478 -336.9
<2e-16 ***
total_compliments 0.0407717 0.0003189
127.8
<2e-16 ***
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 155622
Residual deviance: 94972
AIC: 94976
on 306401
on 306400
degrees of freedom
degrees of freedom
Number of Fisher Scoring iterations: 12
summary(m5)
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
Call:
glm(formula = eliteStatus ~ total_votes, family = binomial(),
data = train_vardata)
Deviance Residuals:
Min
1Q
Median
-8.4904 -0.2259 -0.2112
3Q
-0.2070
Max
2.7649
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.841e+00 1.240e-02 -309.7
<2e-16 ***
160.1
<2e-16 ***
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 155622
Residual deviance: 80606
AIC: 80610
on 306401
on 306400
degrees of freedom
degrees of freedom
Number of Fisher Scoring iterations: 14
summary(m6)
##
## Call:
## glm(formula = eliteStatus ~ nfriends, family = binomial(), data =
train_vardata)
##
## Deviance Residuals:
##
Min
1Q
Median
3Q
Max
## -8.4904 -0.3001 -0.2864 -0.2864
2.5353
##
## Coefficients:
##
Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1729499 0.0092741 -342.1
<2e-16 ***
## nfriends
0.0476524 0.0003641
130.9
<2e-16 ***
## --## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
##
Null deviance: 155622 on 306401 degrees of freedom
## Residual deviance: 122759 on 306400 degrees of freedom
## AIC: 122763
##
## Number of Fisher Scoring iterations: 7
summary(m7)
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
Call:
glm(formula = eliteStatus ~ AverageLeniencyScore, family = binomial(),
data = train_vardata)
Deviance Residuals:
Min
1Q
Median
-0.4412 -0.3924 -0.3822
3Q
-0.3626
Max
2.4118
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept)
-2.607211
0.007352 -354.64
<2e-16 ***
AverageLeniencyScore 0.081692
0.006208
13.16
<2e-16 ***
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 155622
Residual deviance: 155444
AIC: 155448
on 306401
on 306400
Number of Fisher Scoring iterations: 5
degrees of freedom
degrees of freedom
summary(m8)
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
Call:
glm(formula = eliteStatus ~ nmonths + review_count + nfriends +
fans + total_compliments + total_votes, family = binomial(),
data = train_vardata, maxit = 100)
Deviance Residuals:
Min
1Q
Median
-8.4904 -0.1996 -0.1682
3Q
-0.1495
Max
2.8998
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept)
-4.746e+00 2.672e-02 -177.611 < 2e-16 ***
nmonths
7.711e-03 4.329e-04
17.811 < 2e-16 ***
review_count
2.017e-02 2.097e-04
96.211 < 2e-16 ***
nfriends
6.947e-03 4.083e-04
17.013 < 2e-16 ***
fans
7.724e-02 2.753e-03
28.055 < 2e-16 ***
total_compliments -5.015e-04 7.232e-05
-6.934 4.08e-12 ***
8.786e-04 6.511e-05
13.495 < 2e-16 ***
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 155622
Residual deviance: 71628
AIC: 71642
on 306401
on 306395
degrees of freedom
degrees of freedom
Number of Fisher Scoring iterations: 31
summary(m9)
##
## Call:
## glm(formula = eliteStatus ~ nmonths + review_count + fans +
total_compliments +
##
total_votes + AverageLeniencyScore, family = binomial(),
##
data = train_vardata, maxit = 100)
##
## Deviance Residuals:
##
Min
1Q
Median
3Q
Max
## -8.4904 -0.2010 -0.1701 -0.1510
2.9246
##
## Coefficients:
##
Estimate Std. Error z value Pr(>|z|)
## (Intercept)
-4.732e+00 2.674e-02 -176.966 < 2e-16 ***
## nmonths
7.500e-03 4.316e-04
17.379 < 2e-16 ***
## review_count
1.992e-02 2.108e-04
94.499 < 2e-16 ***
##
##
##
##
##
##
##
##
##
##
##
##
##
##
fans
8.763e-02 2.970e-03
29.506 < 2e-16
total_compliments
-4.816e-04 6.375e-05
-7.554 4.22e-14
1.152e-03 6.576e-05
17.521 < 2e-16
AverageLeniencyScore 8.946e-02 1.097e-02
8.154 3.51e-16
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' '
***
***
***
***
1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 155622
Residual deviance: 71842
AIC: 71856
on 306401
on 306395
degrees of freedom
degrees of freedom
Number of Fisher Scoring iterations: 36
summary(logreg.model)
##
## Call:
## glm(formula = eliteStatus ~ nmonths + review_count + fans +
total_compliments +
##
total_votes + nfriends + AverageLeniencyScore, family = binomial(),
##
data = train_vardata, maxit = 100)
##
## Deviance Residuals:
##
Min
1Q
Median
3Q
Max
## -8.4904 -0.2006 -0.1685 -0.1489
2.9305
##
## Coefficients:
##
Estimate Std. Error z value Pr(>|z|)
## (Intercept)
-4.771e+00 2.700e-02 -176.666 < 2e-16 ***
## nmonths
7.718e-03 4.329e-04
17.826 < 2e-16 ***
## review_count
2.015e-02 2.097e-04
96.083 < 2e-16 ***
## fans
7.717e-02 2.750e-03
28.058 < 2e-16 ***
## total_compliments
-5.041e-04 7.443e-05
-6.773 1.26e-11 ***
8.877e-04 6.524e-05
13.607 < 2e-16 ***
## nfriends
6.831e-03 4.073e-04
16.770 < 2e-16 ***
## AverageLeniencyScore 8.414e-02 1.102e-02
7.638 2.20e-14 ***
## --## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
##
Null deviance: 155622 on 306401 degrees of freedom
## Residual deviance: 71567 on 306394 degrees of freedom
## AIC: 71583
##
## Number of Fisher Scoring iterations: 76
##odd ratios of the outcome
exp(coef(m1))
##
##
(Intercept) review_count
0.01312968
1.02785270
exp(coef(m2))
## (Intercept)
nmonths
## 0.008073108 1.039871482
exp(coef(m3))
## (Intercept)
## 0.02555996
fans
1.55549346
exp(coef(m4))
##
##
(Intercept) total_compliments
0.03749052
1.04161431
exp(coef(m5))
## 0.02148023 1.00808852
exp(coef(m6))
## (Intercept)
## 0.04187988
nfriends
1.04880599
exp(coef(m7))
##
##
(Intercept) AverageLeniencyScore
0.0737399
1.0851213
exp(coef(m8))
##
##
##
##
(Intercept)
nmonths
0.008683711
1.007741165
fans total_compliments
1.080306144
0.999498617
review_count
1.020378145
1.000879009
nfriends
1.006971213
exp(coef(m9))
##
(Intercept)
##
0.008806231
##
fans
##
1.091584449
## AverageLeniencyScore
##
1.093580534
exp(coef(logreg.model))
nmonths
1.007528580
total_compliments
0.999518547
review_count
1.020124506
1.001152794
##
##
##
##
##
##
(Intercept)
nmonths
0.008476039
1.007747561
fans
total_compliments
1.080222128
0.999495988
nfriends AverageLeniencyScore
1.006854137
1.087781343
review_count
1.020356199
1.000888083
Correlations of predicted and actual values of different models
##predicting eliteStatus using test data
pred1 <- predict(m1,test_vardata,type="response")
pred2 <- predict(m2,test_vardata,type="response")
pred3 <- predict(m3,test_vardata,type="response")
pred4 <- predict(m4,test_vardata,type="response")
pred5 <- predict(m5,test_vardata,type="response")
pred6 <- predict(m6,test_vardata,type="response")
pred7 <- predict(m7,test_vardata,type="response")
pred8 <- predict(m8,test_vardata,type="response")
pred9 <- predict(m9,test_vardata,type="response")
test_vardata\$eliteStatusP <predict(logreg.model,test_vardata,type="response")
##finding correlation between the predicted and actual value for eliteStatus
cor(pred1,test_vardata\$eliteStatus)
## [1] 0.7275434
cor(pred2,test_vardata\$eliteStatus)
## [1] 0.2990372
cor(pred3,test_vardata\$eliteStatus)
## [1] 0.6869403
cor(pred4,test_vardata\$eliteStatus)
## [1] 0.6657621
cor(pred5,test_vardata\$eliteStatus)
## [1] 0.7265785
cor(pred6,test_vardata\$eliteStatus)
## [1] 0.4689936
cor(pred7,test_vardata\$eliteStatus)
## [1] 0.01568872
cor(pred8,test_vardata\$eliteStatus)
## [1] 0.7452816
cor(pred9,test_vardata\$eliteStatus)
## [1] 0.7448852
cor(test_vardata\$eliteStatusP,test_vardata\$eliteStatus)
## [1] 0.74534
Graph of Predicted and Actual Elite and Non-elite Users
##graphical representation of prediction of eliteStatus against nfriends and
average leniency score
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.4
predictplot <ggplot(test_vardata,aes(x=nfriends,y=AverageLeniencyScore,color=eliteStatusP)
)+geom_point()
predictplot + facet_grid(eliteStatus~.)+labs(x="Number of Friends",y="Average
Leniency Score",color="Predicted Elite Status",title="Actual vs Predicted
Elite Status of User")
Graph shows the plotting of predicted elite users and non elite users against their actual
elite non-elite users
## Bayes Classifier
## train data has 75% of data and rest 25% is in test data
setwd("C:/Users/ESHAN/Desktop/Viral/Dropbox/YelpAnalysis/Datasets")
continous_data = read.csv("continuous random.csv",stringsAsFactors = F)
row_divider = nrow(continous_data)*0.75
train_data_bayes=continous_data[1:row_divider,]
test_data_bayes=continous_data[row_divider:nrow(continous_data),]
## categorizing variables all the variables
train_data_bayes <- lapply(train_data_bayes, factor)
test_data_bayes <- lapply(test_data_bayes, factor)
##checking the proportion of elitestatus user
prop.table(table(train_data_bayes\$eliteStatus))
##
##
0
1
## 0.92961534 0.07038466
prop.table(table(test_data_bayes\$eliteStatus))
##
##
0
1
## 0.92993587 0.07006413
##applying bayes classifier
library(e1071)
## Warning: package 'e1071' was built under R version 3.2.4
elite_classifier <- naiveBayes(train_data_bayes[9],train_data_bayes\$eliteStatus)
##model performance
elite_classifier_pred <- predict(elite_classifier,test_data_bayes)
summary(elite_classifier_pred)
##
0
1
## 82110 20025
##cross table
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.2.4
CrossTable(test_data_bayes\$eliteStatus,elite_classifier_pred,prop.chisq =
F,prop.t = F,prop.c = F,dnn = c("actual","predicted"))
##
##
##
Cell Contents
## |-------------------------|
## |
N |
## |
N / Row Total |
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
|-------------------------|
Total Observations in Table:
102135
| predicted
actual |
0 |
1 | Row Total |
-------------|-----------|-----------|-----------|
0 |
81934 |
13045 |
94979 |
|
0.863 |
0.137 |
0.930 |
-------------|-----------|-----------|-----------|
1 |
176 |
6980 |
7156 |
|
0.025 |
0.975 |
0.070 |
-------------|-----------|-----------|-----------|
Column Total |
82110 |
20025 |
102135 |
-------------|-----------|-----------|-----------|
##naive bayes with laplace smoother
elite_classifier_laplace <- naiveBayes(train_data_bayes[9],train_data_bayes\$eliteStatus,laplace = 1)
##prediction with laplace smoother
elite_classifier_pred_laplace <predict(elite_classifier_laplace,test_data_bayes)
##cross table
CrossTable(test_data_bayes\$eliteStatus,elite_classifier_pred_laplace,prop.chi
sq = F,prop.t = F,prop.c = F,dnn = c("actual","predicted"))
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
Cell Contents
|-------------------------|
|
N |
|
N / Row Total |
|-------------------------|
Total Observations in Table:
102135
| predicted
actual |
0 |
1 | Row Total |
-------------|-----------|-----------|-----------|
0 |
87504 |
7475 |
94979 |
|
0.921 |
0.079 |
0.930 |
-------------|-----------|-----------|-----------|
##
1 |
365 |
6791 |
7156 |
##
|
0.051 |
0.949 |
0.070 |
## -------------|-----------|-----------|-----------|
## Column Total |
87869 |
14266 |
102135 |
## -------------|-----------|-----------|-----------|
##
##
R Markdown
```
