1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158#~~~~~~~~~~~~~~~~~~~~~~
#Sentiment analysis - Luca Vehbiu
#06/02/2018
#~~~~~~~~~~~~~~~~~~~~~~~
#Load libraries
require(pacman)
p_load(tidyverse, ggplot2, doParallel, modelr, caret, plotly)
#Set up parallel processing
detectCores()
cl <- makeCluster(4) #make the cluster
registerDoParallel(cl) #register the cluster
getDoParWorkers() #check how many
stopCluster(cl)
##read the small matrix datas
iphone <- read.csv("iphone_smallmatrix_labeled_8d.csv", header = T)
galaxy <- read.csv("galaxy_smallmatrix_labeled_8d.csv", header = T)
##read the validation sets obtained from AWS EMR
first.run <- read.csv("firstrun.csv", header = T)
second.run <- read.csv("secondrun.csv", header = T)
third.run <- read.csv("thirdrun.csv", header = T)
valid <- rbind(first.run, second.run, third.run) #bind them together
#add predictions of iphonesentiment
add_predictions(valid, model_rf) %>% plyr:: rename(c("pred" = "iphonesentiment")) -> valid
#add predictions of galaxysentiment
add_predictions(valid, model_galaxy_rf) %>% plyr:: rename(c("pred" = "galaxysentiment")) -> valid
valid %>% group_by(iphonesentiment, galaxysentiment) %>% count()
ggplot(valid) + geom_bar(aes(iphonesentiment, fill = "blue")) +
geom_bar(aes(galaxysentiment, fill = 'green'), position = 'dodge')
#check distribution of phone sentiments
plot_ly(iphone, x = ~iphonesentiment, type='violin')
plot_ly(valid, x = ~iphonesentiment, type='histogram', color = "green")
plot_ly(galaxy, x = ~galaxysentiment, type='violin')
plot_ly(valid, x = ~galaxysentiment, type='histogram')
#under and over sample --> To correct for imbalancedness
no.redundant %>% filter(iphonesentiment == 2) -> dupli#duplicate
dupli <- dupli[sample(1:nrow(dupli), 1500),]
no.redundant <- rbind(no.redundant, dupli)
##no redundant models
no.redundant$iphonesentiment <- as.factor(no.redundant$iphonesentiment)
set.seed(568)
trctrl <- trainControl(method = "repeatedcv", number = 2, repeats = 2)
a <- createDataPartition(y = no.redundant$iphonesentiment, p = 0.75, list = F)
train <- no.redundant[a,]
test <- no.redundant[-a,]
set.seed(122)
#rf
model_rf <- train(iphonesentiment ~., data = train,
method = "rf",
trControl = trctrl, importance = T, allowParallel = T, tuneGrid = grid)
grid <- expand.grid(size = c(1:10), decay = c(0.1, 0.2))
#c5.0
model_cart <- train(iphonesentiment ~., data = train,
method = "C5.0",
trControl = trctrl, allowParallel = T, importance = T)
#svmliner and neural net
model_net <- train(iphonesentiment ~., data = train,
method = "nnet",
trControl = trctrl, allowParallel = T, importance = T)
model_svm <- train(iphonesentiment ~., data = train,
method = "svmLinear3",
trControl = trctrl, allowParallel = T, importance = T)
#bwplot the results
results <- resamples(list("Random Forest" = model_rf,
"Neural Net" = model_net,
C5.0 = model_cart,
SVM = model_svm))
bwplot(results)
#predictions and confusion matrix
pred <- predict(model_rf, newdata = test)
confusionMatrix(pred, test$iphonesentiment)
#confusion matrix using ggplot2
as.data.frame(table(pred, test$iphonesentiment)) -> confusion_matrix
ggplot(data = confusion_matrix,
mapping = aes(x = pred,
y = confusion_matrix$Var2)) +
geom_tile(aes(fill = Freq)) +
geom_text(aes(label = sprintf("%2.0f", Freq)), vjust = 2) +
scale_fill_gradient(low = "lightblue",
high = "orange",
trans = "log") +
labs( x = "Predictions", y = "Actual Values", title = "iPhone") +
theme(legend.position = "none")
####no redundant models for galaxy phone
no.redundant_galaxy$galaxysentiment <- as.factor(no.redundant_galaxy$galaxysentiment)
set.seed(579)
trctrl <- trainControl(method = "repeatedcv", number = 2, repeats = 2)
a <- createDataPartition(y = no.redundant_galaxy$galaxysentiment, p = 0.75, list = F)
training <- no.redundant_galaxy[a,]
testing <- no.redundant_galaxy[-a,]
set.seed(1225)
#rf
model_galaxy_rf <- train(galaxysentiment ~., data = training,
method = "rf",
trControl = trctrl, allowParallel = T, importance = T, tuneGrid = grid)
grid <- expand.grid(mtry = c(1:11))
#c5.0
model_cart.galaxy <- train(galaxysentiment ~., data = training,
method = "C5.0",
trControl = trctrl, allowParallel = T, importance = T)
#svmliner and kknn
model_svm.galaxy <- train(galaxysentiment ~., data = training,
method = "kknn",
trControl = trctrl, allowParallel = T, importance = T)
#tuning
grid <- expand.grid(kmax = c(1:20), distance = c(1:10), kernel = "optimal")
#predictions and confusion matrix
pred <- predict(model_cart.galaxy, newdata = testing)
confusionMatrix(pred, testing$galaxysentiment)