## ----echo = FALSE------------------------------------------------------------- knitr::opts_chunk$set( fig.width = 5 , fig.height = 3.5, fig.align = 'center' ) oldpar <- list(mar = par()$mar, mfrow = par()$mfrow) ## ----------------------------------------------------------------------------- library(randomForest) library(classmap) ## ----------------------------------------------------------------------------- data("data_instagram") traindata <- data_instagram[which(data_instagram$dataType == "train"), -13] str(traindata) # The variable names and their interpretation are colnames(traindata) # profile.pic: binary, indicates whether profile has picture # nums.length.username: ratio of number of numerical chars in username to its length # fullname.words: number of words in full name # nums.length.fullname: ratio of number of numerical characters in full name to its length # name..username: binary, indicates whether name == username of the profile # description.length: length of the description/biography of the profile (in number of characters) # external.URL: binary, indicates whether profile has external url # private: binary, indicates whether profile is private or not # X.posts: number of posts made by profile # X.followers: number of followers # X.follows: numbers of follows # y: whether profile is fake or not. x_train <- traindata[, -12] y_train <- traindata[, 12] dim(traindata) table(traindata$y) # 50/50 split of genuine/fake accounts: ## ----------------------------------------------------------------------------- set.seed(71) rfout <- randomForest(y ~ ., data = traindata, keep.forest = TRUE) ## ----------------------------------------------------------------------------- mytype <- list(symm = c(1, 5, 7, 8)) ## ----------------------------------------------------------------------------- vcrtrain <- vcr.forest.train(X = x_train, y = y_train, trainfit = rfout, type = mytype) names(vcrtrain) vcrtrain$predint[c(1:10, 301:310)] # prediction as integer vcrtrain$pred[c(1:10, 301:310)] # prediction as label vcrtrain$altint[c(1:10, 301:310)] # alternative label as integer vcrtrain$altlab[c(1:10, 301:310)] # alternative label # Probability of Alternative Class (PAC) of each object: vcrtrain$PAC[1:3] # summary(vcrtrain$PAC) # f(i, g) is the distance from case i to class g: vcrtrain$fig[1:3, ] # for the first 3 objects: # The farness of an object i is the f(i, g) to its own class: vcrtrain$farness[1:3] # summary(vcrtrain$farness) # The "overall farness" of an object is defined as the # lowest f(i, g) it has to any class g (including its own): summary(vcrtrain$ofarness) sum(vcrtrain$ofarness > 0.99, na.rm = TRUE) # With the default cutoff = 0.99 we find 6 outliers, # also shown in the last column of the confusion matrix: confmat.vcr(vcrtrain) # If we do not want to show the outliers: confmat.vcr(vcrtrain, showOutliers = FALSE) # Note that the accuracy is computed before any objects # are flagged, so it does not depend on the cutoff. # Here the accuracy is `perfect' due to overfitting. # The out-of-box prediction accuracy is about 92%. cols <- c("blue", "red3") ## ----------------------------------------------------------------------------- stackedplot(vcrtrain, classCols = cols, main = "Instagram training data") # Silhouette plot: silplot(vcrtrain, classCols = cols) # Here all the s(i) are nonnegative (due to overfitting). # Class maps: classmap(vcrtrain, "genuine", classCols = cols) #, identify = TRUE) # farness outliers from furthest to closer: 45, 25, 41 x_train[c(45, 25, 41), ] # they have huge numbers of followers. classmap(vcrtrain, "fake", classCols = cols) #, identify = TRUE) # only case 261 is borderline far. ## ----------------------------------------------------------------------------- testdata <- data_instagram[which(data_instagram$dataType == "test"), -13] Xnew <- testdata[, -12] ynew <- testdata[, 12] ## ----------------------------------------------------------------------------- vcrtest <- vcr.forest.newdata(Xnew, ynew, vcrtrain) confmat.vcr(vcrtest) ## ----------------------------------------------------------------------------- stackedplot(vcrtest, classCols = cols, main = "RF on Instagram test data") # Silhouette plot: silplot(vcrtest, classCols = cols, main = "Silhouettes of RF on Instagram test data") # now some s(i) are negative ## ----------------------------------------------------------------------------- ## Class of genuine accounts: classmap(vcrtest, "genuine", classCols = cols) #, identify = TRUE) # one farness outlier: Xnew[c(30), ] # has very lengthy bio/description # has large number of X.posts # has very large number of followers and follows # genuine misclassified as fake: from highest PAC to lowest Xnew[c(21, 29, 51), ] # and 2 more borderline cases # They have some unusual characteristics for their class: # * 21, 29 have a (very) high nums.length.username, i.e. the # percentage of numerical characters in the username. # * 21, 29 have a full name of only 1 word. # * 21, 29 and 51 have description.length = 0, i.e. no # description/biography of their profile. # * they all have low X.posts (even 0 for case 21), i.e. # relatively few previous posts. # All of these characteristics are more common for fake profiles # than for genuine profiles, as we can see below: trcols <- cols[as.numeric(y_train)] plot(x_train[, 1], col = trcols, main = "profile.pic") # fakes are less likely to have a profile picture plot(x_train[, 2], col = trcols, main = "nums.length.username") # is higher for fakes plot(x_train[, 3], col = trcols, main = "fullname.words") # is lower for fakes plot(x_train[, 4], col = trcols, main = "nums.length.fullname") # is a bit higher for fakes plot(x_train[, 5], col = trcols, main = "name..username") # mostly 0 for genuine; fakes have a few values 1 plot(x_train[, 6], col = trcols, main = "description.length") # fakes are typically lower, and more often zero plot(x_train[, 7], col = trcols, main = "external.URL") # fakes never had them, genuines sometimes did plot(x_train[, 8], col = trcols, main = "private") # no visible difference plot((x_train[, 9])^0.1, col = trcols, main = "X.posts") # fakes have fewer posts, and often none plot((x_train[, 10])^0.1, col = trcols, main = "X.followers") # fakes have fewer followers, sometimes none plot((x_train[, 11])^0.1, col = trcols, main = "X.follows") # fakes follow a bit fewer, but the difference is small. ## Class of fake accounts: classmap(vcrtest, "fake", classCols = cols) #, identify = TRUE) # Fake identified as genuine, from highest PAC to lower: # c(27, 51, 34, 23, 58) Xnew[which(ynew == "fake")[c(27, 34, 51, 23, 58)], ] # These have a number of characteristics which are more common # for genuine profiles: # # all have profile pictures # none have numerical characters in username # none have numerical characters in fullname # 27 has a lengthy bio description # all have a relatively high number of followers # all have a relatively high number of follows.