General outlier detection for univariate datasets

library(specleanr)

Introduction to general data outlier detection

Setting one variable of interest

1. Preparing data


irisdata1 <- iris

#introduce outlier data and NAs

rowsOutNA1 <- data.frame(x= c(344, NA,NA, NA),
                         x2 = c(34, 45, 544, NA), 
                         x3= c(584, 5, 554, NA),
                         x4 = c(575, 4554,474, NA), 
                         x5 =c('setosa', 'setosa', 'setosa', "setosa"))

colnames(rowsOutNA1) <- colnames(irisdata1)


dfinal <- rbind(irisdata1, rowsOutNA1)

Detecting outlier in changed iris dataset

We can only use univariate methods to detect only in in variable such as Sepal.Length or we can exclude the species column and also use multivariate methods such as isolation forest, Mahalanobis outlier detection method or One class support vector machines. To identify the methods allowed in this package, run extractMethod()

NOTE * Because we are considering univariate analysis, the parameter sdm is set to FALSE.

2. Filter out only setosa data before outlier detection


setosadf <- dfinal[dfinal$Species%in%"setosa",c("Sepal.Width", 'Species')]

setosa_outlier_detection <- multidetect(data = setosadf, 
                                var = 'Sepal.Width', 
                                multiple = FALSE,
                                methods = c("adjbox", "iqr", "hampel","jknife",
                                            "seqfences", "mixediqr",
                                            "distboxplot", "semiqr",
                                            "zscore", "logboxplot", "medianrule"),
                       silence_true_errors = FALSE, 
                       missingness = 0.1,
                       sdm = FALSE,
                       na.inform = TRUE)
#> 1 (1.85%) NAs removed for parameter Sepal.Width.

#extractMethods()

3. Visualize the number of outliers detected by each method


ggoutliers(setosa_outlier_detection)

4 Obtaining quality controlled dataset using loess method or data labeling


setosa_qc_loess <- extract_clean_data(refdata = setosadf, 
                                      outliers = setosa_outlier_detection, loess = TRUE)

#clean dataset
nrow(setosa_qc_loess)
#> [1] 51

#reference data
nrow(setosadf)
#> [1] 54

setosa_qc_labeled <- classify_data(refdata = setosadf, outliers = setosa_outlier_detection)

5 Visualize labelled quality controlled dataset



ggenvironmentalspace(setosa_qc_labeled, 
                     type = '1D',
                     ggxangle = 45, 
                     scalecolor = 'viridis',
                     xhjust = 1,
                     legend_position = 'blank',
                     ylab = "Number of records",
                     xlab = "Outlier labels")

For multiple species but using only variable of interest

NOTE

6. Outlier detection across the species groups in iris dataset


multspp_outlier_detection <- multidetect(data = dfinal, 
                                var = 'Sepal.Width', 
                                multiple = TRUE,
                                var_col = "Species",
                                methods = c("adjbox", "iqr", "hampel","jknife",
                                            "seqfences", "mixediqr",
                                            "distboxplot", "semiqr",
                                            "zscore", "logboxplot", "medianrule"),
                       silence_true_errors = FALSE, 
                       missingness = 0.1,
                       sdm = FALSE,
                       na.inform = TRUE)
#> 1 (1.85%) NAs removed for parameter Sepal.Width.
#> 0 (0%) NAs removed for parameter Sepal.Width.
#> 0 (0%) NAs removed for parameter Sepal.Width.

7 Visualise the number of outliers detected by each method


ggoutliers(multspp_outlier_detection)

8 Obtaining quality controlled dataset using loess method or data labeling


multsp_qc_loess <- extract_clean_data(refdata = dfinal, 
                                      outliers = multspp_outlier_detection,
                                      var_col = 'Species',
                                      loess = TRUE)

#clean dataset
nrow(multsp_qc_loess)
#> [1] 151

#reference data
nrow(dfinal)
#> [1] 154

multi_qc_labeled <- classify_data(refdata = dfinal, 
                                      outliers = multspp_outlier_detection,
                                  var_col = 'Species')

10 Visualise labelled quality controlled dataset


ggenvironmentalspace(multi_qc_labeled, 
                     type = '1D',
                     ggxangle = 45, 
                     scalecolor = 'viridis',
                     xhjust = 1,
                     legend_position = 'blank',
                     ylab = "Number of records",
                     xlab = "Outlier labels")

The second approach is setting multiple variables of interest

11. Outlier detection


multivariables <- multidetect(data = dfinal, multiple = TRUE,
                      var = c('Sepal.Length', 'Sepal.Width'), output = 'outlier',
                      var_col = 'Species',
                      methods = c('zscore', 'adjbox',
                                  'logboxplot', 'distboxplot',
                                  'iqr', 'semiqr','seqfences','hampel',
                                  'jknife'), 
                      warn = FALSE,
                      sdm = FALSE)

12. Visualize the number of outliers detected by each method


ggoutliers(multivariables)

13. Data extraction

NOTE

#outliers will be returned to NA for each variable

lenwidth_clean <- extract_clean_data(dfinal, outliers = multivariables, 
                                     var_col = 'Species', outlier_to_NA = TRUE, threshold = 0.8)
nrow(lenwidth_clean)
#> [1] 154

lenwidth_long <- extract_clean_data(dfinal, outliers = multivariables, 
                                     var_col = 'Species', outlier_to_NA = FALSE, threshold = 0.8)
nrow(lenwidth_long)
#> [1] 305

The package is undergoing peer review for publication