Chapter 3 Preliminary Data Investigation

3.1 Exploratory Data Analysis

3.1.1 Cleaning Predictors

sapply(argus, class)
     Flgs   SrcAddr     Sport   DstAddr     Dport   SrcPkts   DstPkts 
 "factor"  "factor"  "factor"  "factor"  "factor" "integer" "integer" 
 SrcBytes  DstBytes     State 
"integer" "integer"  "factor" 
argus = transform(argus,
                  Sport = as.factor(argus$Sport),
                  Dport = as.factor(argus$Dport))
argus = subset(argus, select = c("Flgs", "SrcAddr", "Sport", "DstAddr", "Dport",
                                "SrcPkts", "DstPkts", "SrcBytes", "DstBytes", "State"))
attach(argus)
categorical = c("Flgs", "SrcAddr", "Sport", "DstAddr", "Dport", "State")
continuous = c("SrcPkts", "DstPkts", "SrcBytes", "DstBytes")

This code casts the features to their corresponding class classifications (numeric and factor), and removes Proto, StartTime, and Diretion from the dataset.

3.1.2 Categorical Features: Unique Categories and Counts

sapply(argus, function(x) length(unique(x)))
    Flgs  SrcAddr    Sport  DstAddr    Dport  SrcPkts  DstPkts SrcBytes 
      70    65113    64486    41903    17537     2790     3633    44075 
DstBytes    State 
   64549        6 
#function that returns elements of the feature and their counts in descending order
element_counts = function(x) {
  dt = data.table(x)[, .N, keyby = x]
  dt[order(dt$N, decreasing = TRUE),]
}
element_counts(Sport)
           x     N
    1: 33461 35683
    2:  4263  8541
    3:    80  8346
    4:  4165  4988
    5: 48468  3023
   ---            
64482: 57904     1
64483: 58242     1
64484: 59051     1
64485: 59315     1
64486: 60116     1
element_counts(Dport)
           x      N
    1:    23 235616
    2:    80 204128
    3:   443 137360
    4: 32819 102166
    5:    25  53167
   ---             
17533: 65515      1
17534: 65528      1
17535: 65529      1
17536: 65532      1
17537: 65533      1
element_counts(SrcAddr)
                  x     N
    1: 197.0.31.231 35440
    2:    1.0.11.96  8717
    3:  100.0.7.149  8526
    4:    197.0.9.1  5536
    5:   1.0.85.103  4976
   ---                   
65109:   197.0.55.5     1
65110:   197.0.55.6     1
65111:   197.0.55.7     1
65112:   197.0.55.8     1
65113:   197.0.55.9     1
element_counts(DstAddr)
                  x     N
    1:    100.0.1.9 64508
    2:    100.0.1.2 62681
    3:   100.0.1.28 25780
    4:   100.0.1.55 25641
    5:  100.0.18.93 20766
   ---                   
41899:  100.0.6.179     1
41900:  100.0.7.173     1
41901: 100.0.77.113     1
41902: 100.0.77.147     1
41903: 100.0.88.111     1
element_counts(State)
     x      N
1: REQ 571899
2: FIN 307337
3: RST 135280
4: CON  17146
5: ACC  16815
6: CLO     98

3.1.3 Continuous Features: Distributions and Relationships

par(mfrow=c(2,2))
hist(SrcBytes); hist(SrcPkts); hist(DstBytes); hist(DstPkts) #clearly some very large values

largest_n = function(x, n){
  head(sort(x, decreasing=TRUE), n)
}
largest_n(SrcBytes, 10)
 [1] 118257047 116615879 112673526 108933442 105793666  73376579  72839115
 [8]  70001807  56206409  55359912
largest_n(SrcPkts, 10)
 [1] 1008233 1000971  771590  492361  458603  437296  408530  407973
 [9]  371976  371251
largest_n(DstBytes, 10)
 [1] 1850817751 1713055847 1690162763 1524781880 1491609296 1340922625
 [7] 1304668214 1206594243 1163954979 1145323438
largest_n(DstPkts, 10)
 [1] 1239611 1223485 1219276 1004471  982931  942827  883354  795120
 [9]  766776  754831

The histograms and the largest 10 values in each of the continuous variables show that there are a relatively few amount of large observations skewing the distributions. This explains the model summary containing means much larger than their medians. It’s not possible to remove the large values as outliers because they may be scanner observations to detect. Also there is a high frequency (up to the first quartile) of destination bytes and packets that equal 0.

We will now try to investigate whether the largest continuous predictor values correspond to any particular addresses or ports.

max.SrcBytes = argus[with(argus,order(-SrcBytes)),][1:20,]
max.SrcPkts = argus[with(argus,order(-SrcPkts)),][1:20,]
max.DstBytes = argus[with(argus,order(-DstBytes)),][1:20,]
max.DstPkts = argus[with(argus,order(-DstPkts)),][1:20,]
head(max.SrcBytes)
             Flgs   SrcAddr Sport    DstAddr Dport SrcPkts DstPkts
282859  * s        1.0.12.1 18086  100.0.1.8 31743  208339  104886
282841  * s        1.0.12.1 18086  100.0.1.8 31743  204912   99688
282832  * s        1.0.12.1 18086  100.0.1.8 31743  198007   94621
282853  * s        1.0.12.1 18086  100.0.1.8 31743  191443   95892
282823  * s        1.0.12.1 18086  100.0.1.8 31743  186228   84342
724162  * *       100.0.4.9 37901 100.0.2.67    22 1008233 1239611
        SrcBytes   DstBytes State
282859 118257047    7514403   CON
282841 116615879    7146182   CON
282832 112673526    6801146   CON
282853 108933442    6857053   CON
282823 105793666    6048529   CON
724162  73376579 1713055847   CON
head(max.DstBytes)
             Flgs   SrcAddr Sport    DstAddr Dport SrcPkts DstPkts
2162    * d       197.0.1.1 62030  100.0.1.1    80  371251 1219276
724162  * *       100.0.4.9 37901 100.0.2.67    22 1008233 1239611
724106  * *       100.0.4.9 37901 100.0.2.67    22 1000971 1223485
2212    * d       197.0.1.1 62034  100.0.1.1    80  280593 1004471
78245   * d         1.0.2.1 11210  100.0.1.2    80  158194  982931
2185    * d       197.0.1.1 62033  100.0.1.1    80  268402  883354
       SrcBytes   DstBytes State
2162   26430322 1850817751   CON
724162 73376579 1713055847   CON
724106 72839115 1690162763   CON
2212   20037592 1524781880   FIN
78245  11294111 1491609296   CON
2185   19137354 1340922625   FIN

Source Addresses tend to be repetitive for the largest max bytes/packets, while ports vary. The top 10 largest DstBytes all correspond to SrcAddr 197.0.1.1 and DstAddr 100.0.1.1. Also both max Src and Dst rows correspond to the “* s”" flag. The largest sizes of DstBytes tend to go to Dport 80, which is the port that expects to receive from a web client (http), while the largest SrcBytes go to 31743. The next section implements a systematic way for investigating the relationship between addresses and ports because simply looking at the max rows is difficult.

3.1.4 Correlation Between Features

cor(SrcBytes, SrcPkts)
[1] 0.5732968
cor(DstBytes, DstPkts)
[1] 0.996775
cor(SrcBytes, DstBytes)
[1] 0.3331221
cor(SrcPkts, DstPkts)
[1] 0.8448269

The plots of the predictors suggest strong linear trends between the predictors, which makes intuitive sense given the domain matter. Further investigations show that DstPkts has a correlation of ~1 with DstBytes and ~0.85 with SrcPkts.

cor(DstBytes, DstPkts, method = "kendall")
cor(SrcPkts, DstPkts, method = "kendall")
cor(DstBytes, DstPkts, method = "spearman")
cor(SrcPkts, DstPkts, method = "spearman")

Because the original correlation tests relied on the Pearson method, which is susceptible to bias from large values, further tests investigate the relationship between DstPkts and the other features. While the correlation is still high for Kendall-Tau and Spearman’s correlation coefficients, it is less cause for concern when compared to the skewed response from the Pearson method.

3.2 Transformations on the Data

3.2.1 Removing Quantiles

To get a better sense of the unskewed distribution, the below plots visualize the continuous features with the largest and smallest 10% of observations removed. The removed values will be readded to the dataset when investigating for anomalies.

remove_quantiles = function(v, lowerbound, upperbound){
  return (v[quantile(v,lowerbound) >= v & v <= quantile(v,upperbound)])
}
SrcBytes.abrev = remove_quantiles(SrcBytes,0.10,0.9)
SrcPkts.abrev = remove_quantiles(SrcPkts,0.10,0.9)
DstBytes.abrev = remove_quantiles(DstBytes,0.10,0.9)
DstPkts.abrev = remove_quantiles(DstPkts,0.10,0.9)
par(mfrow=c(2,2))
hist(SrcBytes.abrev); hist(SrcPkts.abrev); hist(DstBytes.abrev); hist(DstPkts.abrev)

The continuous features are still unevenly distributed even with the 20% most extreme values removed.

3.2.2 Log Transformation

par(mfrow=c(2,2))
hist(log(SrcBytes)); hist(log(SrcPkts)); hist(log(DstBytes)); hist(log(DstPkts))

plot(log(SrcPkts), log(SrcBytes)); plot(log(DstPkts), log(DstBytes))
plot(log(SrcBytes), log(DstBytes)); plot(log(SrcPkts), log(DstPkts))

A log transformation for each of the continuous features outputs right-skewed histograms. Skewed features may affect the results of a kernel pca, so we consider other approaches for transformations.

3.2.3 Normal Scores Transformation

nscore = function(x) {
   # Takes a vector of values x and calculates their normal scores. Returns 
   # a list with the scores and an ordered table of original values and
   # scores, which is useful as a back-transform table. See backtr().
   nscore = qqnorm(x, plot.it = FALSE)$x  # normal score 
   trn.table = data.frame(x=sort(x),nscore=sort(nscore))
   return (list(nscore=nscore, trn.table=trn.table))
}

backtr = function(scores, nscore, tails='none', draw=TRUE) {
   # Given a vector of normal scores and a normal score object 
   # (from nscore), the function returns a vector of back-transformed 
   # values
   # 'none' : No extrapolation; more extreme score values will revert 
   # to the original min and max values. 
   # 'equal' : Calculate magnitude in std deviations of the scores about 
   # initial data mean. Extrapolation is linear to these deviations. 
   # will be based upon deviations from the mean of the original 
   # hard data - possibly quite dangerous!
   # 'separate' :  This calculates a separate sd for values 
   # above and below the mean.
   if(tails=='separate') { 
      mean.x <- mean(nscore$trn.table$x)
      small.x <- nscore$trn.table$x < mean.x
      large.x <- nscore$trn.table$x > mean.x
      small.sd <- sqrt(sum((nscore$trn.table$x[small.x]-mean.x)^2)/
                       (length(nscore$trn.table$x[small.x])-1))
      large.sd <- sqrt(sum((nscore$trn.table$x[large.x]-mean.x)^2)/
                       (length(nscore$trn.table$x[large.x])-1))
      min.x <- mean(nscore$trn.table$x) + (min(scores) * small.sd)
      max.x <- mean(nscore$trn.table$x) + (max(scores) * large.sd)
      # check to see if these values are LESS extreme than the
      # initial data - if so, use the initial data.
      #print(paste('lg.sd is:',large.sd,'max.x is:',max.x,'max nsc.x
      #     is:',max(nscore$trn.table$x)))
      if(min.x > min(nscore$trn.table$x)) {min.x <- min(nscore$trn.table$x)}
      if(max.x < max(nscore$trn.table$x)) {max.x <- max(nscore$trn.table$x)}
   }
   if(tails=='equal') { # assumes symmetric distribution around the mean
      mean.x <- mean(nscore$trn.table$x)
      sd.x <- sd(nscore$trn.table$x)
      min.x <- mean(nscore$trn.table$x) + (min(scores) * sd.x)
      max.x <- mean(nscore$trn.table$x) + (max(scores) * sd.x)
      # check to see if these values are LESS extreme than the
      # initial data - if so, use the initial data.
      if(min.x > min(nscore$trn.table$x)) {min.x <- min(nscore$trn.table$x)}
      if(max.x < max(nscore$trn.table$x)) {max.x <- max(nscore$trn.table$x)}
   }
   if(tails=='none') {   # No extrapolation
      min.x <- min(nscore$trn.table$x)
      max.x <- max(nscore$trn.table$x)
   }
   min.sc <- min(scores)
   max.sc <- max(scores)
   x <- c(min.x, nscore$trn.table$x, max.x)
   nsc <- c(min.sc, nscore$trn.table$nscore, max.sc)
   
   if(draw) {plot(nsc,x, main='Transform Function')}
   back.xf <- approxfun(nsc,x) # Develop the back transform function
   val <- back.xf(scores)
   return(val)
}

SrcBytes_norm = nscore(SrcBytes)$nscore
SrcBytes_table = nscore(SrcBytes)$trn.table

SrcPkts_norm = nscore(SrcPkts)$nscore
SrcPkts_table = nscore(SrcPkts)$trn.table

DstBytes_norm = nscore(DstBytes)$nscore
DstBytes_table = nscore(DstBytes)$trn.table

DstPkts_norm = nscore(DstPkts)$nscore
DstPkts_table = nscore(DstPkts)$trn.table
par(mfrow=c(2,2))
hist(SrcBytes_norm); hist(SrcPkts_norm); hist(DstBytes_norm); hist(DstPkts_norm)

Finally, a normal scores transformation is applied to the dataset. The normal scores transformation reassigns each feature value so that it appears the overall data for that feature had arisen or been observed from a standard normal distribution. This transformation solves the issue of skewness-each value’s histogram will now follow a standard gaussian density plot-, but it may cause issues with other analysis methods, particularly methods that are susceptible to ties in data.