Chapter 4 Matrix Techniques for Anomaly Detection

MAYBE PUT LIT REVIEW WITH EACH SECTION?

4.1 Ports Combination Matrix/Tensor

#set counts
s_num = 25
d_num = 25
combo_num = 20

#get freqs
Sport_table = table(Sport)
Sport_table = as.data.frame(Sport_table)
Sport_table = Sport_table[order(-Sport_table$Freq),]
top_Sport = (head(Sport_table$Sport, s_num))

#get freqs
Dport_table = table(Dport)
Dport_table = as.data.frame(Dport_table)
Dport_table = Dport_table[order(-Dport_table$Freq),]
top_Dport = (head(Dport_table$Dport, d_num))

#subset data
argus_maxes = argus[is.element(Sport, top_Sport) & is.element(Dport, top_Dport), ]
argus_maxes = transform(argus_maxes,
                        Sport = as.numeric(as.character(Sport)),
                        Dport = as.numeric(as.character(Dport)))
max_combinations = as.data.frame(table(argus_maxes$Sport, argus_maxes$Dport))

top_combinations = head(max_combinations[order(-max_combinations$Freq),], combo_num)
top_combinations$Sport = top_combinations$Var1
top_combinations$Dport = top_combinations$Var2
top_combinations$Var1 = NULL
top_combinations$Var2 = NULL
top_combinations = transform(top_combinations,
                             Sport = as.numeric(as.character(Sport)),
                             Dport = as.numeric(as.character(Dport)))

extract_intersection = function(sport, dport){
  argus_subset = argus[Sport == sport & Dport == dport,]
  return (argus_subset)
}

generate_combinations_matrix = function(top_combinations){
  n = dim(top_combinations)[1]
  combinations = c()
  for (i in 1:n){
    sport = as.numeric(top_combinations[i,]$Sport)
    dport = as.numeric(top_combinations[i,]$Dport)
    combo = extract_intersection(sport, dport)
    combinations = c(combinations, list(combo))
  }
  return (combinations)
}
combinations = generate_combinations_matrix(top_combinations)

We now have a matrix built with the most common Sport and Dport combinations. Each entry in the matrix contains all of the observations that correspond to that Sport and Dport combination. We can now perform testing on each group using normal transformations and principal component analysis to see if there are any trends between groups.

4.2 Principal Component Analysis

pca_analysis = function(SrcBytes, SrcPkts, DstBytes, DstPkts){
  pca_cont_vars = cbind(SrcBytes, SrcPkts, DstBytes, DstPkts)
  pca = prcomp(pca_cont_vars, center = TRUE, scale. = TRUE)
  print(pca$rotation)
  print((summary(pca)))
  #screeplot(pca, type="lines",col=3)
  g = ggbiplot(pca, obs.scale = 1, var.scale = 1,
                ellipse = TRUE,
                circle = TRUE)
  g = g + scale_color_discrete(name = '')
  g = g + theme(legend.direction = 'horizontal',
                 legend.position = 'top')
  print(g)
  return(pca$rotation)
}

4.2.1 Investigating Combinations

combo_num = 10
for (i in 1:combo_num){
  combo_table = combinations[i]
  combo_table = transform(combo_table,
                          SrcBytes = as.numeric(SrcBytes),
                          SrcPkts = as.numeric(SrcPkts),
                          DstBytes = as.numeric(DstBytes),
                          DstPkts = as.numeric(DstPkts))
  cat("Sport:", combo_table$Sport[1],"\t")
  cat("Dport:", combo_table$Dport[1],"\n")
  SrcBytes_norm =  nscore(combo_table$SrcBytes)$nscore
  SrcPkts_norm =  nscore(combo_table$SrcPkts)$nscore
  DstBytes_norm =  nscore(combo_table$DstBytes)$nscore
  DstPkts_norm =  nscore(combo_table$DstPkts)$nscore
  pca_analysis(SrcBytes_norm, SrcPkts_norm, DstBytes_norm, DstPkts_norm)
}

Sport: 32416    Dport: 9163 
               PC1        PC2        PC3          PC4
SrcBytes 0.4113308 -0.7246604 -0.5528760  0.001575730
SrcPkts  0.5054262 -0.3234250  0.7999554  0.003459252
DstBytes 0.5357591  0.4327433 -0.1665939  0.705649994
DstPkts  0.5369483  0.4277813 -0.1632360 -0.708550377
Importance of components:
                          PC1    PC2     PC3     PC4
Standard deviation     1.7118 0.8991 0.51065 0.02309
Proportion of Variance 0.7326 0.2021 0.06519 0.00013
Cumulative Proportion  0.7326 0.9347 0.99987 1.00000

Sport: 4145     Dport: 9119 
               PC1        PC2        PC3          PC4
SrcBytes 0.4333309  0.8902089 -0.1405308  0.001886624
SrcPkts  0.5213529 -0.1206221  0.8439979  0.036180840
DstBytes 0.5190742 -0.3165495 -0.3953966  0.688490995
DstPkts  0.5205549 -0.3045896 -0.3340362 -0.724339380
Importance of components:
                         PC1    PC2     PC3     PC4
Standard deviation     1.875 0.6538 0.23124 0.05551
Proportion of Variance 0.879 0.1069 0.01337 0.00077
Cumulative Proportion  0.879 0.9859 0.99923 1.00000

Sport: 19239    Dport: 9153 
               PC1        PC2        PC3          PC4
SrcBytes 0.4400190  0.8125062 -0.3823817 -0.001104223
SrcPkts  0.5189497  0.1174251  0.8466938 -0.003489750
DstBytes 0.5179403 -0.4057384 -0.2640886 -0.705245607
DstPkts  0.5184712 -0.4017728 -0.2591352  0.708953621
Importance of components:
                          PC1    PC2     PC3     PC4
Standard deviation     1.8418 0.7037 0.33316 0.03695
Proportion of Variance 0.8481 0.1238 0.02775 0.00034
Cumulative Proportion  0.8481 0.9719 0.99966 1.00000

Sport: 4243     Dport: 27 
               PC1        PC2        PC3          PC4
SrcBytes 0.3986539  0.8262979 -0.3978739  0.001762539
SrcPkts  0.5268775  0.1487193  0.8367993  0.007045781
DstBytes 0.5300998 -0.3876467 -0.2708012  0.703840168
DstPkts  0.5314785 -0.3805842 -0.2610173 -0.710321243
Importance of components:
                          PC1    PC2     PC3     PC4
Standard deviation     1.7727 0.8343 0.40033 0.03406
Proportion of Variance 0.7856 0.1740 0.04007 0.00029
Cumulative Proportion  0.7856 0.9596 0.99971 1.00000

Sport: 4243     Dport: 10290 
                PC1         PC2        PC3          PC4
SrcBytes -0.4146349  0.86007431 -0.2972268 -0.002501828
SrcPkts  -0.5225073  0.04229265  0.8514240 -0.016572825
DstBytes -0.5257892 -0.36545098 -0.3181251 -0.699133556
DstPkts  -0.5278350 -0.35345310 -0.2924547  0.714794623
Importance of components:
                          PC1    PC2     PC3     PC4
Standard deviation     1.8125 0.7560 0.37528 0.05022
Proportion of Variance 0.8213 0.1429 0.03521 0.00063
Cumulative Proportion  0.8213 0.9642 0.99937 1.00000

Sport: 4243     Dport: 26 
               PC1         PC2        PC3          PC4
SrcBytes 0.4320683  0.87332152 -0.2249932  0.002131619
SrcPkts  0.5202875 -0.03769203  0.8529953  0.016712305
DstBytes 0.5202102 -0.34831316 -0.3463831  0.698625865
DstPkts  0.5215355 -0.33847715 -0.3190547 -0.715288791
Importance of components:
                          PC1    PC2     PC3    PC4
Standard deviation     1.8559 0.6797 0.30326 0.0398
Proportion of Variance 0.8611 0.1155 0.02299 0.0004
Cumulative Proportion  0.8611 0.9766 0.99960 1.0000

Sport: 4243     Dport: 25 
               PC1        PC2        PC3           PC4
SrcBytes 0.4278586  0.8931432 -0.1386797 -0.0005117289
SrcPkts  0.5214493 -0.1186010  0.8449812 -0.0055942005
DstBytes 0.5218835 -0.3078511 -0.3699338 -0.7042828114
DstPkts  0.5221736 -0.3057071 -0.3604494  0.7098972916
Importance of components:
                          PC1    PC2     PC3     PC4
Standard deviation     1.8651 0.6736 0.25860 0.02853
Proportion of Variance 0.8697 0.1134 0.01672 0.00020
Cumulative Proportion  0.8697 0.9831 0.99980 1.00000

Sport: 4243     Dport: 10282 
               PC1        PC2        PC3          PC4
SrcBytes 0.3952127  0.8082469 -0.4365116  0.001226653
SrcPkts  0.5292182  0.1880550  0.8273666  0.005281099
DstBytes 0.5303470 -0.3972687 -0.2534339  0.704755906
DstPkts  0.5314763 -0.3918544 -0.2463602 -0.709429149
Importance of components:
                          PC1    PC2     PC3     PC4
Standard deviation     1.7608 0.8629 0.39223 0.03300
Proportion of Variance 0.7751 0.1862 0.03846 0.00027
Cumulative Proportion  0.7751 0.9613 0.99973 1.00000

Sport: 19581    Dport: 118 
               PC1        PC2         PC3           PC4
SrcBytes 0.4978768  0.5190781 -0.69474927  0.0003069068
SrcPkts  0.5010460  0.4817043  0.71896568 -0.0014778396
DstBytes 0.5004716 -0.4999998 -0.01523225 -0.7066090712
DstPkts  0.5005994 -0.4985169 -0.01340796  0.7076025312
Importance of components:
                          PC1     PC2     PC3     PC4
Standard deviation     1.9145 0.56579 0.10402 0.06249
Proportion of Variance 0.9163 0.08003 0.00271 0.00098
Cumulative Proportion  0.9163 0.99632 0.99902 1.00000

Sport: 35506    Dport: 12 
               PC1        PC2        PC3         PC4
SrcBytes 0.4432713  0.7265363  0.5248198 -0.01482290
SrcPkts  0.5148990  0.2730712 -0.8122575  0.02343025
DstBytes 0.5185367 -0.4458439  0.1990667  0.70193688
DstPkts  0.5191429 -0.4458702  0.1586646 -0.71169932
Importance of components:
                        PC1    PC2     PC3     PC4
Standard deviation     1.80 0.7830 0.38006 0.05046
Proportion of Variance 0.81 0.1533 0.03611 0.00064
Cumulative Proportion  0.81 0.9633 0.99936 1.00000

Zeroes in the dataset causing the patterns in the 2nd principal component. Normal scores dont work very well if there are ties in the data.

4.3 Matrix Completion via Singular Value Decomposition

There are \(m\) source ports and \(n\) destination ports. \(Y \in {\rm I\!R}^{m \times n}\), is the matrix that stores the means of the combinations of source ports and destination ports. \(Y\) has a lot of missingness because not every source port interacts with every destination port. \(F \in {\rm I\!R}^{m \times n}\) is a sparse matrix that represents the frequencies of combinations, i.e \(F[32242,12312]\) represents the number of observations for the 32242 12312 port interaction. \(M \in {\rm I\!R}^{m \times n}\) represents a boolean matrix of whether the corresponding \(Y\) values are missing. \(Y[M]\) represents all of the missing values of \(Y\).

There are multiple steps to the matrix completion process: Impute the initial values for the missing \(y_{i,j}\) observations \(1 \leq i \leq m, 1 \leq j \leq n\): In general an additive model is applicable: \[y_{i,j} = \mu + a_i + b_j + \epsilon_{i,j}\] where \(\epsilon \in N(0,\sigma^2)\), \(\mu\) is the overall mean, \(a_i\) is the row mean, and \(b_j\) is column mean. An analysis of variance (ANOVA) imputation is used to fill in the initial values, \(y_{i,j}\). Ignoring the missing values for now, let \(y_{..}\) denote the empirical overall mean, \(y_{i.}\) denote the empirical row mean, and \(y_{.j}\) denote the column mean. \[y_{i,j} = y_{..} + (y_{i.}-y{..}) + (y_{.j}-y_{..}) = y_{i.} + y_{.j} - y{..}\]

The repeated imputation procedure solves \(Y^{(s)}[M] = R_k(Y^{(s-1)})[M]\) where \(R_k\) is the best rank-k approximation for the \(s\)-th step. For each step \((s)\) use singular value decomposition to decompose \[Y^{(s)} = U^{(s)}DV^{T(s)}\] where \(D\) is a diagonal matrix of the singular values, \(U\) is the left singular vectors of \(Y\) and \(V\) is the right singular vectors of \(Y\).

The Eckart-Young-Mirsky (EYM) Theorem provides the best rank-k approximation for the missing values in \(Y^{(s+1)}\). Recall \(Y[M]\) represents all of the missing values of \(Y\). Applying the EYM theorem: \[Y^{(s+1)}[M] = (U[,1:k]^{(s)}D[,1:k]V[,1:k]^{T(s)})[M]\]. Eliminate (s), (s+1)

Where \(U[,1:k]\) represents the first \(k\) columns of \(U\).

The EYM rank approximation is repeated until the relative difference between \(Y^{(s+1)}\) and \(Y^{(s)}\) falls below a set threshold, \(T\). The relative difference threshold is expressed: \[\|Y^{(s+1)}-Y^{(s)}\|_2 < T\] divide by current value of Y^({s}). Make criteria invariate to a scale change, multiplyng by 20 doesnt change convergence criteria. Frobenius norm? L2 norm?

To assess the quality of the imputation, Leave-One-Out Cross Validation (LOOCV) is used to generate a prediction error. LOOCV requires taking an observed value, setting it to NA (missing), and then performing the described imputation process. The prediction error can then be calculated as some function of the difference between the imputed value and the true value, \(\hat y_{i,j} - y_{i,j}\).

Least squares estimate with gaussian error model, for the same variance assumption between cells.

What would a good statistical model be for variable frequency

AMMI Model, \[y_{ijk}`sim .mu + a_i + b_j + u_i^TDv_j+\sigma_{ij}\epsilon_{ij}\] epsilon is standard normal so it turns the whole term to having variance i,j

#matrix parameters
n_Sport = 20
n_Dport = 20

#get freqs
Sport_table = as.data.frame(table(argus$Sport))
Sport_table = Sport_table[order(-Sport_table$Freq),]
top_Sport = (head(Sport_table$Var1, n_Sport))

#get freqs
Dport_table = as.data.frame(table(argus$Dport))
Dport_table = Dport_table[order(-Dport_table$Freq),]
top_Dport = (head(Dport_table$Var1, n_Dport))

#create starting matrices
ports_combo_matrix = matrix(list(), nrow = n_Sport, ncol = n_Dport)
dimnames(ports_combo_matrix) = list(top_Sport, top_Dport)

ports_freq_matrix = matrix(0, nrow = n_Sport, ncol = n_Dport)
dimnames(ports_freq_matrix) = list(top_Sport, top_Dport)

nscore = function(x) {
  nscore = qqnorm(x, plot.it = FALSE)$x  # normal score 
  trn.table = data.frame(x=sort(x),nscore=sort(nscore))
  return (list(nscore=nscore, trn.table=trn.table))
}

#fill the ports_combo_matrix and ports_freq_matrix
for (s in 1:n_Sport){
  for (d in 1:n_Dport){
    combination = argus[is.element(argus$Sport, top_Sport[s])
                        & is.element(argus$Dport, top_Dport[d]),]
    obs = combination$SrcBytes
    n_obs = length(obs) #ignores NA values
    if (n_obs > 0){
      #obs = nscore(obs)$nscore #normal transformation
      for (i in 1:n_obs){
        ports_combo_matrix[[s,d]] = c(ports_combo_matrix[[s,d]],obs[i]) 
        #O(1) time to append values to a list?
        ports_freq_matrix[s,d] = ports_freq_matrix[s,d] + 1
      }
    }
  }
}

#create mean and variance matrix
ports_mean_matrix = matrix(NA, nrow = n_Sport, ncol = n_Dport)
dimnames(ports_mean_matrix) = list(top_Sport, top_Dport)

ports_variance_matrix = matrix(NA, nrow = n_Sport, ncol = n_Dport)
dimnames(ports_variance_matrix) = list(top_Sport, top_Dport)

#fill mean and variance matrix
for (s in 1:n_Sport){
  for (d in 1:n_Dport){
    if (ports_freq_matrix[s,d] == 1){
      ports_mean_matrix[s,d] = ports_combo_matrix[[s,d]]
      ports_variance_matrix[s,d] = 0
    }
    else if (ports_freq_matrix[s,d] > 1){
      ports_mean_matrix[s,d] = mean(ports_combo_matrix[[s,d]])
      ports_variance_matrix[s,d] = var(ports_combo_matrix[[s,d]])
    }
  }
}

#untuned ALS using softimpute
fit = softImpute(ports_mean_matrix,rank.max=2,lambda=0.9,trace=TRUE,type="als")
fit$d
filled = complete(ports_mean_matrix, fit)
plot(ports_mean_matrix[!is.na(ports_mean_matrix)], filled[!is.na(ports_mean_matrix)])
plot(ports_mean_matrix[!is.na(ports_mean_matrix)], filled[!is.na(ports_mean_matrix)], 
     xlim = c(0,5000), ylim = c(0,5000))


####Eckhart Young Theorem Implementation, Best Rank k Approximation####
S = 1000
k = 2
Y = ports_mean_matrix

#calculate overall mean
n = 0
sum = 0
for (s in 1:n_Sport){
  for (d in 1:n_Dport){
    if (ports_freq_matrix[s,d] != 0){
      sum = sum + ports_mean_matrix[s,d]
      n = n + ports_freq_matrix[s,d]
    }
  }
}
overall_mean = sum/n
#calculate row means and col means
row_means = rowMeans(ports_mean_matrix, na.rm = TRUE)
col_means = colMeans(ports_mean_matrix, na.rm = TRUE)

#Fill in missing values in Y with ANOVA
for (s in 1:n_Sport){
  for (d in 1:n_Dport){
    if (ports_freq_matrix[s,d] == 0){
      Y[s,d] = row_means[s] + col_means[d] - overall_mean
    }
  }
}

for (i in 1:S){
  #extract SVD
  svd_Y = svd(Y)
  D = diag(svd_Y$d)
  U = svd_Y$u
  V = svd_Y$v
  #EYM theorem
  EYM = U[,1:k] %*% D[,1:k] %*% V[,1:k]
  #replace imputed values with new values 
  for (s in 1:n_Sport){
    for (d in 1:n_Dport){
      if (ports_freq_matrix[s,d] == 0){
        Y[s,d] = EYM[s,d]
      }
    }
  }
}

–>