MTSpak.R

library(MASS)            # for ginv function
library(qualityTools)    # for taguchiDesign function
library(tidyverse)       # for data handling, pipes, and plotting
library(highcharter)     # for an alternative to ggplot
library(cowplot)         # for formatting plots in diffMTS

computeMDs <- function(good, bad) {
# This function computes Mahalanobis Distances (MDs) for "good" and "bad" groups.
# Takes two data frames as inputs and produces a list of MD.good and MD.bad -- the object
# returned by this function can be ingested by plotMDs
   if (!is.data.frame(good) || !is.data.frame(bad)) { 
      stop("At least one input is not a data frame")
   } else {
      # Compute means and sd's for "good" group
      xbars <- colMeans(good)
      sds <- apply(good, 2, sd)
      pinv <- 1/ncol(good)
      Rinv <- ginv(cor(good))

      # Compute MDs for bad group by using means and sds of "good" group as a baseline
      z0s <- apply(bad, 1, function(x) (x-xbars)/sds)  
      MD.bad <- pinv * diag( (t(z0s) %*% Rinv %*% z0s) )
   
      # Compute MDs for good group
      z0s.good <- apply(good, 1, function(x) (x-xbars)/sds)
      MD.good <- pinv * diag( (t(z0s.good) %*% Rinv %*% z0s.good) )
   
   return(list(MD.bad=MD.bad, MD.good=MD.good))
   }
}

plotMDs <- function(computeMDs_obj, type="bars") {
      bar.df <- data.frame( 
          index=seq(1, length(computeMDs_obj$MD.good) + length(computeMDs_obj$MD.bad), 1), 
          md=c(computeMDs_obj$MD.good, computeMDs_obj$MD.bad),
          group=c(rep("normal", length(computeMDs_obj$MD.good)), rep("abnormal",length(computeMDs_obj$MD.bad) ))
      )
      if (type == "bars") {
          bar.df %>% ggplot() + geom_bar(aes(x=index, y=md), stat="identity") + 
              ggtitle("Comparison of MDs between Normal and Abnormal Groups")
      } else if (type == "hc_scatter") {
          hchart(bar.df, "scatter", hcaes(x = index, y = md, group = group)) %>% hc_colors(c("red","blue"))
      } else if (type == "hc_column") {
          highchart() %>% hc_chart(type = "column") %>% hc_add_series(data = bar.df$md, name = "MD")
      }
}
                        
generateTDO <- function(good) {
# This function can take as input either the good or bad data frame, since both have same ncol.
# Alternatively, you can just give it the number of IVs directly and it will still work.
   if (!is.data.frame(good)) { 
          ivs <- good
   } else {
          ivs <- ncol(good)
   }
      if (ivs %in% c(2:3)) {
          tdo <- taguchiDesign("L4_2")@design[,1:ivs] %>% replace(.=="2",0)
      } else if (ivs %in% c(4:7)) {
          tdo <- taguchiDesign("L8_2")@design[,1:ivs] %>% replace(.=="2",0)
      } else if (ivs %in% c(8:11)) { 
          tdo <- taguchiDesign("L12_2")@design[,1:ivs] %>% replace(.=="2",0)
      } else if (ivs %in% c(12:15)) {
          tdo <- taguchiDesign("L16_2")@design[,1:ivs] %>% replace(.=="2",0)
      } else if (ivs==1 || ivs > 15) {
          stop(paste0("Number of independent variables ",ivs," is not supported"))
      }
   return(tdo=tdo)
}

# Functions to compute Taguchi Signal to Noise (SN)
ltb <- function(x) { 
    -10*log10( (1/(length(x))) * sum(1/(x^2) )) 
}   # Larger-the-better

stb <- function(x) { 
    -10*log10( (1/(length(x))) * sum(  (x^2) )) 
}   # Smaller-the-better

dyn.sn <- function(x, y) { 
   stb(x) - ltb(y) 
}

runTaguchi <- function(good, bad, tdo) {
# This function takes the data frames of good and bad observations + the Taguchi Orthogonal Array design (tdo)
# created by the generateTDO function and runs all signal-to-noise experiments on the MDs.
# It returns a matrix of experimental results, with one row per Taguchi experiment, and one column
# per predictor/independent variable.
  
   all.results <- NULL
   
   for (i in 1:(nrow(tdo)) ) {
      exp.bad <- 0; exp.good <- 0
      
      exp.bad  <- as.data.frame(mapply(`*`, bad, tdo[i,]))
      exp.bad  <- exp.bad[, colSums(exp.bad != 0) > 0]        # drop all columns that contain all zeroes
      exp.good <- as.data.frame(mapply(`*`, good, tdo[i,]))
      exp.good <- exp.good[, colSums(exp.good != 0) > 0]      # drop all columns that contain all zeroes

      if (is.vector(exp.good)) { # this indicates there is only ONE active column in the Taguchi array
         xbars <- mean(exp.good)
         sds <- sd(exp.good)
         # pinv will be 1/1 because only one predictor
         Rinv <- ginv(cor(as.matrix(exp.good))) 
         z0s <- (exp.bad-xbars)/sds
         as.vector(t(z0s)*z0s) -> results
         all.results <- rbind(all.results, results)

      } else if (is.data.frame(exp.good)) {                  # there are MULTIPLE active columns to process in the Taguchi array
         xbars <- colMeans(exp.good)
         sds <- apply(exp.good, 2, sd)
         pinv <- 1/ncol(exp.good)
         Rinv <- ginv(cor(exp.good))
         z0s <- apply(exp.bad, 1, function(x) (x-xbars)/sds)  # scale bad group based on xbar/sd of good group
         as.vector(pinv * diag( (t(z0s) %*% Rinv %*% z0s) )) -> results
         all.results <- rbind(all.results, results)
 
      }                 
   }
      rownames(all.results) <- NULL
      return(all.results)
}
                   
addSN <- function(taguchiResults, method="ltb") {
    # This function takes the results from runTaguchi, and appends a SN column to it 
    # based on whether you want larger-the-better, smaller-the-better, or dynamic (stb-ltb) SN.
    if (is.data.frame(taguchiResults)) { 
       if (method == "stb") {
          taguchiResults %>% mutate(sn=apply(taguchiResults,1,stb)) -> sn.df
       } else if (method == "dyn") {
          taguchiResults %>% mutate(sn=apply(taguchiResults,1,dyn)) -> sn.df
       } else {
          taguchiResults %>% mutate(sn=apply(taguchiResults,1,ltb)) -> sn.df
       }
    } else {
          stop("The addSN function requires a data frame containing Taguchi experiment results")
    }
   return(sn.df)
}
                      
calcGains <- function(tdo, df) {
# This function calculates average SN for each predictor/independent variable when it was used (Level 1)
# and also when it was not used (Level 2) in each of the Taguchi experiments. The difference between
# these two averages as you move from Level 1 to Level 2 is called the gain.
# For example, if the first predictor/independent variable was used in 3 of 8 Taguchi experiments, the
# Level 1 SN for X1 would be the average of these three SNs, while the Level 2 SN would be the average of
# SNs for the 5 experiments where X1 was not used.
   
    level1 <- colMeans(apply(tdo, 2, function(x) x*df$sn))
    level2s <- as.data.frame(matrix(as.numeric(!tdo), nrow=nrow(tdo)))
    level2 <- colMeans(apply(level2s, 2, function(x) x*df$sn))

    gains.df <- as.data.frame(cbind(level1, level2, gain=level1-level2), rownames=colnames(x))
    gains.df$var <- colnames(good)

    return(gains.df)
}
                             
plotGains <- function(tdo, gains.df, type="bars") {
# This function plots the output of calcGains. Negative gain indicates that the predictor does not
# enhance the discriminatory power of the MTS. On the main effects plot, downward sloping lines 
# indicate predictors that do not enhance the discriminatory power.

     if (type == "bars") {
       gains.df %>% ggplot() + geom_bar(aes(x=var, y=gain), stat="identity") +
         ggtitle("Gain in SN Ratios for Each Variable")
     } else if (type == "maineffects") {
       gains.df %>% select(-gain) %>% 
           pivot_longer(cols=starts_with("level"), names_to="level", values_to="value") %>% 
           ggplot(aes(level, value, group=var)) + geom_line(lwd=2) +
           facet_wrap(~var,ncol=ncol(good))
     } else if (type == "hc_scatter") {
         hchart(gains.df, "scatter", hcaes(x = var, y = gain, group = sign(gain))) %>% 
              hc_colors(c("red","blue")) %>% hc_add_series(showInLegend = FALSE)
     } else if (type == "hc_column") {
         highchart() %>% hc_chart(type = "column") %>% hc_add_series(data = gains.df$gain, name = "gain")
     }
}     
                             
                             
recommend <- function(gains.df, type="print") {
# This function tells you what variables to use for discriminating between "good" and "bad" groups.
# If argument "pretty" is used, it generates a flextable containing the recommended variables.
   
   if (type == "print") {
       gains.df %>% filter(gain > 0) %>% select(var) %>% as.list()

     } else if (type == "pretty") {
       gains.df %>% filter(gain > 0) %>% select(var) %>% regulartable() %>% autofit()
     } 
}     

diffMTS <- function(mts1, mts2) {
# This function takes two objects from compareMDs: your INITIAL set of MDs from good and bad groups with all potential
# predictor variables included, and your FINAL set of MDs after feature selection.
   
   compare <- data.frame(index = seq(1, length(unlist(mts1)), 1),
              md.old = c(mts1$MD.good, mts1$MD.bad),
              md.new = c(mts2$MD.good, mts2$MD.bad))
   
   compare %>% mutate(diff = md.new - md.old) -> compare

   g1 <- compare %>% ggplot() + geom_line(aes(x = index, y = md.old), color="blue") +
      geom_line(aes(x = index, y = md.new), color="red", lwd=2)+ 
      ggtitle("Discrimination Power After Feature Selection (Red)")
   
   g2 <- compare %>% ggplot() + geom_histogram(aes(x=diff)) + geom_vline(xintercept=2, col="red") +
      ggtitle("Differences in Mahalanobis Distance (MD) after Feature Selection")
   
   cowplot::plot_grid(g1, g2)
}