-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCVA.r
96 lines (82 loc) · 5.69 KB
/
CVA.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#CREDIT: ADAPTED FROM CODE BY JAMES CRAMPTON
#Canonical Variates Analysis
#Useful items are: projection (data projected onto DF axis, centred on overall mean)
# Mahalan.dist (distance between means) -- has distances in upper half, and permutation probabilities in lower half (probs of that distance arising by chance alone)
# jackCV (jacknifed cross-validation correct percentage)
# MANOVA components - use "str(object)" to see structure of output and use "object$stats[5]" to output F statistic, say
# std.coef = within-group standardized canonical coefficients - i.e., coefficients that allow us to interpret loadings of variables on canonical/discriminant axis
# proportion of between-group variation explain by each axis (where multiple groups/axes) - coded as "Proportion of trace" in lda output
#Some of this code adapted from http://www.statmethods.net/advstats/discriminant.html
#other parts taken from Tamsin's course notes (esp. generating pooled, within-group standardized canonical coefficients)
#IMPORTANT NOTE ABOUT INPUT: CANNOT HAVE SYMBOLS LIKE '+' IN COLUMN NAMES; CANNOT HAVE LABEL FOR FIRST COLUMN (WHICH WILL BECOME CASE NAMES)
#clear all buffers
rm(list=ls())
#set working directory
setwd("C:/Users/fvaux/Documents/R/rdata")
#load library
library(MASS)
#Infile expected to have case names in column 1 and grouping variable column 2
infile <- "INFILE.csv"
n.iter <- 1000 # number iterations for permutation test
In.data <- read.csv(file=infile, header=TRUE, row.names=1)
In.data$group <- as.factor(In.data$group)
CVA.data <- as.matrix(In.data[,-1]) #remove grouping variable from data (lda requires this)
n.groups <- nlevels(In.data$group)
group.names <- levels(In.data$group)
CVA <- lda(CVA.data, In.data$group) #performs discriminant analysis by group; variables must be separated from grouping variable
meanproj <- CVA$mean%*%CVA$scaling #projected means
overall.mean <- apply(meanproj,MARGIN=2,FUN=mean)
projection <- (CVA.data%*%CVA$scaling)
projection <- sweep(projection, 2, overall.mean) # projects data and centres on (i.e., subtracts) mean for each axis
#to project and centre can also use "predict" and "scale"
col.labels <- as.character(c("group",colnames(projection)))
projection <- as.data.frame(cbind(as.character(In.data$group), projection)) #stick grouping variable back in
colnames(projection) <- col.labels
#output appears as set of factors ... laborious to convert all back to numbers but I can find no way to avoid this
for (f in 2:ncol(projection)) {
projection[,f] <- as.numeric(as.character(projection[,f]))
}
#plot(CVA) #produces summary plot of projections by CV axis, format changes depending on number groups/axes
########## compute Mahalanobis distances between group means
mahalan.dist <- as.matrix(dist(meanproj, upper=TRUE))
#Note: 'dist' defaults to Euclidean distance, but if one is using full dimentionality, then Euclidean distance is equal to Mahalanobis distance (Krzanowski 1988: 301, 376)
#Alternative ways of computing Mahal.dist - see long-hand at end, or use:
#'dist(predict(CVA, CVA$mean) $x)'
#Where x is one of the quantities produced by predict, see '?predict.lda'
########## Permutation test for Mahalanobis distances
# Note: this generates random membership permutation for just two focal groups,
# but recomputes Mahalanobis distances within space of full dataset and all groups
for (j in 1:n.groups) { # then loop through each pair-wise group comparison
for (k in j:n.groups) {
mahalan.dist.perm <- vector(mode="numeric")
perm.group <- as.factor(as.numeric(In.data$group)) # create a new grouping factor, integer, for use in permutations
rows.j <- grep(levels(In.data$group)[j], In.data$group)
rows.k <- grep(levels(In.data$group)[k], In.data$group)
samp.size.j <- length(rows.j)
samp.size.k <- length(rows.k)
samp.jk <- c(rows.j, rows.k)
for (m in 1:n.iter) {
# create permutation for groups jk
perm <- sample(samp.jk) # take random permutation of row indices in gp.jk
perm.group[perm[1:samp.size.j]] <- j # recodes 'new.gp' for two focal groups according to permutation
perm.group[perm[samp.size.j+1:length(perm)]] <- k
CVA.perm <- lda(CVA.data, perm.group) # performs discriminant analysis by group; variables must be separated from grouping variable
meanproj.perm <- CVA.perm$mean%*%CVA.perm$scaling # projected means
mahalan.dist.perm[m] <- as.matrix(dist(meanproj.perm))[j,k]
}
# compute probability of observed value against permutation
perm.dist <- ecdf(mahalan.dist.perm) # note, 'perm.dist' is a function
mahalan.prob <- perm.dist(mahalan.dist[j,k])
mahalan.dist[k,j] <- 1-mahalan.prob
}
}
##########Undertake jacknifed (leave-one-out) predictions and print percent correct assignment
CVA2 <- lda(CVA.data, In.data$group, CV=TRUE) # CV = jacknifed cross-validation
ct <- table(In.data$group, CVA2$class) # assess accuracy of the prediction
jackCV <- diag((prop.table(ct,1))*100) # NOTE: dimension of prop.table is important - columns totals for CVA classification, rows total for original classification
print("jacknifed assignments; rows are original classification; columns are jacknife classification")
# to read this table look along rows and then up columns to see how originally classified are reclass. in jacknife;
# look down columns to see how jacknife classified were classified originally.
ct
print("jacknifed cross-validation % correct ")
jackCV