-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfeature_differential_methylation.r
166 lines (123 loc) · 7.26 KB
/
feature_differential_methylation.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#Determine whether or not case status is associated with average values across different features (e.g. are CpG islands differentially methylated)
library('CpGassoc')
library(minfi)
#get number of rows in file (reads file faster)
nr <- as.numeric(system('wc -l cleaned_v3a.methylation | awk \'{print $1}\' ',intern=T))
#get list of numeric column classes (reads file faster)
dat_temp <- read.table("cleaned_v3a.methylation", header=T,nrows=50)
classes_to_use <- sapply(dat_temp , class)
#read file
dat0 <- read.table("cleaned_v3a.methylation", header=T,nrows=nr,colClasses=classes_to_use)
######## we want to filter out the cross hybridizing and SNP probes #######
############# load in the other stuff like cross hybridizing probes or whatever #########
snp_probes <- read.table('PCA/cpgs_within_10bp_of_SNP.table', header=F,stringsAsFactors=F)
names(snp_probes)[1] <- "CpG"
cross_hybridizing <- read.table('PCA/cpg-non-specific-probes-Illumina450k.txt',header=T,stringsAsFactors=F)
names(cross_hybridizing)[1] <- "CpG"
badprobes <- c(snp_probes$CpG, cross_hybridizing$CpG)
#also logit 2 transform the data to help make valid inferences
dat <- logit2(dat0[-which(row.names(dat0) %in% badprobes),])
#make a dataframe of subject names that goes in the order they appear in the file
dat_order <- as.data.frame(names(dat))
#name the subjects the same as found in the matching column in the batch file
names(dat_order) <- c('methylome_id_fixed')
#establish a note of what the ordering is
dat_order$order <- c(1:(dim(dat)[2]))
batch <- read.csv("data_analysis/demo2_wholedata_methylation_x3c.csv", header=T,na.strings=c("NA","#N/A","N/A"))
#merge batch data with subject name order data
covariates_ordered0 <- merge(batch,dat_order,by='methylome_id_fixed')
#sort the data by order so the phenotype now lines up
covariates_ordered <- covariates_ordered0[order(covariates_ordered0$order),]
#read in studymax covariate data, to get the studymax ids (if doing studymax)
covars_studymax <- read.csv('data_analysis/demo2_studymaxnov0_methylation_x1.csv',header=T,na.strings=c("NA","#N/A","N/A"),stringsAsFactors=F)
##### load in probe annotations #####
probe_annotations <- read.csv('misc/humanmethylation450_15017482_v1-2.csv',header=T,skip=7,stringsAsFactors=F)
##### order this file by the probe order in the data #####
all_qced_probes <- data.frame(row.names(dat))
names(all_qced_probes)[1] <- "IlmnID"
all_qced_probes$order <- 1:dim(all_qced_probes)[1]
merged_probe_types0 <- merge(all_qced_probes,probe_annotations,by="IlmnID",all.x=TRUE)
probe_annotations_use <- merged_probe_types0[order(merged_probe_types0$order),]
n_shelf <- probe_annotations_use[probe_annotations_use$Relation_to_UCSC_CpG_Island == "N_Shelf",]$IlmnID
n_shore <- probe_annotations_use[probe_annotations_use$Relation_to_UCSC_CpG_Island == "N_Shore",]$IlmnID
island <- probe_annotations_use[probe_annotations_use$Relation_to_UCSC_CpG_Island == "Island",]$IlmnID
opensea <- probe_annotations_use[is.na(probe_annotations_use$Relation_to_UCSC_CpG_Island),]$IlmnID
s_shelf <- probe_annotations_use[probe_annotations_use$Relation_to_UCSC_CpG_Island == "S_Shelf",]$IlmnID
s_shore <- probe_annotations_use[probe_annotations_use$Relation_to_UCSC_CpG_Island == "S_Shore",]$IlmnID
#the gene grouping stuff depends on what gene it annotates to, i'll just take the first piece
unlist_split <- function(x, ...)
{
toret <- unlist(strsplit(x, ...) )[1]
return(t(toret))
}
probe_annotations_use$UCSC_RefGene_Group_simple <- sapply(probe_annotations_use$UCSC_RefGene_Group, unlist_split, split = c(";"))
f_utr <- probe_annotations_use[probe_annotations_use$UCSC_RefGene_Group_simple == "5'UTR",]$IlmnID
t_utr <- probe_annotations_use[probe_annotations_use$UCSC_RefGene_Group_simple == "3'UTR",]$IlmnID
tss1500 <- probe_annotations_use[probe_annotations_use$UCSC_RefGene_Group_simple == "TSS1500",]$IlmnID
tss200 <- probe_annotations_use[probe_annotations_use$UCSC_RefGene_Group_simple == "TSS200",]$IlmnID
body <- probe_annotations_use[probe_annotations_use$UCSC_RefGene_Group_simple == "Body",]$IlmnID
exon <- probe_annotations_use[probe_annotations_use$UCSC_RefGene_Group_simple == "1stExon",]$IlmnID
intergenic <- probe_annotations_use[is.na(probe_annotations_use$UCSC_RefGene_Group_simple),]$IlmnID
############### get average methylation for each subject at each location ############
covariates_ordered$mean_overall <- apply(dat,2,mean,na.rm=T)
#gene features means
covariates_ordered$f_utr <- apply(dat[f_utr,],2,mean,na.rm=T)
covariates_ordered$t_utr <- apply(dat[t_utr,],2,mean,na.rm=T)
covariates_ordered$tss1500 <- apply(dat[tss1500,],2,mean,na.rm=T)
covariates_ordered$tss200 <- apply(dat[tss200,],2,mean,na.rm=T)
covariates_ordered$body <- apply(dat[body,],2,mean,na.rm=T)
covariates_ordered$exon <- apply(dat[exon,],2,mean,na.rm=T)
covariates_ordered$intergenic <- apply(dat[intergenic,],2,mean,na.rm=T)
#genome regions means
covariates_ordered$n_shelf <- apply(dat[n_shelf,],2,mean,na.rm=T)
covariates_ordered$n_shore <- apply(dat[n_shore,],2,mean,na.rm=T)
covariates_ordered$island <- apply(dat[island,], 2,mean,na.rm=T)
covariates_ordered$opensea <- apply(dat[opensea,],2,mean,na.rm=T)
covariates_ordered$s_shelf <- apply(dat[s_shelf,],2,mean,na.rm=T)
covariates_ordered$s_shore <- apply(dat[s_shore,],2,mean,na.rm=T)
###### take differences between cases and controls in refgene groups and by cpg island relation
######## at v3
covariates_ordered_subset <- subset(covariates_ordered, visit == 2 & (PTSDbroad == 1 | PTSDbroad_highest == 0))
dat_subset <- dat[, names(dat) %in% covariates_ordered_subset$methylome_id_fixed]
summary(
lm(mean_overall ~ PTSDbroad + CD8T+CD4T+NK+Bcell+Mono + PC1_HGDP + PC2_HGDP + PC3_HGDP, data= covariates_ordered_subset)
)
summary(
lm(f_utr ~ PTSDbroad + CD8T+CD4T+NK+Bcell+Mono + PC1_HGDP + PC2_HGDP + PC3_HGDP, data= covariates_ordered_subset)
)
summary(
lm(t_utr ~ PTSDbroad + CD8T+CD4T+NK+Bcell+Mono + PC1_HGDP + PC2_HGDP + PC3_HGDP, data= covariates_ordered_subset)
)
summary(
lm(tss1500 ~ PTSDbroad + CD8T+CD4T+NK+Bcell+Mono + PC1_HGDP + PC2_HGDP + PC3_HGDP, data= covariates_ordered_subset)
)
summary(
lm(tss200 ~ PTSDbroad + CD8T+CD4T+NK+Bcell+Mono + PC1_HGDP + PC2_HGDP + PC3_HGDP, data= covariates_ordered_subset)
)
summary(
lm(body ~ PTSDbroad + CD8T+CD4T+NK+Bcell+Mono + PC1_HGDP + PC2_HGDP + PC3_HGDP, data= covariates_ordered_subset)
)
summary(
lm(exon ~ PTSDbroad + CD8T+CD4T+NK+Bcell+Mono + PC1_HGDP + PC2_HGDP + PC3_HGDP, data= covariates_ordered_subset)
)
summary(
lm(intergenic ~ PTSDbroad + CD8T+CD4T+NK+Bcell+Mono + PC1_HGDP + PC2_HGDP + PC3_HGDP, data= covariates_ordered_subset)
)
summary(
lm(n_shore ~ PTSDbroad + CD8T+CD4T+NK+Bcell+Mono + PC1_HGDP + PC2_HGDP + PC3_HGDP, data= covariates_ordered_subset)
)
summary(
lm(n_shelf ~ PTSDbroad + CD8T+CD4T+NK+Bcell+Mono + PC1_HGDP + PC2_HGDP + PC3_HGDP, data= covariates_ordered_subset)
)
summary(
lm(island ~ PTSDbroad + CD8T+CD4T+NK+Bcell+Mono + PC1_HGDP + PC2_HGDP + PC3_HGDP, data= covariates_ordered_subset)
)
summary(
lm(opensea ~ PTSDbroad + CD8T+CD4T+NK+Bcell+Mono + PC1_HGDP + PC2_HGDP + PC3_HGDP, data= covariates_ordered_subset)
)
summary(
lm(s_shelf ~ PTSDbroad + CD8T+CD4T+NK+Bcell+Mono + PC1_HGDP + PC2_HGDP + PC3_HGDP, data= covariates_ordered_subset)
)
summary(
lm(s_shore ~ PTSDbroad + CD8T+CD4T+NK+Bcell+Mono + PC1_HGDP + PC2_HGDP + PC3_HGDP, data= covariates_ordered_subset)
)