forked from DrZiruiJ/R_Code
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDEseq2
148 lines (87 loc) · 4.87 KB
/
DEseq2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
######################################################################
################### DEseq2 #####################
######################################################################
##用SUBIO数据做DESeq2差异分析##
BiocManager::install("DESeq2")
install.packages("BiocManager")
library(BiocManager)
library(tidyverse)
library(dplyr)
library(DESeq2)
library(DESeq2)
library(limma)
##数据清洗##
data1<-read.table("./Protein1.txt",sep="\t",header=T) #protein1为用subio处理出的测序数据
avereps(data1[,-1],ID=data1$id) #重复序列取均值,前半部分为只保留矩阵,后半部分设id(基因名)列为被查找的重复列
datagene<-data1[!duplicated(data1$id),] #合并删除重复序列
row.names(datagene)<-datagene[,1]
datagene<-datagene[,-1]
datagene[is.na(datagene)] <- 0 #na全赋值为0
any(is.na(data1))
datagene<-as.matrix(datagene)
any(is.na(datagene))
summary(datagene)
datagene<-as.numeric(datagene)
datatest<-datagene[!rowMeans(datagene)==0,] #去除全为0的行
datagene<-datatest
##转化为DESeq2要求的数据格式##
condition <-factor(c(rep("case",5),rep("control",5)),levels=c("case","control"))
colData<-data.frame(row.names =c("A1","A2","A3","A4","A5", "C1","C2",
"C3", "C4","C5") ,condition)
dds<-DESeqDataSetFromMatrix(countData = datagene, colData = colData,
design = ~condition)
dds
##数据过滤##
# dds <- dds[ rowSums(counts(dds)) > 1 ,]
dds<-DESeq(dds)
sizeFactors(dds) #出现小数点证明前面的操作成功,estimates size factors to account for differences in sequencing
#depth. So the value are typically centered around 1.
res <- results(dds, contrast=c("condition","case","control")) ########注意case组在前,control在后,输出结果
rld <- rlogTransformation(dds) #转化为rlog形式的标准化数据矩阵,得到DESeq2包normlization的表达矩阵!
exprMatrix_rlog=assay(rld)
write.csv(exprMatrix_rlog,'exprMatrix.rlog.csv' ) #保存rlog数据到csv格式
normalizedCounts1 <- t( t(counts(dds)) / sizeFactors(dds) ) # 生成TPM格式数据
normalizedCounts2 <- counts(dds, normalized=T) # it's the same for the tpm value
exprMatrix_rpm=as.data.frame(normalizedCounts1)
head(exprMatrix_rpm)
write.csv(exprMatrix_rpm,'exprMatrix.rpm.csv' ) #保存TPM数据到csv格式
#as.data.frame(exprMatrix_rpm)
#write.table(exprMatrix_rpm,"exprMatrix_rpm.xls",sep = "\t",col.names = TRUE,
row.names = T,quote = F,na="") #quote = F 意思是双引号不输出,另一种保存方法
#下面图看哪些差异基因(差异有意义)
png("qc_dispersions.png", 1000, 1000, pointsize=20)
plotDispEsts(dds, main="Dispersion plot")
dev.off()
##下方代码为绘制DEseq_RAWvsNORM对比图##
## 下面的代码如果你不感兴趣不需要运行,免得误导你
png("DEseq_RAWvsNORM.png",height = 800,width = 800) #jimmy的代码
par(cex = 0.7)
n.sample=ncol(exprMatrix_rlog)
if(n.sample>40) par(cex = 0.5)
cols <- rainbow(n.sample*1.2)
par(mfrow=c(2,2))
boxplot(datagene, col = cols,main="expression value",las=2) #datagene是原始的count数据
boxplot(exprMatrix_rlog, col = cols,main="expression value",las=2) #exprMatrix_rlog为新的标准化后数据
hist(as.matrix(datagene)) #下面两个是数据分布柱状图
hist(exprMatrix_rlog)
dev.off()
## 上面的代码如果你不感兴趣不需要运行,免得误导你
##生成差异结果文件##
res
class(res) #应该是两种
res<-as.data.frame(res)
head(res)
resOrdered <- res[order(res$padj),] #生成一个升序排列文件
resOrdered<-cbind(rownames(resOrdered),resOrdered)
colnames(resOrdered)<-c("gene_id","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj")
head(resOrdered)
write.csv(resOrdered,'case-control-all-DESeq2.gene.csv',row.names = F,col.names = T))
##生成FDR<0.05文件##
resSig<-resOrdered[which(resOrdered$padj<0.05 & abs(resOrdered$log2FoldChange)>1),]
resSig[which(resSig$log2FoldChange>0),"up_down"]<-"up"
resSig[which(resSig$log2FoldChange<0),"up_down"]<-"down"
resSig <- resSig[order(resSig$padj),]
head(resSig)
write.csv(resSig,'case-control-differ-pvalue-0.05-FC-2.gene.csv',row.names = F,col.names = T))
#write.table(resSig,"case-control-differ-pvalue-0.05-FC-2.gene.csv",sep = "\t",
col.names = TRUE,row.names = F,quote = F,na="")