-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathabcmax.R
1131 lines (966 loc) · 76.7 KB
/
abcmax.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -----------------------------------------------------------------------------
# Project: snMultiome
# Date: August 2024
# Script name: abcmax.R
# Description: This script is designed for processing genomic data in the context of
# different GWAS traits and analyzing their linkage disequilibrium (LD)
# buddies. It loads various data sources, processes genomic information,
# and generates useful output for further analysis.
# This script describes ABC-MAX analysis (connection between peaks
# overlapping GWAS SNPs and causal genes)
#
# outputs: Plots for Fig. 3-4, Fig. S8-S17, Table S8-S10
#
# -----------------------------------------------------------------------------
# -------------------------------
# Setup and Library Loading
# -------------------------------
library(plyr) # Data manipulation and aggregation
library(data.table) # Fast data manipulation for large datasets
library(GenomicRanges) # Handling genomic intervals
library(bedr) # Working with BED files and genomic ranges
library(liftOver) # Converting genomic coordinates between assemblies
library(ggplot2) # Creating customizable plots
library(edgeR) # Differential expression analysis for count data
library(scales) # Formatting and scaling for plots
library(ggsci) # Color palettes for ggplot2
########################################################################################
##### CONFIG ###########################################################################
{
ROOT = "~/" # !!! FIXME: SET TO YOUR CUSTOM DIRECTORY !!!
ABC_THRESHOLD = 0.015 # Minimal ABC score to consider it peak-gene as valid EP
MAX_LINKS_PER_PEAK = 1 # Maximum number of E-P links detected by ABC per peak in GWAS loci (per each cell type separately)
}
########################################################################################
##### PROLOGUE: PATHS AND SETTING ANALYSIS TYPE ########################################
{
# List of paths to table with index SNPs for analyzed GWAS traits
GWAS_LDBUDDIES_ROOT = file.path(ROOT, "inputs", "abcmax_gwas")
GWAS = list("SCZ" = list(trait="SCZ", path=file.path(GWAS_LDBUDDIES_ROOT, "pgc3_scz_table.csv"), chr="Chromosome", left="range.left", right="range.right", index="top.pos", index_rs="top.index", p="top.P"),
"BD" = list(trait="BD", path=file.path(GWAS_LDBUDDIES_ROOT, "BD_GWAS_mullins_2020.tsv"), chr="CHR", left="range.left", right="range.right", index="BP", index_rs="SNP", p="P"),
"MDD" = list(trait="MDD", path=file.path(GWAS_LDBUDDIES_ROOT, "mdd_GWAS_2023.csv"), chr="CHR", left="NONE", right="NONE", index="BP.index.SNP", index_rs="leadSNP", p="P.x.index.SNP"),
"AN" = list(trait="AN", path=file.path(GWAS_LDBUDDIES_ROOT, "anorexia_GWAS_2019.csv"), chr="CHR", left="left", right="right", index="BP", index_rs="SNP", p="P"),
"ASD" = list(trait="ASD", path=file.path(GWAS_LDBUDDIES_ROOT, "ASD_GWAS_grove_2019.csv"), chr="CHR", left="left", right="right", index="BP", index_rs="SNP", p="P"),
"ADHD" = list(trait="ADHD", path=file.path(GWAS_LDBUDDIES_ROOT, "adhd_GWAS_2022.csv"), chr="Chr", left="NONE", right="NONE", index="Position_bp", index_rs="rsID", p="P_value"),
"AD" = list(trait="AD", path=file.path(GWAS_LDBUDDIES_ROOT, "ad_GWAS_meta_shea_2023_bellenguezSubset.csv"), chr="CHR", left="left", right="right", index="BP", index_rs="SNP", p="P"),
"ALS" = list(trait="ALS", path=file.path(GWAS_LDBUDDIES_ROOT, "als_GWAS_2021_vanRheenen.csv"), chr="CHR", left="NONE", right="NONE", index="BP", index_rs="SNP", p="P"), # https://www.nature.com/articles/s41588-021-00973-1
"PD" = list(trait="PD", path=file.path(GWAS_LDBUDDIES_ROOT, "pd_GWAS_2023_kim.csv"), chr="CHR", left="NONE", right="NONE", index="BP", index_rs="rsID", p="P_RE"), # https://www.medrxiv.org/content/10.1101/2022.08.04.22278432v1.full-text
"MS" = list(trait="MS", path=file.path(GWAS_LDBUDDIES_ROOT, "ms_GWAS_2019.csv"), chr="CHR", left="NONE", right="NONE", index="POS", index_rs="SNP", p="EUR") # https://www-science-org.eresources.mssm.edu/doi/10.1126/science.aav7188#supplementary-materials
)
# Gene expression matrices from cell type psedobulks
COUNT_MATRIX_SNRNASEQ_TO_BULK = list(
"Astrocyte" = file.path(ROOT, "inputs", "abcmax_materials", "Astrocyte_all.exon.geneID.txt"),
"Endothelial" = file.path(ROOT, "inputs", "abcmax_materials", "Endothelial_all.exon.geneID.txt"),
"GABA" = file.path(ROOT, "inputs", "abcmax_materials", "GABA_all.exon.geneID.txt"),
"GLU" = file.path(ROOT, "inputs", "abcmax_materials", "GLU_all.exon.geneID.txt"),
"Microglia" = file.path(ROOT, "inputs", "abcmax_materials", "Microglia_all.exon.geneID.txt"),
"OPC" = file.path(ROOT, "inputs", "abcmax_materials", "OPC_all.exon.geneID.txt"),
"Oligo" = file.path(ROOT, "inputs", "abcmax_materials", "Oligo_all.exon.geneID.txt")
)
LD_BUDDIES = file.path(ROOT, "inputs", "abcmax_materials", "EUR_geno.txt.gz") # LD buddies # from https://zenodo.org/record/3404275#.YWL0eVPMJdB
TRUBETSKOY_PRIORITIZED_GENES = file.path(ROOT, "inputs", "Trubetskoy_Supplementary_Table_12.csv") # Table S12 from PMID:35396580
TRUBETSKOY_PRIORITIZED_GENES2 = file.path(ROOT, "inputs", "Trubetskoy_Supplementary_Table_12b.csv") # Table S12 from PMID:35396580
CELLTYPE_MARKERS_GENERAL = file.path(ROOT, "inputs", "abcmax_materials", "TopALL_3dg_ALL_Celltype_SCT_logfc0.25_minpct0.25.csv") # Cell type gene markers
COUNTMATRIX_PEAKS_GENERAL = file.path(ROOT, "inputs", "abcmax_materials", "readCountMatrices.RData") # Cell type peak markers
COUNTMATRIX_GENES_GENERAL = file.path(ROOT, "inputs", "abcmax_materials", "ps_3dg_RNA_celltype_brain.region_gene.csv") # Count matrix on cell type pseudobulks
CYTOGENIC_LOCATIONS = file.path(ROOT, "inputs", "abcmax_materials", "gen.location.RDS") # Convertor of genes to cytogenic locations
ENSEMBL_INFO_hg19 = file.path(ROOT, "inputs", "abcmax_materials", "muchEnsemblInfo_hg19.tsv.gz") # Downloaded Ensembl hg19
ENSEMBL_INFO_hg38 = file.path(ROOT, "inputs", "abcmax_materials", "muchEnsemblInfo_hg38.tsv.gz") # Downloaded Ensembl hg38
ENSEMBL_INFO = file.path(ROOT, "inputs", "abcmax_materials", "muchEnsemblInfo_hg38.tsv.gz") # muchEnsemblInfo_hg38_v101.gz
GENCODE = file.path(ROOT, "inputs", "abcmax_materials", "gencode.v30.annotation.gtf.gz") # REFERENCE_HG38_GENCODE30/gencode.v30.annotation.gtf.gz
allCells_env = new.env(); load(file.path(ROOT, "inputs", "abcmax_materials", "abc_and_peaks.RData"), envir=allCells_env) # Pre-calculated ABC E-P links
outDir = file.path(ROOT, "outputs", "abcmax") # Set output dir
dir.create(outDir, recursive=T)
TX = list()
CELL_TYPES = c("GABA", "GLU", "Oligo", "OPC", "Astrocyte", "Endothelial", "Microglia") # Vector of cell analyzed cell types
TRAITS = c("AD", "ADHD", "ALS", "AN", "ASD", "BD", "MDD", "MS", "PD", "SCZ") # Vector of analyzed GWAS traits
}
########################################################################################
##### HELPER FUNCTIONS #################################################################
{
# Get cytogenic location
get_location <- function(gene.name) {
cyt.location <- readRDS(CYTOGENIC_LOCATIONS)
test <- lapply(cyt.location, FUN = function(x){as.logical(which(x==gene.name))})
ifelse (
length(as.character(na.omit(names(test)[test==T]))) !=0 ,
stringr::str_replace_all(
as.character(na.omit(names(test)[test==T])),
"\\d+",
function(m) sprintf("%02d", as.integer(m)) ),
NA )
}
# Convert Ensembl gene name to Ensembl gene ID
findEnsemblNames = function(inputNames, keep_NA=F) {
ensemblIds = sapply(inputNames, function(x)
ifelse(!is.na(which(ensemblInfo$Associated.Gene.Name==x)[1]), ensemblInfo[which(ensemblInfo$Associated.Gene.Name==x)[1], "Ensembl.Gene.ID"],
ifelse((which(ensemblInfo$HGNC.symbol==x)[1]), ensemblInfo[which(ensemblInfo$HGNC.symbol==x)[1], "Ensembl.Gene.ID"],
ifelse((which(ensemblInfo$Associated.Transcript.Name==x)[1]), ensemblInfo[which(ensemblInfo$Associated.Transcript.Name==x)[1], "Ensembl.Gene.ID"], NA)))
)
if(keep_NA) {
return(ensemblIds)
} else {
return(ensemblIds[!is.na(ensemblIds)])
}
}
untransform = function(x) {
return(2^as.numeric(x))
}
# Plotting/CSV-exporting functions
mpdf = function(x, width=7,height=7, outDir=outDir, onefile=T)eval.parent(substitute({ pdf(paste0(outDir,"/plot_",make.names(x),".pdf"),useDingbats=F,width=width,height=height,onefile=onefile) })) #outDir must be defined as a global var
qplotly = function(x,command,outDir=outDir, width=700,height=700)eval.parent(substitute({
if(!getOption("disablePlotly",default=F)){
plotlyDir=paste0(outDir,"/plotly/")
dir.create(plotlyDir,showWarnings=F,recursive=T)
htmlwidgets::saveWidget(as_widget(ggplotly(command, width=width, height=height)), paste0(plotlyDir, x, ".html"), selfcontained=F)
}
}))
}
########################################################################################
##### PROLOGUE :: PREPARE ENVIRONMENT ##################################################
{
# Load Ensembl hg38
ensemblInfo = read.delim(ENSEMBL_INFO_hg38, stringsAsFactors=F)
ensemblGeneInfo = unique(ensemblInfo[,c("Ensembl.Gene.ID", "Strand", "Transcript.start..bp.", "Transcript.end..bp.", "Associated.Gene.Name", "Gene.type", "Status..gene.", "Source..gene.", "Transcript.count", "Description", "Chromosome.Name","HGNC.symbol", "Transcription.Start.Site..TSS."),])
z = ddply(ensemblGeneInfo,c("Ensembl.Gene.ID"),summarize,HGNC.symbol=paste(HGNC.symbol,collapse=";"))
ensemblGeneInfo$HGNC.symbol = z$HGNC.symbol[match(ensemblGeneInfo$Ensembl.Gene.ID,z$Ensembl.Gene.ID)]
ensemblGeneInfo = unique(ensemblGeneInfo)
rownames(ensemblGeneInfo) = ensemblGeneInfo$Ensembl.Gene.ID
#####
# Load Ensembl hg19
ensemblInfo_hg19 = read.delim(ENSEMBL_INFO_hg19, stringsAsFactors=F)
ensemblGeneInfo_hg19 = unique(ensemblInfo_hg19[,c("Ensembl.Gene.ID", "Strand", "Transcript.Start..bp.", "Transcript.End..bp.", "Associated.Gene.Name", "Gene.type", "Status..gene.", "Source..gene.", "Transcript.count", "Description", "Chromosome.Name","HGNC.symbol"),])
z = ddply(ensemblGeneInfo_hg19,c("Ensembl.Gene.ID"),summarize,HGNC.symbol=paste(HGNC.symbol,collapse=";"))
ensemblGeneInfo_hg19$HGNC.symbol = z$HGNC.symbol[match(ensemblGeneInfo_hg19$Ensembl.Gene.ID,z$Ensembl.Gene.ID)]
ensemblGeneInfo_hg19 = unique(ensemblGeneInfo_hg19)
ensemblGeneInfoShort = ensemblGeneInfo[,c("Chromosome.Name", "Transcript.start..bp.", "Transcript.end..bp.", "Transcription.Start.Site..TSS.", "Ensembl.Gene.ID", "Strand")]
colnames(ensemblGeneInfoShort) = c("seqnames", "start", "end", "tss", "name", "strand")
ensemblGeneInfoShort$seqnames = paste0("chr", ensemblGeneInfoShort$seqnames)
ensemblGeneInfoShort = ensemblGeneInfoShort[ensemblGeneInfoShort$seqnames %in% paste0("chr", c(1:22, "X")),]
ensemblGeneInfoShort$strand = ifelse(ensemblGeneInfoShort$strand == "1", "+", "-")
rownames(ensemblGeneInfoShort) = ensemblGeneInfoShort$name
#####
# Load cell type marker genes
ctypeDiff = read.csv(CELLTYPE_MARKERS_GENERAL)
ctypeDiff$gene_id = findEnsemblNames(ctypeDiff$gene, keep_NA = T)
# Load gene count matrix (pseudobulk)
df = data.frame(t(read.csv(COUNTMATRIX_GENES_GENERAL)))
celltypes = c("Astrocyte", "Endothelial", "GABAergic", "Glutamatergic", "Microglia", "OPC", "Oligodendrocyte")
colnames(df) = df[1,]
df = df[-1,]
geneNames = rownames(df)
df = data.frame(apply(df, 2, untransform))
rownames(df) = geneNames
geneMatrix = do.call("cbind.data.frame", lapply(celltypes, function(ctype) { rowMeans(df[,startsWith(colnames(df), prefix=ctype)]) }))
best_cellType = sapply(1:nrow(geneMatrix), function(i) { names(which.max(geneMatrix[i,])) })
best_cellType_ratio = sapply(1:nrow(geneMatrix), function(i) { mx = which.max(geneMatrix[i,]); geneMatrix[i,mx] / sum(geneMatrix[i,]) })
qcGeneAnno = cbind.data.frame(rownames(geneMatrix), best_cellType, best_cellType_ratio)
colnames(qcGeneAnno) = c("GeneID", "best_cellType", "best_cellType_ratio")
}
########################################################################################
##### LOAD PEAK COUNT MATRIX (PSEUDOBULK) ##############################################
{
tmpEnv = new.env(); load(COUNTMATRIX_PEAKS_GENERAL, envir=tmpEnv)
peakMatrix = data.frame(apply(tmpEnv$initialVoomObj$E, 2, untransform))
qcPeakAnno = allCells_env$mergedPeaksDf
colnames(peakMatrix) = gsub("_all$", "", colnames(peakMatrix))
tmp = allCells_env$barcodes[!duplicated(allCells_env$barcodes$sampleID),c("sampleID", "age.group")]
rownames(tmp) = tmp$sampleID
for(ctype in names(allCells_env$abcResults_cell)) {
allCells_env$abcResults_cell[[ctype]]$CellType_general = ctype
allCells_env$abcResults_cell[[ctype]]$SampleID = sapply(allCells_env$abcResults_cell[[ctype]]$CellType, function(x) strsplit(x, "_")[[1]][1])
allCells_env$abcResults_cell[[ctype]]$AgePeriod = tmp[allCells_env$abcResults_cell[[ctype]]$SampleID, "age.group"]
}
rownames(peakMatrix) = rownames(tmpEnv$initialVoomObj)
qcPeakAnno$best_cellType = sapply(1:nrow(qcPeakAnno), function(i) { names(which.max(peakMatrix[i,])) }) # For each peak, find cell type in which it it the most expressed (~accessible)
qcPeakAnno$best_cellType_ratio = sapply(1:nrow(qcPeakAnno), function(i) { mx = which.max(peakMatrix[i,]); peakMatrix[i,mx] / sum(peakMatrix[i,]) }) # ... and proportion of expression for that most expressed cell type (compared to the other cell types)
cellTypeRatiosDf = do.call("cbind.data.frame", lapply(colnames(peakMatrix), function(ctype) {
sapply(1:nrow(peakMatrix), function(i) {
peakMatrix[i,ctype] / sum(peakMatrix[i,])
})
}))
colnames(cellTypeRatiosDf) = colnames(peakMatrix)
qcPeakAnno = cbind.data.frame(qcPeakAnno, cellTypeRatiosDf)
}
########################################################################################
##### LOAD GWAS INDEX SNPS AND LD BUDDIES ##############################################
{
ld_buddies_list = list()
lociTable_list = list()
ld_main_and_buddies_list = list()
for(gwas in GWAS) {
print(paste0("*** PROCESSING ", gwas$trait, " GWAS"))
# Load GWAS info
lociTable = read.csv(gwas$path, sep="\t")
lociTable_list[[gwas$trait]] = lociTable[,c(gwas$index_rs, gwas$chr, gwas$index, gwas$index, gwas$p)]
colnames(lociTable_list[[gwas$trait]]) = c("rs", "chr", "start", "end", "p")
lociTable_list[[gwas$trait]]$start = lociTable_list[[gwas$trait]]$start - 1
lociTable_list[[gwas$trait]]$trait = gwas$trait
lociTableTmp = lociTable_list[[gwas$trait]][,c("chr", "start", "end", "rs", "chr", "start", "end", "rs", "rs", "p")]
colnames(lociTableTmp) = c("chr", "start", "end", "rs", "buddy_chr", "buddy_start", "buddy_end", "buddy_rs", "r2", "p")
lociTableTmp$r2 = NA
lociTableTmp$trait = gwas$trait
lociTableTmp$is_index_snp = T
# Find LD buddies for GWAS index SNP locations
ld_buddies = tabix(region=c(paste0(lociTable[,gwas$chr], ":", lociTable[,gwas$index]-1, "-", lociTable[,gwas$index])), LD_BUDDIES)
if(!is.null(ld_buddies) == 0) {
next
}
colnames(ld_buddies) = c("chr", "start", "end", "rs", "buddy_chr", "buddy_start", "buddy_end", "buddy_rs", "r2")
TX[[paste0("ld_buddy_search_", gwas$trait, "_1")]] = paste0("GWAS ", gwas$trait, ": We found at least 1 LD buddy for ", sum(unique(lociTable[,gwas$index_rs]) %in% unique(ld_buddies$rs)), " out of ", length(unique(lociTable[,gwas$index])), " GWAS index SNPs.")
TX[[paste0("ld_buddy_search_", gwas$trait, "_2")]] = paste0("GWAS ", gwas$trait, ": We found ", min(table(ld_buddies$rs)), "-", max(table(ld_buddies$rs)), " LD buddies (avg/median=", round(mean(nrow(ld_buddies)) / nrow(lociTable)), "/", median(table(ld_buddies$rs)), ") for GWAS index SNPs; total buddies = ", length(unique(ld_buddies$buddy_rs)))
print(TX[[paste0("ld_buddy_search_", gwas$trait, "_1")]])
print(TX[[paste0("ld_buddy_search_", gwas$trait, "_2")]])
ld_buddies$trait = gwas$trait
ld_buddies = ld_buddies[(ld_buddies$rs %in% lociTableTmp$rs),]
ld_buddies$p = lociTableTmp[match(ld_buddies$rs, lociTableTmp$rs),"p"]
ld_buddies$is_index_snp = F
ld_buddies_list[[gwas$trait]] = ld_buddies
ld_main_and_buddies_list[[gwas$trait]] = rbind.data.frame(ld_buddies, lociTableTmp)
}
ld_buddies_df = data.frame(do.call("rbind", ld_buddies_list))
lociTable_df = data.frame(do.call("rbind", lociTable_list))
ld_main_and_buddies_df = data.frame(do.call("rbind", ld_main_and_buddies_list))
ld_main_and_buddies_df$combo = paste0(ld_main_and_buddies_df$buddy_rs, "_", ld_main_and_buddies_df$chr, "_", ld_main_and_buddies_df$start, "_", ld_main_and_buddies_df$end)
ld_main_and_buddies_df$rsAlt = paste0(ld_main_and_buddies_df$chr, "_", ld_main_and_buddies_df$start) # that's because some buddies don't have rsID so they wouldn't be counted
ld_main_and_buddies_df$buddy_rsAlt = paste0(ld_main_and_buddies_df$buddy_chr, "_", ld_main_and_buddies_df$buddy_start) # that's because some buddies don't have rsID so they wouldn't be counted
ld_main_and_buddies_df$rs = sapply(1:nrow(ld_main_and_buddies_df), function(i) {
ifelse(startsWith(ld_main_and_buddies_df[i,"rs"], "rs"), ld_main_and_buddies_df[i,"rs"], paste0(ld_main_and_buddies_df[i,"chr"], "_", ld_main_and_buddies_df[i,"end"]))
})
ld_main_and_buddies_df$buddy_rs = sapply(1:nrow(ld_main_and_buddies_df), function(i) {
ifelse(startsWith(ld_main_and_buddies_df[i,"buddy_rs"], "rs"), ld_main_and_buddies_df[i,"buddy_rs"], paste0(ld_main_and_buddies_df[i,"chr"], "_", ld_main_and_buddies_df[i,"buddy_end"]))
})
uniqIndexSnps = length(unique(ld_main_and_buddies_df$rsAlt))
uniqIndexBuddies = length(unique(ld_main_and_buddies_df$buddy_rsAlt))
write.table(ld_main_and_buddies_df, file="./ld_buddies.tsv", quote=F, row.names=F, sep="\t")
TX[["buddy_search_global"]] = paste0("Initially, we assembled a set of ", uniqIndexSnps, " genome-wide significant variants linked to select neuropsychiatric and neurodegenerative traits (with a significance threshold of P<5×10-8) and expanded this set to include ", uniqIndexBuddies, " variants based on their high linkage disequilibrium (LD; R2≥0.8). [[total of ", length(c(unique(ld_main_and_buddies_df$rsAlt), unique(ld_main_and_buddies_df$buddy_rsAlt))), "]]")
print(TX[["buddy_search_global"]])
}
########################################################################################
##### FUNCTIONS FOR CONVERTING PEAKS (FROM ABC-LINKS) TO HG19 ##########################
{
# Loading liftover and chains for conversion
chPath = system.file(package="liftOver", "extdata", "hg38ToHg19.over.chain")
ch = import.chain(chPath)
chPath2 = system.file(package="liftOver", "extdata", "hg19ToHg38.over.chain")
ch2 = import.chain(chPath2)
# Helper function for converting hg38 cell-type to hg19 (lifted peak cannot have size shorter or larger than 10% compared to original one)
toHg19 = function(x) {
x$name = paste0(x@seqnames, "_", x@ranges@start, "_", x@ranges@width, "_", x$TargetGene)
x@ranges@NAMES = x$name
hg19 = liftOver(x, ch)@unlistData
hg19 = unlist(range(split(hg19, ~name)))
idsInBothChrVersions = as.character(intersect(names(hg19), x$name))
differences = (hg19[idsInBothChrVersions,]@ranges@width - x[idsInBothChrVersions,]@ranges@width) / x[idsInBothChrVersions,]@ranges@width
idsTooDifferent = idsInBothChrVersions[which(abs(differences) > 0.10)]
hg19 = hg19[!hg19@ranges@NAMES %in% idsTooDifferent,]
hg19$gene = x[hg19@ranges@NAMES,]@elementMetadata$TargetGene
hg19$gene_name = x[hg19@ranges@NAMES,]@elementMetadata$TargetGene_name
hg19$peak = x[hg19@ranges@NAMES,]@elementMetadata$PeakID
hg19$ABC.Score = x[hg19@ranges@NAMES,]@elementMetadata$ABC.Score
hg19$AgePeriod = x[hg19@ranges@NAMES,]@elementMetadata$AgePeriod
hg19$SampleID = x[hg19@ranges@NAMES,]@elementMetadata$SampleID
hg19$CellType_general = x[hg19@ranges@NAMES,]@elementMetadata$CellType_general
hg19$TargetGeneTSS = x[hg19@ranges@NAMES,]@elementMetadata$TargetGeneTSS
names(hg19) = NULL
return(hg19)
}
# Helper function for converting hg38 cell-type to hg19 (lifted peak cannot have size shorter or larger than 10% compared to original one)
toHg38 = function(x) {
x@ranges@NAMES = x$name
hg38 = liftOver(x, ch2)@unlistData
return(hg38)
}
}
########################################################################################
##### LOAD PEAK-TO-GENES LINKS #########################################################
{
# Convert ABC E-P links to GenomicRanges
epLinks = lapply(allCells_env$abcResults_cell, function(x) { x$start = x$PeakID_start; x$end = x$PeakID_end; gr = makeGRangesFromDataFrame(x, keep.extra.columns=T) })
names(epLinks) = names(allCells_env$abcResults_cell)
# Remove VLMC_Pericyte links
epLinks$VLMC_Pericyte = NULL
epLinks$VLMC = NULL
allCells_env$abcResults_cell_age_df = allCells_env$abcResults_cell_age_df[!(allCells_env$abcResults_cell_age_df$cell_type %in% c("VLMC", "VLMC_Pericyte")),]
allCells_env$abcResults_cell_df = allCells_env$abcResults_cell_df[!(allCells_env$abcResults_cell_df$cell_type %in% c("VLMC", "VLMC_Pericyte")),]
# Convert ABC E-P links from hg38 to hg19
epLinks_hg19 = lapply(epLinks, function(x) { toHg19(x) })
names(epLinks_hg19) = names(epLinks)
# Get summary counts of EP links
tx_linksUnique = unique(unlist(lapply(epLinks, function(x) unique(x$combo))))
tx_linksAll = sum(unlist(lapply(epLinks, function(x) length(unique(x$combo)))))
tx_genesLinkedUnique = length(unique(unlist(lapply(epLinks, function(x) unique(x$TargetGene)))))
tx_peaksLinkedUnique = length(unique(unlist(lapply(epLinks, function(x) unique(x$PeakID)))))
tx_totalEnhancers = nrow(allCells_env$mergedPeaksDf)
TX$maps_1 = paste0("Collectively, we identified ", length(tx_linksUnique), " enhancer–gene connections for ", tx_genesLinkedUnique, " expressed genes and ", tx_peaksLinkedUnique, " unique enhancers (note - total enhancers: ", tx_totalEnhancers , ").")
print(TX$maps_1)
# Find out how many ABC E-P links are present in at least two developmental periods
atLeastTwoDevelPeriods = sapply(CELL_TYPES, function(ctype) {
epLinksList = sapply(allCells_env$abcResults_cell_age[startsWith(names(allCells_env$abcResults_cell_age), prefix=ctype)], function(df) {
df$combo
})
freqTable = table(table(unlist(epLinksList)))
sum(freqTable[2:length(freqTable)]) / sum(freqTable)
})
tx_linksFreq = table(unlist(lapply(epLinks, function(x) unique(x$combo))))
tx_linksFreqSum = table(tx_linksFreq)
atLeastTwoCellTypes = sum(tx_linksFreqSum[2:length(tx_linksFreqSum)]) / sum(tx_linksFreqSum)
TX$maps_2 = paste0("On average, ", round(atLeastTwoCellTypes * 100), "% of the E–P links were shared in at least two cell types, whereas ", round(min(atLeastTwoDevelPeriods) * 100), "–", round(max(atLeastTwoDevelPeriods) * 100), "% of E–P links were shared across regions, when comparing within the same cell type, with a high correlation of ABC score (demanding-to-calculate).")
print(TX$maps_2)
peaksMelted = allCells_env$abcResults_cell_age_df
peaks_count_celltype_age = ggplot(data=peaksMelted, aes(x=category, y=abcLinks_count)) + geom_bar(stat="identity") + xlab("Cell type & Developmental period") + ylab("Number of E-P links") +
theme_classic() + theme(axis.text = element_text(colour = "black"), axis.text.x = element_text(angle = 45, hjust = 1))
mpdf(paste0("abc_celltype_age"), width=12, height=7); print(peaks_count_celltype_age); dev.off()
}
########################################################################################
##### FORMAT OUTPUT DATAFRAME (WITH PEAK-GENE LINKS) ###################################
{
# Get unique rsIDs (skip ".")
shared_rs = names(which(table(ld_buddies_df$buddy_rs) > 0))
shared_rs = shared_rs[shared_rs != "."]
# List of associations between "rsIDs - associated disease GWAS"
allcomb = list()
allcomb[unique(ld_main_and_buddies_df$rs)] = ld_main_and_buddies_df[match(unique(ld_main_and_buddies_df$rs), ld_main_and_buddies_df$rs), "trait"]
for(rsAlt in shared_rs) {
allcomb[[rsAlt]] = paste0(sort(ld_main_and_buddies_df[which(ld_main_and_buddies_df$buddy_rsAlt == rsAlt),"trait"]), collapse=",")
buddies_rsSource = ld_main_and_buddies_df[which(ld_main_and_buddies_df$buddy_rsAlt==rsAlt), "rsAlt"]
allcomb[buddies_rsSource] = paste0(sort(ld_main_and_buddies_df[which(ld_main_and_buddies_df$buddy_rs==rsAlt),"trait"]), collapse=",")
}
lociTable = ld_main_and_buddies_df[, c("buddy_chr", "buddy_start", "buddy_end", "buddy_rs", "trait", "rs")]
colnames(lociTable) = c("chr", "start", "end", "rs", "trait", "rsx")
lociTable$end = as.numeric(lociTable$start)
lociTable$start = as.numeric(lociTable$start) - 1
lociTable$chr = gsub("chr23", "chrX", lociTable$chr)
lociTableGr = makeGRangesFromDataFrame(lociTable, keep.extra.columns=T)
lociTable$id = ifelse(lociTable$rs != ".", lociTable$rs, paste0(lociTable$chr, "_", lociTable$start))
lociTable$rsAlt = paste0(lociTable$chr, "_", lociTable$start)
lociTable = lociTable[!duplicated(lociTable$id),]
rownames(lociTable) = lociTable$id
lociTable_hg38 = data.frame(toHg38(makeGRangesFromDataFrame(lociTable, keep.extra.columns=T)))
lociTable_hg38$desc = paste0(lociTable_hg38$seqnames, "_", lociTable_hg38$start, "_", lociTable_hg38$end)
rownames(lociTable_hg38) = lociTable_hg38$id
lociTable$pos_hg38 = lociTable_hg38[lociTable$id, "end"]
# Export locations of GWAS rsIDs and their buddies
exportLociTable = lociTable_hg38[,c("seqnames", "start", "end", "id","trait")]
exportLociTable$score = ifelse(exportLociTable$id %in% unique(ld_main_and_buddies_df$rs[(ld_main_and_buddies_df$rs==ld_main_and_buddies_df$buddy_rs)]), 1000, 500)
write.table(exportLociTable, row.names=F, sep="\t", quote=F, file=file.path(outDir, "rs_locations.bed"))
for(trait in unique(exportLociTable$trait)) {
df = exportLociTable[(exportLociTable$trait == trait) & (exportLociTable$score == 1000),]
write.table(df[,!colnames(df) %in% c("trait")], row.names=F, sep="\t", quote=F, file=file.path(outDir, paste0("rs_locations_", trait, "_index.bed")))
df = exportLociTable[(exportLociTable$trait == trait) & (exportLociTable$score == 500),]
write.table(df[,!colnames(df) %in% c("trait")], row.names=F, sep="\t", quote=F, file=file.path(outDir, paste0("rs_locations_", trait, "_buddy.bed")))
}
allResults = list()
for(ctype in names(epLinks_hg19)) {
print(ctype)
hits = findOverlaps(lociTableGr, epLinks_hg19[[ctype]])
results = cbind.data.frame(ld_main_and_buddies_df[hits@from,], epLinks_hg19[[ctype]][hits@to,])
colnames(results) = c("chr", "start", "end", "rs", "trait_buddy_chr", "trait_buddy_start", "trait_buddy_end", "trait_buddy_rs", "r2", "trait", "p", "is_index_snp", "combo", "rsAlt", "buddy_rsAlt", "linked_chr", "linked_start", "linked_end", "linked_width", "linked_strand", "gene", "gene_name", "peak", "ABC.Score", "AgePeriod", "SampleID", "CellType_general", "TargetGeneTSS_hg38")
results$linked_chr = qcPeakAnno[results$peak, "chr"]
results$linked_start = qcPeakAnno[results$peak, "start"]
results$linked_end = qcPeakAnno[results$peak, "end"]
results$p = as.numeric(results$p)
results$trait_gene = paste0(results$gene, "_", results$trait)
results$trait_gene_ldbuddy = paste0(results$gene, "_", results$trait, "_", results$trait_buddy_rs)
results$trait_gene_rs = paste0(results$gene, "_", results$trait, "_", results$rs)
results[which("AC018685.2" == results$gene), "cyt_loc"] = "chr02p25" # get_location("EIPR1"); get_location("MYT1L") #fixme
results[which("AC096570.1" == results$gene), "cyt_loc"] = "chr02p24"
results[which("AC156455.1" == results$gene), "cyt_loc"] = "chr12q24"
results[which("U91319.1" == results$gene), "cyt_loc"] = "chr16p13"
results[which("AC138647.1" == results$gene), "cyt_loc"] = "chr08q24"
results[which("HIST1H2AK" == results$gene), "cyt_loc"] = "chr08q24"
results = results[!duplicated(results$trait_gene_ldbuddy),]
dim(results[which(results$peak=="Peak_505316"),])
results$ctypeMarker = sapply(results$gene, function(geneName) {
subset = ctypeDiff[which(geneName==ctypeDiff$gene_id),]
if(nrow(subset) > 0) {
paste0(subset[order(subset$p_val, -subset$avg_log2FC),"cluster"], collapse=",")
} else {
NA
}
})
results = results[results$peak %in% rownames(qcPeakAnno),]
results$is_marker_gene = ifelse(is.na(results$ctypeMarker), F, T)
results$peak_best_cellType = qcPeakAnno[results$peak, "best_cellType"]
results$peak_best_cellType_ratio = qcPeakAnno[results$peak, "best_cellType_ratio"]
results$best_cellType = qcGeneAnno[results$gene_name, "best_cellType"]
results$best_cellType_ratio = qcGeneAnno[results$gene_name, "best_cellType_ratio"]
allResults[[ctype]] = results
}
final = data.frame(do.call("rbind.data.frame", allResults))
final$chr = gsub("chr23", "chrX", final$chr)
final = final[final$linked_chr == final$chr,] # this looks as a weird operation but hg38->hg19 can even change chromosome, those cases are rare so let's drop them
final$gene_biotype = ensemblGeneInfo[match(final$gene, ensemblGeneInfo$Ensembl.Gene.ID),"Gene.type"]
final$gene_name = sapply(final$gene, function(x) { ensemblInfo[match(x, ensemblInfo$Ensembl.Gene.ID),"Associated.Gene.Name"] })
final$cyt_loc = sapply(final$gene_name, function(x) get_location(x))
}
########################################################################################
##### FILTER OUT E-P LINKS THAT ARE NOT EXPRESSED IN THE CELL TYPE #####################
{
countMatrix = do.call("cbind.data.frame", lapply(names(COUNT_MATRIX_SNRNASEQ_TO_BULK), function(ctype) {
print(ctype)
cReadCount = read.csv(COUNT_MATRIX_SNRNASEQ_TO_BULK[[ctype]], sep="\t", header=T, skip=1, stringsAsFactors=F)[,7]
cReadCount
}))
colnames(countMatrix) = names(COUNT_MATRIX_SNRNASEQ_TO_BULK)
head(countMatrix)
df = read.csv(COUNT_MATRIX_SNRNASEQ_TO_BULK[[ctype]], sep="\t", header=T, skip=1, stringsAsFactors=F)
countMatrix$Geneid = sapply(df$Geneid, function(x) { strsplit(x, "\\.")[[1]][1] })
countMatrix = countMatrix[!duplicated(countMatrix$Geneid),]
rownames(countMatrix) = countMatrix$Geneid
countMatrix = countMatrix[,1:(length(countMatrix)-1)]
ensemblInfo = read.csv(ENSEMBL_INFO, sep="\t", header=T, stringsAsFactors=F)
ensemblInfo = ensemblInfo[,c("Ensembl.Gene.ID", "Transcript.length..including.UTRs.and.CDS.", "Gene.type", "Gene...GC.content")]
colnames(ensemblInfo) = c("Ensembl.Gene.ID", "Length", "transcript_biotype", "gcContent")
gtf = rtracklayer::import(GENCODE)
gtf_df = as.data.frame(gtf)
gtf_df = data.frame(gtf_df)
gtf_df_new = gtf_df[which(gtf_df$type=="gene"),]
gtf_df_new = gtf_df_new[,c("gene_id","seqnames","start","end","strand","gene_name","transcript_name","width","gene_type")]
gtf_df_new$exonLength = ensemblInfo[match(sapply(strsplit(as.character(gtf_df_new[,"gene_id"]), "\\."), "[[", 1), ensemblInfo[,"Ensembl.Gene.ID"]),]$Length
gtf_df_new$biotype = ensemblInfo[match(sapply(strsplit(as.character(gtf_df_new[,"gene_id"]), "\\."), "[[", 1), ensemblInfo[,"Ensembl.Gene.ID"]),]$transcript_biotype
gtf_df_new$gcContent = ensemblInfo[match(sapply(strsplit(as.character(gtf_df_new[,"gene_id"]), "\\."), "[[", 1), ensemblInfo[,"Ensembl.Gene.ID"]),]$gcContent
gtf_df_new = gtf_df_new[!is.na(match(sapply(strsplit(as.character(gtf_df_new[,"gene_id"]), "\\."), "[[", 1), ensemblInfo[,"Ensembl.Gene.ID"])),]
qcPeakAnnoGene = gtf_df_new
qcPeakAnnoGene$gene_id = sapply(qcPeakAnnoGene$gene_id, function(x) { strsplit(x, "\\.")[[1]][1] })
qcPeakAnnoGene = qcPeakAnnoGene[!duplicated(qcPeakAnnoGene$gene_id),]
rownames(qcPeakAnnoGene) = qcPeakAnnoGene$gene_id
datExpr = data.frame(countMatrix)
expObjAll = DGEList(counts=datExpr, genes=qcPeakAnnoGene[match(rownames(datExpr),qcPeakAnnoGene$PeakID),])
MIN_SAMPLE_PERCENT_WITH_MIN_GENE_CPM = 0.5
expressedGenes = lapply(names(COUNT_MATRIX_SNRNASEQ_TO_BULK), function(ctype) {
fracSamplesWithMinCPM = rowMeans(cpm(expObjAll[,ctype]) >= 1)
isNonLowExpr = fracSamplesWithMinCPM >= MIN_SAMPLE_PERCENT_WITH_MIN_GENE_CPM
expObjNonLow = expObjAll[isNonLowExpr, , keep.lib.sizes=F]
rownames(expObjNonLow$genes)
})
names(expressedGenes) = names(COUNT_MATRIX_SNRNASEQ_TO_BULK)
# Not necessary as these links were already filtered when ABC was calculated
flags = sapply(1:nrow(final), function(i) {
final[i,"gene"] %in% expressedGenes[[final$CellType_general[i]]]
})
final = final[flags,]
}
###################################################################################################################################################
##### ADD INDICATION WHETHER GENE (FROM PEAK-GENE LINKE) IS CELL TYPE MARKER (FROM EXTERNAL STUDIES) OR WHETHER PEAK IS OVERLAPPING FINEMAP SNP ###
{
#####
# Prioritized genes from PGC3 study (Supplementary Table 12)
pgcGenes = read.csv(TRUBETSKOY_PRIORITIZED_GENES, sep="\t")
pgcGenes2 = read.csv(TRUBETSKOY_PRIORITIZED_GENES2, sep="\t")
pgcGenes$rsGene = paste0(pgcGenes$Index.SNP, "_", pgcGenes$Ensembl.ID)
pgcGenes2$rsGene = paste0(pgcGenes2$Index.SNP, "_", pgcGenes2$Ensembl.ID)
final$pgc3_prioritized1 = sapply(1:nrow(final), function(i) {
ifelse(final$rs[i] %in% pgcGenes$Index.SNP, T, F)
})
table(final$pgc3_prioritized1)
final$pgc3_prioritized2 = sapply(1:nrow(final), function(i) {
ifelse(final$rs[i] %in% pgcGenes2$Index.SNP, T, F)
})
table(final$pgc3_prioritized2)
final$pgc3_prioritized1_genes = sapply(1:nrow(final), function(i) {
if(final$pgc3_prioritized1[i]) {
paste0(sort(unique(pgcGenes[(final$rs[i] == pgcGenes$Index.SNP),"Ensembl.ID"])), collapse=",")
} else {
NA
}
})
final$pgc3_prioritized2_genes = sapply(1:nrow(final), function(i) {
if(final$pgc3_prioritized2[i]) {
paste0(sort(unique(pgcGenes[(final$rs[i] == pgcGenes$Index.SNP),"Ensembl.ID"])), collapse=",")
} else {
NA
}
})
final$peak_gene = paste0(final$peak, "_", final$gene)
#####
# Distance between index SNPs or LD buddies and the distance from peak edge (it should be 0 if PEAK_PADDING == 0)
final$trait_gene_rs_ctype = paste0(final$trait_gene_rs, "_", final$CellType_general)
final$trait_gene_rs_ctype_agePeriod = paste0(final$trait_gene_rs, "_", final$CellType_general, "_", final$AgePeriod)
final$trait_gene_rs_ctype_sampleId = paste0(final$trait_gene_rs, "_", final$CellType_general, "_", final$SampleID)
final$trait_gene_rs_ctype_agePeriod_sampleId = paste0(final$trait_gene_rs, "_", final$CellType_general, "_", final$AgePeriod, "_", final$SampleID)
final$replications = sapply(final$trait_gene_rs_ctype_agePeriod, function(id) { sum(final$trait_gene_rs_ctype_agePeriod==id) })
final$trait_buddy_end_hg38 = lociTable_hg38[final$trait_buddy_rs,"end"]
final = final[order(final$rs == final$trait_buddy_rs, decreasing=T),]
#####
# Calculated distances between rs SNP and target gene & number of skipped genes
peakToGeneRegions = final[,c("chr", "trait_buddy_end_hg38", "TargetGeneTSS_hg38", "trait_gene_rs_ctype_agePeriod", "gene")]
colnames(peakToGeneRegions) = c("chr", "start", "end", "name", "geneID")
for(i in 1:nrow(peakToGeneRegions)) {
if(i %% 10000 == 0){
print(i)
}
if(peakToGeneRegions[i, "start"] > peakToGeneRegions[i, "end"]) {
x = peakToGeneRegions[i, "end"]
peakToGeneRegions[i, "end"] = peakToGeneRegions[i, "start"];
peakToGeneRegions[i, "start"] = x
}
if(peakToGeneRegions[i,"geneID"] %in% ensemblGeneInfo$Ensembl.Gene.ID) {
peakToGeneRegions[i, "end"] = max(c(peakToGeneRegions[i, "end"], ensemblGeneInfo[match(peakToGeneRegions[i, "geneID"], ensemblGeneInfo$Ensembl.Gene.ID),"Transcript.start..bp."],
ensemblGeneInfo[match(peakToGeneRegions[i, "geneID"], ensemblGeneInfo$Ensembl.Gene.ID),"Transcript.start..bp."] ))
}
}
ensemblGeneInfoOnlyExpressedGr = makeGRangesFromDataFrame(ensemblGeneInfoShort[rownames(ensemblGeneInfoShort) %in% unique(unlist(expressedGenes)),], keep.extra.columns=T)
peakToGeneRegionsGr = makeGRangesFromDataFrame(peakToGeneRegions, keep.extra.columns=T)
hits = data.frame(findOverlaps(peakToGeneRegionsGr, ensemblGeneInfoOnlyExpressedGr))
final$genesInBetween = sapply(1:nrow(final), function(i) { max(0, sum(hits$queryHits == i) - 1) })
final$distance_rsToGene = sapply(1:nrow(final), function(i) { # fixme now
snpPos = lociTable_hg38[final[i,"trait_buddy_rs"], "end"] # lociTable_hg38[final[i,"rs"], "end"]
tss = final$TargetGeneTSS_hg38[i]
abs(snpPos-tss)
})
TX$abcSummary = paste0("ABC filtering: ", nrow(final), " E-P links for ", length(unique(final$gene)), " expressed genes and ", length(unique(final$peak)), " unique enhancers; ", length(unique(final$trait_gene_rs_ctype)), " trait-gene-loci-cell combinations; ", length(unique(final$trait_gene_rs)), " trait-gene-loci combinations; ", length(unique(final$trait_buddy_rs)), " loci")
print(TX$abcSummary)
}
###################################################################################################################################################
##### FILTER OUT E-P LINKS USING ADDITINAL CRITERIA (ACCESSIBLE PEAKS, EXISTENCE OF E-P IN MIN. NUMBER OF SAMPLES, ETC) ##########################
{
# Some stats
print(paste0("> unique cytogenic locations = ", length(unique(final$trait_buddy_rs)) ))
print(paste0("> rsSNP -to- genes = ", nrow(final)))
print(paste0("> rsSNP -to- genes = ", length(unique(final$rs))))
print(paste0("> unique peaks -to- genes = ", length(unique(final$peak_gene))))
print(paste0("> unique mapped genes = ", length(table(final$gene))))
print(paste0("> All of EP links overlaping GWAS loci: ", nrow(final)))
final = final[!duplicated(final$trait_gene_rs_ctype_agePeriod_sampleId),]
print(paste0("> All of EP links with non-duplicated link for LD buddy (i.e. separately for index SNP and its buddy): ", nrow(final)))
final = final[sapply(1:nrow(final), function(i) {final[i,"gene"] %in% expressedGenes[[final[i,"CellType_general"]]] }),]
print(paste0("> All of EP links with gene that is sufficiently expressed in cell type in which EP link was identified: ", nrow(final)))
final = final[(final$peak %in% rownames(tmpEnv$initialVoomObj_filtered)),]
print(paste0("> All of EP links (dtto) with sufficiently accessible peak (existence of peak in filtered matrix): ", nrow(final)))
final = final[sapply(1:nrow(final), function(i) { qcPeakAnno[final$peak[i],final$CellType_general[i]] > 0.05 }),]
print(paste0("> All of EP links (dtto) with sufficiently accessible peak in the given cell type (needs to be at least 5% from the bulk): ", nrow(final)))
}
###################################################################################################################################################
##### ABC-MAX FILTER (Table S8, Fig. 4c, Fig. S8-S17) #############################################################################################
{
finalAbc = do.call("rbind.data.frame", lapply(setdiff(unique(final$CellType_general), "VLMC_Pericyte"), function(cellType) {
print(cellType)
df = final[which(cellType==final$CellType_general),]
dfx = do.call("rbind.data.frame", lapply(unique(df$peak), function(peakId) {
df3 = df[(df$peak == peakId),]
df3$gene_biotype = ordered(df3$gene_biotype, levels=c("protein_coding", setdiff(unique(df3$gene_biotype), "protein_coding")))
df3 = df3[order(df3$gene_biotype, -df3$ABC.Score),]
df3 = df3[1:(min(MAX_LINKS_PER_PEAK, nrow(df3))),] # make change here if you want to change the allowed amount of E-P links per peak
df3 = df3[!duplicated(df3$rs),]
df3$order = 1:nrow(df3)
df3
}))
dfx$combo2 = paste0(dfx$trait, "_", dfx$gene)
df = do.call("rbind.data.frame", lapply(unique(dfx$trait), function(trait) {
dfz = dfx[dfx$trait == trait,]
do.call("rbind.data.frame", lapply(unique(dfz$combo2), function(combo2) {
dfy = dfz[dfz$combo2 == combo2,]
xx = dfy[order(dfy$ABC.Score, decreasing=T),][1,]
xx$rs_all = paste0(unique(dfy$rs), collapse=",")
xx$trait_buddy_rs_all = paste0(unique(dfy$trait_buddy_rs), collapse=",")
xx$peak_all = paste0(unique(dfy$peak), collapse=",")
xx$AgePeriod_all = paste0(unique(dfy$trait_buddy_rs), collapse=",")
xx
}))
}))
df = df[sapply(1:nrow(df), function(i) { qcPeakAnno[df$peak[i],df$CellType_general[i]] > 0.05 }),]
df
}))
finalAbc$gene_name[finalAbc$gene_name==""] = unlist(sapply(finalAbc$gene[finalAbc$gene_name == ""], function(x) { ensemblInfo[match(x, ensemblInfo$Ensembl.Gene.ID),"Associated.Gene.Name"] }))
finalAbc$cyt_loc = sapply(finalAbc$gene_name, function(x) get_location(x))
finalAbc$start = as.numeric(finalAbc$start)
finalAbc$end = as.numeric(finalAbc$end)
finalAbc$trait_buddy_start = as.numeric(finalAbc$trait_buddy_start)
finalAbc$trait_buddy_end = as.numeric(finalAbc$trait_buddy_end)
finalAbc$plotRange = sapply(1:nrow(finalAbc), function(i) {
paste0(finalAbc[i,"chr"], ":", min(finalAbc[i,"linked_start"], ensemblGeneInfo[finalAbc[i,"gene"],"Transcript.start..bp."]), "-", max(finalAbc[i,"linked_end"], ensemblGeneInfo[finalAbc[i,"gene"],"Transcript.end..bp."]))
})
# Printing Table S8: Enhancer-promoter links connected to GWAS loci as predicted by the ABC-MAX approach.
print(paste0("ABC-MAX filtering: ", nrow(finalAbc), " E-P links for ", length(unique(finalAbc$gene)), " expressed genes and ", length(unique(finalAbc$peak)), " unique enhancers; ", length(unique(finalAbc$trait_gene_rs_ctype)), " trait-gene-loci-cell combinations; ", length(unique(finalAbc$trait_gene_rs)), " trait-gene-loci combinations; ", length(unique(finalAbc$trait_buddy_rs)), " loci"))
exportAbcMax = finalAbc[,c("cyt_loc", "rs", "trait_buddy_rs", "r2", "trait", "CellType_general", "ABC.Score", "gene", "gene_name", "peak", "linked_chr", "linked_start", "linked_end")]
colnames(exportAbcMax) = c("Cytogenic location", "rsID", "rsID (LD buddy)", "r2 (LD buddy)", "GWAS abbreviation", "Cell type", "ABC score", "Ensembl Gene ID", "Gene name", "Peak name", "Peak_chr", "Peak_start", "Peak_end")
write.csv(exportAbcMax, row.names=F, file=file.path(ROOT, "outputs", "Table_S8.csv"))
# Export all valid ABC EP links (not ABC-MAX!)
exportAbc = final[,c("cyt_loc", "rs", "trait_buddy_rs", "r2", "trait", "CellType_general", "ABC.Score", "gene", "gene_name", "peak", "linked_chr", "linked_start", "linked_end")]
colnames(exportAbc) = c("Cytogenic location", "rsID", "rsID (LD buddy)", "r2 (LD buddy)", "GWAS abbreviation", "Cell type", "ABC score", "Ensembl Gene ID", "Gene name", "Peak name", "Peak_chr", "Peak_start", "Peak_end")
write.csv(exportAbc, row.names=F, file=file.path(ROOT, "outputs", "abcmax", "abc_filtered.csv"))
abcResult = final
dir.create(file.path(ROOT, "outputs", "abcmax", "genes"))
for(trait in unique(abcResult$trait)) {
traitDf = abcResult[(abcResult$trait == trait) & (abcResult$gene_name != ""),]
genesDf = ensemblGeneInfo[match(traitDf$gene, rownames(ensemblGeneInfo)),]
genesDf = genesDf[(genesDf$Gene.type == "protein_coding"),]
traitDf = traitDf[traitDf$gene %in% genesDf$Ensembl.Gene.ID,]
# Export genes to the file & save it to the list for further exploration
for(ctype in unique(traitDf$CellType_general)) {
genes = unique(traitDf[traitDf$CellType_general == ctype, "gene"])
geneListForGsea[["ABC_MAX"]][[paste0(trait, "_", ctype)]] = genes
write.table(genes, file.path(ROOT, "outputs", "abcmax", "genes", paste0(trait, "_", ctype , ".csv")), quote=F, row.names=F)
}
write.table(unique(traitDf$gene), file.path(outDir, "genes", paste0(trait, "_ALL.cvs")), quote=F, row.names=F)
df = do.call("rbind.data.frame", lapply(unique(traitDf$gene_name), function(gene) {
df = (traitDf[traitDf$gene_name == gene,])
df = df[order(df$ABC.Score, decreasing=T),]
df = df[!duplicated(df$CellType_general),]
rownames(df) = df$CellType_general
xx = sapply(unique(traitDf$CellType_general), function(ctype) {
df[ctype,"ABC.Score"]
})
xx
}))
dim(df)
rownames(df) = unique(traitDf$gene_name)
colnames(df) = unique(traitDf$CellType_general)
df$gene_name = unique(traitDf$gene_name)
df[is.na(df)] = 0
df$dist = sapply(rownames(df), function(gene_name) {
mean(na.omit(traitDf[(traitDf$gene_name == gene_name),"distance_rsToGene"]))
})
df$closestGene = sapply(rownames(df), function(gene_name) {
ifelse(min(traitDf[(traitDf$gene_name == gene_name),"genesInBetween"]) == 0, T, F)
})
# Perform hierarchical clustering
dist_mat = dist(df[,1:length(unique(traitDf$CellType_general))])
hc = hclust(dist_mat)
ordered_df = df[order.dendrogram(as.dendrogram(hc)), ]
ordered_df$gene_name = ordered(ordered_df$gene_name, levels=ordered_df$gene_name)
ordered_df_melt = melt(ordered_df[,1:(length(unique(traitDf$CellType_general))+1)], id.vars = c("gene_name"))
ordered_df_melt$gene_name = ordered(ordered_df_melt$gene_name, levels=ordered_df$gene_name)
ordered_df_melt$variable = ordered(ordered_df_melt$variable, levels=CELL_TYPES)
ordered_df_melt$value = log(ordered_df_melt$value+1)
colors = c("white", colorRampPalette(c("lightsalmon", "salmon", "darkred"))(100))
hmap = ggplot(data = ordered_df_melt, aes(y=variable, x=gene_name, fill=value)) + geom_tile() + scale_x_discrete(position = "top", guide = guide_axis(angle = 90)) +
scale_fill_gradientn(colors = colors, values = rescale(c(0, 0.02, 0.2, 1))) + theme_bw() + labs(x="", y="", fill="ABC score", title=trait) +
theme(axis.ticks.x=element_blank(), axis.title.y=element_blank(), title=element_blank(), axis.ticks.y=element_blank()) # scale_fill_material(colors = colors, values = rescale(c(0, 0.02, 0.2, 1)))
hmap
mpdf(paste0("fig_S8-S17_", trait), outDir=file.path(ROOT, "outputs", "abcmax"), hmap, height=2, width=floor(0.2 * length(unique(traitDf$gene_name))+1.5)); print(hmap); dev.off()
colors = c("white", colorRampPalette(c("white", "salmon", "darkred"))(1000))
ordered_df$temp = "Distance"
ordered_df$distLog = log10(ordered_df$dist)
hmap2 = ggplot(data = ordered_df, aes(x=gene_name, y=temp, fill=distLog)) + geom_tile() + scale_x_discrete(guide = guide_axis(angle = 90)) +
theme_bw() + labs(x="", y="", fill="Distance", title=trait) + scale_fill_material("light-blue") +
theme(axis.ticks.x=element_blank(), axis.title.y=element_blank(), title=element_blank(), axis.ticks.y=element_blank())
mpdf(paste0("fig_S8-S17_", trait, "_distance"), outDir=file.path(ROOT, "outputs", "abcmax"), hmap, height=2, width=floor(0.2 * length(unique(traitDf$gene_name))+1.5)); print(hmap2); dev.off()
colors = c("white", "black")
ordered_df$temp = "ClosestGene"
hmap3 = ggplot(data = ordered_df, aes(x=gene_name, y=temp, fill=closestGene)) + geom_tile() + scale_x_discrete(guide = guide_axis(angle = 90)) +
scale_fill_manual(values=c("white", "black")) + theme_bw() + labs(x="", y="", fill="Distance", title=trait) +
theme(axis.ticks.x=element_blank(), axis.title.y=element_blank(), title=element_blank(), axis.ticks.y=element_blank())
mpdf(paste0("fig_S8-S17_", trait, "_closestGeneIndicator"), outDir=file.path(ROOT, "outputs", "abcmax"), hmap, height=2, width=floor(0.2 * length(unique(traitDf$gene_name))+1.5)); print(hmap3); dev.off()
}
TX[["abc_abcMax"]] = paste0("Then, to narrow down the focus on GWAS traits, we decided to keep only E-P links overlapping GWAS index SNPs or their LD buddies (R2≥0.8), resulting in ", nrow(final), "E-P links. To limit the number of causal genes per GWAS locus, we applied the ABC-MAX approach [33828297], allowing only one E-P link per peak, resulting in ", nrow(finalAbc), " E-Ps.")
print(TX[["abc_abcMax"]])
TX[["abcmax_results"]] = paste0("Across all cell types, we nominated a total of ", length(unique(finalAbc$gene)), " unique genes that were putatively associated with ", round(length(unique(finalAbc$rs)) / uniqIndexSnps * 100, 0), "% of analyzed loci.")
print(TX[["abcmax_results"]])
TX[["abcmax_ms"]] = paste0("Our approach yielded predictions for ", length(unique(finalAbc[(finalAbc$trait == "MS"),"rs"])), " loci associated with multiple sclerosis (MS), nominating a total of ", length(unique(finalAbc[(finalAbc$trait == "MS"),"gene"])), " unique genes (Table traitsByCtype).")
print(TX[["abcmax_ms"]])
}
###################################################################################################################################################
##### SUMMARIZE RESULTS (E-P OVERLAPPING GWAS) BY GENES ###########################################################################################
{
finalx = finalAbc[order(finalAbc$cyt_loc),]
dir.create(file.path(ROOT, "outputs", "abcmax", "indiv_gene_tables"))
finalx = finalAbc[order(finalAbc$cyt_loc),]
finalx = do.call("rbind", lapply(unique(finalx$gene), function(gene) {
old_row = finalx[(finalx$gene==gene),]
old_row = old_row[order(old_row$ABC.Score, decreasing=T),]
write.csv(old_row, file=file.path(ROOT, "outputs", "abcmax", "indiv_gene_tables", paste0(gene, ".csv")), row.names=F)
row = old_row[1,c("cyt_loc", "trait_buddy_rs", "trait", "gene", "gene_name")]
row$trait_buddy_rs = paste0(unique(old_row$trait_buddy_rs),collapse=",")
row$rs = paste0(unique(old_row$rs),collapse=",")
row$no_buddies = length(which(finalx$gene == gene))
row$no_traits = length(unique(old_row$trait))
row$no_peaks = length(unique(old_row$peak))
row$trait = paste0(sort(unique(old_row$trait)),collapse=",")
row$cyt_loc = paste0(unique(old_row$cyt_loc),collapse=",")
row$peak = paste0(unique(old_row$peak),collapse=",")
row$cellType_note = paste0(sapply(unique(old_row$CellType_general)[order(table(unique(old_row$CellType_general)), decreasing=T)], function(ctype) paste0(ctype, ":", length(old_row[old_row$CellType_general==ctype,"trait"]))), collapse="; ")
row$cellType_note2 = paste0(sapply(unique(old_row$trait), function(trt) {
paste0(trt, ": ", paste0(old_row[old_row$trait == trt,"CellType_general"], collapse=" > "))
}), collapse=" ;; ")
row
}))
finalByGenes = finalx
ld_main_and_buddies_df$combo = paste0(ld_main_and_buddies_df$buddy_rs, "_", ld_main_and_buddies_df$chr, "_", ld_main_and_buddies_df$start, "_", ld_main_and_buddies_df$end)
comboLength = length(unique(ld_main_and_buddies_df$combo))
exportByGenes = finalByGenes[,c("cyt_loc", "rs", "trait_buddy_rs", "trait", "gene", "gene_name", "no_traits", "no_peaks", "no_buddies", "peak")]
colnames(exportByGenes) = c("Cytogenic location", "rsIDs", "rsID (LD buddies)", "GWAS abbreviations", "Ensembl Gene ID", "Gene name", "Number of overlapped GWAS traits", "Number of overlapped peaks", "Number of overlapped rsIDs", "Overlapped peaks")
write.csv(exportByGenes, row.names=F, file=file.path(ROOT, "outputs", "abcmax", "table_exportByGenes.csv"))
TX$multiDiseases = paste0("Finally, our approach identified a set of ", sum(finalByGenes$no_traits>=2), " genes associated with multiple diseases, ...")
print(TX$multiDiseases)
}
###################################################################################################################################################
##### GSEA ANALYSIS FOR MS (Table S10) ############################################################################################################
{
gseaResultsDf = read.csv(file.path(ROOT, "inputs", "Table_S10.csv"), sep="\t")
gseaResultsDf$minusLogP = -log10(gseaResultsDf$pval)
gseaResultsDfOrig = gseaResultsDf
#####
# Generate Table S10, i.e. scatterplot for MS and all cell types (only those with at least one BH-significant in top 10 most enriched are shown)
dfx = gseaResultsDfOrig[(startsWith(gseaResultsDfOrig$Set, prefix="MS_")),]
dfx$BH_AdjP = p.adjust(dfx$pval, method="BH")
dfx = dfx[(dfx$name_full %in% unique(dfx$name_full)[1:10]),]
dfx$name_full = ordered(dfx$name_full, levels=unique(dfx$name_full))
dfx$size = ifelse(dfx$BH_AdjP < 0.05, 2, 1)
dfx$Set = gsub("^MS_", "", dfx$Set)
gseaMsDf = dfx[,c("name_full", "Set", "pval", "BH_AdjP", "odds.ratio", "Jaccard", "intersection", "union")]
colnames(gseaMsDf) = c("Pathway name", "Cell type", "P-value", "BH-corrected P-value", "Odds ratio", "Jaccard", "Intersection", "Union")
write.csv(gseaMsDf, row.names=F, file=file.path(ROOT, "outputs", "Table_S10.csv"))
}
###################################################################################################################################################
##### ALS-RELATED ANALYSIS ########################################################################################################################
{
alsNominated = unique(finalAbc[finalAbc$trait == "ALS","rs"])
alsTotal = unique(lociTable_df[lociTable_df$trait == "ALS","rs"])
TX$als = paste0("Our approach was able to nominate causal genes for ", length(alsNominated), " out of ", length(alsTotal), " loci (", round(length(alsNominated) / length(alsTotal) * 100, 1), "%)")
print(TX$als)
df = finalAbc[(finalAbc$trait == "ALS"),]
ALS_LIU_OMIM = file.path(ROOT, "inputs", "liu_als_omim.csv")
ALS_LIU_GWAS = file.path(ROOT, "inputs", "liu_als_gwas.csv")
ALS_VAN_RHEENEN = file.path(ROOT, "inputs", "als_van_Rheenen.csv")
als_omim = read.csv(ALS_LIU_OMIM, sep="\t")
als_gwas = read.csv(ALS_LIU_GWAS, sep="\t")
als_gwas_vanRheenen = read.csv(ALS_VAN_RHEENEN, sep="\t")
als_gwas_vanRheenenList = unique(unlist(lapply(als_gwas_vanRheenen$Prioritized_gene, function(genes) { as.character(sapply(unlist(strsplit(genes, ",")[[1]]), trimws)) })))
df$existSupport = df$gene_name %in% c(als_omim$Gene.symbol, als_gwas$Gene.symbol)
isect_omim = intersect(unique(df$gene_name), als_omim$Gene.symbol)
isect_gwas = intersect(unique(df$gene_name), als_gwas$Gene.symbol)
isect_singleGwas = intersect(unique(df$gene_name), als_gwas_vanRheenenList)
TX$als_omim = paste0("> Overlap between ALS OMIM (", nrow(als_omim) , " genes) and our predictions (", length(unique(df$gene)), " genes) is in ", length(isect_omim), " genes (", length(unique(df[df$gene_name %in% isect_omim,"rs"])), " loci), i.e. ", paste0(sort(isect_omim), collapse=","))
print(TX$als_omim)
TX$als_gwas = paste0("> Overlap between ALS GWAS (", nrow(als_gwas) , " genes) and our predictions (", length(unique(df$gene)), " genes) is in ", length(isect_gwas), " genes (", length(unique(df[df$gene_name %in% isect_gwas,"rs"])), " loci), i.e. ", paste0(sort(isect_gwas), collapse=","))
print(TX$als_gwas)
TX$als_gwas2 = paste0("> Overlap between ALS GWAS (", length(als_gwas_vanRheenen) , " genes) and our predictions (", length(unique(df$gene)), " genes) is in ", length(isect_singleGwas), " genes (", length(unique(df[df$gene_name %in% isect_singleGwas,"rs"])), " loci), i.e. ", paste0(sort(isect_singleGwas), collapse=","))
print(TX$als_gwas2)
}
###################################################################################################################################################
##### SCZ-RELATED ANALYSIS ########################################################################################################################
{
finalAbc$pgc3_prioritized1_matchOur = sapply(1:nrow(finalAbc), function(i) {
if(finalAbc$pgc3_prioritized1[i]) {
#ifelse(length(intersect(unique(pgcGenes[(final$rs[i] == pgcGenes$Index.SNP),"Ensembl.ID"]), unique(finalAbc[(finalAbc$trait == "SCZ") & (finalAbc$rs == final$rs[i]),"gene"]))) > 0, T, F)
ifelse(length(intersect(unique(pgcGenes[(finalAbc$rs[i] == pgcGenes$Index.SNP),"Ensembl.ID"]), unique(finalAbc[(finalAbc$trait == "SCZ") & (finalAbc$rs == finalAbc$rs[i]),"gene"]))) > 0, T, F)
} else {
NA
}
})
table(finalAbc$pgc3_prioritized1_matchOur)
finalAbc$pgc3_prioritized2_matchOur = sapply(1:nrow(finalAbc), function(i) {
if(finalAbc$pgc3_prioritized2[i]) {
ifelse(length(intersect(unique(pgcGenes2[(finalAbc$rs[i] == pgcGenes2$Index.SNP),"Ensembl.ID"]), unique(finalAbc[(finalAbc$trait == "SCZ") & (finalAbc$rs == finalAbc$rs[i]),"gene"]))) > 0, T, F)
} else {
NA
}
})
table(finalAbc$pgc3_prioritized2_matchOur)
#### Comparison with prioritized genes from SCZ PGC3 GWAS (there are two versions of prioritized genes from PGC and we consider using only protein coding-vs-all)
isectPgc3prioritized_oursSczPrioritized_pc = intersect(unique(finalAbc[(finalAbc$trait=="SCZ") & (finalAbc$gene_biotype == "protein_coding"),"rs"]), pgcGenes[pgcGenes$gene_biotype=="protein_coding", "Index.SNP"])
isectPgc3prioritized_oursSczPrioritized_sameOrNot_pc = sapply(isectPgc3prioritized_oursSczPrioritized_pc, function(rs) { any(finalAbc[finalAbc$rs == rs,"pgc3_prioritized1_matchOur"]) })
TX$pgc3_overlap1_pc = paste0("> Out of ", length(unique(ld_buddies_df[ld_buddies_df$trait=="SCZ","rs"])), " SCZ loci, PGC3(v1) made predictions of protein coding target gene for ", length(unique(pgcGenes[(pgcGenes$gene_biotype == "protein_coding"), "Index.SNP"])), " loci while our study made a prediction for ", length(unique(finalAbc[(finalAbc$trait == "SCZ") & (finalAbc$gene_biotype == "protein_coding"), "rs"])), " loci. For ", length(isectPgc3prioritized_oursSczPrioritized_pc), " loci with available predictions in both studies, the same gene is prioritized in ", sum(isectPgc3prioritized_oursSczPrioritized_sameOrNot_pc), " cases.")
print(TX$pgc3_overlap1_pc)
isectPgc3prioritized_oursSczPrioritized = intersect(unique(finalAbc[(finalAbc$trait=="SCZ"),"rs"]), pgcGenes$Index.SNP)
isectPgc3prioritized_oursSczPrioritized_sameOrNot = sapply(isectPgc3prioritized_oursSczPrioritized, function(rs) { any(finalAbc[finalAbc$rs == rs,"pgc3_prioritized1_matchOur"]) })
TX$pgc3_overlap1 = paste0("> Out of ", length(unique(ld_buddies_df[ld_buddies_df$trait=="SCZ","rs"])), " SCZ loci, PGC3(v1) made predictions of all (not just protein coding) target gene for ", length(unique(pgcGenes$Index.SNP)), " loci while our study made a prediction for ", length(unique(finalAbc[(finalAbc$trait == "SCZ"), "rs"])), " loci. For ", length(isectPgc3prioritized_oursSczPrioritized), " loci with available predictions in both studies, the same gene is prioritized in ", sum(isectPgc3prioritized_oursSczPrioritized_sameOrNot), " cases.")
print(TX$pgc3_overlap1)
isectPgc3prioritized_oursSczPrioritized_pc = intersect(unique(finalAbc[(finalAbc$trait=="SCZ") & (finalAbc$gene_biotype == "protein_coding"),"rs"]), pgcGenes2[pgcGenes2$gene_biotype=="protein_coding", "Index.SNP"])
isectPgc3prioritized_oursSczPrioritized_sameOrNot_pc = sapply(isectPgc3prioritized_oursSczPrioritized_pc, function(rs) { any(finalAbc[finalAbc$rs == rs,"pgc3_prioritized2_matchOur"]) })
TX$pgc3_overlap2_pc = paste0("> Out of ", length(unique(ld_buddies_df[ld_buddies_df$trait=="SCZ","rs"])), " SCZ loci, PGC3(v2) made predictions of protein coding target gene for ", length(unique(pgcGenes2[(pgcGenes2$gene_biotype == "protein_coding"), "Index.SNP"])), " loci while our study made a prediction for ", length(unique(finalAbc[(finalAbc$trait == "SCZ") & (finalAbc$gene_biotype == "protein_coding"), "rs"])), " loci. For ", length(isectPgc3prioritized_oursSczPrioritized_pc), " loci with available predictions in both studies, the same gene is prioritized in ", sum(isectPgc3prioritized_oursSczPrioritized_sameOrNot_pc), " cases.")
print(TX$pgc3_overlap2_pc)
isectPgc3prioritized_oursSczPrioritized2 = intersect(unique(finalAbc[(finalAbc$trait=="SCZ"),"rs"]), pgcGenes2$Index.SNP)
isectPgc3prioritized_oursSczPrioritized_sameOrNot2 = sapply(isectPgc3prioritized_oursSczPrioritized2, function(rs) { any(finalAbc[finalAbc$rs == rs,"pgc3_prioritized2_matchOur"]) })
TX$pgc3_overlap2 = paste0("> Out of ", length(unique(ld_buddies_df[ld_buddies_df$trait=="SCZ","rs"])), " SCZ loci, PGC3(v2) made predictions of all (not just protein coding) target gene for ", length(unique(pgcGenes2$Index.SNP)), " loci while our study made a prediction for ", length(unique(finalAbc[(finalAbc$trait == "SCZ"), "rs"])), " loci. For ", length(isectPgc3prioritized_oursSczPrioritized2), " loci with available predictions in both studies, the same gene is prioritized in ", sum(isectPgc3prioritized_oursSczPrioritized_sameOrNot2), " cases.")
print(TX$pgc3_overlap2)
TX$abcmax_scz_epDistance = paste0("For example, ABC-MAX made predictions for ", length(unique(finalAbc[(finalAbc$trait=="SCZ"),"rs"])), " SCZ loci, nominating ", length(unique(finalAbc[(finalAbc$trait=="SCZ"),"gene"])), " unique genes.")
print(TX$abcmax_scz_epDistance)
TX$abcmax_ms_epDistance = paste0("For example, ABC-MAX made predictions for ", length(unique(finalAbc[(finalAbc$trait=="MS"),"rs"])), " MS loci, nominating ", length(unique(finalAbc[(finalAbc$trait=="MS"),"gene"])), " unique genes.")
print(TX$abcmax_ms_epDistance)
nonConflictingRs = names(isectPgc3prioritized_oursSczPrioritized_sameOrNot)[isectPgc3prioritized_oursSczPrioritized_sameOrNot == T] # rsID with shared EP
conflictingRs = names(isectPgc3prioritized_oursSczPrioritized_sameOrNot)[isectPgc3prioritized_oursSczPrioritized_sameOrNot == F] # rsID with non-shared EP
conflictingGenesPgc = pgcGenes2[(pgcGenes2$Index.SNP %in% conflictingRs),]
conflictingGenesPgc$gene_rs = paste0(conflictingGenesPgc$Ensembl.ID, "_SCZ_", conflictingGenesPgc$Index.SNP)
existingInAbc = final[final$trait_gene_rs %in% conflictingGenesPgc$gene_rs,"trait_gene_rs"]
print(paste0("To further demonstrate the accuracy of our approach, we investigated ", length(unique(finalAbc[(finalAbc$trait=="SCZ"),"rs"])), " SCZ loci for which we have predicted E-P(ABC-MAX) links, covering ", round(length(unique(finalAbc[(finalAbc$trait=="SCZ"),"rs"])) / length(unique((ld_buddies_df[(ld_buddies_df$trait=="SCZ"),"rs"]))) * 100, 1), "% of all SCZ loci"))
print(paste0("When comparing our predicted causal genes with those prioritized by SMR/FINEMAP in the latest SCZ GWAS study, we found agreement for ", sum(isectPgc3prioritized_oursSczPrioritized_sameOrNot2), " (", round(sum(isectPgc3prioritized_oursSczPrioritized_sameOrNot2==T) / length(isectPgc3prioritized_oursSczPrioritized_sameOrNot2) * 100, 1), "%) of the ", length(isectPgc3prioritized_oursSczPrioritized_sameOrNot2), " loci that are covered by both studies. Notably, for ", length(unique(existingInAbc)), " out of ", sum(isectPgc3prioritized_oursSczPrioritized_sameOrNot2==F), " loci with different predicted causal gene, we confirmed the existence of E-P(ABC) (but not E-P(ABC-MAX)) link to the same gene as predicted by SMR/FINEMAP, indicating the existence of regulatory link in our data, even if it's not the top-ranked one."))
# For SynGO: Run online with inputs generated here (all SCZ genes) vs background of all genes as exported here ;; then, save ZIP with results, extract it and convert XLS to CSV
write.csv(unique(unique(unlist(expressedGenes))), file=file.path(ROOT, "inputs", "SynGO_background.txt"), row.names=F, quote=F)
SYNGO_SCZ_RESULTS = file.path(ROOT, "inputs", "syngo_scz_precalculated.tsv")
syngoDf = read.csv(SYNGO_SCZ_RESULTS, sep="\t")
syngoDf = syngoDf[(syngoDf$GO.domain == "BP"),]
syngoGO_terms = unique(syngoDf$GO.term.ID)[startsWith(unique(syngoDf$GO.term.ID), prefix="GO")]
TX$syngo = paste0("These prioritized genes are associated with ", length(syngoGO_terms), " unique biological terms within the synaptic hierarchy (Table SynGO), including several specific pathways and processes.")
print(TX$syngo)
}
###################################################################################################################################################
##### SUMMARY OF GENE IMPLICATIONS BY TRAIT :: TABLE S9, FIG. 3F ##################################################################################
{
outDir = "~/Desktop/3dg/abc_max/" # abcResultsFolder$ABC_MAX
traitsByCtype = do.call("rbind.data.frame", lapply(TRAITS, function(trait) {
table(finalAbc[finalAbc$trait == trait,"CellType_general"])[CELL_TYPES]
}))
colnames(traitsByCtype) = CELL_TYPES
rownames(traitsByCtype) = TRAITS
traitsByCtype$sumUnique = sapply(rownames(traitsByCtype), function(trait) {
length(unique(unlist(sapply(colnames(traitsByCtype), function(ctype) { finalAbc[(finalAbc$trait == trait) & (finalAbc$CellType_general == ctype),"gene"] }))))
})
traitsByCtypeDf = rbind.data.frame(traitsByCtype, sapply(colnames(traitsByCtype), function(ctype) { length(finalAbc[(finalAbc$CellType_general == ctype),"gene"]) }))
rownames(traitsByCtypeDf)[nrow(traitsByCtypeDf)] = "SumNonUnique"
write.csv(traitsByCtypeDf, file=file.path(ROOT, "outputs", "Table_S9_b.csv"))
####
# Summary of covered loci (loci can be represented by multiple genes as we allow 2 links per loci in each cell type) in each cell type & trait
traitsByCtypeAlt = do.call("rbind.data.frame", lapply(TRAITS, function(trait) {
sapply(CELL_TYPES, function(ctype) length(unique(finalAbc[(finalAbc$CellType_general == ctype) & (finalAbc$trait == trait),"rs"])) )
}))
colnames(traitsByCtypeAlt) = CELL_TYPES
rownames(traitsByCtypeAlt) = TRAITS
traitsByCtypeAlt$sumUnique = sapply(rownames(traitsByCtypeAlt), function(trait) {
length(unique(unlist(sapply(colnames(traitsByCtypeAlt), function(ctype) { finalAbc[(finalAbc$trait == trait) & (finalAbc$CellType_general == ctype),"rs"] }))))
})
traitsByCtypeAlt$lociCoverage = sapply(TRAITS, function(trait) (traitsByCtypeAlt[trait,"sumUnique"] / nrow(lociTable_df[(lociTable_df$trait == trait),])) )
traitsByCtypeDfAlt = rbind.data.frame(traitsByCtypeAlt, sapply(colnames(traitsByCtypeAlt), function(ctype) { length(unique(finalAbc[(finalAbc$CellType_general == ctype),"rs"])) }))
rownames(traitsByCtypeDfAlt)[nrow(traitsByCtypeDfAlt)] = "SumNonUnique"
write.csv(traitsByCtypeDfAlt, file=file.path(ROOT, "outputs", "Table_S9_a.csv"))
####
# Figure 3f: Number of explained loci per cell type across all investigated GWAS traits
df = data.frame(t(traitsByCtypeDfAlt[nrow(traitsByCtypeDfAlt),1:(length(traitsByCtypeDfAlt)-2)]))
df$`Cell type` = rownames(df)
colnames(df) = c("Explained loci", "Cell type")
df$`Cell type` = ordered(df$`Cell type`, levels=df$`Cell type`[order(df$`Explained loci`, decreasing=T)] )
explainedLociByCellPlot = ggplot(df, aes(x = `Cell type`, y = `Explained loci`)) + geom_bar(stat = "identity", color = "white", width=1) + theme_classic() + xlab("Cell type") +
ylab("A total number of explained loci") + theme(panel.spacing=unit(1, "lines")) + theme(strip.background=element_rect(fill="#eeeeee",color="#eeeeee")) +
theme(strip.text=element_text(face="bold"), axis.text.x = element_text(angle = 45)) + scale_y_continuous(expand = c(0, 0)) + scale_x_discrete(expand = c(0, 0.5)) #+ facet_wrap(~ CellType, ncol=3)
mpdf("Fig_3f", outDir=file.path(ROOT, "outputs"), height=4, width=4); print(explainedLociByCellPlot); dev.off()
}
###################################################################################################################################################
##### STATISTICS FOR E-P LINKS FROM ABC :: FIG. 3D-E ##############################################################################################
{
# Mean / median distance for EP links
medianEpDistance = median(na.omit(finalAbc$distance_rsToGene))
meanEpDistance = mean(na.omit(finalAbc$distance_rsToGene))
percUnder100kb = round(sum(na.omit(finalAbc$distance_rsToGene)<1e5) / length(na.omit(finalAbc$distance_rsToGene)), 2) * 100
#sapply(unique(finalAbc$CellType_general), function(ctype) { round(mean(na.omit(finalAbc[finalAbc$CellType_general == ctype, "distance_rsToGene"]) / 10^3, 0)) })
TX[["abcmax_epDistance"]] = paste0("The E-PABC-MAX distances varied significantly, with a median of ", round(medianEpDistance/1e3, 0), "kb, and the majority (", percUnder100kb, "%) were less than 100kb (Fig. X).")
print(TX[["abcmax_epDistance"]])
####
# Figure 3d: Histogram of the E-PABC-MAX distances between the GWAS risk variant and TSS of the putatively regulated gene
distRsPeakLogPlot = ggplot(finalAbc, aes(x = distance_rsToGene)) + geom_density() + labs(x="E-P link distance (log10 scale)") + scale_x_continuous(trans='log10') + theme_bw() +
geom_vline(xintercept=meanEpDistance, linetype="dashed", color = "blue") + geom_vline(xintercept=medianEpDistance, linetype="dashed", color = "red") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), aspect.ratio = 1)
mpdf("Fig_3d", outDir=file.path(ROOT, "outputs"), width=3, height=3); print(distRsPeakLogPlot); dev.off()
####
# Figure 3e: Histogram of the number of genes ‘skipped’ by E-PABC-MAX to reach the putatively regulated gene
medianGenesInBetween = median(na.omit(finalAbc$genesInBetween)) # Distribution of the number of "skipped" genes by E-P(ABC-MAX) links
meanGenesInBetween = mean(na.omit(finalAbc$genesInBetween))
df = finalAbc