-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmouse_cistrome2gmt.sh
executable file
·122 lines (106 loc) · 4.26 KB
/
mouse_cistrome2gmt.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/bin/bash
#set -x
###############################################################
#
# Curate TFBS gene sets from cistrome ENCODE data
# by Mark Ziemann and Antony Kaspi
# 2017-2018
# This software is distributed with and may be used according
# to 2-clause FreeBSD License
#
###############################################################
# Description: Will download TFBS peak sets and will
# annotate TSS and enhancers bound by transcription factors
# and create a gene matrix that can be used in GSEA and other
# pathway analysis tools
###############################################################
echo "Testing dependancies"
###############################################################
PARALLEL=`which parallel | wc -l`
if [ $PARALLEL -eq "0" ] ; then
echo 'Error: gnu parallel was not found.
It can be downloaded from the following URL:
https://www.gnu.org/software/parallel/
Also available from the Ubuntu software centre:
sudo apt-get install parallel'
exit
fi
LIFTOVER=`which liftOver | wc -l`
if [ $LIFTOVER -eq "0" ] ; then
echo 'Error: liftOver was not found.
It can be downloaded from the following URL:
http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver'
exit
fi
BEDTOOLS=`which bedtools | wc -l`
if [ $BEDTOOLS -eq "0" ] ; then
echo 'Error: bedtools was not found.
It can be downloaded from the following URL:
http://bedtools.readthedocs.io/en/latest/
Older version is available from the Ubuntu software centre:
sudo apt-get install bedtools'
exit
fi
###############################################################
# Lets declare a bunch of vars for later
###############################################################
GTFURL=ftp://ftp.ensembl.org/pub/release-86/gtf/mus_musculus/Mus_musculus.GRCm38.86.gtf.gz
GTF=`basename $GTFURL`
TSSBED=`echo $GTF | sed 's/.gtf.gz/.tss.bed/'`
SIZE_RANGE='2500'
GNAMES=$(basename $GTF .gtf.gz).gnames.txt
#this encode stuff will have to be modified
METADATA="mouse_factor_full_QC.txt"
METADATA_SUMMARY=metadata_summary.tsv
URLLIST=files.txt
NUMRANGE='500'
UPPER_LIMIT='5000'
NRCPU=`nproc`
###############################################################
echo "Dependancies OK. Downloading required annotation files"
###############################################################
wget -N $GTFURL
###############################################################
echo "extracting gene names from GTF"
###############################################################
zcat $GTF | grep -w gene | cut -f9 | cut -d '"' -f2,6 \
| tr '"' '\t' | sort -uk 2b,2 > $GNAMES
###############################################################
echo "extracting TSS positions from ensembl GTF"
###############################################################
#Get coordinates of plus strand genes
zgrep -w 'exon_number "1"' $GTF | awk '$7=="+"' \
| cut -f1,4,5,7,9 | cut -d '"' -f-2,12 \
| sed 's!gene_id "!!' | tr '"' '_' \
| awk '{OFS="\t"} {print $1,$2,$2+1,$5,$4}' \
| awk '{OFS="\t"} { if ($2<1) print $1,"1",$3,$5,$4 ; else print $0 }' > $TSSBED
#Get coordinates of minus strand genes
zgrep -w 'exon_number "1"' $GTF | awk '$7=="-"' \
| cut -f1,4,5,7,9 | cut -d '"' -f-2,12 \
| sed 's!gene_id "!!' | tr '"' '_' \
| awk '{OFS="\t"} {print $1,$3-1,$3,$5,$4}' \
| awk '{OFS="\t"} { if ($2<1) print $1,"1",$3,$5,$4 ; else print $0 }' >> $TSSBED
###############################################################
echo "intersect peaks with TSS"
###############################################################
map_pk(){
BEDIN=$1
CODENUM=$(echo $(basename $BEDIN) | cut -d '_' -f1)
METADATA=$2
TSS=$3
DIST=$4
MAX_GENES=$5
OUT=$(echo $BEDIN | sed 's/narrowPeak.bed/narrowPeak.gmt/')
NAME=$(grep -w ^$CODENUM $METADATA | cut -f4)_$(grep -w ^$CODENUM $METADATA | cut -f5)_$(grep -w ^$CODENUM $METADATA | cut -f6)_$CODENUM
NAME=$(echo $NAME | sed 's#/#_#g')
bedtools intersect -wb \
-a <(sed 's/^chr//' $BEDIN ) \
-b <(awk -v d=$DIST '{OFS="\t"} {print $1,$2-d,$3+d,$4}' $TSS \
| awk '{OFS="\t"} { if ($2<1) print $1,0,$3+1000,$4 ; else print $0}') \
| sort -k8gr | awk '!arr[$14]++ {print $14}' \
| cut -d '_' -f2- | awk '!arr[$1]++ {print $1}' | head -$MAX_GENES \
| tr '\n' '\t' | sed "s#^#${NAME}\tCistromeDB\t#" | sed 's/$/\n/'
}
export -f map_pk
parallel map_pk ::: TF_mouse/*narrowPeak.bed ::: mouse_factor_full_QC.txt ::: $TSSBED ::: 1000 ::: 1000 \
| sed 's/ //g' > MouseTfPeaks.gmt