-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbed-gtf2
61 lines (54 loc) · 2.52 KB
/
bed-gtf2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/perl
use strict;
use warnings;
#This is a script to take a consensus peak bed file and convert it to gtf format for HTSeq-count
#K. Nixon September 14, 2019
my $usage = "usage: perl $0 filename\n";
#Accept a bed file from the command line:
my $bed = shift @ARGV or die $usage;
#Accept a gtf file name from the command line. If none is given, then use "newgtf"
my $gtf;
$gtf = shift @ARGV or $gtf = "newgtf.gtf";
#Open the given files (read bed, write gtf)
open(IN, "< $bed") or die "could not open $bed for reading";
open(OUT, "> $gtf") or die "could not open $gtf for writing";
#Set up a counter for the line number so each entry has its own id
my $id_count = 1;
#Scroll through the given bed file and parse out required information for the gtf file
while (my $line = <IN>){
#Remove carriage returns
$line =~ s/\cM//;
#Remove newLine characters
chomp $line;
##As a check, print the line in stdout
#print $line;
#Split the line up using whitespace as a delimiter
my @line_array = split ' ',$line;
#The first column is the chromosome name, which is also the first column in the gtf file. Print this as the first column in the gtf file
print OUT "$line_array[0]\t";
#The second column in the gtf file is the source name (not included in bed file, so make it up
print OUT "ATAC-peakset\t";
#The third column in the gtf file is the feature name (not included in the bed file). HTSeq-count defaults to 'exon' so that's what we'll use here:
print OUT "exon\t";
#the fourth column in the gtf file is the start coordinate of the region (the second column in the bed file), so put that here:
if($line_array[1] == 0){
print OUT "1\t";
} else {
print OUT "$line_array[1]\t";
}
#The fifth column in the gtf file is the end coordinate of the region (the third column in the bed file), so put that here:
print OUT "$line_array[2]\t";
#The sixth column in the gtf file is a score (not necessary, so put a '.'):
print OUT ".\t";
#The seventh column in the gtf file is the strand (forward or reverse) we don't know this, so I think we can put a ".":
print OUT ".\t";
#The eighth column is the frame (also not necessary, so put a '.'):
print OUT ".\t";
#The ninth column is the attribute (HTSeq-count uses 'gene_id' to identify the feature that has been aligned, so we'll set that as the peak ID number from the bed file (column 4):
print OUT "gene_id \"$line_array[3]\"; line_id \"$id_count\";\n";
#Add one to the id counter:
$id_count +=1;
}
#Now this should be everything, so we close both files and the script is done
close IN;
close OUT;