forked from Sunhh/NGS_data_processing
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathusing_subfunc.R.cmd
executable file
·92 lines (74 loc) · 4.47 KB
/
using_subfunc.R.cmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# Perl : CMD:
# To comfirm the minimum adaptor: (PE)
# grep ATCTCGTATGCCGTCTTCTGCTTG NSP_10kb_ACAGTG_L00M_R1.ndupB | perl -e ' $n=40; $p="ATCTCGTATGCCGTCTTCTGCTTG"; while (<>) { m/(.{$n}$p)/o or next; $h{$1}++; $. >= 10000 and last; } for (sort {$h{$b} <=> $h{$a} } grep { $h{$_} > 1 } keys %h) { print "$_\t$h{$_}\n";} ' | head
# To comfirm the maximum adaptor: (PE)
# grep ATCTCGTATGCCGTCTTCTGCTTG NSP_10kb_ACAGTG_L00M_R1.ndupB | perl -e ' $n=41; $p="ATCTCGTATGCCGTCTTCTGCTTG"; while (<>) { m/(.{$n}$p)/o or next; $h{$1}++; $. >= 10000 and last; } for (sort {$h{$b} <=> $h{$a} } grep { $h{$_} > 1 } keys %h) { print "$_\t$h{$_}\n";} ' | head
# Record:
# I found in P1_200a_R1.fastq.gz and P1_300_ATCACG_R1.fastq.gz, there are R1 reads end with 'ATCTCGTATGCCGTCTTCTGCTTG' without barcode or 'AGATCGGAAGAGCACACGTCTGAACTiCCAGTCAC' sequences. So for these cases, I might need to run two times of function-clean.pe.fq.file with different "adaptor1" option.
# And ~10bp head of 'ATCTCGTATGCCGTCTTCTGCTTG' is similar to some chloroplast sequence or rRNA sequence. Like this 'ATGTGTGTCGGTTGCTTGCA';
# I can also find 'ATGTGTGTCGGTTGCTTGCA' in P1_300 library, maybe it is common in P1 libraries. So I plan to trim twice and see how many reads changed.
# Not sure if it is common to other species.
# Need to see the read infor:
# [Sunhh@Penguin SeqData]$ gzip -cd P1_500b_R1.fastq.gz | grep ATCACGATCTCGTATGCCGTCTTCTGCTTG
# CAATCTTAACATCAAGGTTGAGATCGGAAGAGCACACGCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTGAAAAAAAAAACAGAATTAGAATCAT
# For PE adaptor cleaning.
barcod <- 'AAATTT'
patternR1 <- paste0( "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC", barcod, "ATCTCGTATGCCGTCTTCTGCTTG",sep="")
patternR2 <- "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"
barcode_lis <- read.table("barcode_lis", header=T, stringsAsFactors=F)
pattern_lis <- NULL
for ( i in 1:nrow(barcode_lis) ) {
t.prefix <- barcode_lis$Prefix[i]
t.R1pattern <- paste0( "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC", barcode_lis$Barcode[i], "ATCTCGTATGCCGTCTTCTGCTTG",sep="" )
t.R2pattern <- "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"
pattern_lis <- rbind( pattern_lis, c(t.prefix, barcode_lis$Barcode[i], t.R1pattern, t.R2pattern) )
}
colnames(pattern_lis) <- c("Prefix", "Barcode", "R1pattern", "R2pattern")
# write.table( pattern_lis, file="pattern_lis", quote=F, sep="\t", col.names=T, row.names=F )
source("/home/Sunhh/tools/github/data_proc/using_subfunc.R")
pattern_lis <- read.table("pattern_lis", header=T, stringsAsFactors=F)
for ( i in 1:nrow(pattern_lis) ) {
inFq1 <- paste0(pattern_lis$Prefix[i], "_R1.ndupB", sep="")
inFq2 <- paste0(pattern_lis$Prefix[i], "_R2.ndupB", sep="")
oFq1 <- paste0(pattern_lis$Prefix[i], "_R1", sep="")
oFq2 <- paste0(pattern_lis$Prefix[i], "_R2", sep="")
adp1 <- pattern_lis$R1pattern[i]
adp2 <- pattern_lis$R2pattern[i]
clean.pe.fq.file( inFqName1=inFq1, outFqName1=oFq1, adaptor1=adp1, inFqName2=inFq2, outFqName2=oFq2, adaptor2=adp2, RdPerYield=40e6 )
system(paste( "perl /home/Sunhh/tools/fq_rdNum.pl ",
oFq1, ".paired.hq ",
oFq2, ".paired.hq ",
oFq1, ".single.hq ",
oFq2, ".single.hq ",
oFq1, ".paired ",
oFq2, ".paired ",
oFq1, ".single ",
oFq2, ".single ",
" >> test_tbl ; ",
"rm -f ",
oFq1, ".paired.hq ",
oFq2, ".paired.hq ",
oFq1, ".single.hq ",
oFq2, ".single.hq ",
sep="")
)
}
clean.mp.fq.file( inFqName1=inFq1, outFqName1=oFq1, inFqName2=inFq2, junction.seq=junc_seq, RdPerYield=40e5 )
# [Sunhh@Penguin Q20]$ more MP_junc_pattern
Prefix JuncPattern
P1_cre5k_CACTCA_time1 CGTATAACTTCGTATAATGTATGCTATACGAAGTTATACA
P1_cre5k_CACTCA_time2 CGTATAACTTCGTATAATGTATGCTATACGAAGTTATACA
P1_ec5k_ATGAGC_time1 CGTATAACTTCGTATAATGTATGCTATACGAAGTTATACA
P1_ec5k_ATGAGC_time2 CGTATAACTTCGTATAATGTATGCTATACGAAGTTATACA
P1_MP2k GGTCGATAACTTCGTATAATGTATGCTATACGAAGTTATACA
P1_MP5k GGTCGATAACTTCGTATAATGTATGCTATACGAAGTTATACA
source("/home/Sunhh/tools/clean_reads/using_subfunc.R") # Penguin server.
pattern_lis <- read.table("MP_junc_pattern", header=T, stringsAsFactors=F)
for ( i in 1:nrow(pattern_lis) ) {
gc()
inFq1 <- paste0(pattern_lis$Prefix[i], "_R1.paired", sep="")
inFq2 <- paste0(pattern_lis$Prefix[i], "_R2.paired", sep="")
oFq1 <- paste0(pattern_lis$Prefix[i], sep="")
junc_seq <- c(pattern_lis$JuncPattern[i], as.character( reverseComplement(DNAString( pattern_lis$JuncPattern[i] )) ))
clean.mp.fq.file ( inFqName1=inFq1, outFqName1=oFq1, inFqName2=inFq2, junction.seq=junc_seq, RdPerYield=40e6 )
}