-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHex_reads_prep.sh
80 lines (69 loc) · 5.47 KB
/
Hex_reads_prep.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/bin/bash
# This script downloads all 1st instar H. limbata reads from the 2014 Illumina sequencing run,
# fixes their file extensions, concatenates the R1's and R3's together, and prepares the R3 file
# for header fixing script.
echo making new directory ...
mkdir raw_reads
cd ./raw_reads
# Download all R1 and R3 reads from NCBI - SRA database
echo downloading SRA fastq files ...
wget https://sra-pub-src-1.s3.amazonaws.com/SRR9732743/A155_GCCAATAT_L001_R1_001.fastq.gz.1
wget https://sra-pub-src-1.s3.amazonaws.com/SRR9732743/A155_GCCAATAT_L001_R1_002.fastq.gz.1
wget https://sra-pub-src-1.s3.amazonaws.com/SRR9732743/A155_GCCAATAT_L001_R1_003.fastq.gz.1
wget https://sra-pub-src-1.s3.amazonaws.com/SRR9732743/A155_GCCAATAT_L001_R1_004.fastq.gz.1
wget https://sra-pub-src-1.s3.amazonaws.com/SRR9732743/A155_GCCAATAT_L001_R1_005.fastq.gz.1
wget https://sra-pub-src-1.s3.amazonaws.com/SRR9732743/A155_GCCAATAT_L001_R1_006.fastq.gz.1
wget https://sra-pub-src-1.s3.amazonaws.com/SRR9732743/A155_GCCAATAT_L001_R1_007.fastq.gz.1
wget https://sra-pub-src-1.s3.amazonaws.com/SRR9732743/A155_GCCAATAT_L001_R1_008.fastq.gz.1
wget https://sra-pub-src-1.s3.amazonaws.com/SRR9732743/A155_GCCAATAT_L001_R1_009.fastq.gz.1
wget https://sra-pub-src-1.s3.amazonaws.com/SRR9732743/A155_GCCAATAT_L001_R3_001.fastq.gz.1
wget https://sra-pub-src-1.s3.amazonaws.com/SRR9732743/A155_GCCAATAT_L001_R3_002.fastq.gz.1
wget https://sra-pub-src-1.s3.amazonaws.com/SRR9732743/A155_GCCAATAT_L001_R3_003.fastq.gz.1
wget https://sra-pub-src-1.s3.amazonaws.com/SRR9732743/A155_GCCAATAT_L001_R3_004.fastq.gz.1
wget https://sra-pub-src-1.s3.amazonaws.com/SRR9732743/A155_GCCAATAT_L001_R3_005.fastq.gz.1
wget https://sra-pub-src-1.s3.amazonaws.com/SRR9732743/A155_GCCAATAT_L001_R3_006.fastq.gz.1
wget https://sra-pub-src-1.s3.amazonaws.com/SRR9732743/A155_GCCAATAT_L001_R3_007.fastq.gz.1
wget https://sra-pub-src-1.s3.amazonaws.com/SRR9732743/A155_GCCAATAT_L001_R3_008.fastq.gz.1
wget https://sra-pub-src-1.s3.amazonaws.com/SRR9732743/A155_GCCAATAT_L001_R3_009.fastq.gz.1
# Fix the file headers by removing the ".1" extension
echo removing unneccesary file extenion ...
mv A155_GCCAATAT_L001_R1_001.fastq.gz.1 A155_GCCAATAT_L001_R1_001.fastq.gz
mv A155_GCCAATAT_L001_R1_002.fastq.gz.1 A155_GCCAATAT_L001_R1_002.fastq.gz
mv A155_GCCAATAT_L001_R1_003.fastq.gz.1 A155_GCCAATAT_L001_R1_003.fastq.gz
mv A155_GCCAATAT_L001_R1_004.fastq.gz.1 A155_GCCAATAT_L001_R1_004.fastq.gz
mv A155_GCCAATAT_L001_R1_005.fastq.gz.1 A155_GCCAATAT_L001_R1_005.fastq.gz
mv A155_GCCAATAT_L001_R1_006.fastq.gz.1 A155_GCCAATAT_L001_R1_006.fastq.gz
mv A155_GCCAATAT_L001_R1_007.fastq.gz.1 A155_GCCAATAT_L001_R1_007.fastq.gz
mv A155_GCCAATAT_L001_R1_008.fastq.gz.1 A155_GCCAATAT_L001_R1_008.fastq.gz
mv A155_GCCAATAT_L001_R1_009.fastq.gz.1 A155_GCCAATAT_L001_R1_009.fastq.gz
mv A155_GCCAATAT_L001_R3_001.fastq.gz.1 A155_GCCAATAT_L001_R3_001.fastq.gz
mv A155_GCCAATAT_L001_R3_002.fastq.gz.1 A155_GCCAATAT_L001_R3_002.fastq.gz
mv A155_GCCAATAT_L001_R3_003.fastq.gz.1 A155_GCCAATAT_L001_R3_003.fastq.gz
mv A155_GCCAATAT_L001_R3_004.fastq.gz.1 A155_GCCAATAT_L001_R3_004.fastq.gz
mv A155_GCCAATAT_L001_R3_005.fastq.gz.1 A155_GCCAATAT_L001_R3_005.fastq.gz
mv A155_GCCAATAT_L001_R3_006.fastq.gz.1 A155_GCCAATAT_L001_R3_006.fastq.gz
mv A155_GCCAATAT_L001_R3_007.fastq.gz.1 A155_GCCAATAT_L001_R3_007.fastq.gz
mv A155_GCCAATAT_L001_R3_008.fastq.gz.1 A155_GCCAATAT_L001_R3_008.fastq.gz
mv A155_GCCAATAT_L001_R3_009.fastq.gz.1 A155_GCCAATAT_L001_R3_009.fastq.gz
# Concatenate the R1 and R3 gzip files into gzipped total R1 and total R3 files
echo concatenating R1 files ...
zcat A155_GCCAATAT_L001_R1_001.fastq.gz A155_GCCAATAT_L001_R1_002.fastq.gz A155_GCCAATAT_L001_R1_003.fastq.gz A155_GCCAATAT_L001_R1_004.fastq.gz A155_GCCAATAT_L001_R1_005.fastq.gz A155_GCCAATAT_L001_R1_006.fastq.gz A155_GCCAATAT_L001_R1_007.fastq.gz A155_GCCAATAT_L001_R1_008.fastq.gz A155_GCCAATAT_L001_R1_009.fastq.gz > total_R1.fq
echo concatenating R3 files ...
zcat A155_GCCAATAT_L001_R3_001.fastq.gz A155_GCCAATAT_L001_R3_002.fastq.gz A155_GCCAATAT_L001_R3_003.fastq.gz A155_GCCAATAT_L001_R3_004.fastq.gz A155_GCCAATAT_L001_R3_005.fastq.gz A155_GCCAATAT_L001_R3_006.fastq.gz A155_GCCAATAT_L001_R3_007.fastq.gz A155_GCCAATAT_L001_R3_008.fastq.gz A155_GCCAATAT_L001_R3_009.fastq.gz > total_R3.fq
# If an error with concatenation occurs, the files must be unzipped
# beforehand. If no error occurs, ignore these steps.
# unzip all files in current directory
#echo unzipping all fastq.gz files ...
#gzip -d *fastq.gz
#echo concatenating R1 files ...
#cat A155_GCCAATAT_L001_R1_001.fastq A155_GCCAATAT_L001_R1_002.fastq A155_GCCAATAT_L001_R1_003.fastq A155_GCCAATAT_L001_R1_004.fastq A155_GCCAATAT_L001_R1_005.fastq A155_GCCAATAT_L001_R1_006.fastq A155_GCCAATAT_L001_R1_007.fastq A155_GCCAATAT_L001_R1_008.fastq A155_GCCAATAT_L001_R1_009.fastq > total_R1.fq
#echo concatenating R3 files ...
#cat A155_GCCAATAT_L001_R3_001.fastq A155_GCCAATAT_L001_R3_002.fastq A155_GCCAATAT_L001_R3_003.fastq A155_GCCAATAT_L001_R3_004.fastq A155_GCCAATAT_L001_R3_005.fastq A155_GCCAATAT_L001_R3_006.fastq A155_GCCAATAT_L001_R3_007.fastq A155_GCCAATAT_L001_R3_008.fastq A155_GCCAATAT_L001_R3_009.fastq > total_R3.fq
# Move total R1 & R2 files, & shell to a new directory
echo moving completed reads to total_reads directory ...
mkdir ../total_reads
mv total_R1.fq ../total_reads
mv total_R3.fq ../total_reads
cd ../total_reads
echo reads have been prepped!
echo NEXT STEPS: Run the python script R3_reads_HeaderFix.py by executing with the -d option. For example, ./R3_reads_HeaderFix.py -d ./total_R3.fq