-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathgenerate_seqs_from_VCF.py
executable file
·232 lines (188 loc) · 9.19 KB
/
generate_seqs_from_VCF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#! /usr/bin/env python
###############################################################################
## LICENSE
##
## Copyright (C) 2020
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
###############################################################################
# DATE: 2020
# AUTHOR: Chase W. Nelson
# AFFILIATION: Biodiversity Research Center, Academia Sinica, Taipei, Taiwan
# CONTACT: cnelson@gate.sinica.edu.tw
# FUNDING: Academia Sinica Postdoctoral Research Fellowship, P.I. Wen-Hsiung Li
# CITATION: Nelson CW, Evolutionary Bioinformatics Toolkit, https://github.com/chasewnelson/EBT
from Bio import SeqIO
import os
import random
import re
import sys
# Usage = "Script to generate a FASTA with randomly interspersed variants (from VCF)"
# Example:
# generate_seqs_from_VCF.py reference.fasta variants.vcf 1000
# Check argument number
if not len(sys.argv) == 4:
raise SystemExit("\n### TERMINATED: need 3 unnamed arguments:\n" + \
" # (1) FASTA reference (1 sequence)\n" + \
" # (2) .VCF SNP report\n" + \
" # (3) number of sequences to generate\n\n")
infile_FASTA = sys.argv[1]
infile_VCF = sys.argv[2]
num_seqs = int(sys.argv[3])
### Gather VCF files in working directory
#VCF_filenames = [name for name in os.listdir(".") if os.path.isfile(name) and name.endswith(".vcf")]
print("FASTA reference: " + str(infile_FASTA))
print("VCF with variants: " + str(infile_VCF))
print("Num sequences to generate: " + str(num_seqs))
#print("VCF files to examine, gathered from working directory: " + str(VCF_filenames))
# Check infiles exist
if not os.path.isfile(infile_FASTA):
raise SystemExit("\n### TERMINATED: infile 1 (FASTA reference) does not exist\n")
if not os.path.isfile(infile_VCF):
raise SystemExit("\n### TERMINATED: infile 2 (.VCF SNP report) does not exist\n")
# Create outfile_prefix name
outfile_prefix = str(infile_VCF)
outfile_prefix = outfile_prefix.replace(".vcf", "") # TODO: learn regex
outfile_prefix = outfile_prefix.replace(".fasta", "") # TODO: learn regex
outfile_prefix = outfile_prefix.replace(".txt", "") # TODO: learn regex
outfile_prefix = outfile_prefix.replace(".tsv", "") # TODO: learn regex
outfile_name = outfile_prefix + "_nSeqs" + str(num_seqs) + ".fasta"
print("Will write output to: " + outfile_name + "\n")
###############################################################################
### Extract SNP data from VCF
# Build a dictionary of site(int)->nt(str)->freqs(list)
# This will later be used to find mean frequency across all samples
site_nt_freqs = {}
site_nt_counts = {}
# Prepare regex
#VCF_line_pattern = r"([\w\_\.]+)\t(\d+)\t([^\s]+)\t([acgtuACGTU]+)\t([acgtuACGTU]+)\t(\d+)\t(\w+)\t([\w\=\;\.\,]+)"
#VCF_INFO_regex = re.compile(VCF_INFO_pattern)
#VCF_DP_pattern = r"DP=\d+" # DP=266
#VCF_DP_regex = re.compile(VCF_DP_pattern)
VCF_AF_pattern = r"AF=[\d\.]+" # AF=0.015038
VCF_AF_regex = re.compile(VCF_AF_pattern)
#VCF_SB_pattern = r"SB=\d+" # SB=0
#VCF_SB_regex = re.compile(VCF_SB_pattern)
#VCF_DP4_grppattern = r"DP4=(\d+),(\d+),(\d+),(\d+)" # DP4=27,235,0,4
#VCF_DP4_grpregex = re.compile(VCF_DP4_pattern)
#sample_count = 0
#for this_VCF in VCF_filenames:
if os.path.isfile(infile_VCF): # this_VCF
with open(infile_VCF, "r") as f:
lines = (line.rstrip() for line in f)
this_VCF_ID = str(infile_VCF).rstrip(".vcf")
#sample_count += 1
variant_count = 0
for line in lines:
if not line.startswith("#"): # skip headers
variant_count += 1
line_list = line.split("\t")
# Extract information for this SNP
# this_CHROM = line_list[0]
this_POS = int(line_list[1])
# this_ID = line_list[2]
# this_REF = line_list[3]
this_ALT = str(line_list[4])
# this_QUAL = line_list[5]
# this_FILTER = line_list[6]
this_INFO = line_list[7]
# this_INFO_regexgroups = VCF_INFO_regex.match(this_INFO)
#this_DP = VCF_DP_regex.match(this_INFO)
#this_DP.replace("DP=", "")
this_AF = VCF_AF_regex.search(this_INFO)
this_AF = this_INFO[this_AF.start():this_AF.end()] # AF=0.367003
this_AF = float(this_AF.replace("AF=", ""))
# this_SB = VCF_SB_regex.match(this_INFO)
# this_DP4_grps = VCF_DP4_grpregex.match(this_INFO)
# Add data for this record
if this_POS not in site_nt_freqs.keys(): # this_POS not in site_nt_freqs:
site_nt_freqs[this_POS] = {}
site_nt_counts[this_POS] = {}
site_nt_freqs[this_POS][this_ALT] = this_AF
site_nt_counts[this_POS][this_ALT] = 1
elif this_ALT not in site_nt_freqs[this_POS]:
site_nt_freqs[this_POS][this_ALT] = {}
site_nt_counts[this_POS][this_ALT] = {}
site_nt_freqs[this_POS][this_ALT] = this_AF
site_nt_counts[this_POS][this_ALT] = 1
else:
site_nt_freqs[this_POS][this_ALT] += this_AF
site_nt_counts[this_POS][this_ALT] += 1
else:
raise SystemExit("\n### TERMINATED: file doesn't exist: " + str(infile_VCF) + "\n")
###############################################################################
### Record reference sequence
# Open FASTA for reading
rec = SeqIO.read(infile_FASTA, "fasta") # will throw error if more than one record!
ref_seq = str(rec.seq)
ref_seq_len = len(ref_seq)
###############################################################################
### Build lists of nucleotides to observe at each site, from which we will draw without replacement
site_nt_lists = {}
for this_site in site_nt_freqs.keys():
this_nt_REF = ref_seq[this_site - 1]
for this_nt_ALT in site_nt_freqs[this_site].keys():
this_nt_ALT_freq = float(site_nt_freqs[this_site][this_nt_ALT]) / float(site_nt_counts[this_site][this_nt_ALT])
num_ALT = int(round(this_nt_ALT_freq * num_seqs)) # rounded to nearest int
num_REF = num_seqs - num_ALT
if num_REF < 0:
raise SystemExit("\n### TERMINATED: negative number of nucleotides implied at site: " + str(this_site) + "\n")
else:
# Add reference nucleotides
for ref_i in range(num_REF):
if this_site in site_nt_lists.keys():
site_nt_lists[this_site].append(str(this_nt_REF))
else:
site_nt_lists[this_site] = [str(this_nt_REF)]
# Add alternate nucleotides
for alt_i in range(num_ALT):
if this_site in site_nt_lists:
site_nt_lists[this_site].append(str(this_nt_ALT))
else:
site_nt_lists[this_site] = [str(this_nt_ALT)]
###############################################################################
### Randomize the order of nucleotides at each site
this_seed = random.randrange(sys.maxsize)
random.seed(this_seed)
print("\nRandom number seed for nucleotide incorporation: " + str(this_seed) + "\n")
for this_site in site_nt_lists.keys():
random.shuffle(site_nt_lists[this_site])
###############################################################################
### Build FASTA alignment with variants!
# Open FASTA for writing
outfile_hdl = open(outfile_name, "w")
for this_seq_idx in range(num_seqs):
this_seq = str(ref_seq)
this_seq_ID = str(outfile_prefix) + "_pseudoseq_" + str(this_seq_idx + 1)
# Make sure we used all the nucleotides
for this_site_idx in range(len(this_seq)):
this_site_num = this_site_idx + 1
if this_site_num in site_nt_lists:
this_seq = this_seq[:this_site_idx] + site_nt_lists[this_site_num].pop() + this_seq[(this_site_idx + 1):]
#this_seq[this_site_idx] = site_nt_lists[this_site_num].pop() # strings immutable
# else it's the REF for sure; leave it
# Write to FASTA file
outfile_hdl.write(">" + this_seq_ID + "\n")
outfile_hdl.write(str(this_seq) + "\n")
# Close FASTA output file
outfile_hdl.close()
# Make sure we used all the nucleotides were used
for this_seq_idx in range(num_seqs):
this_site_num = this_seq_idx + 1
if this_site_num in site_nt_lists and len(site_nt_lists[this_site_num]) > 0:
raise SystemExit("\n### WARNING: sequences remain at site " + str(this_site_num) + ". BIG PROBLEM.\n")
###############################################################################
### FINISH
print("\n")
raise SystemExit("\n### All done; we stopped here, dear researcher.\n")