-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwbbc.py
280 lines (251 loc) · 11 KB
/
wbbc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
# -*- coding: utf-8 -*-
import os
import statistics
from concurrent.futures import ThreadPoolExecutor, as_completed
VCF_FILENAME = "wbbc_vcf/WBBC.chr{}.GRCh37_PhaseI.vcf"
# https://wbbc.westlake.edu.cn/
# 根据西湖中国样本库,生成祖源模型文件
# The VCF is annotated with rsIDs from dbSNP151, and the following INFO fields:
# AC:Allele count in called genotypes in WBBC
# AF:Allele frequency in called genotypes in WBBC
# AN:Total number of alleles in called genotypes in WBBC
# NS:Total number of samples in called genotypes in WBBC
# North_AF:Allele frequency in North Han Chinese
# North_AN:Total number of alleles in North Han Chinese
# Central_AF:Allele frequency in Central Han Chinese
# Central_AN:Total number of alleles in Central Han Chinese
# South_AF:Allele frequency in South Han Chinese
# South_AN:Total number of alleles in South Han Chinese
# Lingnan_AF:Allele frequency in Lingnan Han Chinese
# Lingnan_AN:Total number of alleles in Lingnan Han Chinese
# DP:Raw read depth
# VQSLOD:Variant Recalibration Score from GATK
def make_allele_frq(
tsv_files: list,
high_ld_filename: str = "",
model_path: str = ".",
allele_frq_file: str = "wbbc",
af_digits: int = 6,
std_dev_threshold: float = 0.03,
max_workers: int = 4,
) -> None:
try:
# 从TSV文件获取RSID集合
rsid_set = get_rsid(tsv_files, high_ld_filename)
# SNP总计数
snp_total_count = 0
with open(
"{}/{}.alleles".format(model_path, allele_frq_file), "w", encoding="utf-8"
) as allele_file:
with open(
"{}/{}.F".format(model_path, allele_frq_file), "w", encoding="utf-8"
) as frq_file:
# 多线程遍历所有VCF文件
with ThreadPoolExecutor(max_workers=max_workers) as t:
task_list = []
for chr in range(1, 23):
print("\tTask {} is launching...".format(chr))
task_list.append(
t.submit(
match_snp_from_vcf,
rsid_set,
VCF_FILENAME.format(chr),
af_digits,
std_dev_threshold,
)
)
# 异步等待线程,每个VCF文件处理结果
for task in as_completed(task_list):
allele_list, frq_list = task.result()
if len(allele_list) == len(frq_list):
snp_total_count += len(allele_list)
# 结果写入alleles文件
allele_file.writelines(allele_list)
allele_file.flush()
# 结果写入frequency文件
frq_file.writelines(frq_list)
frq_file.flush()
print(
"\tOne task success, {:,} SNPs have been saved in {}.alleles and {}.F respectively".format(
len(allele_list), allele_frq_file, allele_frq_file
)
)
else:
print(
"\tOne task failed and not saved to file due to not matched amount between alleles and frequency. The rows of alleles is {}, while the rows of frequency is {}".format(
len(allele_list), len(frq_list)
)
)
print(
"All {} tasks finished. Totally {:,} SNPs have been saved in {}.alleles and {}.F respectively.".format(
len(task_list), snp_total_count, allele_frq_file, allele_frq_file
)
)
except FileNotFoundError:
print("文件不存在!")
except PermissionError:
print("无权限访问文件!")
except Exception as e:
print(f"发生未知错误:{e}")
# 在指定的VCF文件中筛选符合条件的SNP
def match_snp_from_vcf(
rsid_set: set, vcf_filename: str, af_digits: int, std_dev_threshold: float
):
# 当前VCF文件中筛选出的alleles和frequency列表
allele_list = []
frq_list = []
# 遍历指定的VCF文件
with open(
vcf_filename,
"r",
encoding="utf-8",
) as vcf_file:
print("\nProcessing WBBC vcf file: " + vcf_filename)
# 构造vcf字典,以RSID作为键,REF,ALT和INFO作为键值,只需要SNP,不包括Indel,生成vcf字典集
vcf_dict = {}
for vcf_line in vcf_file.readlines():
if len(vcf_line) > 0 and not vcf_line.startswith("#"):
vcf_line_list = vcf_line.split("\t")
if (
len(vcf_line_list) == 8
and vcf_line_list[2] != "."
and len(vcf_line_list[3]) == 1
and len(vcf_line_list[4]) == 1
):
vcf_dict[vcf_line_list[2]] = {
"ref": vcf_line_list[3],
"alt": vcf_line_list[4],
"info": vcf_line_list[7],
}
# 遍历rsid模板集合,在vcf字典集查询对应的rsid,如有则加入
for rsid in rsid_set:
if rsid in vcf_dict:
info_list = vcf_dict[rsid]["info"].split(";")
if len(info_list) == 15:
af = float(info_list[1].split("=")[1])
if af != 0 and af != 1:
# 解析每个人群的alt allele频率
north_af = float(info_list[4].split("=")[1])
central_af = float(info_list[6].split("=")[1])
south_af = float(info_list[8].split("=")[1])
lingnan_af = float(info_list[10].split("=")[1])
# 过滤allele频率标准差大于阈值的SNP
if (
statistics.pstdev(
[
north_af,
central_af,
south_af,
lingnan_af,
]
)
< std_dev_threshold
):
continue
# 构造allele数据行,A1是ref allele, A2是alt allele
allele_line = "{} {} {}\n".format(
rsid,
vcf_dict[rsid]["ref"],
vcf_dict[rsid]["alt"],
)
allele_list.append(allele_line)
# print("\tCollecting allele data: " + allele_line)
# frequency数据行,每个祖源成分的ref allele频率
frq_line = "{} {} {} {}\n".format(
round(
1 - north_af,
af_digits,
),
round(
1 - central_af,
af_digits,
),
round(
1 - south_af,
af_digits,
),
round(
1 - lingnan_af,
af_digits,
),
)
frq_list.append(frq_line)
# print("\tCollecting frequency data: " + frq_line)
return allele_list, frq_list
# 从TSV文件获取RSID集合
def get_rsid(tsv_files: list, high_ld_filename: str) -> set:
# TSV数据源的RSID集合
rsid_set = set()
# 获取高连锁不平衡数据
hld_list = (
get_high_ld(high_ld_filename)
if high_ld_filename != "" and high_ld_filename != None
else []
)
rsid_in_hld_count = 0
# 常染色体集合
chr_set = {str(chr) for chr in range(1, 23)}
if tsv_files != None:
for tsv_filename in tsv_files:
print("Collecting RSID in TSV file {0}...".format(tsv_filename))
with open(tsv_filename, "r", encoding="utf-8") as tsv_file:
for tsv_line in tsv_file.readlines():
if len(tsv_line) > 0 and not tsv_line.startswith(
("#", "\n", "\t", '"')
):
tsv_line_list = tsv_line.split("\t")
if (
len(tsv_line_list) >= 2
and tsv_line_list[0] not in rsid_set
and tsv_line_list[1] in chr_set
):
# 去除连锁不平衡区域的SNP
inHLD = False
for i in range(len(hld_list)):
if (
tsv_line_list[1] == hld_list[i]["chr"]
and int(tsv_line_list[2])
>= hld_list[i]["start_pos"]
and int(tsv_line_list[2]) <= hld_list[i]["end_pos"]
):
rsid_in_hld_count += 1
inHLD = True
break
if not inHLD:
rsid_set.add(tsv_line_list[0])
if len(rsid_set) > 0:
print(
"Finish collecting {:,} RSID in {}\nApart from {:,} RSID in high LD region.".format(
len(rsid_set), " ".join(tsv_files), rsid_in_hld_count
)
)
else:
raise Exception("There is no available SNP in TSV files.")
return rsid_set
# 获取连锁不平衡数据high linkage disequilibrium
# https://genome.sph.umich.edu/wiki/Regions_of_high_linkage_disequilibrium_(LD)
def get_high_ld(ld_filename: str) -> list:
if not os.access(ld_filename, os.F_OK):
raise Exception(
"High Linkage Disequilibrium file '{}' is not available.".format(
ld_filename
)
)
ld_list = []
with open(
ld_filename,
"r",
encoding="utf-8",
) as hd_file:
for hd_line in hd_file.readlines():
if len(hd_line) > 0 and not hd_line.startswith("#"):
hd_line_list = hd_line.split("\t")
if len(hd_line_list) == 3:
ld_list.append(
{
"chr": hd_line_list[0],
"start_pos": int(hd_line_list[1]),
"end_pos": int(hd_line_list[2]),
}
)
return ld_list