forked from jeromekelleher/sc2ts-paper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpaper.bib
2459 lines (2335 loc) · 126 KB
/
paper.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% Generated by Paperpile. Check out https://paperpile.com for more information.
% BibTeX export options can be customized via Settings -> BibTeX.
@article{otto2021origins,
title={The origins and potential future of SARS-CoV-2 variants of concern in the evolving COVID-19 pandemic},
author={Otto, Sarah P and Day, Troy and Arino, Julien and Colijn, Caroline and Dushoff, Jonathan and Li, Michael and Mechai, Samir and Van Domselaar, Gary and Wu, Jianhong and Earn, David JD and others},
journal={Current Biology},
volume={31},
number={14},
pages={R918--R929},
year={2021},
publisher={Elsevier}
}
@article{amoutzias2022remarkable,
title={The remarkable evolutionary plasticity of coronaviruses by mutation and recombination: insights for the COVID-19 pandemic and the future evolutionary paths of SARS-CoV-2},
author={Amoutzias, Grigorios D and Nikolaidis, Marios and Tryfonopoulou, Eleni and Chlichlia, Katerina and Markoulatos, Panayotis and Oliver, Stephen G},
journal={Viruses},
volume={14},
number={1},
pages={78},
year={2022},
publisher={MDPI}
}
@article{Attwood2022-ab,
abstract = {Determining the transmissibility, prevalence and patterns of movement of severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) infections is central to our understanding of the impact of the pandemic and to the design of effective control strategies. Phylogenies (evolutionary trees) have provided key insights into the international spread of SARS-CoV-2 and enabled investigation of individual outbreaks and transmission chains in specific settings. Phylodynamic approaches combine evolutionary, demographic and epidemiological concepts and have helped track virus genetic changes, identify emerging variants and inform public health strategy. Here, we review and synthesize studies that illustrate how phylogenetic and phylodynamic techniques were applied during the first year of the pandemic, and summarize their contributions to our understanding of SARS-CoV-2 transmission and control.},
author = {Attwood, Stephen W. and Hill, Sarah C. and Aanensen, David M. and Connor, Thomas R. and Pybus, Oliver G.},
da = {2022/09/01},
date-added = {2023-06-01 15:31:08 +0100},
date-modified = {2023-06-01 15:31:08 +0100},
doi = {10.1038/s41576-022-00483-8},
isbn = {1471-0064},
journal = {Nature Reviews Genetics},
number = {9},
pages = {547--562},
title = {Phylogenetic and phylodynamic approaches to understanding and combating the early {SARS-CoV-2} pandemic},
ty = {JOUR},
volume = {23},
year = {2022},
Bdsk-Url-1 = {https://doi.org/10.1038/s41576-022-00483-8}}
@ARTICLE{Chen2022-pz,
title = "{CoV-Spectrum}: analysis of globally shared {SARS-CoV-2} data to
identify and characterize new variants",
author = "Chen, Chaoran and Nadeau, Sarah and Yared, Michael and Voinov,
Philippe and Xie, Ning and Roemer, Cornelius and Stadler, Tanja",
abstract = "SUMMARY: The CoV-Spectrum website supports the identification of
new SARS-CoV-2 variants of concern and the tracking of known
variants. Its flexible amino acid and nucleotide mutation search
allows querying of variants before they are designated by a
lineage nomenclature system. The platform brings together
SARS-CoV-2 data from different sources and applies analyses.
Results include the proportion of different variants over time,
their demographic and geographic distributions, common mutations,
hospitalization and mortality probabilities, estimates for
transmission fitness advantage and insights obtained from
wastewater samples. AVAILABILITY AND IMPLEMENTATION: CoV-Spectrum
is available at https://cov-spectrum.org. The code is released
under the GPL-3.0 license at
https://github.com/cevo-public/cov-spectrum-website.",
journal = "Bioinformatics",
volume = 38,
number = 6,
pages = "1735--1737",
month = mar,
year = 2022,
language = "en"
}
@ARTICLE{Shriner2003-kb,
title = "Potential impact of recombination on sitewise approaches for
detecting positive natural selection",
author = "Shriner, Daniel and Nickle, David C and Jensen, Mark A and
Mullins, James I",
abstract = "Current sitewise methods for detecting positive selection on gene
sequences (the de facto standard being the CODEML method (Yang et
al., 2000)) assume no recombination. This paper presents
simulation results indicating that violation of this assumption
can lead to false positive detection of sites undergoing positive
selection. Through the use of population-scaled mutation and
recombination rates, simulations can be performed that permit the
generation of appropriate null distributions corresponding to
neutral expectations in the presence of recombination, thereby
allowing for a more accurate estimation of positive selection.",
journal = "Genetics Research",
volume = 81,
number = 2,
pages = "115--121",
month = apr,
year = 2003,
language = "en"
}
@ARTICLE{Kelleher2019-ba,
title = "Inferring whole-genome histories in large population datasets",
author = "Kelleher, Jerome and Wong, Yan and Wohns, Anthony W and Fadil,
Chaimaa and Albers, Patrick K and McVean, Gil",
abstract = "Inferring the full genealogical history of a set of DNA sequences
is a core problem in evolutionary biology, because this history
encodes information about the events and forces that have
influenced a species. However, current methods are limited, and
the most accurate techniques are able to process no more than a
hundred samples. As datasets that consist of millions of genomes
are now being collected, there is a need for scalable and
efficient inference methods to fully utilize these resources.
Here we introduce an algorithm that is able to not only infer
whole-genome histories with comparable accuracy to the
state-of-the-art but also process four orders of magnitude more
sequences. The approach also provides an 'evolutionary encoding'
of the data, enabling efficient calculation of relevant
statistics. We apply the method to human data from the 1000
Genomes Project, Simons Genome Diversity Project and UK Biobank,
showing that the inferred genealogies are rich in biological
signal and efficient to process.",
journal = "Nature Genetics",
volume = 51,
number = 9,
pages = "1330--1338",
month = sep,
year = 2019,
language = "en"
}
@ARTICLE{Wohns2022-th,
title = "A unified genealogy of modern and ancient genomes",
author = "Wohns, Anthony Wilder and Wong, Yan and Jeffery, Ben and Akbari,
Ali and Mallick, Swapan and Pinhasi, Ron and Patterson, Nick and
Reich, David and Kelleher, Jerome and McVean, Gil",
abstract = "The sequencing of modern and ancient genomes from around the
world has revolutionized our understanding of human history and
evolution. However, the problem of how best to characterize
ancestral relationships from the totality of human genomic
variation remains unsolved. Here, we address this challenge with
nonparametric methods that enable us to infer a unified genealogy
of modern and ancient humans. This compact representation of
multiple datasets explores the challenges of missing and
erroneous data and uses ancient samples to constrain and date
relationships. We demonstrate the power of the method to recover
relationships between individuals and populations as well as to
identify descendants of ancient samples. Finally, we introduce a
simple nonparametric estimator of the geographical location of
ancestors that recapitulates key events in human history.",
journal = "Science",
volume = 375,
number = 6583,
pages = "eabi8264",
month = feb,
year = 2022,
language = "en"
}
@ARTICLE{Schaefer2021-yg,
title = "An ancestral recombination graph of human, {Neanderthal}, and
{Denisovan} genomes",
author = "Schaefer, Nathan K and Shapiro, Beth and Green, Richard E",
abstract = "Many humans carry genes from Neanderthals, a legacy of past
admixture. Existing methods detect this archaic hominin ancestry
within human genomes using patterns of linkage disequilibrium or
direct comparison to Neanderthal genomes. Each of these methods
is limited in sensitivity and scalability. We describe a new
ancestral recombination graph inference algorithm that scales to
large genome-wide datasets and demonstrate its accuracy on real
and simulated data. We then generate a genome-wide ancestral
recombination graph including human and archaic hominin genomes.
From this, we generate a map within human genomes of archaic
ancestry and of genomic regions not shared with archaic hominins
either by admixture or incomplete lineage sorting. We find that
only 1.5 to 7\% of the modern human genome is uniquely human. We
also find evidence of multiple bursts of adaptive changes
specific to modern humans within the past 600,000 years involving
genes related to brain development and function.",
journal = "Science Advances",
volume = 7,
number = 29,
month = jul,
year = 2021,
language = "en"
}
@ARTICLE{Carabelli2023-tb,
title = "{SARS-CoV-2} variant biology: immune escape, transmission and
fitness",
author = "Carabelli, Alessandro M and Peacock, Thomas P and Thorne, Lucy G
and Harvey, William T and Hughes, Joseph and {COVID-19 Genomics
UK Consortium} and Peacock, Sharon J and Barclay, Wendy S and de
Silva, Thushan I and Towers, Greg J and Robertson, David L",
abstract = "In late 2020, after circulating for almost a year in the human
population, severe acute respiratory syndrome coronavirus 2
(SARS-CoV-2) exhibited a major step change in its adaptation to
humans. These highly mutated forms of SARS-CoV-2 had enhanced
rates of transmission relative to previous variants and were
termed 'variants of concern' (VOCs). Designated Alpha, Beta,
Gamma, Delta and Omicron, the VOCs emerged independently from one
another, and in turn each rapidly became dominant, regionally or
globally, outcompeting previous variants. The success of each VOC
relative to the previously dominant variant was enabled by
altered intrinsic functional properties of the virus and, to
various degrees, changes to virus antigenicity conferring the
ability to evade a primed immune response. The increased virus
fitness associated with VOCs is the result of a complex interplay
of virus biology in the context of changing human immunity due to
both vaccination and prior infection. In this Review, we
summarize the literature on the relative transmissibility and
antigenicity of SARS-CoV-2 variants, the role of mutations at the
furin spike cleavage site and of non-spike proteins, the
potential importance of recombination to virus success, and
SARS-CoV-2 evolution in the context of T cells, innate immunity
and population immunity. SARS-CoV-2 shows a complicated
relationship among virus antigenicity, transmission and
virulence, which has unpredictable implications for the future
trajectory and disease burden of COVID-19.",
journal = "Nature Reviews Microbiology",
pages = "1--16",
month = jan,
year = 2023,
language = "en"
}
@ARTICLE{Michener1957-tr,
title = "A Quantitative Approach To A Problem In Classification",
author = "Michener, Charles D and Sokal, Robert R",
journal = "Evolution",
volume = 11,
number = 2,
pages = "130--162",
month = jun,
year = 1957
}
@article{Tamura2023-ab,
author = {Tamura, Tomokazu and Ito, Jumpei and Uriu, Keiya and Zahradnik,
Jiri and Kida, Izumi and Anraku, Yuki and Nasser, Hesham and Shofa,
Maya and Oda, Yoshitaka and Lytras, Spyros and Nao, Naganori and
Itakura, Yukari and Deguchi, Sayaka and Suzuki, Rigel and Wang, Lei
and Begum, MST Monira and Kita, Shunsuke and Yajima, Hisano and
Sasaki, Jiei and Sasaki-Tabata, Kaori and Shimizu, Ryo and Tsuda,
Masumi and Kosugi, Yusuke and Fujita, Shigeru and Pan, Lin and Sauter,
Daniel and Yoshimatsu, Kumiko and Suzuki, Saori and Asakura, Hiroyuki
and Nagashima, Mami and Sadamasu, Kenji and Yoshimura, Kazuhisa and
Yamamoto, Yuki and Nagamoto, Tetsuharu and Schreiber, Gideon and
Maenaka, Katsumi and Ito, Hayato and Misawa, Naoko and Kimura,
Izumi and Suganami, Mai and Chiba, Mika and Yoshimura, Ryo and Yasuda,
Kyoko and Iida, Keiko and Ohsumi, Naomi and Strange, Adam P. and
Takahashi, Otowa and Ichihara, Kimiko and Shibatani, Yuki and
Nishiuchi, Tomoko and Kato, Marie and Ferdous, Zannatul and Mouri,
Hiromi and Shishido, Kenji and Sawa, Hirofumi and Hashimoto, Rina and
Watanabe, Yukio and Sakamoto, Ayaka and Yasuhara, Naoko and Suzuki,
Tateki and Kimura, Kanako and Nakajima, Yukari and Nakagawa, So and Wu,
Jiaqi and Shirakawa, Kotaro and Takaori-Kondo, Akifumi and Nagata,
Kayoko and Kazuma, Yasuhiro and Nomura, Ryosuke and Horisawa, Yoshihito
and Tashiro, Yusuke and Kawai, Yugo and Irie, Takashi and Kawabata,
Ryoko and Motozono, Chihiro and Toyoda, Mako and Ueno, Takamasa and
Hashiguchi, Takao and Ikeda, Terumasa and Fukuhara, Takasuke and
Saito, Akatsuki and Tanaka, Shinya and Matsuno, Keita and Takayama,
Kazuo and Sato, Kei and The Genotype to Phenotype Japan (G2P-Japan)
Consortium},
journal = {Nature Communications},
number = {1},
pages = {2800},
title = {Virological characteristics of the {SARS-CoV-2} {XBB} variant
derived from recombination of two {Omicron} subvariants},
volume = {14},
year = {2023},
}
@ARTICLE{Turakhia2022-it,
title = "Pandemic-scale phylogenomics reveals the {SARS-CoV-2}
recombination landscape",
author = "Turakhia, Yatish and Thornlow, Bryan and Hinrichs, Angie and
McBroome, Jakob and Ayala, Nicolas and Ye, Cheng and Smith, Kyle
and De Maio, Nicola and Haussler, David and Lanfear, Robert and
Corbett-Detig, Russell",
abstract = "Accurate and timely detection of recombinant lineages is crucial
for interpreting genetic variation, reconstructing epidemic
spread, identifying selection and variants of interest, and
accurately performing phylogenetic analyses. During the
SARS-CoV-2 pandemic, genomic data generation has exceeded the
capacities of existing analysis platforms, thereby crippling
real-time analysis of viral evolution. Here, we use a new
phylogenomic method to search a nearly comprehensive SARS-CoV-2
phylogeny for recombinant lineages. In a 1.6 million sample tree
from May 2021, we identify 589 recombination events, which
indicate that around 2.7\% of sequenced SARS-CoV-2 genomes have
detectable recombinant ancestry. Recombination breakpoints are
inferred to occur disproportionately in the 3' portion of the
genome that contains the spike protein. Our results highlight the
need for timely analyses of recombination for pinpointing the
emergence of recombinant lineages with the potential to increase
transmissibility or virulence of the virus. We anticipate that
this approach will empower comprehensive real-time tracking of
viral recombination during the SARS-CoV-2 pandemic and beyond.",
journal = "Nature",
volume = 609,
number = 7929,
pages = "994--997",
month = sep,
year = 2022,
language = "en"
}
@ARTICLE{Scornavacca2011-mg,
title = "Tanglegrams for rooted phylogenetic trees and networks",
author = "Scornavacca, Celine and Zickmann, Franziska and Huson, Daniel H",
abstract = "Abstract. Motivation: In systematic biology, one is often faced
with the task of comparing different phylogenetic trees, in
particular in multi-gene analysis or",
journal = "Bioinformatics",
publisher = "Oxford Academic",
volume = 27,
number = 13,
pages = "i248--i256",
month = jun,
year = 2011,
language = "en"
}
@article{Shu2017-hp,
title = "{GISAID}: Global initiative on sharing all influenza data -- from
vision to reality",
author = "Shu, Yuelong and McCauley, John",
journal = "Eurosurveillance",
volume = 22,
number = 13,
year = 2017
}
@ARTICLE{Delaneau2019-wl,
title = "Accurate, scalable and integrative haplotype estimation",
author = "Delaneau, Olivier and Zagury, Jean-Fran{\c c}ois and Robinson,
Matthew R and Marchini, Jonathan L and Dermitzakis, Emmanouil T",
abstract = "The number of human genomes being genotyped or sequenced
increases exponentially and efficient haplotype estimation
methods able to handle this amount of data are now required. Here
we present a method, SHAPEIT4, which substantially improves upon
other methods to process large genotype and high coverage
sequencing datasets. It notably exhibits sub-linear running times
with sample size, provides highly accurate haplotypes and allows
integrating external phasing information such as large reference
panels of haplotypes, collections of pre-phased variants and long
sequencing reads. We provide SHAPEIT4 in an open source format
and demonstrate its performance in terms of accuracy and running
times on two gold standard datasets: the UK Biobank data and the
Genome In A Bottle.",
journal = "Nature Communications",
volume = 10,
number = 1,
pages = "5436",
month = nov,
year = 2019,
language = "en"
}
@ARTICLE{Browning2021-cg,
title = "Fast two-stage phasing of large-scale sequence data",
author = "Browning, Brian L and Tian, Xiaowen and Zhou, Ying and Browning,
Sharon R",
abstract = "Haplotype phasing is the estimation of haplotypes from genotype
data. We present a fast, accurate, and memory-efficient haplotype
phasing method that scales to large-scale SNP array and sequence
data. The method uses marker windowing and composite reference
haplotypes to reduce memory usage and computation time. It
incorporates a progressive phasing algorithm that identifies
confidently phased heterozygotes in each iteration and fixes the
phase of these heterozygotes in subsequent iterations. For data
with many low-frequency variants, such as whole-genome sequence
data, the method employs a two-stage phasing algorithm that
phases high-frequency markers via progressive phasing in the
first stage and phases low-frequency markers via genotype
imputation in the second stage. This haplotype phasing method is
implemented in the open-source Beagle 5.2 software package. We
compare Beagle 5.2 and SHAPEIT 4.2.1 by using expanding subsets
of 485,301 UK Biobank samples and 38,387 TOPMed samples. Both
methods have very similar accuracy and computation time for UK
Biobank SNP array data. However, for TOPMed sequence data, Beagle
is more than 20 times faster than SHAPEIT, achieves similar
accuracy, and scales to larger sample sizes.",
journal = "American Journal of Human Genetics",
volume = 108,
number = 10,
pages = "1880--1890",
month = oct,
year = 2021,
keywords = "TOPMed; UK Biobank; genotype phasing; haplotype phasing; phasing",
language = "en"
}
@ARTICLE{Rambaut2020-dw,
title = "A dynamic nomenclature proposal for {SARS-CoV-2} lineages to
assist genomic epidemiology",
author = "Rambaut, Andrew and Holmes, Edward C and O'Toole, {\'A}ine and
Hill, Verity and McCrone, John T and Ruis, Christopher and du
Plessis, Louis and Pybus, Oliver G",
abstract = "The ongoing pandemic spread of a new human coronavirus,
SARS-CoV-2, which is associated with severe pneumonia/disease
(COVID-19), has resulted in the generation of tens of thousands
of virus genome sequences. The rate of genome generation is
unprecedented, yet there is currently no coherent nor accepted
scheme for naming the expanding phylogenetic diversity of
SARS-CoV-2. Here, we present a rational and dynamic virus
nomenclature that uses a phylogenetic framework to identify those
lineages that contribute most to active spread. Our system is
made tractable by constraining the number and depth of
hierarchical lineage labels and by flagging and delabelling virus
lineages that become unobserved and hence are probably inactive.
By focusing on active virus lineages and those spreading to new
locations, this nomenclature will assist in tracking and
understanding the patterns and determinants of the global spread
of SARS-CoV-2.",
journal = "Nature Microbiology",
volume = 5,
number = 11,
pages = "1403--1407",
month = nov,
year = 2020,
language = "en"
}
@ARTICLE{Baumdicker2022-ep,
title = "Efficient ancestry and mutation simulation with msprime 1.0",
author = "Baumdicker, Franz and Bisschop, Gertjan and Goldstein, Daniel and
Gower, Graham and Ragsdale, Aaron P and Tsambos, Georgia and Zhu,
Sha and Eldon, Bjarki and Ellerman, E Castedo and Galloway, Jared
G and Gladstein, Ariella L and Gorjanc, Gregor and Guo, Bing and
Jeffery, Ben and Kretzschumar, Warren W and Lohse, Konrad and
Matschiner, Michael and Nelson, Dominic and Pope, Nathaniel S and
Quinto-Cort{\'e}s, Consuelo D and Rodrigues, Murillo F and
Saunack, Kumar and Sellinger, Thibaut and Thornton, Kevin and van
Kemenade, Hugo and Wohns, Anthony W and Wong, Yan and Gravel,
Simon and Kern, Andrew D and Koskela, Jere and Ralph, Peter L and
Kelleher, Jerome",
abstract = "Stochastic simulation is a key tool in population genetics, since
the models involved are often analytically intractable and
simulation is usually the only way of obtaining ground-truth data
to evaluate inferences. Because of this, a large number of
specialized simulation programs have been developed, each filling
a particular niche, but with largely overlapping functionality
and a substantial duplication of effort. Here, we introduce
msprime version 1.0, which efficiently implements ancestry and
mutation simulations based on the succinct tree sequence data
structure and the tskit library. We summarize msprime's many
features, and show that its performance is excellent, often many
times faster and more memory efficient than specialized
alternatives. These high-performance features have been
thoroughly tested and validated, and built using a collaborative,
open source development model, which reduces duplication of
effort and promotes software quality via community engagement.",
journal = "Genetics",
volume = 220,
number = 3,
month = mar,
year = 2022,
keywords = "Ancestral Recombination Graphs; coalescent; mutations; simulation",
language = "en"
}
@MISC{noauthor_2021-kd,
title = "Pango Lineage Nomenclature: {P}rovisional rules for naming
recombinant lineages",
booktitle = "Virological",
abstract = "The rules outlined below constitute a provisional naming
convention for identifiable SARS-CoV-2 recombinant lineages
within the Pango dynamic nomenclature system
(https://doi.org/10.1038/s41564-020-0770-5). These rules are
being considered for ratification by the Pango nomenclature
committee. NOTE: New lineage names are designated by the
Pango committee and not by individual researchers or groups.
If you would like to make a lineage suggestion, please read
the nomenclature documentation and sub...",
month = mar,
year = 2021,
howpublished = "\url{https://virological.org/t/pango-lineage-nomenclature-provisional-rules-for-naming-recombinant-lineages/657}",
note = "Accessed: 2023-4-4",
language = "en"
}
@ARTICLE{Chen2021-zc,
title = "{COVID-19} {CG} enables {SARS-CoV-2} mutation and lineage
tracking by locations and dates of interest",
author = "Chen, Albert Tian and Altschuler, Kevin and Zhan, Shing Hei and
Chan, Yujia Alina and Deverman, Benjamin E",
abstract = "COVID-19 CG (covidcg.org) is an open resource for tracking
SARS-CoV-2 single-nucleotide variations (SNVs), lineages, and
clades using the virus genomes on the GISAID database while
filtering by location, date, gene, and mutation of interest.
COVID-19 CG provides significant time, labor, and cost-saving
utility to projects on SARS-CoV-2 transmission, evolution,
diagnostics, therapeutics, vaccines, and intervention tracking.
Here, we describe case studies in which users can interrogate (1)
SNVs in the SARS-CoV-2 spike receptor binding domain (RBD) across
different geographical regions to inform the design and testing
of therapeutics, (2) SNVs that may impact the sensitivity of
commonly used diagnostic primers, and (3) the emergence of a
dominant lineage harboring an S477N RBD mutation in Australia in
2020. To accelerate COVID-19 efforts, COVID-19 CG will be
upgraded with new features for users to rapidly pinpoint
mutations as the virus evolves throughout the pandemic and in
response to therapeutic and public health interventions.",
journal = "eLife",
volume = 10,
pages = {e63409},
month = feb,
year = 2021,
keywords = "COVID-19; SARS-CoV-2; browser; epidemiology; evolutionary
biology; global health; mutation tracking; pandemic; resource;
virus",
language = "en"
}
@ARTICLE{Kim2020-gt,
title = "The Architecture of {SARS-CoV-2} Transcriptome",
author = "Kim, Dongwan and Lee, Joo-Yeon and Yang, Jeong-Sun and Kim, Jun
Won and Kim, V Narry and Chang, Hyeshik",
abstract = "SARS-CoV-2 is a betacoronavirus responsible for the COVID-19
pandemic. Although the SARS-CoV-2 genome was reported recently,
its transcriptomic architecture is unknown. Utilizing two
complementary sequencing techniques, we present a high-resolution
map of the SARS-CoV-2 transcriptome and epitranscriptome. DNA
nanoball sequencing shows that the transcriptome is highly
complex owing to numerous discontinuous transcription events. In
addition to the canonical genomic and 9 subgenomic RNAs,
SARS-CoV-2 produces transcripts encoding unknown ORFs with
fusion, deletion, and/or frameshift. Using nanopore direct RNA
sequencing, we further find at least 41 RNA modification sites on
viral transcripts, with the most frequent motif, AAGAA. Modified
RNAs have shorter poly(A) tails than unmodified RNAs, suggesting
a link between the modification and the 3' tail. Functional
investigation of the unknown transcripts and RNA modifications
discovered in this study will open new directions to our
understanding of the life cycle and pathogenicity of SARS-CoV-2.",
journal = "Cell",
volume = 181,
number = 4,
pages = "914--921",
month = may,
year = 2020,
keywords = "COVID-19; RNA modification; SARS-CoV-2; coronavirus; direct RNA
sequencing; discontinuous transcription; epitranscriptome;
nanopore; poly(A) tail; transcriptome",
language = "en"
}
@BOOK{Gusfield2014-qw,
title = "{ReCombinatorics}: {T}he Algorithmics of Ancestral Recombination
Graphs and Explicit Phylogenetic Networks",
author = "Gusfield, Dan",
abstract = "When a Philadelphia girl intercepts a message about an impending
British attack against her father's regiment, commanded by
General Washington, in White Marsh, she travels alone by
horseback to warn the Patriot army.",
publisher = "MIT Press",
address = "Cambridge, MA",
month = jul,
year = 2014,
language = "en"
}
@ARTICLE{McBroome2021-zm,
title = "A {Daily-Updated} Database and Tools for Comprehensive
{SARS-CoV-2} {Mutation-Annotated} Trees",
author = "McBroome, Jakob and Thornlow, Bryan and Hinrichs, Angie S and
Kramer, Alexander and De Maio, Nicola and Goldman, Nick and
Haussler, David and Corbett-Detig, Russell and Turakhia, Yatish",
abstract = "The vast scale of SARS-CoV-2 sequencing data has made it
increasingly challenging to comprehensively analyze all available
data using existing tools and file formats. To address this, we
present a database of SARS-CoV-2 phylogenetic trees inferred with
unrestricted public sequences, which we update daily to
incorporate new sequences. Our database uses the recently
proposed mutation-annotated tree (MAT) format to efficiently
encode the tree with branches labeled with parsimony-inferred
mutations, as well as Nextstrain clade and Pango lineage labels
at clade roots. As of June 9, 2021, our SARS-CoV-2 MAT consists
of 834,521 sequences and provides a comprehensive view of the
virus' evolutionary history using public data. We also present
matUtils-a command-line utility for rapidly querying,
interpreting, and manipulating the MATs. Our daily-updated
SARS-CoV-2 MAT database and matUtils software are available at
http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER\_SARS-CoV-2/
and https://github.com/yatisht/usher, respectively.",
journal = "Molecular Biology and Evolution",
volume = 38,
number = 12,
pages = "5819--5824",
month = dec,
year = 2021,
keywords = "COVID-19; SARS-CoV-2 phylogenetics; genomic surveillance",
language = "en"
}
@ARTICLE{Varabyou2021-rw,
title = "Rapid detection of inter-clade recombination in {SARS-CoV-2} with
{B}olotie",
author = "Varabyou, Ales and Pockrandt, Christopher and Salzberg, Steven L
and Pertea, Mihaela",
abstract = "The ability to detect recombination in pathogen genomes is
crucial to the accuracy of phylogenetic analysis and consequently
to forecasting the spread of infectious diseases and to
developing therapeutics and public health policies. However, in
case of the SARS-CoV-2, the low divergence of near-identical
genomes sequenced over a short period of time makes conventional
analysis infeasible. Using a novel method, we identified 225
anomalous SARS-CoV-2 genomes of likely recombinant origins out of
the first 87,695 genomes to be released, several of which have
persisted in the population. Bolotie is specifically designed to
perform a rapid search for inter-clade recombination events over
extremely large datasets, facilitating analysis of novel isolates
in seconds. In cases where raw sequencing data were available, we
were able to rule out the possibility that these samples
represented co-infections by analyzing the underlying sequence
reads. The Bolotie software and other data from our study are
available at https://github.com/salzberg-lab/bolotie.",
journal = "Genetics",
volume = 218,
number = 3,
month = jul,
year = 2021,
keywords = "COVID-19; SARS-CoV-2; coronavirus; recombination",
language = "en"
}
@ARTICLE{Jungreis2021-dh,
title = "{SARS-CoV-2} gene content and {COVID-19} mutation impact by
comparing 44 Sarbecovirus genomes",
author = "Jungreis, Irwin and Sealfon, Rachel and Kellis, Manolis",
abstract = "Despite its clinical importance, the SARS-CoV-2 gene set remains
unresolved, hindering dissection of COVID-19 biology. We use
comparative genomics to provide a high-confidence protein-coding
gene set, characterize evolutionary constraint, and prioritize
functional mutations. We select 44 Sarbecovirus genomes at
ideally-suited evolutionary distances, and quantify
protein-coding evolutionary signatures and overlapping
constraint. We find strong protein-coding signatures for ORFs
3a, 6, 7a, 7b, 8, 9b, and a novel alternate-frame gene, ORF3c,
whereas ORFs 2b, 3d/3d-2, 3b, 9c, and 10 lack protein-coding
signatures or convincing experimental evidence of protein-coding
function. Furthermore, we show no other conserved protein-coding
genes remain to be discovered. Mutation analysis suggests ORF8
contributes to within-individual fitness but not
person-to-person transmission. Cross-strain and within-strain
evolutionary pressures agree, except for fewer-than-expected
within-strain mutations in nsp3 and S1, and more-than-expected
in nucleocapsid, which shows a cluster of mutations in a
predicted B-cell epitope, suggesting immune-avoidance selection.
Evolutionary histories of residues disrupted by spike-protein
substitutions D614G, N501Y, E484K, and K417N/T provide clues
about their biology, and we catalog likely-functional
co-inherited mutations. Previously reported RNA-modification
sites show no enrichment for conservation. Here we report a
high-confidence gene set and evolutionary-history annotations
providing valuable resources and insights on SARS-CoV-2 biology,
mutations, and evolution. The SARS-CoV-2 gene set remains
unresolved, hindering dissection of COVID-19 biology. Comparing
44 Sarbecovirus genomes provides a high-confidence
protein-coding gene set. The study characterizes protein-level
and nucleotide-level evolutionary constraints, and prioritizes
functional mutations from the ongoing COVID-19 pandemic.",
journal = "Nature Communications",
publisher = "Nature Publishing Group",
volume = 12,
number = 1,
pages = "1--20",
month = may,
year = 2021,
language = "en"
}
@article {Zhang2023-lf,
title={Biobank-scale inference of ancestral recombination graphs
enables genealogical analysis of complex traits},
author={Zhang, Brian C and Biddanda, Arjun and Gunnarsson, {\'A}rni
Freyr and Cooper, Fergus and Palamara, Pier Francesco},
journal={Nature Genetics},
pages={768--776},
volume={55},
year={2023},
publisher={Nature Publishing Group US New York}
}
@ARTICLE{Minichiello2006-dp,
title = "Mapping trait loci by use of inferred ancestral recombination
graphs",
author = "Minichiello, Mark J and Durbin, Richard",
abstract = "Large-scale association studies are being undertaken with the
hope of uncovering the genetic determinants of complex disease.
We describe a computationally efficient method for inferring
genealogies from population genotype data and show how these
genealogies can be used to fine map disease loci and interpret
association signals. These genealogies take the form of the
ancestral recombination graph (ARG). The ARG defines a
genealogical tree for each locus, and, as one moves along the
chromosome, the topologies of consecutive trees shift according
to the impact of historical recombination events. There are two
stages to our analysis. First, we infer plausible ARGs, using a
heuristic algorithm, which can handle unphased and missing data
and is fast enough to be applied to large-scale studies. Second,
we test the genealogical tree at each locus for a clustering of
the disease cases beneath a branch, suggesting that a causative
mutation occurred on that branch. Since the true ARG is unknown,
we average this analysis over an ensemble of inferred ARGs. We
have characterized the performance of our method across a wide
range of simulated disease models. Compared with simpler tests,
our method gives increased accuracy in positioning untyped
causative loci and can also be used to estimate the frequencies
of untyped causative alleles. We have applied our method to Ueda
et al.'s association study of CTLA4 and Graves disease, showing
how it can be used to dissect the association signal, giving
potentially interesting results of allelic heterogeneity and
interaction. Similar approaches analyzing an ensemble of ARGs
inferred using our method may be applicable to many other
problems of inference from population genotype data.",
journal = "American Journal of Human Genetics",
volume = 79,
number = 5,
pages = "910--922",
month = nov,
year = 2006,
language = "en"
}
@ARTICLE{Kelleher2018-xc,
title = "Efficient pedigree recording for fast population genetics
simulation",
author = "Kelleher, Jerome and Thornton, Kevin R and Ashander, Jaime and
Ralph, Peter L",
abstract = "In this paper we describe how to efficiently record the entire
genetic history of a population in forwards-time,
individual-based population genetics simulations with arbitrary
breeding models, population structure and demography. This
approach dramatically reduces the computational burden of
tracking individual genomes by allowing us to simulate only those
loci that may affect reproduction (those having non-neutral
variants). The genetic history of the population is recorded as a
succinct tree sequence as introduced in the software package
msprime, on which neutral mutations can be quickly placed
afterwards. Recording the results of each breeding event requires
storage that grows linearly with time, but there is a great deal
of redundancy in this information. We solve this storage problem
by providing an algorithm to quickly 'simplify' a tree sequence
by removing this irrelevant history for a given set of genomes.
By periodically simplifying the history with respect to the
extant population, we show that the total storage space required
is modest and overall large efficiency gains can be made over
classical forward-time simulations. We implement a
general-purpose framework for recording and simplifying
genealogical data, which can be used to make simulations of any
population model more efficient. We modify two popular
forwards-time simulation frameworks to use this new approach and
observe efficiency gains in large, whole-genome simulations of
one to two orders of magnitude. In addition to speed, our method
for recording pedigrees has several advantages: (1) All marginal
genealogies of the simulated individuals are recorded, rather
than just genotypes. (2) A population of N individuals with M
polymorphic sites can be stored in O(N log N + M) space, making
it feasible to store a simulation's entire final generation as
well as its history. (3) A simulation can easily be initialized
with a more efficient coalescent simulation of deep history. The
software for recording and processing tree sequences is named
tskit.",
journal = "PLoS Computational Biology",
volume = 14,
number = 11,
pages = "e1006581",
month = nov,
year = 2018,
language = "en"
}
@ARTICLE{Turakhia2021-ur,
title = "Ultrafast Sample placement on Existing tRees ({UShER}) enables
real-time phylogenetics for the {SARS-CoV-2} pandemic",
author = "Turakhia, Yatish and Thornlow, Bryan and Hinrichs, Angie S and De
Maio, Nicola and Gozashti, Landen and Lanfear, Robert and
Haussler, David and Corbett-Detig, Russell",
abstract = "As the SARS-CoV-2 virus spreads through human populations, the
unprecedented accumulation of viral genome sequences is ushering
in a new era of 'genomic contact tracing'-that is, using viral
genomes to trace local transmission dynamics. However, because
the viral phylogeny is already so large-and will undoubtedly grow
many fold-placing new sequences onto the tree has emerged as a
barrier to real-time genomic contact tracing. Here, we resolve
this challenge by building an efficient tree-based data structure
encoding the inferred evolutionary history of the virus. We
demonstrate that our approach greatly improves the speed of
phylogenetic placement of new samples and data visualization,
making it possible to complete the placements under the
constraints of real-time contact tracing. Thus, our method
addresses an important need for maintaining a fully updated
reference phylogeny. We make these tools available to the
research community through the University of California Santa
Cruz SARS-CoV-2 Genome Browser to enable rapid cross-referencing
of information in new virus sequences with an ever-expanding
array of molecular and structural biology data. The methods
described here will empower research and genomic contact tracing
for SARS-CoV-2 specifically for laboratories worldwide.",
journal = "Nature Genetics",
volume = 53,
number = 6,
pages = "809--816",
month = jun,
year = 2021,
language = "en"
}
@ARTICLE{De_Klerk2022-tt,
title = "Conserved recombination patterns across coronavirus subgenera",
author = "De Klerk, Arn{\'e} and Swanepoel, Phillip and Lourens, Rentia and
Zondo, Mpumelelo and Abodunran, Isaac and Lytras, Spyros and
MacLean, Oscar A and Robertson, David and Kosakovsky Pond, Sergei
L and Zehr, Jordan D and Kumar, Venkatesh and Stanhope, Michael J
and Harkins, Gordon and Murrell, Ben and Martin, Darren P",
abstract = "Recombination contributes to the genetic diversity found in
coronaviruses and is known to be a prominent mechanism whereby
they evolve. It is apparent, both from controlled experiments and
in genome sequences sampled from nature, that patterns of
recombination in coronaviruses are non-random and that this is
likely attributable to a combination of sequence features that
favour the occurrence of recombination break points at specific
genomic sites, and selection disfavouring the survival of
recombinants within which favourable intra-genome interactions
have been disrupted. Here we leverage available whole-genome
sequence data for six coronavirus subgenera to identify specific
patterns of recombination that are conserved between multiple
subgenera and then identify the likely factors that underlie
these conserved patterns. Specifically, we confirm the
non-randomness of recombination break points across all six
tested coronavirus subgenera, locate conserved recombination hot-
and cold-spots, and determine that the locations of
transcriptional regulatory sequences are likely major
determinants of conserved recombination break-point hotspot
locations. We find that while the locations of recombination
break points are not uniformly associated with degrees of
nucleotide sequence conservation, they display significant
tendencies in multiple coronavirus subgenera to occur in low
guanine-cytosine content genome regions, in non-coding regions,
at the edges of genes, and at sites within the Spike gene that
are predicted to be minimally disruptive of Spike protein
folding. While it is apparent that sequence features such as
transcriptional regulatory sequences are likely major
determinants of where the template-switching events that yield
recombination break points most commonly occur, it is evident
that selection against misfolded recombinant proteins also
strongly impacts observable recombination break-point
distributions in coronavirus genomes sampled from nature.",
journal = "Virus Evolution",
volume = 8,
number = 2,
pages = "veac054",
month = jun,
year = 2022,
keywords = "Coronavirus; Evolution; Phylogenetics; Recombination; Selection",
language = "en"
}
@ARTICLE{Wang2008-eq,
title = "Bayesian inference of fine-scale recombination rates using
population genomic data",
author = "Wang, Ying and Rannala, Bruce",
abstract = "Recently, several statistical methods for estimating fine-scale
recombination rates using population samples have been developed.
However, currently available methods that can be applied to
large-scale data are limited to approximated likelihoods. Here,
we developed a full-likelihood Markov chain Monte Carlo method
for estimating recombination rate under a Bayesian framework.
Genealogies underlying a sampling of chromosomes are effectively
modelled by using marginal individual single nucleotide
polymorphism genealogies related through an ancestral
recombination graph. The method is compared with two existing
composite-likelihood methods using simulated data.Simulation
studies show that our method performs well for different
simulation scenarios. The method is applied to two human
population genetic variation datasets that have been studied by
sperm typing. Our results are consistent with the estimates from
sperm crossover analysis.",
journal = "Philosophical Transactions of the Royal Society of London B: Biological Sciences",
volume = 363,
number = 1512,
pages = "3921--3930",
month = dec,
year = 2008,
language = "en"
}
@ARTICLE{Palmer2019-wa,
title = "Mapping the drivers of within-host pathogen evolution using
massive data sets",
author = "Palmer, Duncan S and Turner, Isaac and Fidler, Sarah and Frater,
John and Goedhals, Dominique and Goulder, Philip and Huang,
Kuan-Hsiang Gary and Oxenius, Annette and Phillips, Rodney and
Shapiro, Roger and van Vuuren, Cloete and McLean, Angela R and
McVean, Gil",
abstract = "Differences among hosts, resulting from genetic variation in the
immune system or heterogeneity in drug treatment, can impact
within-host pathogen evolution. Genetic association studies can
potentially identify such interactions. However, extensive and
correlated genetic population structure in hosts and pathogens
presents a substantial risk of confounding analyses. Moreover,
the multiple testing burden of interaction scanning can
potentially limit power. We present a Bayesian approach for
detecting host influences on pathogen evolution that exploits
vast existing data sets of pathogen diversity to improve power
and control for stratification. The approach models key
processes, including recombination and selection, and identifies
regions of the pathogen genome affected by host factors. Our
simulations and empirical analysis of drug-induced selection on
the HIV-1 genome show that the method recovers known associations
and has superior precision-recall characteristics compared to
other approaches. We build a high-resolution map of HLA-induced
selection in the HIV-1 genome, identifying novel epitope-allele
combinations.",
journal = "Nature Communications",
volume = 10,
number = 1,
pages = "3017",
month = jul,
year = 2019,
language = "en"
}
@ARTICLE{Schierup2000-fg,
title = "Consequences of recombination on traditional phylogenetic
analysis",
author = "Schierup, M H and Hein, J",
abstract = "We investigate the shape of a phylogenetic tree reconstructed
from sequences evolving under the coalescent with recombination.
The motivation is that evolutionary inferences are often made
from phylogenetic trees reconstructed from population data even
though recombination may well occur (mtDNA or viral sequences) or
does occur (nuclear sequences). We investigate the size and
direction of biases when a single tree is reconstructed ignoring
recombination. Standard software (PHYLIP) was used to construct
the best phylogenetic tree from sequences simulated under the
coalescent with recombination. With recombination present, the
length of terminal branches and the total branch length are
larger, and the time to the most recent common ancestor smaller,
than for a tree reconstructed from sequences evolving with no
recombination. The effects are pronounced even for small levels
of recombination that may not be immediately detectable in a data
set. The phylogenies when recombination is present superficially
resemble phylogenies for sequences from an exponentially growing
population. However, exponential growth has a different effect on
statistics such as Tajima's D. Furthermore, ignoring
recombination leads to a large overestimation of the substitution
rate heterogeneity and the loss of the molecular clock. These
results are discussed in relation to viral and mtDNA data sets.",
journal = "Genetics",
volume = 156,
number = 2,
pages = "879--891",
month = oct,
year = 2000,
language = "en"
}
@ARTICLE{Hinch2011-tz,
title = "The landscape of recombination in {A}frican {A}mericans",
author = "Hinch, Anjali G and Tandon, Arti and Patterson, Nick and Song,
Yunli and Rohland, Nadin and Palmer, Cameron D and Chen, Gary K
and Wang, Kai and Buxbaum, Sarah G and Akylbekova, Ermeg L and
Aldrich, Melinda C and Ambrosone, Christine B and Amos,
Christopher and Bandera, Elisa V and Berndt, Sonja I and
Bernstein, Leslie and Blot, William J and Bock, Cathryn H and
Boerwinkle, Eric and Cai, Qiuyin and Caporaso, Neil and Casey,
Graham and Cupples, L Adrienne and Deming, Sandra L and Diver, W
Ryan and Divers, Jasmin and Fornage, Myriam and Gillanders,
Elizabeth M and Glessner, Joseph and Harris, Curtis C and Hu,
Jennifer J and Ingles, Sue A and Isaacs, William and John, Esther
M and Kao, W H Linda and Keating, Brendan and Kittles, Rick A and
Kolonel, Laurence N and Larkin, Emma and Le Marchand, Loic and
McNeill, Lorna H and Millikan, Robert C and Murphy, Adam and
Musani, Solomon and Neslund-Dudas, Christine and Nyante, Sarah
and Papanicolaou, George J and Press, Michael F and Psaty, Bruce
M and Reiner, Alex P and Rich, Stephen S and Rodriguez-Gil, Jorge
L and Rotter, Jerome I and Rybicki, Benjamin A and Schwartz, Ann
G and Signorello, Lisa B and Spitz, Margaret and Strom, Sara S
and Thun, Michael J and Tucker, Margaret A and Wang, Zhaoming and
Wiencke, John K and Witte, John S and Wrensch, Margaret and Wu,
Xifeng and Yamamura, Yuko and Zanetti, Krista A and Zheng, Wei
and Ziegler, Regina G and Zhu, Xiaofeng and Redline, Susan and
Hirschhorn, Joel N and Henderson, Brian E and Taylor, Jr, Herman
A and Price, Alkes L and Hakonarson, Hakon and Chanock, Stephen J
and Haiman, Christopher A and Wilson, James G and Reich, David
and Myers, Simon R",
abstract = "Recombination, together with mutation, gives rise to genetic
variation in populations. Here we leverage the recent mixture of
people of African and European ancestry in the Americas to build
a genetic map measuring the probability of crossing over at each
position in the genome, based on about 2.1 million crossovers in
30,000 unrelated African Americans. At intervals of more than
three megabases it is nearly identical to a map built in
Europeans. At finer scales it differs significantly, and we
identify about 2,500 recombination hotspots that are active in
people of West African ancestry but nearly inactive in Europeans.
The probability of a crossover at these hotspots is almost fully
controlled by the alleles an individual carries at PRDM9 (P value
< 10(-245)). We identify a 17-base-pair DNA sequence motif that
is enriched in these hotspots, and is an excellent match to the