paper.bib

% Generated by Paperpile. Check out https://paperpile.com for more information.
% BibTeX export options can be customized via Settings -> BibTeX.

@article{otto2021origins,
  title={The origins and potential future of SARS-CoV-2 variants of concern in the evolving COVID-19 pandemic},
  author={Otto, Sarah P and Day, Troy and Arino, Julien and Colijn, Caroline and Dushoff, Jonathan and Li, Michael and Mechai, Samir and Van Domselaar, Gary and Wu, Jianhong and Earn, David JD and others},
  journal={Current Biology},
  volume={31},
  number={14},
  pages={R918--R929},
  year={2021},
  publisher={Elsevier}
}

@article{amoutzias2022remarkable,
  title={The remarkable evolutionary plasticity of coronaviruses by mutation and recombination: insights for the COVID-19 pandemic and the future evolutionary paths of SARS-CoV-2},
  author={Amoutzias, Grigorios D and Nikolaidis, Marios and Tryfonopoulou, Eleni and Chlichlia, Katerina and Markoulatos, Panayotis and Oliver, Stephen G},
  journal={Viruses},
  volume={14},
  number={1},
  pages={78},
  year={2022},
  publisher={MDPI}
}

@article{Attwood2022-ab,
	abstract = {Determining the transmissibility, prevalence and patterns of movement of severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) infections is central to our understanding of the impact of the pandemic and to the design of effective control strategies. Phylogenies (evolutionary trees) have provided key insights into the international spread of SARS-CoV-2 and enabled investigation of individual outbreaks and transmission chains in specific settings. Phylodynamic approaches combine evolutionary, demographic and epidemiological concepts and have helped track virus genetic changes, identify emerging variants and inform public health strategy. Here, we review and synthesize studies that illustrate how phylogenetic and phylodynamic techniques were applied during the first year of the pandemic, and summarize their contributions to our understanding of SARS-CoV-2 transmission and control.},
	author = {Attwood, Stephen W. and Hill, Sarah C. and Aanensen, David M. and Connor, Thomas R. and Pybus, Oliver G.},
	da = {2022/09/01},
	date-added = {2023-06-01 15:31:08 +0100},
	date-modified = {2023-06-01 15:31:08 +0100},
	doi = {10.1038/s41576-022-00483-8},
	isbn = {1471-0064},
	journal = {Nature Reviews Genetics},
	number = {9},
	pages = {547--562},
	title = {Phylogenetic and phylodynamic approaches to understanding and combating the early {SARS-CoV-2} pandemic},
	ty = {JOUR},
	volume = {23},
	year = {2022},
	Bdsk-Url-1 = {https://doi.org/10.1038/s41576-022-00483-8}}


@ARTICLE{Chen2022-pz,
  title    = "{CoV-Spectrum}: analysis of globally shared {SARS-CoV-2} data to
              identify and characterize new variants",
  author   = "Chen, Chaoran and Nadeau, Sarah and Yared, Michael and Voinov,
              Philippe and Xie, Ning and Roemer, Cornelius and Stadler, Tanja",
  abstract = "SUMMARY: The CoV-Spectrum website supports the identification of
              new SARS-CoV-2 variants of concern and the tracking of known
              variants. Its flexible amino acid and nucleotide mutation search
              allows querying of variants before they are designated by a
              lineage nomenclature system. The platform brings together
              SARS-CoV-2 data from different sources and applies analyses.
              Results include the proportion of different variants over time,
              their demographic and geographic distributions, common mutations,
              hospitalization and mortality probabilities, estimates for
              transmission fitness advantage and insights obtained from
              wastewater samples. AVAILABILITY AND IMPLEMENTATION: CoV-Spectrum
              is available at https://cov-spectrum.org. The code is released
              under the GPL-3.0 license at
              https://github.com/cevo-public/cov-spectrum-website.",
  journal  = "Bioinformatics",
  volume   =  38,
  number   =  6,
  pages    = "1735--1737",
  month    =  mar,
  year     =  2022,
  language = "en"
}

@ARTICLE{Shriner2003-kb,
  title    = "Potential impact of recombination on sitewise approaches for
              detecting positive natural selection",
  author   = "Shriner, Daniel and Nickle, David C and Jensen, Mark A and
              Mullins, James I",
  abstract = "Current sitewise methods for detecting positive selection on gene
              sequences (the de facto standard being the CODEML method (Yang et
              al., 2000)) assume no recombination. This paper presents
              simulation results indicating that violation of this assumption
              can lead to false positive detection of sites undergoing positive
              selection. Through the use of population-scaled mutation and
              recombination rates, simulations can be performed that permit the
              generation of appropriate null distributions corresponding to
              neutral expectations in the presence of recombination, thereby
              allowing for a more accurate estimation of positive selection.",
  journal  = "Genetics Research",
  volume   =  81,
  number   =  2,
  pages    = "115--121",
  month    =  apr,
  year     =  2003,
  language = "en"
}

@ARTICLE{Kelleher2019-ba,
  title    = "Inferring whole-genome histories in large population datasets",
  author   = "Kelleher, Jerome and Wong, Yan and Wohns, Anthony W and Fadil,
              Chaimaa and Albers, Patrick K and McVean, Gil",
  abstract = "Inferring the full genealogical history of a set of DNA sequences
              is a core problem in evolutionary biology, because this history
              encodes information about the events and forces that have
              influenced a species. However, current methods are limited, and
              the most accurate techniques are able to process no more than a
              hundred samples. As datasets that consist of millions of genomes
              are now being collected, there is a need for scalable and
              efficient inference methods to fully utilize these resources.
              Here we introduce an algorithm that is able to not only infer
              whole-genome histories with comparable accuracy to the
              state-of-the-art but also process four orders of magnitude more
              sequences. The approach also provides an 'evolutionary encoding'
              of the data, enabling efficient calculation of relevant
              statistics. We apply the method to human data from the 1000
              Genomes Project, Simons Genome Diversity Project and UK Biobank,
              showing that the inferred genealogies are rich in biological
              signal and efficient to process.",
  journal  = "Nature Genetics",
  volume   =  51,
  number   =  9,
  pages    = "1330--1338",
  month    =  sep,
  year     =  2019,
  language = "en"
}

@ARTICLE{Wohns2022-th,
  title    = "A unified genealogy of modern and ancient genomes",
  author   = "Wohns, Anthony Wilder and Wong, Yan and Jeffery, Ben and Akbari,
              Ali and Mallick, Swapan and Pinhasi, Ron and Patterson, Nick and
              Reich, David and Kelleher, Jerome and McVean, Gil",
  abstract = "The sequencing of modern and ancient genomes from around the
              world has revolutionized our understanding of human history and
              evolution. However, the problem of how best to characterize
              ancestral relationships from the totality of human genomic
              variation remains unsolved. Here, we address this challenge with
              nonparametric methods that enable us to infer a unified genealogy
              of modern and ancient humans. This compact representation of
              multiple datasets explores the challenges of missing and
              erroneous data and uses ancient samples to constrain and date
              relationships. We demonstrate the power of the method to recover
              relationships between individuals and populations as well as to
              identify descendants of ancient samples. Finally, we introduce a
              simple nonparametric estimator of the geographical location of
              ancestors that recapitulates key events in human history.",
  journal  = "Science",
  volume   =  375,
  number   =  6583,
  pages    = "eabi8264",
  month    =  feb,
  year     =  2022,
  language = "en"
}

@ARTICLE{Schaefer2021-yg,
  title    = "An ancestral recombination graph of human, {Neanderthal}, and
          {Denisovan} genomes",
  author   = "Schaefer, Nathan K and Shapiro, Beth and Green, Richard E",
  abstract = "Many humans carry genes from Neanderthals, a legacy of past
              admixture. Existing methods detect this archaic hominin ancestry
              within human genomes using patterns of linkage disequilibrium or
              direct comparison to Neanderthal genomes. Each of these methods
              is limited in sensitivity and scalability. We describe a new
              ancestral recombination graph inference algorithm that scales to
              large genome-wide datasets and demonstrate its accuracy on real
              and simulated data. We then generate a genome-wide ancestral
              recombination graph including human and archaic hominin genomes.
              From this, we generate a map within human genomes of archaic
              ancestry and of genomic regions not shared with archaic hominins
              either by admixture or incomplete lineage sorting. We find that
              only 1.5 to 7\% of the modern human genome is uniquely human. We
              also find evidence of multiple bursts of adaptive changes
              specific to modern humans within the past 600,000 years involving
              genes related to brain development and function.",
  journal  = "Science Advances",
  volume   =  7,
  number   =  29,
  month    =  jul,
  year     =  2021,
  language = "en"
}

@ARTICLE{Carabelli2023-tb,
  title    = "{SARS-CoV-2} variant biology: immune escape, transmission and
              fitness",
  author   = "Carabelli, Alessandro M and Peacock, Thomas P and Thorne, Lucy G
              and Harvey, William T and Hughes, Joseph and {COVID-19 Genomics
              UK Consortium} and Peacock, Sharon J and Barclay, Wendy S and de
              Silva, Thushan I and Towers, Greg J and Robertson, David L",
  abstract = "In late 2020, after circulating for almost a year in the human
              population, severe acute respiratory syndrome coronavirus 2
              (SARS-CoV-2) exhibited a major step change in its adaptation to
              humans. These highly mutated forms of SARS-CoV-2 had enhanced
              rates of transmission relative to previous variants and were
              termed 'variants of concern' (VOCs). Designated Alpha, Beta,
              Gamma, Delta and Omicron, the VOCs emerged independently from one
              another, and in turn each rapidly became dominant, regionally or
              globally, outcompeting previous variants. The success of each VOC
              relative to the previously dominant variant was enabled by
              altered intrinsic functional properties of the virus and, to
              various degrees, changes to virus antigenicity conferring the
              ability to evade a primed immune response. The increased virus
              fitness associated with VOCs is the result of a complex interplay
              of virus biology in the context of changing human immunity due to
              both vaccination and prior infection. In this Review, we
              summarize the literature on the relative transmissibility and
              antigenicity of SARS-CoV-2 variants, the role of mutations at the
              furin spike cleavage site and of non-spike proteins, the
              potential importance of recombination to virus success, and
              SARS-CoV-2 evolution in the context of T cells, innate immunity
              and population immunity. SARS-CoV-2 shows a complicated
              relationship among virus antigenicity, transmission and
              virulence, which has unpredictable implications for the future
              trajectory and disease burden of COVID-19.",
  journal  = "Nature Reviews Microbiology",
  pages    = "1--16",
  month    =  jan,
  year     =  2023,
  language = "en"
}

@ARTICLE{Michener1957-tr,
  title   = "A Quantitative Approach To A Problem In Classification",
  author  = "Michener, Charles D and Sokal, Robert R",
  journal = "Evolution",
  volume  =  11,
  number  =  2,
  pages   = "130--162",
  month   =  jun,
  year    =  1957
}


@article{Tamura2023-ab,
    author = {Tamura, Tomokazu and Ito, Jumpei and Uriu, Keiya and Zahradnik,
        Jiri and Kida, Izumi and Anraku, Yuki and Nasser, Hesham and Shofa,
        Maya and Oda, Yoshitaka and Lytras, Spyros and Nao, Naganori and
            Itakura, Yukari and Deguchi, Sayaka and Suzuki, Rigel and Wang, Lei
            and Begum, MST Monira and Kita, Shunsuke and Yajima, Hisano and
            Sasaki, Jiei and Sasaki-Tabata, Kaori and Shimizu, Ryo and Tsuda,
        Masumi and Kosugi, Yusuke and Fujita, Shigeru and Pan, Lin and Sauter,
        Daniel and Yoshimatsu, Kumiko and Suzuki, Saori and Asakura, Hiroyuki
            and Nagashima, Mami and Sadamasu, Kenji and Yoshimura, Kazuhisa and
            Yamamoto, Yuki and Nagamoto, Tetsuharu and Schreiber, Gideon and
            Maenaka, Katsumi and Ito, Hayato and Misawa, Naoko and Kimura,
        Izumi and Suganami, Mai and Chiba, Mika and Yoshimura, Ryo and Yasuda,
        Kyoko and Iida, Keiko and Ohsumi, Naomi and Strange, Adam P. and
            Takahashi, Otowa and Ichihara, Kimiko and Shibatani, Yuki and
            Nishiuchi, Tomoko and Kato, Marie and Ferdous, Zannatul and Mouri,
        Hiromi and Shishido, Kenji and Sawa, Hirofumi and Hashimoto, Rina and
            Watanabe, Yukio and Sakamoto, Ayaka and Yasuhara, Naoko and Suzuki,
        Tateki and Kimura, Kanako and Nakajima, Yukari and Nakagawa, So and Wu,
        Jiaqi and Shirakawa, Kotaro and Takaori-Kondo, Akifumi and Nagata,
        Kayoko and Kazuma, Yasuhiro and Nomura, Ryosuke and Horisawa, Yoshihito
            and Tashiro, Yusuke and Kawai, Yugo and Irie, Takashi and Kawabata,
        Ryoko and Motozono, Chihiro and Toyoda, Mako and Ueno, Takamasa and
            Hashiguchi, Takao and Ikeda, Terumasa and Fukuhara, Takasuke and
            Saito, Akatsuki and Tanaka, Shinya and Matsuno, Keita and Takayama,
        Kazuo and Sato, Kei and The Genotype to Phenotype Japan (G2P-Japan)
            Consortium},
	journal = {Nature Communications},
	number = {1},
	pages = {2800},
	title = {Virological characteristics of the {SARS-CoV-2} {XBB} variant 
        derived from recombination of two {Omicron} subvariants},
	volume = {14},
	year = {2023},
 }


@ARTICLE{Turakhia2022-it,
  title    = "Pandemic-scale phylogenomics reveals the {SARS-CoV-2}
              recombination landscape",
  author   = "Turakhia, Yatish and Thornlow, Bryan and Hinrichs, Angie and
              McBroome, Jakob and Ayala, Nicolas and Ye, Cheng and Smith, Kyle
              and De Maio, Nicola and Haussler, David and Lanfear, Robert and
              Corbett-Detig, Russell",
  abstract = "Accurate and timely detection of recombinant lineages is crucial
              for interpreting genetic variation, reconstructing epidemic
              spread, identifying selection and variants of interest, and
              accurately performing phylogenetic analyses. During the
              SARS-CoV-2 pandemic, genomic data generation has exceeded the
              capacities of existing analysis platforms, thereby crippling
              real-time analysis of viral evolution. Here, we use a new
              phylogenomic method to search a nearly comprehensive SARS-CoV-2
              phylogeny for recombinant lineages. In a 1.6 million sample tree
              from May 2021, we identify 589 recombination events, which
              indicate that around 2.7\% of sequenced SARS-CoV-2 genomes have
              detectable recombinant ancestry. Recombination breakpoints are
              inferred to occur disproportionately in the 3' portion of the
              genome that contains the spike protein. Our results highlight the
              need for timely analyses of recombination for pinpointing the
              emergence of recombinant lineages with the potential to increase
              transmissibility or virulence of the virus. We anticipate that
              this approach will empower comprehensive real-time tracking of
              viral recombination during the SARS-CoV-2 pandemic and beyond.",
  journal  = "Nature",
  volume   =  609,
  number   =  7929,
  pages    = "994--997",
  month    =  sep,
  year     =  2022,
  language = "en"
}

@ARTICLE{Scornavacca2011-mg,
  title     = "Tanglegrams for rooted phylogenetic trees and networks",
  author    = "Scornavacca, Celine and Zickmann, Franziska and Huson, Daniel H",
  abstract  = "Abstract. Motivation: In systematic biology, one is often faced
               with the task of comparing different phylogenetic trees, in
               particular in multi-gene analysis or",
  journal   = "Bioinformatics",
  publisher = "Oxford Academic",
  volume    =  27,
  number    =  13,
  pages     = "i248--i256",
  month     =  jun,
  year      =  2011,
  language  = "en"
}

@article{Shu2017-hp,
  title   = "{GISAID}: Global initiative on sharing all influenza data -- from
             vision to reality",
  author  = "Shu, Yuelong and McCauley, John",
  journal = "Eurosurveillance",
  volume  =  22,
  number  =  13,
  year    =  2017
}

@ARTICLE{Delaneau2019-wl,
  title    = "Accurate, scalable and integrative haplotype estimation",
  author   = "Delaneau, Olivier and Zagury, Jean-Fran{\c c}ois and Robinson,
              Matthew R and Marchini, Jonathan L and Dermitzakis, Emmanouil T",
  abstract = "The number of human genomes being genotyped or sequenced
              increases exponentially and efficient haplotype estimation
              methods able to handle this amount of data are now required. Here
              we present a method, SHAPEIT4, which substantially improves upon
              other methods to process large genotype and high coverage
              sequencing datasets. It notably exhibits sub-linear running times
              with sample size, provides highly accurate haplotypes and allows
              integrating external phasing information such as large reference
              panels of haplotypes, collections of pre-phased variants and long
              sequencing reads. We provide SHAPEIT4 in an open source format
              and demonstrate its performance in terms of accuracy and running
              times on two gold standard datasets: the UK Biobank data and the
              Genome In A Bottle.",
  journal  = "Nature Communications",
  volume   =  10,
  number   =  1,
  pages    = "5436",
  month    =  nov,
  year     =  2019,
  language = "en"
}

@ARTICLE{Browning2021-cg,
  title    = "Fast two-stage phasing of large-scale sequence data",
  author   = "Browning, Brian L and Tian, Xiaowen and Zhou, Ying and Browning,
              Sharon R",
  abstract = "Haplotype phasing is the estimation of haplotypes from genotype
              data. We present a fast, accurate, and memory-efficient haplotype
              phasing method that scales to large-scale SNP array and sequence
              data. The method uses marker windowing and composite reference
              haplotypes to reduce memory usage and computation time. It
              incorporates a progressive phasing algorithm that identifies
              confidently phased heterozygotes in each iteration and fixes the
              phase of these heterozygotes in subsequent iterations. For data
              with many low-frequency variants, such as whole-genome sequence
              data, the method employs a two-stage phasing algorithm that
              phases high-frequency markers via progressive phasing in the
              first stage and phases low-frequency markers via genotype
              imputation in the second stage. This haplotype phasing method is
              implemented in the open-source Beagle 5.2 software package. We
              compare Beagle 5.2 and SHAPEIT 4.2.1 by using expanding subsets
              of 485,301 UK Biobank samples and 38,387 TOPMed samples. Both
              methods have very similar accuracy and computation time for UK
              Biobank SNP array data. However, for TOPMed sequence data, Beagle
              is more than 20 times faster than SHAPEIT, achieves similar
              accuracy, and scales to larger sample sizes.",
  journal  = "American Journal of Human Genetics",
  volume   =  108,
  number   =  10,
  pages    = "1880--1890",
  month    =  oct,
  year     =  2021,
  keywords = "TOPMed; UK Biobank; genotype phasing; haplotype phasing; phasing",
  language = "en"
}

@ARTICLE{Rambaut2020-dw,
  title    = "A dynamic nomenclature proposal for {SARS-CoV-2} lineages to
              assist genomic epidemiology",
  author   = "Rambaut, Andrew and Holmes, Edward C and O'Toole, {\'A}ine and
              Hill, Verity and McCrone, John T and Ruis, Christopher and du
              Plessis, Louis and Pybus, Oliver G",
  abstract = "The ongoing pandemic spread of a new human coronavirus,
              SARS-CoV-2, which is associated with severe pneumonia/disease
              (COVID-19), has resulted in the generation of tens of thousands
              of virus genome sequences. The rate of genome generation is
              unprecedented, yet there is currently no coherent nor accepted
              scheme for naming the expanding phylogenetic diversity of
              SARS-CoV-2. Here, we present a rational and dynamic virus
              nomenclature that uses a phylogenetic framework to identify those
              lineages that contribute most to active spread. Our system is
              made tractable by constraining the number and depth of
              hierarchical lineage labels and by flagging and delabelling virus
              lineages that become unobserved and hence are probably inactive.
              By focusing on active virus lineages and those spreading to new
              locations, this nomenclature will assist in tracking and
              understanding the patterns and determinants of the global spread
              of SARS-CoV-2.",
  journal  = "Nature Microbiology",
  volume   =  5,
  number   =  11,
  pages    = "1403--1407",
  month    =  nov,
  year     =  2020,
  language = "en"
}

@ARTICLE{Baumdicker2022-ep,
  title    = "Efficient ancestry and mutation simulation with msprime 1.0",
  author   = "Baumdicker, Franz and Bisschop, Gertjan and Goldstein, Daniel and
              Gower, Graham and Ragsdale, Aaron P and Tsambos, Georgia and Zhu,
              Sha and Eldon, Bjarki and Ellerman, E Castedo and Galloway, Jared
              G and Gladstein, Ariella L and Gorjanc, Gregor and Guo, Bing and
              Jeffery, Ben and Kretzschumar, Warren W and Lohse, Konrad and
              Matschiner, Michael and Nelson, Dominic and Pope, Nathaniel S and
              Quinto-Cort{\'e}s, Consuelo D and Rodrigues, Murillo F and
              Saunack, Kumar and Sellinger, Thibaut and Thornton, Kevin and van
              Kemenade, Hugo and Wohns, Anthony W and Wong, Yan and Gravel,
              Simon and Kern, Andrew D and Koskela, Jere and Ralph, Peter L and
              Kelleher, Jerome",
  abstract = "Stochastic simulation is a key tool in population genetics, since
              the models involved are often analytically intractable and
              simulation is usually the only way of obtaining ground-truth data
              to evaluate inferences. Because of this, a large number of
              specialized simulation programs have been developed, each filling
              a particular niche, but with largely overlapping functionality
              and a substantial duplication of effort. Here, we introduce
              msprime version 1.0, which efficiently implements ancestry and
              mutation simulations based on the succinct tree sequence data
              structure and the tskit library. We summarize msprime's many
              features, and show that its performance is excellent, often many
              times faster and more memory efficient than specialized
              alternatives. These high-performance features have been
              thoroughly tested and validated, and built using a collaborative,
              open source development model, which reduces duplication of
              effort and promotes software quality via community engagement.",
  journal  = "Genetics",
  volume   =  220,
  number   =  3,
  month    =  mar,
  year     =  2022,
  keywords = "Ancestral Recombination Graphs; coalescent; mutations; simulation",
  language = "en"
}

@MISC{noauthor_2021-kd,
  title        = "Pango Lineage Nomenclature: {P}rovisional rules for naming
                  recombinant lineages",
  booktitle    = "Virological",
  abstract     = "The rules outlined below constitute a provisional naming
                  convention for identifiable SARS-CoV-2 recombinant lineages
                  within the Pango dynamic nomenclature system
                  (https://doi.org/10.1038/s41564-020-0770-5). These rules are
                  being considered for ratification by the Pango nomenclature
                  committee. NOTE: New lineage names are designated by the
                  Pango committee and not by individual researchers or groups.
                  If you would like to make a lineage suggestion, please read
                  the nomenclature documentation and sub...",
  month        =  mar,
  year         =  2021,
  howpublished = "\url{https://virological.org/t/pango-lineage-nomenclature-provisional-rules-for-naming-recombinant-lineages/657}",
  note         = "Accessed: 2023-4-4",
  language     = "en"
}

@ARTICLE{Chen2021-zc,
  title    = "{COVID-19} {CG} enables {SARS-CoV-2} mutation and lineage
              tracking by locations and dates of interest",
  author   = "Chen, Albert Tian and Altschuler, Kevin and Zhan, Shing Hei and
              Chan, Yujia Alina and Deverman, Benjamin E",
  abstract = "COVID-19 CG (covidcg.org) is an open resource for tracking
              SARS-CoV-2 single-nucleotide variations (SNVs), lineages, and
              clades using the virus genomes on the GISAID database while
              filtering by location, date, gene, and mutation of interest.
              COVID-19 CG provides significant time, labor, and cost-saving
              utility to projects on SARS-CoV-2 transmission, evolution,
              diagnostics, therapeutics, vaccines, and intervention tracking.
              Here, we describe case studies in which users can interrogate (1)
              SNVs in the SARS-CoV-2 spike receptor binding domain (RBD) across
              different geographical regions to inform the design and testing
              of therapeutics, (2) SNVs that may impact the sensitivity of
              commonly used diagnostic primers, and (3) the emergence of a
              dominant lineage harboring an S477N RBD mutation in Australia in
              2020. To accelerate COVID-19 efforts, COVID-19 CG will be
              upgraded with new features for users to rapidly pinpoint
              mutations as the virus evolves throughout the pandemic and in
              response to therapeutic and public health interventions.",
  journal  = "eLife",
  volume   =  10,
  pages = {e63409},
  month    =  feb,
  year     =  2021,
  keywords = "COVID-19; SARS-CoV-2; browser; epidemiology; evolutionary
              biology; global health; mutation tracking; pandemic; resource;
              virus",
  language = "en"
}

@ARTICLE{Kim2020-gt,
  title    = "The Architecture of {SARS-CoV-2} Transcriptome",
  author   = "Kim, Dongwan and Lee, Joo-Yeon and Yang, Jeong-Sun and Kim, Jun
              Won and Kim, V Narry and Chang, Hyeshik",
  abstract = "SARS-CoV-2 is a betacoronavirus responsible for the COVID-19
              pandemic. Although the SARS-CoV-2 genome was reported recently,
              its transcriptomic architecture is unknown. Utilizing two
              complementary sequencing techniques, we present a high-resolution
              map of the SARS-CoV-2 transcriptome and epitranscriptome. DNA
              nanoball sequencing shows that the transcriptome is highly
              complex owing to numerous discontinuous transcription events. In
              addition to the canonical genomic and 9 subgenomic RNAs,
              SARS-CoV-2 produces transcripts encoding unknown ORFs with
              fusion, deletion, and/or frameshift. Using nanopore direct RNA
              sequencing, we further find at least 41 RNA modification sites on
              viral transcripts, with the most frequent motif, AAGAA. Modified
              RNAs have shorter poly(A) tails than unmodified RNAs, suggesting
              a link between the modification and the 3' tail. Functional
              investigation of the unknown transcripts and RNA modifications
              discovered in this study will open new directions to our
              understanding of the life cycle and pathogenicity of SARS-CoV-2.",
  journal  = "Cell",
  volume   =  181,
  number   =  4,
  pages    = "914--921",
  month    =  may,
  year     =  2020,
  keywords = "COVID-19; RNA modification; SARS-CoV-2; coronavirus; direct RNA
              sequencing; discontinuous transcription; epitranscriptome;
              nanopore; poly(A) tail; transcriptome",
  language = "en"
}

@BOOK{Gusfield2014-qw,
  title     = "{ReCombinatorics}: {T}he Algorithmics of Ancestral Recombination
               Graphs and Explicit Phylogenetic Networks",
  author    = "Gusfield, Dan",
  abstract  = "When a Philadelphia girl intercepts a message about an impending
               British attack against her father's regiment, commanded by
               General Washington, in White Marsh, she travels alone by
               horseback to warn the Patriot army.",
  publisher = "MIT Press",
  address = "Cambridge, MA",
  month     =  jul,
  year      =  2014,
  language  = "en"
}

@ARTICLE{McBroome2021-zm,
  title    = "A {Daily-Updated} Database and Tools for Comprehensive
              {SARS-CoV-2} {Mutation-Annotated} Trees",
  author   = "McBroome, Jakob and Thornlow, Bryan and Hinrichs, Angie S and
              Kramer, Alexander and De Maio, Nicola and Goldman, Nick and
              Haussler, David and Corbett-Detig, Russell and Turakhia, Yatish",
  abstract = "The vast scale of SARS-CoV-2 sequencing data has made it
              increasingly challenging to comprehensively analyze all available
              data using existing tools and file formats. To address this, we
              present a database of SARS-CoV-2 phylogenetic trees inferred with
              unrestricted public sequences, which we update daily to
              incorporate new sequences. Our database uses the recently
              proposed mutation-annotated tree (MAT) format to efficiently
              encode the tree with branches labeled with parsimony-inferred
              mutations, as well as Nextstrain clade and Pango lineage labels
              at clade roots. As of June 9, 2021, our SARS-CoV-2 MAT consists
              of 834,521 sequences and provides a comprehensive view of the
              virus' evolutionary history using public data. We also present
              matUtils-a command-line utility for rapidly querying,
              interpreting, and manipulating the MATs. Our daily-updated
              SARS-CoV-2 MAT database and matUtils software are available at
              http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER\_SARS-CoV-2/
              and https://github.com/yatisht/usher, respectively.",
  journal  = "Molecular Biology and Evolution",
  volume   =  38,
  number   =  12,
  pages    = "5819--5824",
  month    =  dec,
  year     =  2021,
  keywords = "COVID-19; SARS-CoV-2 phylogenetics; genomic surveillance",
  language = "en"
}

@ARTICLE{Varabyou2021-rw,
  title    = "Rapid detection of inter-clade recombination in {SARS-CoV-2} with
              {B}olotie",
  author   = "Varabyou, Ales and Pockrandt, Christopher and Salzberg, Steven L
              and Pertea, Mihaela",
  abstract = "The ability to detect recombination in pathogen genomes is
              crucial to the accuracy of phylogenetic analysis and consequently
              to forecasting the spread of infectious diseases and to
              developing therapeutics and public health policies. However, in
              case of the SARS-CoV-2, the low divergence of near-identical
              genomes sequenced over a short period of time makes conventional
              analysis infeasible. Using a novel method, we identified 225
              anomalous SARS-CoV-2 genomes of likely recombinant origins out of
              the first 87,695 genomes to be released, several of which have
              persisted in the population. Bolotie is specifically designed to
              perform a rapid search for inter-clade recombination events over
              extremely large datasets, facilitating analysis of novel isolates
              in seconds. In cases where raw sequencing data were available, we
              were able to rule out the possibility that these samples
              represented co-infections by analyzing the underlying sequence
              reads. The Bolotie software and other data from our study are
              available at https://github.com/salzberg-lab/bolotie.",
  journal  = "Genetics",
  volume   =  218,
  number   =  3,
  month    =  jul,
  year     =  2021,
  keywords = "COVID-19; SARS-CoV-2; coronavirus; recombination",
  language = "en"
}

@ARTICLE{Jungreis2021-dh,
  title     = "{SARS-CoV-2} gene content and {COVID-19} mutation impact by
               comparing 44 Sarbecovirus genomes",
  author    = "Jungreis, Irwin and Sealfon, Rachel and Kellis, Manolis",
  abstract  = "Despite its clinical importance, the SARS-CoV-2 gene set remains
               unresolved, hindering dissection of COVID-19 biology. We use
               comparative genomics to provide a high-confidence protein-coding
               gene set, characterize evolutionary constraint, and prioritize
               functional mutations. We select 44 Sarbecovirus genomes at
               ideally-suited evolutionary distances, and quantify
               protein-coding evolutionary signatures and overlapping
               constraint. We find strong protein-coding signatures for ORFs
               3a, 6, 7a, 7b, 8, 9b, and a novel alternate-frame gene, ORF3c,
               whereas ORFs 2b, 3d/3d-2, 3b, 9c, and 10 lack protein-coding
               signatures or convincing experimental evidence of protein-coding
               function. Furthermore, we show no other conserved protein-coding
               genes remain to be discovered. Mutation analysis suggests ORF8
               contributes to within-individual fitness but not
               person-to-person transmission. Cross-strain and within-strain
               evolutionary pressures agree, except for fewer-than-expected
               within-strain mutations in nsp3 and S1, and more-than-expected
               in nucleocapsid, which shows a cluster of mutations in a
               predicted B-cell epitope, suggesting immune-avoidance selection.
               Evolutionary histories of residues disrupted by spike-protein
               substitutions D614G, N501Y, E484K, and K417N/T provide clues
               about their biology, and we catalog likely-functional
               co-inherited mutations. Previously reported RNA-modification
               sites show no enrichment for conservation. Here we report a
               high-confidence gene set and evolutionary-history annotations
               providing valuable resources and insights on SARS-CoV-2 biology,
               mutations, and evolution. The SARS-CoV-2 gene set remains
               unresolved, hindering dissection of COVID-19 biology. Comparing
               44 Sarbecovirus genomes provides a high-confidence
               protein-coding gene set. The study characterizes protein-level
               and nucleotide-level evolutionary constraints, and prioritizes
               functional mutations from the ongoing COVID-19 pandemic.",
  journal   = "Nature Communications",
  publisher = "Nature Publishing Group",
  volume    =  12,
  number    =  1,
  pages     = "1--20",
  month     =  may,
  year      =  2021,
  language  = "en"
}

@article {Zhang2023-lf,
    title={Biobank-scale inference of ancestral recombination graphs
        enables genealogical analysis of complex traits},
    author={Zhang, Brian C and Biddanda, Arjun and Gunnarsson, {\'A}rni
        Freyr and Cooper, Fergus and Palamara, Pier Francesco},
    journal={Nature Genetics},
    pages={768--776},
    volume={55},
    year={2023},
    publisher={Nature Publishing Group US New York}
}

@ARTICLE{Minichiello2006-dp,
  title    = "Mapping trait loci by use of inferred ancestral recombination
              graphs",
  author   = "Minichiello, Mark J and Durbin, Richard",
  abstract = "Large-scale association studies are being undertaken with the
              hope of uncovering the genetic determinants of complex disease.
              We describe a computationally efficient method for inferring
              genealogies from population genotype data and show how these
              genealogies can be used to fine map disease loci and interpret
              association signals. These genealogies take the form of the
              ancestral recombination graph (ARG). The ARG defines a
              genealogical tree for each locus, and, as one moves along the
              chromosome, the topologies of consecutive trees shift according
              to the impact of historical recombination events. There are two
              stages to our analysis. First, we infer plausible ARGs, using a
              heuristic algorithm, which can handle unphased and missing data
              and is fast enough to be applied to large-scale studies. Second,
              we test the genealogical tree at each locus for a clustering of
              the disease cases beneath a branch, suggesting that a causative
              mutation occurred on that branch. Since the true ARG is unknown,
              we average this analysis over an ensemble of inferred ARGs. We
              have characterized the performance of our method across a wide
              range of simulated disease models. Compared with simpler tests,
              our method gives increased accuracy in positioning untyped
              causative loci and can also be used to estimate the frequencies
              of untyped causative alleles. We have applied our method to Ueda
              et al.'s association study of CTLA4 and Graves disease, showing
              how it can be used to dissect the association signal, giving
              potentially interesting results of allelic heterogeneity and
              interaction. Similar approaches analyzing an ensemble of ARGs
              inferred using our method may be applicable to many other
              problems of inference from population genotype data.",
  journal  = "American Journal of Human Genetics",
  volume   =  79,
  number   =  5,
  pages    = "910--922",
  month    =  nov,
  year     =  2006,
  language = "en"
}

@ARTICLE{Kelleher2018-xc,
  title    = "Efficient pedigree recording for fast population genetics
              simulation",
  author   = "Kelleher, Jerome and Thornton, Kevin R and Ashander, Jaime and
              Ralph, Peter L",
  abstract = "In this paper we describe how to efficiently record the entire
              genetic history of a population in forwards-time,
              individual-based population genetics simulations with arbitrary
              breeding models, population structure and demography. This
              approach dramatically reduces the computational burden of
              tracking individual genomes by allowing us to simulate only those
              loci that may affect reproduction (those having non-neutral
              variants). The genetic history of the population is recorded as a
              succinct tree sequence as introduced in the software package
              msprime, on which neutral mutations can be quickly placed
              afterwards. Recording the results of each breeding event requires
              storage that grows linearly with time, but there is a great deal
              of redundancy in this information. We solve this storage problem
              by providing an algorithm to quickly 'simplify' a tree sequence
              by removing this irrelevant history for a given set of genomes.
              By periodically simplifying the history with respect to the
              extant population, we show that the total storage space required
              is modest and overall large efficiency gains can be made over
              classical forward-time simulations. We implement a
              general-purpose framework for recording and simplifying
              genealogical data, which can be used to make simulations of any
              population model more efficient. We modify two popular
              forwards-time simulation frameworks to use this new approach and
              observe efficiency gains in large, whole-genome simulations of
              one to two orders of magnitude. In addition to speed, our method
              for recording pedigrees has several advantages: (1) All marginal
              genealogies of the simulated individuals are recorded, rather
              than just genotypes. (2) A population of N individuals with M
              polymorphic sites can be stored in O(N log N + M) space, making
              it feasible to store a simulation's entire final generation as
              well as its history. (3) A simulation can easily be initialized
              with a more efficient coalescent simulation of deep history. The
              software for recording and processing tree sequences is named
              tskit.",
  journal  = "PLoS Computational Biology",
  volume   =  14,
  number   =  11,
  pages    = "e1006581",
  month    =  nov,
  year     =  2018,
  language = "en"
}

@ARTICLE{Turakhia2021-ur,
  title    = "Ultrafast Sample placement on Existing tRees ({UShER}) enables
              real-time phylogenetics for the {SARS-CoV-2} pandemic",
  author   = "Turakhia, Yatish and Thornlow, Bryan and Hinrichs, Angie S and De
              Maio, Nicola and Gozashti, Landen and Lanfear, Robert and
              Haussler, David and Corbett-Detig, Russell",
  abstract = "As the SARS-CoV-2 virus spreads through human populations, the
              unprecedented accumulation of viral genome sequences is ushering
              in a new era of 'genomic contact tracing'-that is, using viral
              genomes to trace local transmission dynamics. However, because
              the viral phylogeny is already so large-and will undoubtedly grow
              many fold-placing new sequences onto the tree has emerged as a
              barrier to real-time genomic contact tracing. Here, we resolve
              this challenge by building an efficient tree-based data structure
              encoding the inferred evolutionary history of the virus. We
              demonstrate that our approach greatly improves the speed of
              phylogenetic placement of new samples and data visualization,
              making it possible to complete the placements under the
              constraints of real-time contact tracing. Thus, our method
              addresses an important need for maintaining a fully updated
              reference phylogeny. We make these tools available to the
              research community through the University of California Santa
              Cruz SARS-CoV-2 Genome Browser to enable rapid cross-referencing
              of information in new virus sequences with an ever-expanding
              array of molecular and structural biology data. The methods
              described here will empower research and genomic contact tracing
              for SARS-CoV-2 specifically for laboratories worldwide.",
  journal  = "Nature Genetics",
  volume   =  53,
  number   =  6,
  pages    = "809--816",
  month    =  jun,
  year     =  2021,
  language = "en"
}

@ARTICLE{De_Klerk2022-tt,
  title    = "Conserved recombination patterns across coronavirus subgenera",
  author   = "De Klerk, Arn{\'e} and Swanepoel, Phillip and Lourens, Rentia and
              Zondo, Mpumelelo and Abodunran, Isaac and Lytras, Spyros and
              MacLean, Oscar A and Robertson, David and Kosakovsky Pond, Sergei
              L and Zehr, Jordan D and Kumar, Venkatesh and Stanhope, Michael J
              and Harkins, Gordon and Murrell, Ben and Martin, Darren P",
  abstract = "Recombination contributes to the genetic diversity found in
              coronaviruses and is known to be a prominent mechanism whereby
              they evolve. It is apparent, both from controlled experiments and
              in genome sequences sampled from nature, that patterns of
              recombination in coronaviruses are non-random and that this is
              likely attributable to a combination of sequence features that
              favour the occurrence of recombination break points at specific
              genomic sites, and selection disfavouring the survival of
              recombinants within which favourable intra-genome interactions
              have been disrupted. Here we leverage available whole-genome
              sequence data for six coronavirus subgenera to identify specific
              patterns of recombination that are conserved between multiple
              subgenera and then identify the likely factors that underlie
              these conserved patterns. Specifically, we confirm the
              non-randomness of recombination break points across all six
              tested coronavirus subgenera, locate conserved recombination hot-
              and cold-spots, and determine that the locations of
              transcriptional regulatory sequences are likely major
              determinants of conserved recombination break-point hotspot
              locations. We find that while the locations of recombination
              break points are not uniformly associated with degrees of
              nucleotide sequence conservation, they display significant
              tendencies in multiple coronavirus subgenera to occur in low
              guanine-cytosine content genome regions, in non-coding regions,
              at the edges of genes, and at sites within the Spike gene that
              are predicted to be minimally disruptive of Spike protein
              folding. While it is apparent that sequence features such as
              transcriptional regulatory sequences are likely major
              determinants of where the template-switching events that yield
              recombination break points most commonly occur, it is evident
              that selection against misfolded recombinant proteins also
              strongly impacts observable recombination break-point
              distributions in coronavirus genomes sampled from nature.",
  journal  = "Virus Evolution",
  volume   =  8,
  number   =  2,
  pages    = "veac054",
  month    =  jun,
  year     =  2022,
  keywords = "Coronavirus; Evolution; Phylogenetics; Recombination; Selection",
  language = "en"
}

@ARTICLE{Wang2008-eq,
  title    = "Bayesian inference of fine-scale recombination rates using
              population genomic data",
  author   = "Wang, Ying and Rannala, Bruce",
  abstract = "Recently, several statistical methods for estimating fine-scale
              recombination rates using population samples have been developed.
              However, currently available methods that can be applied to
              large-scale data are limited to approximated likelihoods. Here,
              we developed a full-likelihood Markov chain Monte Carlo method
              for estimating recombination rate under a Bayesian framework.
              Genealogies underlying a sampling of chromosomes are effectively
              modelled by using marginal individual single nucleotide
              polymorphism genealogies related through an ancestral
              recombination graph. The method is compared with two existing
              composite-likelihood methods using simulated data.Simulation
              studies show that our method performs well for different
              simulation scenarios. The method is applied to two human
              population genetic variation datasets that have been studied by
              sperm typing. Our results are consistent with the estimates from
              sperm crossover analysis.",
  journal  = "Philosophical Transactions of the Royal Society of London B: Biological Sciences",
  volume   =  363,
  number   =  1512,
  pages    = "3921--3930",
  month    =  dec,
  year     =  2008,
  language = "en"
}

@ARTICLE{Palmer2019-wa,
  title    = "Mapping the drivers of within-host pathogen evolution using
              massive data sets",
  author   = "Palmer, Duncan S and Turner, Isaac and Fidler, Sarah and Frater,
              John and Goedhals, Dominique and Goulder, Philip and Huang,
              Kuan-Hsiang Gary and Oxenius, Annette and Phillips, Rodney and
              Shapiro, Roger and van Vuuren, Cloete and McLean, Angela R and
              McVean, Gil",
  abstract = "Differences among hosts, resulting from genetic variation in the
              immune system or heterogeneity in drug treatment, can impact
              within-host pathogen evolution. Genetic association studies can
              potentially identify such interactions. However, extensive and
              correlated genetic population structure in hosts and pathogens
              presents a substantial risk of confounding analyses. Moreover,
              the multiple testing burden of interaction scanning can
              potentially limit power. We present a Bayesian approach for
              detecting host influences on pathogen evolution that exploits
              vast existing data sets of pathogen diversity to improve power
              and control for stratification. The approach models key
              processes, including recombination and selection, and identifies
              regions of the pathogen genome affected by host factors. Our
              simulations and empirical analysis of drug-induced selection on
              the HIV-1 genome show that the method recovers known associations
              and has superior precision-recall characteristics compared to
              other approaches. We build a high-resolution map of HLA-induced
              selection in the HIV-1 genome, identifying novel epitope-allele
              combinations.",
  journal  = "Nature Communications",
  volume   =  10,
  number   =  1,
  pages    = "3017",
  month    =  jul,
  year     =  2019,
  language = "en"
}

@ARTICLE{Schierup2000-fg,
  title    = "Consequences of recombination on traditional phylogenetic
              analysis",
  author   = "Schierup, M H and Hein, J",
  abstract = "We investigate the shape of a phylogenetic tree reconstructed
              from sequences evolving under the coalescent with recombination.
              The motivation is that evolutionary inferences are often made
              from phylogenetic trees reconstructed from population data even
              though recombination may well occur (mtDNA or viral sequences) or
              does occur (nuclear sequences). We investigate the size and
              direction of biases when a single tree is reconstructed ignoring
              recombination. Standard software (PHYLIP) was used to construct
              the best phylogenetic tree from sequences simulated under the
              coalescent with recombination. With recombination present, the
              length of terminal branches and the total branch length are
              larger, and the time to the most recent common ancestor smaller,
              than for a tree reconstructed from sequences evolving with no
              recombination. The effects are pronounced even for small levels
              of recombination that may not be immediately detectable in a data
              set. The phylogenies when recombination is present superficially
              resemble phylogenies for sequences from an exponentially growing
              population. However, exponential growth has a different effect on
              statistics such as Tajima's D. Furthermore, ignoring
              recombination leads to a large overestimation of the substitution
              rate heterogeneity and the loss of the molecular clock. These
              results are discussed in relation to viral and mtDNA data sets.",
  journal  = "Genetics",
  volume   =  156,
  number   =  2,
  pages    = "879--891",
  month    =  oct,
  year     =  2000,
  language = "en"
}

@ARTICLE{Hinch2011-tz,
  title    = "The landscape of recombination in {A}frican {A}mericans",
  author   = "Hinch, Anjali G and Tandon, Arti and Patterson, Nick and Song,
              Yunli and Rohland, Nadin and Palmer, Cameron D and Chen, Gary K
              and Wang, Kai and Buxbaum, Sarah G and Akylbekova, Ermeg L and
              Aldrich, Melinda C and Ambrosone, Christine B and Amos,
              Christopher and Bandera, Elisa V and Berndt, Sonja I and
              Bernstein, Leslie and Blot, William J and Bock, Cathryn H and
              Boerwinkle, Eric and Cai, Qiuyin and Caporaso, Neil and Casey,
              Graham and Cupples, L Adrienne and Deming, Sandra L and Diver, W
              Ryan and Divers, Jasmin and Fornage, Myriam and Gillanders,
              Elizabeth M and Glessner, Joseph and Harris, Curtis C and Hu,
              Jennifer J and Ingles, Sue A and Isaacs, William and John, Esther
              M and Kao, W H Linda and Keating, Brendan and Kittles, Rick A and
              Kolonel, Laurence N and Larkin, Emma and Le Marchand, Loic and
              McNeill, Lorna H and Millikan, Robert C and Murphy, Adam and
              Musani, Solomon and Neslund-Dudas, Christine and Nyante, Sarah
              and Papanicolaou, George J and Press, Michael F and Psaty, Bruce
              M and Reiner, Alex P and Rich, Stephen S and Rodriguez-Gil, Jorge
              L and Rotter, Jerome I and Rybicki, Benjamin A and Schwartz, Ann
              G and Signorello, Lisa B and Spitz, Margaret and Strom, Sara S
              and Thun, Michael J and Tucker, Margaret A and Wang, Zhaoming and
              Wiencke, John K and Witte, John S and Wrensch, Margaret and Wu,
              Xifeng and Yamamura, Yuko and Zanetti, Krista A and Zheng, Wei
              and Ziegler, Regina G and Zhu, Xiaofeng and Redline, Susan and
              Hirschhorn, Joel N and Henderson, Brian E and Taylor, Jr, Herman
              A and Price, Alkes L and Hakonarson, Hakon and Chanock, Stephen J
              and Haiman, Christopher A and Wilson, James G and Reich, David
              and Myers, Simon R",
  abstract = "Recombination, together with mutation, gives rise to genetic
              variation in populations. Here we leverage the recent mixture of
              people of African and European ancestry in the Americas to build
              a genetic map measuring the probability of crossing over at each
              position in the genome, based on about 2.1 million crossovers in
              30,000 unrelated African Americans. At intervals of more than
              three megabases it is nearly identical to a map built in
              Europeans. At finer scales it differs significantly, and we
              identify about 2,500 recombination hotspots that are active in
              people of West African ancestry but nearly inactive in Europeans.
              The probability of a crossover at these hotspots is almost fully
              controlled by the alleles an individual carries at PRDM9 (P value
              < 10(-245)). We identify a 17-base-pair DNA sequence motif that
              is enriched in these hotspots, and is an excellent match to the
              predicted binding target of PRDM9 alleles common in West Africans
              and rare in Europeans. Sites of this motif are predicted to be
              risk loci for disease-causing genomic rearrangements in
              individuals carrying these alleles. More generally, this map
              provides a resource for research in human genetic variation and
              evolution.",
  journal  = "Nature",
  volume   =  476,
  number   =  7359,
  pages    = "170--175",
  month    =  jul,
  year     =  2011,
  language = "en"
}

@ARTICLE{Kelleher2016-wk,
  title    = "Efficient Coalescent Simulation and Genealogical Analysis for
              Large Sample Sizes",
  author   = "Kelleher, Jerome and Etheridge, Alison M and McVean, Gilean",
  abstract = "A central challenge in the analysis of genetic variation is to
              provide realistic genome simulation across millions of samples.
              Present day coalescent simulations do not scale well, or use
              approximations that fail to capture important long-range linkage
              properties. Analysing the results of simulations also presents a
              substantial challenge, as current methods to store genealogies
              consume a great deal of space, are slow to parse and do not take
              advantage of shared structure in correlated trees. We solve these
              problems by introducing sparse trees and coalescence records as
              the key units of genealogical analysis. Using these tools, exact
              simulation of the coalescent with recombination for
              chromosome-sized regions over hundreds of thousands of samples is
              possible, and substantially faster than present-day approximate
              methods. We can also analyse the results orders of magnitude more
              quickly than with existing methods.",
  journal  = "PLoS Computational Biology",
  volume   =  12,
  number   =  5,
  pages    = "e1004842",
  month    =  may,
  year     =  2016,
  language = "en"
}

@article{Griffiths1981-lw,
  title={Neutral two-locus multiple allele models with recombination},
  author={Griffiths, RC},
  journal={Theoretical Population Biology},
  volume={19},
  number={2},
  pages={169--186},
  year={1981},
  publisher={Elsevier}
}


@ARTICLE{Li2003-ib,
  title    = "Modeling linkage disequilibrium and identifying recombination
              hotspots using single-nucleotide polymorphism data",
  author   = "Li, Na and Stephens, Matthew",
  abstract = "We introduce a new statistical model for patterns of linkage
              disequilibrium (LD) among multiple SNPs in a population sample.
              The model overcomes limitations of existing approaches to
              understanding, summarizing, and interpreting LD by (i) relating
              patterns of LD directly to the underlying recombination process;
              (ii) considering all loci simultaneously, rather than pairwise;
              (iii) avoiding the assumption that LD necessarily has a
              ``block-like'' structure; and (iv) being computationally
              tractable for huge genomic regions (up to complete chromosomes).
              We examine in detail one natural application of the model:
              estimation of underlying recombination rates from population
              data. Using simulation, we show that in the case where
              recombination is assumed constant across the region of interest,
              recombination rate estimates based on our model are competitive
              with the very best of current available methods. More
              importantly, we demonstrate, on real and simulated data, the
              potential of the model to help identify and quantify fine-scale
              variation in recombination rate from population data. We also
              outline how the model could be useful in other contexts, such as
              in the development of more efficient haplotype-based methods for
              LD mapping.",
  journal  = "Genetics",
  volume   =  165,
  number   =  4,
  pages    = "2213--2233",
  month    =  dec,
  year     =  2003,
  language = "en"
}

@ARTICLE{Minh2020-lr,
  title    = "{IQ-TREE} 2: {N}ew Models and Efficient Methods for Phylogenetic
              Inference in the Genomic Era",
  author   = "Minh, Bui Quang and Schmidt, Heiko A and Chernomor, Olga and
              Schrempf, Dominik and Woodhams, Michael D and von Haeseler, Arndt
              and Lanfear, Robert",
  abstract = "IQ-TREE (http://www.iqtree.org, last accessed February 6, 2020)
              is a user-friendly and widely used software package for
              phylogenetic inference using maximum likelihood. Since the
              release of version 1 in 2014, we have continuously expanded
              IQ-TREE to integrate a plethora of new models of sequence
              evolution and efficient computational approaches of phylogenetic
              inference to deal with genomic data. Here, we describe notable
              features of IQ-TREE version 2 and highlight the key advantages
              over other software.",
  journal  = "Molecular Biology and Evolution",
  volume   =  37,
  number   =  5,
  pages    = "1530--1534",
  month    =  may,
  year     =  2020,
  keywords = "maximum likelihood; models of sequence evolution; phylogenetics;
              phylogenomics",
  language = "en"
}

@ARTICLE{Bal2022-hq,
  title    = "Detection and prevalence of {SARS-CoV-2} co-infections during the
              {O}micron variant circulation in {F}rance",
  author   = "Bal, Antonin and Simon, Bruno and Destras, Gregory and
              Chalvignac, Richard and Semanas, Quentin and Oblette, Antoine and
              Qu{\'e}rom{\`e}s, Gr{\'e}gory and Fanget, Remi and Regue, Hadrien
              and Morfin, Florence and Valette, Martine and Lina, Bruno and
              Josset, Laurence",
  abstract = "From December 2021-February 2022, an intense and unprecedented
              co-circulation of SARS-CoV-2 variants with high genetic diversity
              raised the question of possible co-infections between variants
              and how to detect them. Using 11 mixes of Delta:Omicron isolates
              at different ratios, we evaluated the performance of 4 different
              sets of primers used for whole-genome sequencing and developed an
              unbiased bioinformatics method for the detection of co-infections
              involving genetically distinct SARS-CoV-2 lineages. Applied on
              21,387 samples collected between December 6, 2021 to February 27,
              2022 from random genomic surveillance in France, we detected 53
              co-infections between different lineages. The prevalence of Delta
              and Omicron (BA.1) co-infections and Omicron lineages BA.1 and
              BA.2 co-infections were estimated at 0.18\% and 0.26\%,
              respectively. Among 6,242 hospitalized patients, the intensive
              care unit (ICU) admission rates were 1.64\%, 4.81\% and 15.38\%
              in Omicron, Delta and Delta/Omicron patients, respectively. No
              BA.1/BA.2 co-infections were reported among ICU admitted
              patients. Among the 53 co-infected patients, a total of 21
              patients (39.6\%) were not vaccinated. Although SARS-CoV-2
              co-infections were rare in this study, their proper detection is
              crucial to evaluate their clinical impact and the risk of the
              emergence of potential recombinants.",
  journal  = "Nature Communications",
  volume   =  13,
  number   =  1,
  pages    = "6316",
  month    =  oct,
  year     =  2022,
  language = "en"
}

@ARTICLE{Focosi2022-ni,
  title    = "Recombination in Coronaviruses, with a Focus on {SARS-CoV-2}",
  author   = "Focosi, Daniele and Maggi, Fabrizio",
  abstract = "Recombination is a common evolutionary tool for RNA viruses, and
              coronaviruses are no exception. We review here the evidence for
              recombination in SARS-CoV-2 and reconcile nomenclature for
              recombinants, discuss their origin and fitness, and speculate how
              recombinants could make a difference in the future of the
              COVID-19 pandemics.",
  journal  = "Viruses",
  volume   =  14,
  number   =  6,
  month    =  jun,
  year     =  2022,
  keywords = "COVID-19; Coronaviridae; PANGOLIN; SARS-CoV-2; coinfection;
              recombination; superinfection",
  language = "en"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{Rubinacci2020-pa,
  title    = "Genotype imputation using the Positional {B}urrows {W}heeler
              Transform",
  author   = "Rubinacci, Simone and Delaneau, Olivier and Marchini, Jonathan",
  abstract = "Genotype imputation is the process of predicting unobserved
              genotypes in a sample of individuals using a reference panel of
              haplotypes. In the last 10 years reference panels have increased
              in size by more than 100 fold. Increasing reference panel size
              improves accuracy of markers with low minor allele frequencies
              but poses ever increasing computational challenges for imputation
              methods. Here we present IMPUTE5, a genotype imputation method
              that can scale to reference panels with millions of samples. This
              method continues to refine the observation made in the IMPUTE2
              method, that accuracy is optimized via use of a custom subset of
              haplotypes when imputing each individual. It achieves fast,
              accurate, and memory-efficient imputation by selecting haplotypes
              using the Positional Burrows Wheeler Transform (PBWT). By using
              the PBWT data structure at genotyped markers, IMPUTE5 identifies
              locally best matching haplotypes and long identical by state
              segments. The method then uses the selected haplotypes as
              conditioning states within the IMPUTE model. Using the HRC
              reference panel, which has ∼65,000 haplotypes, we show that
              IMPUTE5 is up to 30x faster than MINIMAC4 and up to 3x faster
              than BEAGLE5.1, and uses less memory than both these methods.
              Using simulated reference panels we show that IMPUTE5 scales
              sub-linearly with reference panel size. For example, keeping the
              number of imputed markers constant, increasing the reference
              panel size from 10,000 to 1 million haplotypes requires less than
              twice the computation time. As the reference panel increases in
              size IMPUTE5 is able to utilize a smaller number of reference
              haplotypes, thus reducing computational cost.",
  journal  = "PLoS Genetics",
  volume   =  16,
  number   =  11,
  pages    = "e1009049",
  month    =  nov,
  year     =  2020,
  language = "en"
}

@ARTICLE{Ronquist2012-zw,
  title    = "{MrBayes} 3.2: {E}fficient {B}ayesian phylogenetic inference and
              model choice across a large model space",
  author   = "Ronquist, Fredrik and Teslenko, Maxim and van der Mark, Paul and
              Ayres, Daniel L and Darling, Aaron and H{\"o}hna, Sebastian and
              Larget, Bret and Liu, Liang and Suchard, Marc A and Huelsenbeck,
              John P",
  abstract = "Since its introduction in 2001, MrBayes has grown in popularity
              as a software package for Bayesian phylogenetic inference using
              Markov chain Monte Carlo (MCMC) methods. With this note, we
              announce the release of version 3.2, a major upgrade to the
              latest official release presented in 2003. The new version
              provides convergence diagnostics and allows multiple analyses to
              be run in parallel with convergence progress monitored on the
              fly. The introduction of new proposals and automatic optimization
              of tuning parameters has improved convergence for many problems.
              The new version also sports significantly faster likelihood
              calculations through streaming single-instruction-multiple-data
              extensions (SSE) and support of the BEAGLE library, allowing
              likelihood calculations to be delegated to graphics processing
              units (GPUs) on compatible hardware. Speedup factors range from
              around 2 with SSE code to more than 50 with BEAGLE for codon
              problems. Checkpointing across all models allows long runs to be
              completed even when an analysis is prematurely terminated. New
              models include relaxed clocks, dating, model averaging across
              time-reversible substitution models, and support for hard,
              negative, and partial (backbone) tree constraints. Inference of
              species trees from gene trees is supported by full incorporation
              of the Bayesian estimation of species trees (BEST) algorithms.
              Marginal model likelihoods for Bayes factor tests can be
              estimated accurately across the entire model space using the
              stepping stone method. The new version provides more output
              options than previously, including samples of ancestral states,
              site rates, site d(N)/d(S) rations, branch rates, and node dates.
              A wide range of statistics on tree parameters can also be output
              for visualization in FigTree and compatible software.",
  journal  = "Systematic Biology",
  volume   =  61,
  number   =  3,
  pages    = "539--542",
  month    =  may,
  year     =  2012,
  language = "en"
}

@ARTICLE{Ignatieva2022-st,
  title    = "Ongoing Recombination in {SARS-CoV-2} Revealed through
              Genealogical Reconstruction",
  author   = "Ignatieva, Anastasia and Hein, Jotun and Jenkins, Paul A",
  abstract = "The evolutionary process of genetic recombination has the
              potential to rapidly change the properties of a viral pathogen,
              and its presence is a crucial factor to consider in the
              development of treatments and vaccines. It can also significantly
              affect the results of phylogenetic analyses and the inference of
              evolutionary rates. The detection of recombination from samples
              of sequencing data is a very challenging problem and is further
              complicated for SARS-CoV-2 by its relatively slow accumulation of
              genetic diversity. The extent to which recombination is ongoing
              for SARS-CoV-2 is not yet resolved. To address this, we use a
              parsimony-based method to reconstruct possible genealogical
              histories for samples of SARS-CoV-2 sequences, which enables us
              to pinpoint specific recombination events that could have
              generated the data. We propose a statistical framework for
              disentangling the effects of recurrent mutation from
              recombination in the history of a sample, and hence provide a way
              of estimating the probability that ongoing recombination is
              present. We apply this to samples of sequencing data collected in
              England and South Africa and find evidence of ongoing
              recombination.",
  journal  = "Molecular Biology and Evolution",
  volume   =  39,
  number   =  2,
  month    =  feb,
  year     =  2022,
  keywords = "SARS-CoV-2; genealogy; parsimony; recombination",
  language = "en"
}

@ARTICLE{Wertheim2022-hj,
  title    = "Detection of {SARS-CoV-2} intra-host recombination during
              superinfection with {A}lpha and {E}psilon variants in {N}ew {Y}ork {C}ity",
  author   = "Wertheim, Joel O and Wang, Jade C and Leelawong, Mindy and
              Martin, Darren P and Havens, Jennifer L and Chowdhury, Moinuddin
              A and Pekar, Jonathan E and Amin, Helly and Arroyo, Anthony and
              Awandare, Gordon A and Chow, Hoi Yan and Gonzalez, Edimarlyn and
              Luoma, Elizabeth and Morang'a, Collins M and Nekrutenko, Anton
              and Shank, Stephen D and Silver, Stefan and Quashie, Peter K and
              Rakeman, Jennifer L and Ruiz, Victoria and Torian, Lucia V and
              Vasylyeva, Tetyana I and Kosakovsky Pond, Sergei L and Hughes,
              Scott",
  abstract = "Recombination is an evolutionary process by which many pathogens
              generate diversity and acquire novel functions. Although a common
              occurrence during coronavirus replication, detection of
              recombination is only feasible when genetically distinct viruses
              contemporaneously infect the same host. Here, we identify an
              instance of SARS-CoV-2 superinfection, whereby an individual was
              infected with two distinct viral variants: Alpha (B.1.1.7) and
              Epsilon (B.1.429). This superinfection was first noted when an
              Alpha genome sequence failed to exhibit the classic S gene target
              failure behavior used to track this variant. Full genome
              sequencing from four independent extracts reveals that Alpha
              variant alleles comprise around 75\% of the genomes, whereas the
              Epsilon variant alleles comprise around 20\% of the sample.
              Further investigation reveals the presence of numerous
              recombinant haplotypes spanning the genome, specifically in the
              spike, nucleocapsid, and ORF 8 coding regions. These findings
              support the potential for recombination to reshape SARS-CoV-2
              genetic diversity.",
  journal  = "Nature Communications",
  volume   =  13,
  number   =  1,
  pages    = "3645",
  month    =  jun,
  year     =  2022,
  language = "en"
}

@ARTICLE{Sagulenko2018-nk,
  title    = "{TreeTime}: {M}aximum-likelihood phylodynamic analysis",
  author   = "Sagulenko, Pavel and Puller, Vadim and Neher, Richard A",
  abstract = "Mutations that accumulate in the genome of cells or viruses can
              be used to infer their evolutionary history. In the case of
              rapidly evolving organisms, genomes can reveal their detailed
              spatiotemporal spread. Such phylodynamic analyses are
              particularly useful to understand the epidemiology of rapidly
              evolving viral pathogens. As the number of genome sequences
              available for different pathogens has increased dramatically over
              the last years, phylodynamic analysis with traditional methods
              becomes challenging as these methods scale poorly with growing
              datasets. Here, we present TreeTime, a Python-based framework for
              phylodynamic analysis using an approximate Maximum Likelihood
              approach. TreeTime can estimate ancestral states, infer evolution
              models, reroot trees to maximize temporal signals, estimate
              molecular clock phylogenies and population size histories. The
              runtime of TreeTime scales linearly with dataset size.",
  journal  = "Virus Evolution",
  volume   =  4,
  number   =  1,
  pages    = "vex042",
  month    =  jan,
  year     =  2018,
  keywords = "molecular clock phylogenies; phylodynamics; python",
  language = "en"
}

@ARTICLE{Hadfield2018-ef,
  title    = "Nextstrain: {R}eal-time tracking of pathogen evolution",
  author   = "Hadfield, James and Megill, Colin and Bell, Sidney M and
              Huddleston, John and Potter, Barney and Callender, Charlton and
              Sagulenko, Pavel and Bedford, Trevor and Neher, Richard A",
  abstract = "SUMMARY: Understanding the spread and evolution of pathogens is
              important for effective public health measures and surveillance.
              Nextstrain consists of a database of viral genomes, a
              bioinformatics pipeline for phylodynamics analysis, and an
              interactive visualization platform. Together these present a
              real-time view into the evolution and spread of a range of viral
              pathogens of high public health importance. The visualization
              integrates sequence data with other data types such as geographic
              information, serology, or host species. Nextstrain compiles our
              current understanding into a single accessible location, open to
              health professionals, epidemiologists, virologists and the public
              alike. AVAILABILITY AND IMPLEMENTATION: All code (predominantly
              JavaScript and Python) is freely available from
              github.com/nextstrain and the web-application is available at
              nextstrain.org.",
  journal  = "Bioinformatics",
  volume   =  34,
  number   =  23,
  pages    = "4121--4123",
  month    =  dec,
  year     =  2018,
  language = "en"
}

@ARTICLE{Sekizuka2022-xz,
  title    = "Genome Recombination between the {D}elta and {A}lpha Variants of
              Severe Acute Respiratory Syndrome Coronavirus 2 ({SARS-CoV-2})",
  author   = "Sekizuka, Tsuyoshi and Itokawa, Kentaro and Saito, Masumichi and
              Shimatani, Michitsugu and Matsuyama, Shutoku and Hasegawa, Hideki
              and Saito, Tomoya and Kuroda, Makoto",
  abstract = "Prominent genomic recombination has been observed between the
              Delta and Alpha variants of severe acute respiratory syndrome
              coronavirus 2 (SARS-CoV-2), isolated from clinical specimens in
              Japan. Interestingly, the recombination variant detected in this
              study carries a spike protein identical to that in the domestic
              Delta variant, thereby suggesting that further risks would not be
              associated with infectivity and immune escape. The recombinant
              was classified as an XC lineage in the PANGOLIN database. It is
              necessary to intensively study such marked genetic variations and
              characterize emerging variants after careful verification of
              their lineage and clade assignment.",
  journal  = "Japanese Journal of Infectious Diseases",
  volume   =  75,
  number   =  4,
  pages    = "415--418",
  month    =  jul,
  year     =  2022,
  keywords = "SARS-CoV-2; alpha variant; delta variant; genome recombination",
  language = "en"
}

@ARTICLE{Gallaher2020-lb,
  title    = "A palindromic {RNA} sequence as a common breakpoint contributor
              to copy-choice recombination in {SARS-COV-2}",
  author   = "Gallaher, William R",
  abstract = "Much remains unknown concerning the origin of the novel pandemic
              coronavirus that has raged across the globe since emerging in
              Wuhan of Hubei province, near the center of the People's Republic
              of China, in December of 2019. All current members of the family
              Coronaviridae have arisen by a combination of incremental
              adaptive mutations, against the backdrop of many recombinational
              events throughout the past, rendering each a unique mosaic of RNA
              sequences from diverse sources. The consensus among virologists
              is that the base sequence of the novel coronavirus, designated
              SARS-CoV-2, was derived from a common ancestor of a bat
              coronavirus, represented by the strain RaTG13, isolated in Yunnan
              province in 2013. Into that ancestral genetic background, several
              recombination events have since occurred from other divergent
              bat-derived coronaviruses, resulting in localized discordance
              between the two. One such event left SARS-CoV-2 with a receptor
              binding domain (RBD) capable of binding the human ACE-2 receptor
              lacking in RaTG13, and a second event uniquely added to
              SARS-CoV-2 a site specific for furin, capable of efficient
              endoproteolytic cleavage and activation of the spike glycoprotein
              responsible for virus entry and cell fusion. This paper
              demonstrates by bioinformatic analysis that such recombinational
              events are facilitated by short oligonucleotide ``breakpoint
              sequences'', similar to CAGAC, that direct recombination
              naturally to certain positions in the genome at the boundaries
              between blocks of RNA code and potentially RNA structure. This
              ``breakpoint sequence hypothesis'' provides a natural explanation
              for the biogenesis of SARS-CoV-2 over time and in the wild.",
  journal  = "Archives of Virology",
  volume   =  165,
  number   =  10,
  pages    = "2341--2348",
  month    =  oct,
  year     =  2020,
  language = "en"
}

@article{VanInsberghe2021-eu,
  title   = "Recombinant {SARS-CoV-2} Genomes Circulated at Low Levels Over The
             First Year of The Pandemic",
  author  = "VanInsberghe, David and Neish, Andrew S and Lowen, Anice C and
             Koelle, Katia",
  journal={Virus Evolution},
  volume={7},
  number={2},
  pages={veab059},
  year={2021},
  publisher={Oxford University Press UK}
}


@ARTICLE{Lam2018-dc,
  title    = "Improved Algorithmic Complexity for the {3SEQ} Recombination
              Detection Algorithm",
  author   = "Lam, Ha Minh and Ratmann, Oliver and Boni, Maciej F",
  abstract = "Identifying recombinant sequences in an era of large genomic
              databases is challenging as it requires an efficient algorithm to
              identify candidate recombinants and parents, as well as
              appropriate statistical methods to correct for the large number
              of comparisons performed. In 2007, a computation was introduced
              for an exact nonparametric mosaicism statistic that gave
              high-precision P values for putative recombinants. This exact
              computation meant that multiple-comparisons corrected P values
              also had high precision, which is crucial when performing
              millions or billions of tests in large databases. Here, we
              introduce an improvement to the algorithmic complexity of this
              computation from O(mn3) to O(mn2), where m and n are the numbers
              of recombination-informative sites in the candidate recombinant.
              This new computation allows for recombination analysis to be
              performed in alignments with thousands of polymorphic sites.
              Benchmark runs are presented on viral genome sequence alignments,
              new features are introduced, and applications outside
              recombination analysis are discussed.",
  journal  = "Molecular Biology and Evolution",
  volume   =  35,
  number   =  1,
  pages    = "247--251",
  month    =  jan,
  year     =  2018,
  keywords = "mosaic structure; nonparametric; recombination",
  language = "en"
}

@ARTICLE{Speidel2019-yh,
  title    = "A method for genome-wide genealogy estimation for thousands of
              samples",
  author   = "Speidel, Leo and Forest, Marie and Shi, Sinan and Myers, Simon R",
  abstract = "Knowledge of genome-wide genealogies for thousands of individuals
              would simplify most evolutionary analyses for humans and other
              species, but has remained computationally infeasible. We have
              developed a method, Relate, scaling to >10,000 sequences while
              simultaneously estimating branch lengths, mutational ages and
              variable historical population sizes, as well as allowing for
              data errors. Application to 1,000 Genomes Project haplotypes
              produces joint genealogical histories for 26 human populations.
              Highly diverged lineages are present in all groups, but most
              frequent in Africa. Outside Africa, these mainly reflect ancient
              introgression from groups related to Neanderthals and Denisovans,
              while African signals instead reflect unknown events unique to
              that continent. Our approach allows more powerful inferences of
              natural selection than has previously been possible. We identify
              multiple regions under strong positive selection, and
              multi-allelic traits including hair color, body mass index and
              blood pressure, showing strong evidence of directional selection,
              varying among human groups.",
  journal  = "Nature Genetics",
  volume   =  51,
  number   =  9,
  pages    = "1321--1329",
  month    =  sep,
  year     =  2019,
  language = "en"
}

@ARTICLE{Aksamentov2021-hj,
  title   = "Nextclade: {C}lade assignment, mutation calling and quality control
             for viral genomes",
  author  = "Aksamentov, I and Roemer, C and Hodcroft, E B and Neher, R A",
  journal = "Journal of Open Source Software",
  volume  =  6,
  number  =  67,
  pages   = "3773",
  month   =  nov,
  year    =  2021
}

@ARTICLE{Jackson2021-ik,
  title    = "Generation and transmission of interlineage recombinants in the
              {SARS-CoV-2} pandemic",
  author   = "Jackson, Ben and Boni, Maciej F and Bull, Matthew J and Colleran,
              Amy and Colquhoun, Rachel M and Darby, Alistair C and Haldenby,
              Sam and Hill, Verity and Lucaci, Anita and McCrone, John T and
              Nicholls, Samuel M and O'Toole, {\'A}ine and Pacchiarini, Nicole
              and Poplawski, Radoslaw and Scher, Emily and Todd, Flora and
              Webster, Hermione J and Whitehead, Mark and Wierzbicki, Claudia
              and {COVID-19 Genomics UK (COG-UK) Consortium} and Loman,
              Nicholas J and Connor, Thomas R and Robertson, David L and Pybus,
              Oliver G and Rambaut, Andrew",
  abstract = "We present evidence for multiple independent origins of
              recombinant SARS-CoV-2 viruses sampled from late 2020 and early
              2021 in the United Kingdom. Their genomes carry single-nucleotide
              polymorphisms and deletions that are characteristic of the
              B.1.1.7 variant of concern but lack the full complement of
              lineage-defining mutations. Instead, the remainder of their
              genomes share contiguous genetic variation with non-B.1.1.7
              viruses circulating in the same geographic area at the same time
              as the recombinants. In four instances, there was evidence for
              onward transmission of a recombinant-origin virus, including one
              transmission cluster of 45 sequenced cases over the course of 2
              months. The inferred genomic locations of recombination
              breakpoints suggest that every community-transmitted recombinant
              virus inherited its spike region from a B.1.1.7 parental virus,
              consistent with a transmission advantage for B.1.1.7's set of
              mutations.",
  journal  = "Cell",
  volume   =  184,
  number   =  20,
  pages    = "5179--5188",
  month    =  sep,
  year     =  2021,
  keywords = "B.1.1.7; SARS-CoV-2; evolution; genomic epidemiology; genomics;
              recombination; variants",
  language = "en"
}

@ARTICLE{Graham2010-xe,
  title    = "Recombination, reservoirs, and the modular spike: {M}echanisms of
              coronavirus cross-species transmission",
  author   = "Graham, Rachel L and Baric, Ralph S",
  abstract = "Over the past 30 years, several cross-species transmission
              events, as well as changes in virus tropism, have mediated
              significant animal and human diseases. Most notable is severe
              acute respiratory syndrome (SARS), a lower respiratory tract
              disease of humans that was first reported in late 2002 in
              Guangdong Province, China. The disease, which quickly spread
              worldwide over a period of 4 months spanning late 2002 and early
              2003, infected over 8,000 individuals and killed nearly 800
              before it was successfully contained by aggressive public health
              intervention strategies. A coronavirus (SARS-CoV) was identified
              as the etiological agent of SARS, and initial assessments
              determined that the virus crossed to human hosts from zoonotic
              reservoirs, including bats, Himalayan palm civets (Paguma
              larvata), and raccoon dogs (Nyctereutes procyonoides), sold in
              exotic animal markets in Guangdong Province. In this review, we
              discuss the molecular mechanisms that govern coronavirus
              cross-species transmission both in vitro and in vivo, using the
              emergence of SARS-CoV as a model. We pay particular attention to
              how changes in the Spike attachment protein, both within and
              outside of the receptor binding domain, mediate the emergence of
              coronaviruses in new host populations.",
  journal  = "Journal of Virology",
  volume   =  84,
  number   =  7,
  pages    = "3134--3146",
  month    =  apr,
  year     =  2010,
  language = "en"
}

@MISC{noauthor_undated-ai,
  title        = "Error bounds for convolutional codes and an asymptotically
                  optimum decoding algorithm",
  abstract     = "The probability of error in decoding an optimal convolutional
                  code transmitted over a memoryless channel is bounded from
                  above and below as a function of the constraint length of the
                  code. For all but pathological channels the bounds are
                  asymptotically (exponentially) tight for rates above R\_\{0\}
                  , the computational cutoff rate of sequential decoding. As a
                  function of constraint length the performance of optimal
                  convolutional codes is shown to be superior to that of block
                  codes of the same length, the relative improvement increasing
                  with rate. The upper bound is obtained for a specific
                  probabilistic nonsequential decoding algorithm which is shown
                  to be asymptotically optimum for rates above R\_\{0\} and
                  whose performance bears certain similarities to that of
                  sequential decoding algorithms.",
  howpublished = "\url{https://ieeexplore.ieee.org/document/1054010}",
  note         = "Accessed: 2023-3-6",
  language     = "en"
}

@ARTICLE{Viterbi1967-ol,
  title   = "Error bounds for convolutional codes and an asymptotically optimum
             decoding algorithm",
  author  = "Viterbi, A J",
  journal = "IEEE Transactions on Information Theory",
  volume  =  13,
  number  =  2,
  pages   = "260--269",
  year    =  1967
}

@ARTICLE{Huson2012-ys,
  title    = "Dendroscope 3: {A}n interactive tool for rooted phylogenetic trees
              and networks",
  author   = "Huson, Daniel H and Scornavacca, Celine",
  abstract = "Dendroscope 3 is a new program for working with rooted
              phylogenetic trees and networks. It provides a number of methods
              for drawing and comparing rooted phylogenetic networks, and for
              computing them from rooted trees. The program can be used
              interactively or in command-line mode. The program is written in
              Java, use of the software is free, and installers for all 3 major
              operating systems can be downloaded from www.dendroscope.org.
              [Phylogenetic trees; phylogenetic networks; software.].",
  journal  = "Systematic Biology",
  volume   =  61,
  number   =  6,
  pages    = "1061--1067",
  month    =  dec,
  year     =  2012,
  language = "en"
}

@ARTICLE{Guindon2003-zd,
  title    = "A simple, fast, and accurate algorithm to estimate large
              phylogenies by maximum likelihood",
  author   = "Guindon, St{\'e}phane and Gascuel, Olivier",
  abstract = "The increase in the number of large data sets and the complexity
              of current probabilistic sequence evolution models necessitates
              fast and reliable phylogeny reconstruction methods. We describe a
              new approach, based on the maximum- likelihood principle, which
              clearly satisfies these requirements. The core of this method is
              a simple hill-climbing algorithm that adjusts tree topology and
              branch lengths simultaneously. This algorithm starts from an
              initial tree built by a fast distance-based method and modifies
              this tree to improve its likelihood at each iteration. Due to
              this simultaneous adjustment of the topology and branch lengths,
              only a few iterations are sufficient to reach an optimum. We used
              extensive and realistic computer simulations to show that the
              topological accuracy of this new method is at least as high as
              that of the existing maximum-likelihood programs and much higher
              than the performance of distance-based and parsimony approaches.
              The reduction of computing time is dramatic in comparison with
              other maximum-likelihood packages, while the likelihood
              maximization ability tends to be higher. For example, only 12 min
              were required on a standard personal computer to analyze a data
              set consisting of 500 rbcL sequences with 1,428 base pairs from
              plant plastids, thus reaching a speed of the same order as some
              popular distance-based and parsimony algorithms. This new method
              is implemented in the PHYML program, which is freely available on
              our web page: http://www.lirmm.fr/w3ifa/MAAS/.",
  journal  = "Systematic Biology",
  volume   =  52,
  number   =  5,
  pages    = "696--704",
  month    =  oct,
  year     =  2003,
  language = "en"
}

@ARTICLE{Rasmussen2014-el,
  title    = "Genome-wide inference of ancestral recombination graphs",
  author   = "Rasmussen, Matthew D and Hubisz, Melissa J and Gronau, Ilan and
              Siepel, Adam",
  abstract = "The complex correlation structure of a collection of orthologous
              DNA sequences is uniquely captured by the ``ancestral
              recombination graph'' (ARG), a complete record of coalescence and
              recombination events in the history of the sample. However,
              existing methods for ARG inference are computationally intensive,
              highly approximate, or limited to small numbers of sequences,
              and, as a consequence, explicit ARG inference is rarely used in
              applied population genomics. Here, we introduce a new algorithm
              for ARG inference that is efficient enough to apply to dozens of
              complete mammalian genomes. The key idea of our approach is to
              sample an ARG of [Formula: see text] chromosomes conditional on
              an ARG of [Formula: see text] chromosomes, an operation we call
              ``threading.'' Using techniques based on hidden Markov models, we
              can perform this threading operation exactly, up to the
              assumptions of the sequentially Markov coalescent and a
              discretization of time. An extension allows for threading of
              subtrees instead of individual sequences. Repeated application of
              these threading operations results in highly efficient Markov
              chain Monte Carlo samplers for ARGs. We have implemented these
              methods in a computer program called ARGweaver. Experiments with
              simulated data indicate that ARGweaver converges rapidly to the
              posterior distribution over ARGs and is effective in recovering
              various features of the ARG for dozens of sequences generated
              under realistic parameters for human populations. In applications
              of ARGweaver to 54 human genome sequences from Complete Genomics,
              we find clear signatures of natural selection, including regions
              of unusually ancient ancestry associated with balancing selection
              and reductions in allele age in sites under directional
              selection. The patterns we observe near protein-coding genes are
              consistent with a primary influence from background selection
              rather than hitchhiking, although we cannot rule out a
              contribution from recurrent selective sweeps.",
  journal  = "PLoS Genetics",
  volume   =  10,
  number   =  5,
  pages    = "e1004342",
  month    =  may,
  year     =  2014,
  language = "en"
}

@ARTICLE{Anisimova2003-vr,
  title    = "Effect of recombination on the accuracy of the likelihood method
              for detecting positive selection at amino acid sites",
  author   = "Anisimova, Maria and Nielsen, Rasmus and Yang, Ziheng",
  abstract = "Maximum-likelihood methods based on models of codon substitution
              accounting for heterogeneous selective pressures across sites
              have proved to be powerful in detecting positive selection in
              protein-coding DNA sequences. Those methods are phylogeny based
              and do not account for the effects of recombination. When
              recombination occurs, such as in population data, no unique tree
              topology can describe the evolutionary history of the whole
              sequence. This violation of assumptions raises serious concerns
              about the likelihood method for detecting positive selection.
              Here we use computer simulation to evaluate the reliability of
              the likelihood-ratio test (LRT) for positive selection in the
              presence of recombination. We examine three tests based on
              different models of variable selective pressures among sites.
              Sequences are simulated using a coalescent model with
              recombination and analyzed using codon-based likelihood models
              ignoring recombination. We find that the LRT is robust to low
              levels of recombination (with fewer than three recombination
              events in the history of a sample of 10 sequences). However, at
              higher levels of recombination, the type I error rate can be as
              high as 90\%, especially when the null model in the LRT is
              unrealistic, and the test often mistakes recombination as
              evidence for positive selection. The test that compares the more
              realistic models M7 (beta) against M8 (beta and omega) is more
              robust to recombination, where the null model M7 allows the
              positive selection pressure to vary between 0 and 1 (and so does
              not account for positive selection), and the alternative model M8
              allows an additional discrete class with omega = d(N)/d(S) that
              could be estimated to be >1 (and thus accounts for positive
              selection). Identification of sites under positive selection by
              the empirical Bayes method appears to be less affected than the
              LRT by recombination.",
  journal  = "Genetics",
  volume   =  164,
  number   =  3,
  pages    = "1229--1236",
  month    =  jul,
  year     =  2003,
  language = "en"
}

@ARTICLE{Browning2018-nk,
  title    = "A {One-Penny} Imputed Genome from {Next-Generation} Reference
              Panels",
  author   = "Browning, Brian L and Zhou, Ying and Browning, Sharon R",
  abstract = "Genotype imputation is commonly performed in genome-wide
              association studies because it greatly increases the number of
              markers that can be tested for association with a trait. In
              general, one should perform genotype imputation using the largest
              reference panel that is available because the number of
              accurately imputed variants increases with reference panel size.
              However, one impediment to using larger reference panels is the
              increased computational cost of imputation. We present a new
              genotype imputation method, Beagle 5.0, which greatly reduces the
              computational cost of imputation from large reference panels. We
              compare Beagle 5.0 with Beagle 4.1, Impute4, Minimac3, and
              Minimac4 using 1000 Genomes Project data, Haplotype Reference
              Consortium data, and simulated data for 10k, 100k, 1M, and 10M
              reference samples. All methods produce nearly identical accuracy,
              but Beagle 5.0 has the lowest computation time and the best
              scaling of computation time with increasing reference panel size.
              For 10k, 100k, 1M, and 10M reference samples and 1,000 phased
              target samples, Beagle 5.0's computation time is 3$\times$ (10k),
              12$\times$ (100k), 43$\times$ (1M), and 533$\times$ (10M) faster
              than the fastest alternative method. Cost data from the Amazon
              Elastic Compute Cloud show that Beagle 5.0 can perform
              genome-wide imputation from 10M reference samples into 1,000
              phased target samples at a cost of less than one US cent per
              sample.",
  journal  = "American Journal of Human Genetics",
  volume   =  103,
  number   =  3,
  pages    = "338--348",
  month    =  sep,
  year     =  2018,
  keywords = "GWAS; genome-wide association study; genotype imputation",
  language = "en"
}

@ARTICLE{Yi2021-sc,
  title     = "Mutational spectrum of {SARS-CoV-2} during the global pandemic",
  author    = "Yi, Kijong and Kim, Su Yeon and Bleazard, Thomas and Kim, Taewoo
               and Youk, Jeonghwan and Ju, Young Seok",
  abstract  = "Viruses accumulate mutations under the influence of natural
               selection and host--virus interactions. Through a systematic
               comparison of 351,525 full viral genome sequences collected
               during the recent COVID-19 pandemic, we reveal the spectrum of
               SARS-CoV-2 mutations. Unlike those of other viruses, the
               mutational spectrum of SARS-CoV-2 exhibits extreme asymmetry,
               with a much higher rate of C>U than U>C substitutions, as well
               as a higher rate of G>U than U>G substitutions. This suggests
               directional genome sequence evolution during transmission. The
               substantial asymmetry and directionality of the mutational
               spectrum enable pseudotemporal tracing of SARS-CoV-2 without
               prior information about the root sequence, collection time, and
               sampling region. This shows that the viral genome sequences
               collected in Asia are similar to the original genome sequence.
               Adjusted estimation of the dN/dS ratio accounting for the
               asymmetrical mutational spectrum also shows evidence of negative
               selection on viral genes, consistent with previous reports. Our
               findings provide deep insights into the mutational processes in
               SARS-CoV-2 viral infection and advance the understanding of the
               history and future evolution of the virus. Sequencing the
               genetic material of >350,000 samples of the SARS-CoV-2 virus
               responsible for the COVID-19 pandemic has revealed details of
               the spectrum of mutations occurring in the recent viral
               transmission. The analysis was performed by researchers in South
               Korea and the UK, led by Young Seok Ju at the Korea Advanced
               Institute for Science and Technology, Daejeon. The results
               confirm that mutations in the SARS-CoV-2 genome are being shaped
               in a specific pattern: the nucleic acid bases cytosine and
               guanine have been replaced by uracil much more often than its
               reverse. Overall, the genome diversity of SARS-CoV-2 has not yet
               reached equilibrium, suggesting that the virus entered the human
               population very recently and is currently evolving rapidly. The
               findings enhance understanding of the history of the virus and
               help predict possibilities for its future evolution.",
  journal   = "Experimental \& Molecular Medicine",
  publisher = "Nature Publishing Group",
  volume    =  53,
  number    =  8,
  pages     = "1229--1237",
  month     =  aug,
  year      =  2021,
  language  = "en"
}

@ARTICLE{Ignatieva2021-rg,
  title    = "{KwARG}: {P}arsimonious reconstruction of ancestral recombination
              graphs with recurrent mutation",
  author   = "Ignatieva, Anastasia and Lyngs{\o}, Rune B and Jenkins, Paul A
              and Hein, Jotun",
  abstract = "MOTIVATION: The reconstruction of possible histories given a
              sample of genetic data in the presence of recombination and
              recurrent mutation is a challenging problem, but can provide key
              insights into the evolution of a population. We present KwARG,
              which implements a parsimony-based greedy heuristic algorithm for
              finding plausible genealogical histories (ancestral recombination
              graphs) that are minimal or near-minimal in the number of posited
              recombination and mutation events. RESULTS: Given an input
              dataset of aligned sequences, KwARG outputs a list of possible
              candidate solutions, each comprising a list of mutation and
              recombination events that could have generated the dataset; the
              relative proportion of recombinations and recurrent mutations in
              a solution can be controlled via specifying a set of 'cost'
              parameters. We demonstrate that the algorithm performs well when
              compared against existing methods. AVAILABILITY: The software is
              available at https://github.com/a-ignatieva/kwarg. SUPPLEMENTARY
              INFORMATION: Supplementary materials are available at
              Bioinformatics online.",
  journal  = "Bioinformatics",
  volume   =  37,
  number   =  19,
  pages    = "3277--3284",
  month    =  may,
  year     =  2021,
  language = "en"
}


@article{Ralph2020-efficiently,
    title={Efficiently summarizing relationships in large samples: {A} general
        duality between statistics of genealogies and genomes},
    author={Ralph, Peter and Thornton, Kevin and Kelleher, Jerome},
    journal={Genetics},
    volume={215},
    number={3},
    pages={779--797},
    year={2020},
    publisher={Oxford University Press}
}

@InProceedings{Griffiths1991-two,
    author = {Griffiths, Robert C},
    title = {The two-locus ancestral graph},
    booktitle = {Selected Proceedings of the Sheffield Symposium on Applied Probability. IMS Lecture Notes-Monograph Series},
    year = 1991,
    volume= 18,
    publisher = {IMS},
    address = {Hayward, CA},
    editor = {Basawa, I. V. and Taylor, R. L.},
    pages = {100--117},
}


@incollection{Griffiths1998-ancestral,
    author = {Robert C Griffiths and Paul Marjoram},
    title = {An ancestral recombination graph},
    editor = {Donnelly, P. and Tavar\'e, S.},
    booktitle = {Progress in Population Genetics and Human Evolution, 
        IMA Volumes in Mathematics and its Applications},
    volume = {87},
    publisher = {Springer-Verlag},
    address = {Berlin},
    year = {1997},
    pages = {257--270},
}

@article{Hudson1983-properties,
    title = {Properties of a neutral allele model with intragenic recombination},
    author = {Richard R. Hudson},
    journal = {Theoretical Population Biology}, 
    year = {1983},
    volume = {23},
    pages = {183--201},
}

@article{Minichiello2006-mapping,
    title={Mapping trait loci by use of inferred ancestral recombination
        graphs},
    author={Minichiello, Mark J and Durbin, Richard},
    journal={The American Journal of Human Genetics},
    volume={79},
    number={5},
    pages={910--922},
    year={2006},
    publisher={Elsevier}
}

@MISC{Wong2023-efficient,
  title={A general and efficient representation of Ancestral Recombination Graphs},
  author={Wong, Yan and Ignatieva, Anastasia and Koskela, Jere
      and Gorjanc, Gregor and Wohns, Anthony W and Kelleher, Jerome},
  year=2023,
  note={In preparation}
}

@MISC{Palmer2023-efficient,
  title={Efficient {Li and Stephens} on Ancestral Recombination Graphs},
  author={Palmer, Duncan S and Wong, Yan and Kelleher, Jerome},
  year=2023,
  note={In preparation}
}

@article{Haller2018-tree,
    title={Tree-sequence recording in {SLiM} opens new horizons for
        forward-time simulation of whole genomes},
    author={Haller, Benjamin C and Galloway, Jared and Kelleher, Jerome and
        Messer, Philipp W and Ralph, Peter L},
    journal={Molecular Ecology Resources},
    year={2019},
      volume={19},
  number={2},
  pages={552--566},
    publisher={Wiley Online Library}
}


@misc{Tskit2023-tskit,
  title={Tskit: {A} portable library for population scale genealogical analysis},
  author={{Tskit developers}},
  note={In preparation},
  year={2023},
}

@article{Harris2020-array,
    title={Array programming with {NumPy}},
    author={Harris, Charles R and Millman, K Jarrod and van der Walt,
        St{\'e}fan J and Gommers, Ralf and Virtanen, Pauli and Cournapeau,
        David and Wieser, Eric and Taylor, Julian and Berg, Sebastian and
            Smith, Nathaniel J and others},
    journal={Nature},
    volume={585},
    number={7825},
    pages={357--362},
    year={2020},
    publisher={Nature Publishing Group}
}

@conference{Kluyver2016-jupyter,
    Title = {Jupyter Notebooks---a publishing format for reproducible
        computational workflows},
    Author = {Thomas Kluyver and Benjamin Ragan-Kelley and Fernando P{\'e}rez
        and Brian Granger and Matthias Bussonnier and Jonathan Frederic and
            Kyle Kelley and Jessica Hamrick and Jason Grout and Sylvain Corlay
            and Paul Ivanov and Dami{\'a}n Avila and Safia Abdalla and Carol
            Willing},
    Booktitle = {Positioning and Power in Academic Publishing: Players, Agents
        and Agendas},
    address = {Amsterdam},
    Editor = {F. Loizides and B. Schmidt},
    Organization = {IOS Press},
    Pages = {87--90},
    Year = {2016}
}

@article{Haller2019-slim,
    title={{SLiM} 3: {F}orward genetic simulations beyond the {Wright--Fisher}
        model},
    author={Haller, Benjamin C and Messer, Philipp W},
    journal={Molecular Biology and Evolution},
    volume={36},
    number={3},
    pages={632--637},
    year={2019},
    publisher={Oxford University Press}
}

@inproceedings{Mcvean2019-linkage,
author = {McVean, Gil and Kelleher, Jerome},
editor = {Balding, David and Moltke, Ida and Marioni, John},
publisher = {Wiley},
title = {Linkage Disequilibrium, Recombination and Haplotype Structure},
booktitle = {Handbook of Statistical Genomics},
chapter = {2},
pages = {51-86},
year = {2019},
address = {Hoboken, NJ},
}

@article{Korfmann2022-weak,
    title={Weak seed banks influence the signature and detectability of
        selective sweeps},
    author={Korfmann, Kevin and Awad, Diala Abu and Tellier, Aur{\'e}lien},
    journal={bioRxiv},
    year={2022},
    url={https://doi.org/10.1101/2022.04.26.489499 },
    publisher={Cold Spring Harbor Laboratory}
}

@article{Mahmoudi2022-bayesian,
    title={Bayesian inference of ancestral recombination graphs},
    author={Mahmoudi, Ali and Koskela, Jere and Kelleher, Jerome and Chan,
        Yao-ban and Balding, David},
    journal={PLOS Computational Biology},
    volume={18},
    number={3},
    pages={e1009960},
    year={2022},
    publisher={Public Library of Science San Francisco, CA USA}
}

@article{Petr2022-slendr,
    title={Slendr: {A} framework for spatio-temporal population genomic
        simulations on geographic landscapes},
    author={Petr, Martin and Haller, Benjamin C and Ralph, Peter L and
        Racimo, Fernando},
    journal={bioRxiv},
    url = {https://doi.org/10.1101/2022.03.20.485041 },
    year={2022},
    publisher={Cold Spring Harbor Laboratory}
}

@article{Rasmussen2022-espalier,
    title={Espalier: {E}fficient tree reconciliation and {ARG} reconstruction
        using maximum agreement forests},
    author={Rasmussen, David A and Guo, Fangfang},
    journal={bioRxiv},
    year={2022},
    url = {https://doi.org/10.1101/2022.01.17.476639 },
    publisher={Cold Spring Harbor Laboratory}
}

@article{Terasaki2021-geonomics,
    title={Geonomics: {F}orward-time, spatially explicit, and arbitrarily
        complex landscape genomic simulations},
    author={Terasaki Hart, Drew E and Bishop, Anusha P and Wang, Ian J},
    journal={Molecular Biology and Evolution},
    volume={38},
    number={10},
    pages={4634--4646},
    year={2021},
    publisher={Oxford University Press}
}

@article{Fan2022-genealogical,
    title={A genealogical estimate of genetic relationships},
    author={Fan, Caoqi and Mancuso, Nicholas and Chiang, Charleston WK},
    journal={The American Journal of Human Genetics},
    volume={109},
    number={5},
    pages={812--824},
    year={2022},
    publisher={Elsevier}
}

@article{OToole2021-assignment,
  title={Assignment of epidemiological lineages in an emerging pandemic using the pangolin tool},
  author={O’Toole, {\'A}ine and Scher, Emily and Underwood, Anthony and Jackson, Ben and Hill, Verity and McCrone, John T and Colquhoun, Rachel and Ruis, Chris and Abu-Dahab, Khalil and Taylor, Ben and others},
  journal={Virus Evolution},
  volume={7},
  number={2},
  pages={veab064},
  year={2021},
  publisher={Oxford University Press UK}
}

@ARTICLE{Pauli2020-scipy,
    author  = {Virtanen, Pauli and Gommers, Ralf and Oliphant, Travis E. and
        Haberland, Matt and Reddy, Tyler and Cournapeau, David
            and
            Burovski, Evgeni and Peterson, Pearu and
            Weckesser, Warren and
            Bright, Jonathan and {van der
                Walt}, St{\'e}fan J. and
                Brett,
        Matthew and Wilson, Joshua and Millman, K. Jarrod and
            Mayorov, Nikolay and Nelson, Andrew R. J.
            and Jones, Eric and
            Kern, Robert and Larson, Eric
            and Carey, C J and
            Polat, {\.I}lhan
            and Feng, Yu and
            Moore, Eric W.
            and
            {VanderPlas},
        Jake and Laxalde, Denis and Perktold, Josef and
            Cimrman, Robert and Henriksen, Ian and
            Quintero, E. A. and
            Harris, Charles R. and
            Archibald, Anne M. and
            Ribeiro,
        Ant{\^o}nio H. and Pedregosa, Fabian and
        {van Mulbregt}, Paul and {SciPy 1.0
                                     Contributors}},
    title   = {{{SciPy} 1.0: Fundamental Algorithms for Scientific
        Computing in Python}},
    journal = {Nature Methods},
    year    = {2020},
    volume  = {17},
    pages   = {261--272},
}

@book{Felsenstein2004-inferring,
  author = {Felsenstein, J.},
  publisher = {Sinauer Associates},
  address = {Sunderland, MA},
  title = {Inferring Phylogenies},
  year = {2004}
}

@article{Su2016-epidemiology,
  title={Epidemiology, genetic recombination, and pathogenesis of coronaviruses},
  author={Su, Shuo and Wong, Gary and Shi, Weifeng and Liu, Jun and Lai, Alexander CK and Zhou, Jiyong and Liu, Wenjun and Bi, Yuhai and Gao, George F},
  journal={Trends in Microbiology},
  volume={24},
  number={6},
  pages={490--502},
  year={2016},
  publisher={Elsevier}
}

@article{Simon2011-rna,
  title={Why do {RNA} viruses recombine?},
  author={Simon-Loriere, Etienne and Holmes, Edward C},
  journal={Nature Reviews Microbiology},
  volume={9},
  number={8},
  pages={617--626},
  year={2011},
  publisher={Nature Publishing Group UK London}
}


@article{Gribble2021-coronavirus,
  title={The coronavirus proofreading exoribonuclease mediates extensive viral recombination},
  author={Gribble, Jennifer and Stevens, Laura J and Agostini, Maria L and Anderson-Daniels, Jordan and Chappell, James D and Lu, Xiaotao and Pruijssers, Andrea J and Routh, Andrew L and Denison, Mark R},
  journal={PLoS Pathogens},
  volume={17},
  number={1},
  pages={e1009226},
  year={2021},
  publisher={Public Library of Science San Francisco, CA USA}
}

@article{Nie2020-phylogenetic,
  title={Phylogenetic and phylodynamic analyses of {SARS-CoV-2}},
  author={Nie, Qing and Li, Xingguang and Chen, Wei and Liu, Dehui and Chen, Yingying and Li, Haitao and Li, Dongying and Tian, Mengmeng and Tan, Wei and Zai, Junjie},
  journal={Virus Research},
  volume={287},
  pages={198098},
  year={2020},
  publisher={Elsevier}
}

@article{Tang2020-origin,
  title={On the origin and continuing evolution of {SARS-CoV-2}},
  author={Tang, Xiaolu and Wu, Changcheng and Li, Xiang and Song, Yuhe and Yao, Xinmin and Wu, Xinkai and Duan, Yuange and Zhang, Hong and Wang, Yirong and Qian, Zhaohui and others},
  journal={National Science Review},
  volume={7},
  number={6},
  pages={1012--1023},
  year={2020},
  publisher={Oxford University Press}
}

@article{de2023maximum,
    title={Maximum likelihood pandemic-scale phylogenetics},
    author={De Maio, Nicola and Kalaghatgi, Prabhav and Turakhia, Yatish
        and Corbett-Detig, Russell and Minh, Bui Quang and Goldman, Nick},
    journal={Nature Genetics},
    year={2023},
    volume={55},
    pages={746--752},
    publisher={Springer Science and Business Media LLC}
}

@article{to2016fast,
    title={Fast dating using least-squares criteria and algorithms},
    author={To, Thu-Hien and Jung, Matthieu and Lycett, Samantha and
        Gascuel, Olivier},
    journal={Systematic Biology},
    volume={65},
    number={1},
    pages={82--97},
    year={2016},
    publisher={Oxford University Press}
}

@article{rasmussen2014genome,
    title={Genome-wide inference of ancestral recombination graphs},
    author={Rasmussen, Matthew D and Hubisz, Melissa J and Gronau, Ilan and
        Siepel, Adam},
    journal={PLoS Genetics},
    volume={10},
    number={5},
    pages={e1004342},
    year={2014},
    publisher={Public Library of Science San Francisco, USA}
}

@article{boni2007exact,
    title={An exact nonparametric method for inferring mosaic structure in
        sequence triplets},
    author={Boni, Maciej F and Posada, David and Feldman, Marcus W},
    journal={Genetics},
    volume={176},
    number={2},
    pages={1035--1047},
    year={2007},
    publisher={Oxford University Press}
}

@article{Yang2020-ct,
    author = {Yang, Yiyan and Yan, Wei and Hall, A Brantley and Jiang, Xiaofang},
    title = "{Characterizing Transcriptional Regulatory Sequences in Coronaviruses and Their Role in Recombination}",
    journal = {Molecular Biology and Evolution},
    volume = {38},
    number = {4},
    pages = {1241-1248},
    year = {2020},
    month = {11},
    abstract = "{Novel coronaviruses, including SARS-CoV-2, SARS, and MERS, often originate from recombination events. The mechanism of recombination in RNA viruses is template switching. Coronavirus transcription also involves template switching at specific regions, called transcriptional regulatory sequences (TRS). It is hypothesized but not yet verified that TRS sites are prone to recombination events. Here, we developed a tool called SuPER to systematically identify TRS in coronavirus genomes and then investigated whether recombination is more common at TRS. We ran SuPER on 506 coronavirus genomes and identified 465 TRS-L and 3,509 TRS-B. We found that the TRS-L core sequence (CS) and the secondary structure of the leader sequence are generally conserved within coronavirus genera but different between genera. By examining the location of recombination breakpoints with respect to TRS-B CS, we observed that recombination hotspots are more frequently colocated with TRS-B sites than expected.}",
    issn = {1537-1719},
    doi = {10.1093/molbev/msaa281},
    eprint = {https://academic.oup.com/mbe/article-pdf/38/4/1241/37042119/msaa281.pdf},
}


@article{Gallaher2020-mr,
	abstract = {Much remains unknown concerning the origin of the novel pandemic coronavirus that has raged across the globe since emerging in Wuhan of Hubei province, near the center of the People's Republic of China, in December of 2019. All current members of the family Coronaviridae have arisen by a combination of incremental adaptive mutations, against the backdrop of many recombinational events throughout the past, rendering each a unique mosaic of RNA sequences from diverse sources. The consensus among virologists is that the base sequence of the novel coronavirus, designated SARS-CoV-2, was derived from a common ancestor of a bat coronavirus, represented by the strain RaTG13, isolated in Yunnan province in 2013. Into that ancestral genetic background, several recombination events have since occurred from other divergent bat-derived coronaviruses, resulting in localized discordance between the two. One such event left SARS-CoV-2 with a receptor binding domain (RBD) capable of binding the human ACE-2 receptor lacking in RaTG13, and a second event uniquely added to SARS-CoV-2 a site specific for furin, capable of efficient endoproteolytic cleavage and activation of the spike glycoprotein responsible for virus entry and cell fusion. This paper demonstrates by bioinformatic analysis that such recombinational events are facilitated by short oligonucleotide ``breakpoint sequences'', similar to CAGAC, that direct recombination naturally to certain positions in the genome at the boundaries between blocks of RNA code and potentially RNA structure. This ``breakpoint sequence hypothesis''provides a natural explanation for the biogenesis of SARS-CoV-2 over time and in the wild.},
	author = {Gallaher, William R. },
	da = {2020/10/01},
	date-added = {2023-06-01 15:39:42 +0100},
	date-modified = {2023-06-01 15:39:42 +0100},
	doi = {10.1007/s00705-020-04750-z},
	id = {Gallaher2020},
	isbn = {1432-8798},
	journal = {Archives of Virology},
	number = {10},
	pages = {2341--2348},
	title = {A palindromic {RNA} sequence as a common breakpoint contributor to copy-choice recombination in {SARS-CoV-2}},
	ty = {JOUR},
	volume = {165},
	year = {2020},
	Bdsk-Url-1 = {https://doi.org/10.1007/s00705-020-04750-z}}

@article{Hodcroft2021-wt,
  title={Want to track pandemic variants faster? {F}ix the bioinformatics bottleneck},
  author={Hodcroft, Emma B and De Maio, Nicola and Lanfear, Rob and MacCannell, Duncan R and Minh, Bui Quang and Schmidt, Heiko A and Stamatakis, Alexandros and Goldman, Nick and Dessimoz, Christophe},
  journal={Nature},
  volume={591},
  number={7848},
  pages={30--33},
  year={2021},
  publisher={Nature Publishing Group UK London}
}

@article{Yang2021-characterizing,
  title={Characterizing transcriptional regulatory sequences in coronaviruses and their role in recombination},
  author={Yang, Yiyan and Yan, Wei and Hall, A Brantley and Jiang, Xiaofang},
  journal={Molecular Biology and Evolution},
  volume={38},
  number={4},
  pages={1241--1248},
  year={2021},
  publisher={Oxford University Press}
}

@article{Zou2021-sars,
  title={The {SARS-CoV-2} transcriptome and the dynamics of the {S} gene furin cleavage site in primary human airway epithelia},
  author={Zou, Wei and Xiong, Min and Hao, Siyuan and Zhang, Elizabeth Yan and Baumlin, Nathalie and Kim, Michael D and Salathe, Matthias and Yan, Ziying and Qiu, Jianming},
  journal={MBio},
  volume={12},
  number={3},
  pages={e01006--21},
  year={2021},
  publisher={Am Soc Microbiol}
}

@article{Bloom2023-fitness,
    title={Fitness effects of mutations to {SARS-CoV-2} proteins},
    author={Bloom, Jesse D and Neher, Richard A},
    journal={bioRxiv},
    year={2023},
    url={https://doi.org/10.1101/2023.01.30.526314},
    publisher={Cold Spring Harbor Laboratory}
}

@article{Abbas2022-reconstruction,
    title={Reconstruction of transmission chains of {SARS-CoV-2} amidst
        multiple outbreaks in a geriatric acute-care hospital: {A} combined
            retrospective epidemiological and genomic study},
    author={Abbas, Mohamed and Cori, Anne and Cordey, Samuel and Laubscher, Florian and Robalo Nunes, Tomás and Myall, Ashleigh and Salamun, Julien and Huber, Philippe and Zekry, Dina and Prendki, Virginie and Iten, Anne and Vieux, Laure and Sauvan, Valérie and Graf, Christophe E and Harbarth, Stephan},
    journal={eLife},
    volume={11},
    pages={e76854},
    year={2022},
    publisher={eLife Sciences Publications, Ltd}
}

@article{Mclaughlin2022-genomic, 
    title={Genomic epidemiology of the first two waves of {SARS-CoV-2} in
        {C}anada},
    author={McLaughlin, Angela and Montoya, Vincent and Miller, Rachel L
        and Mordecai, Gideon J and COVID, Canadian and Worobey, Michael and
            Poon, Art FY and Joy, Jeffrey B and others},
    journal={eLife},
    volume={11},
    pages={e73896},
    year={2022},
    publisher={eLife Sciences Publications Limited}
}

@misc{Smith2023-identifying,
  title = {Identifying {SARS-CoV-2} Recombinants},
  author= {Smith, E and Wright, S and Libuit, K},
  url = {https://pha4ge.org/resource/identifying-sars-cov-2-recombinants},
  year={2023},
  note = {Accessed: 2023-06-02}
}

@article{deBernardiSchneider2023-sars,
  title={{SARS-CoV-2} lineage assignments using phylogenetic placement/{UShER} are superior to {pangoLEARN} machine learning method},
  author={De Bernardi Schneider, Adriano and Su, Michelle and Hinrichs, Angie S and Wang, Jade and Amin, Helly and Bell, John and Wadford, Debra A and O'Toole, {\'A}ine and Scher, Emily and Perry, Marc D and Turakhia, Yatish and De Maio, Nicola and Hughes, Scott and Corbett-Detig, Russ},
  journal={bioRxiv},
  year={2023},
  url={https://doi.org/10.1101/2023.05.26.542489 },
  publisher={Cold Spring Harbor Laboratory}
}

@article{Nielsen2023-host,
    title={Host heterogeneity and epistasis explain punctuated evolution of
        {SARS-CoV-2}},
    author={Nielsen, Bjarke Frost and Saad-Roy, Chadi M and Li, Yimei and
        Sneppen, Kim and Simonsen, Lone and Viboud, C{\'e}cile and Levin,
        Simon A and Grenfell, Bryan T},
    journal={PLoS computational biology},
    volume={19},
    number={2},
    pages={e1010896},
    year={2023},
    publisher={Public Library of Science San Francisco, CA USA}
}

@misc{Corey2021-sars,
    title={{SARS-CoV-2} variants in patients with immunosuppression},
    author={Corey, Lawrence and Beyrer, Chris and Cohen, Myron S and
        Michael, Nelson L and Bedford, Trevor and Rolland, Morgane},
    journal={New England Journal of Medicine},
    volume={385},
    number={6},
    pages={562--566},
    year={2021},
    publisher={Mass Medical Soc}
}

% Note this was published, 
% 34 pages. To appear in Bingham, N. H., and Goldie, C. M. (eds), Probability and Mathematical Genetics: Papers in Honour of Sir John Kingman. London Math. Soc. Lecture Note Series vol. 378. Cambridge: Cambridge Univ. Press
Subjects:	
@article{Donnelly2010-coalescent,
  title={The coalescent and its descendants},
  author={Donnelly, Peter and Leslie, Stephen},
  journal={arXiv preprint arXiv:1006.1514},
  year={2010}
}

@article{McCrone2022-context,
  title={Context-specific emergence and growth of the SARS-CoV-2 Delta variant},
  author={McCrone, John T and Hill, Verity and Bajaj, Sumali and Pena, Rosario Evans and Lambert, Ben C and Inward, Rhys and Bhatt, Samir and Volz, Erik and Ruis, Christopher and Dellicour, Simon and others},
  journal={Nature},
  volume={610},
  pages={154--160},
  year={2022},
  publisher={Nature Publishing Group UK London}
}