Skip to content

Commit

Permalink
Merge branch 'tmorse-nt-compression' into phoenix/parrallel-split-by-…
Browse files Browse the repository at this point in the history
…taxid
  • Loading branch information
phoenixAja authored Jan 18, 2024
2 parents 83292a1 + 12bb2d6 commit e1cf511
Show file tree
Hide file tree
Showing 5 changed files with 169 additions and 162 deletions.
64 changes: 0 additions & 64 deletions workflows/index-generation/generate_loc_db.py

This file was deleted.

102 changes: 102 additions & 0 deletions workflows/index-generation/generate_ncbi_db_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""
Generate loc db
Generate Loc DB for NT/NR
"""
import sys
import logging
from collections import namedtuple

import marisa_trie


MILLION = 1_000_000


def _accession_id_and_name(header_line):
# Sometimes multiple accessions will be mapped to a single sequence.
# In this case, they will be separated by the \x01 char.
# To get the accession name, just use the string until the first \x01.
parts = header_line.split('\x01', 1)[0].split(' ', 1)
accession_id = parts[0][1:] # Remove the '>'
accession_name = parts[1] if len(parts) > 1 else ''
return accession_id, accession_name


Accession = namedtuple('Accession', [
'accession_id',
'accession_name',
'seq_offset',
'header_len',
'seq_len',
'seq_bp_len',
])


def _extract_accessions(db_file: str):
"""
Extracts ID, name, length, and offset information from an NCBI database fasta (NR/NT).
Biopython's fasta parser is not used here because it might remove line break and
spacing information that is critical to getting the correct offset.
"""
with open(db_file) as dbf:
seq_offset = 0
seq_len = 0
seq_bp_len = 0
header_len = 0
lines = 0
accession_id = ""
accession_name = ""
for line in dbf:
lines += 1
if lines % MILLION == 0:
logging.info(f"{lines/MILLION}M lines")
if line[0] == '>': # header line
if seq_len > 0 and len(accession_id) > 0:
yield Accession(accession_id, accession_name, seq_offset, header_len, seq_len, seq_bp_len)

seq_offset = seq_offset + header_len + seq_len
header_len = len(line)
seq_len = 0
seq_bp_len = 0
accession_id, accession_name = _accession_id_and_name(line)
else:
seq_len += len(line)
seq_bp_len += len(line.strip())
if seq_len > 0 and len(accession_id) > 0:
yield Accession(accession_id, accession_name, seq_offset, header_len, seq_len, seq_bp_len)


def generate_loc_db(db_file, loc_db_file):
loc_dict = marisa_trie.RecordTrie(
# unsigned long (8 bytes) followed by 2 unsigned integers (4 bytes)
# https://docs.python.org/3/library/struct.html#format-characters
"QII",
((a.accession_id, (a.seq_offset, a.header_len, a.seq_len)) for a in _extract_accessions(db_file)),
)
loc_dict.save(loc_db_file)


def generate_info_db(db_file, info_db_file):
info_dict = marisa_trie.RecordTrie(
# 256 character string followed by an unsigned integer (4 bytes)
# https://docs.python.org/3/library/struct.html#format-characters
"256pI",
((a.accession_id, (a.accession_name.rstrip().encode(), a.seq_bp_len)) for a in _extract_accessions(db_file)),
)
info_dict.save(info_db_file)


if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
command = sys.argv[1]
db_file = sys.argv[2]
index_file = sys.argv[3]

if command == 'loc':
generate_loc_db(db_file, index_file)
elif command == 'info':
generate_info_db(db_file, index_file)
else:
print(f"Unknown command {command}, expected " + " or ".join(['loc', 'info']))
exit(1)
Loading

0 comments on commit e1cf511

Please sign in to comment.