-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'tmorse-nt-compression' into phoenix/parrallel-split-by-…
…taxid
- Loading branch information
Showing
5 changed files
with
169 additions
and
162 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
""" | ||
Generate loc db | ||
Generate Loc DB for NT/NR | ||
""" | ||
import sys | ||
import logging | ||
from collections import namedtuple | ||
|
||
import marisa_trie | ||
|
||
|
||
MILLION = 1_000_000 | ||
|
||
|
||
def _accession_id_and_name(header_line): | ||
# Sometimes multiple accessions will be mapped to a single sequence. | ||
# In this case, they will be separated by the \x01 char. | ||
# To get the accession name, just use the string until the first \x01. | ||
parts = header_line.split('\x01', 1)[0].split(' ', 1) | ||
accession_id = parts[0][1:] # Remove the '>' | ||
accession_name = parts[1] if len(parts) > 1 else '' | ||
return accession_id, accession_name | ||
|
||
|
||
Accession = namedtuple('Accession', [ | ||
'accession_id', | ||
'accession_name', | ||
'seq_offset', | ||
'header_len', | ||
'seq_len', | ||
'seq_bp_len', | ||
]) | ||
|
||
|
||
def _extract_accessions(db_file: str): | ||
""" | ||
Extracts ID, name, length, and offset information from an NCBI database fasta (NR/NT). | ||
Biopython's fasta parser is not used here because it might remove line break and | ||
spacing information that is critical to getting the correct offset. | ||
""" | ||
with open(db_file) as dbf: | ||
seq_offset = 0 | ||
seq_len = 0 | ||
seq_bp_len = 0 | ||
header_len = 0 | ||
lines = 0 | ||
accession_id = "" | ||
accession_name = "" | ||
for line in dbf: | ||
lines += 1 | ||
if lines % MILLION == 0: | ||
logging.info(f"{lines/MILLION}M lines") | ||
if line[0] == '>': # header line | ||
if seq_len > 0 and len(accession_id) > 0: | ||
yield Accession(accession_id, accession_name, seq_offset, header_len, seq_len, seq_bp_len) | ||
|
||
seq_offset = seq_offset + header_len + seq_len | ||
header_len = len(line) | ||
seq_len = 0 | ||
seq_bp_len = 0 | ||
accession_id, accession_name = _accession_id_and_name(line) | ||
else: | ||
seq_len += len(line) | ||
seq_bp_len += len(line.strip()) | ||
if seq_len > 0 and len(accession_id) > 0: | ||
yield Accession(accession_id, accession_name, seq_offset, header_len, seq_len, seq_bp_len) | ||
|
||
|
||
def generate_loc_db(db_file, loc_db_file): | ||
loc_dict = marisa_trie.RecordTrie( | ||
# unsigned long (8 bytes) followed by 2 unsigned integers (4 bytes) | ||
# https://docs.python.org/3/library/struct.html#format-characters | ||
"QII", | ||
((a.accession_id, (a.seq_offset, a.header_len, a.seq_len)) for a in _extract_accessions(db_file)), | ||
) | ||
loc_dict.save(loc_db_file) | ||
|
||
|
||
def generate_info_db(db_file, info_db_file): | ||
info_dict = marisa_trie.RecordTrie( | ||
# 256 character string followed by an unsigned integer (4 bytes) | ||
# https://docs.python.org/3/library/struct.html#format-characters | ||
"256pI", | ||
((a.accession_id, (a.accession_name.rstrip().encode(), a.seq_bp_len)) for a in _extract_accessions(db_file)), | ||
) | ||
info_dict.save(info_db_file) | ||
|
||
|
||
if __name__ == '__main__': | ||
logging.basicConfig(level=logging.INFO) | ||
command = sys.argv[1] | ||
db_file = sys.argv[2] | ||
index_file = sys.argv[3] | ||
|
||
if command == 'loc': | ||
generate_loc_db(db_file, index_file) | ||
elif command == 'info': | ||
generate_info_db(db_file, index_file) | ||
else: | ||
print(f"Unknown command {command}, expected " + " or ".join(['loc', 'info'])) | ||
exit(1) |
Oops, something went wrong.