Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add genbank parser to python #87

Merged
merged 2 commits into from
Aug 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
- Adds genbank parsing to python package. Release version 0.1.5 of dnadesign python. [#87](https://github.com/Koeng101/dnadesign/pull/87)
- Adds fastq parsing to python package. Releases version 0.1.4 of dnadesign python. [#86](https://github.com/Koeng101/dnadesign/pull/86)
- Integrated errgroup into source tree [#84](https://github.com/Koeng101/dnadesign/pull/84)
- Added kmer detection for ligation events in cloning and removed enzyme manager [#83](https://github.com/Koeng101/dnadesign/pull/83)
Expand Down
15 changes: 14 additions & 1 deletion py/README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,20 @@
# DnaDesign (Python)
This directory contains code for allowing python users to use dnadesign through a shared C library.

This is a work-in-progress. Right now, we have only ported the fasta parser.
This is a work-in-progress. Right now, we have only ported the fasta, fastq, and genbank parsers.

For genbank files, we have this benchmark with bsub.gbk:
```
Running 10 iterations for each parser...

Results:
DNA design average time: 0.337005 seconds
BioPython average time: 0.332007 seconds

DNA design is 0.99x faster than BioPython
```

Turns out, we're just about as performant!

### Other platforms
If you have interest in other platforms, like openbsd or freebsd, please add an issue! I'd be happy to add automatic packaging for these alternative platforms if I know someone will use them.
Expand Down
41 changes: 41 additions & 0 deletions py/benchmark_against_biopython.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import timeit
import os
from Bio import SeqIO
from dnadesign.parsers import parse_genbank_from_c_file

def benchmark_dnadesign(file_path, num_iterations=10):
def parse_with_dnadesign():
records = parse_genbank_from_c_file(file_path)
return records

time_taken = timeit.timeit(parse_with_dnadesign, number=num_iterations)
return time_taken / num_iterations

def benchmark_biopython(file_path, num_iterations=10):
def parse_with_biopython():
with open(file_path, "r") as handle:
records = list(SeqIO.parse(handle, "genbank"))
return records

time_taken = timeit.timeit(parse_with_biopython, number=num_iterations)
return time_taken / num_iterations

def main():
current_dir = os.path.dirname(__file__)
example_path = os.path.join(current_dir, '../lib/bio/genbank/data/bsub.gbk')

print(f"Benchmarking GenBank parsing for file: {example_path}")
print("Running 10 iterations for each parser...")

dnadesign_time = benchmark_dnadesign(example_path)
biopython_time = benchmark_biopython(example_path)

print(f"\nResults:")
print(f"DNA design average time: {dnadesign_time:.6f} seconds")
print(f"BioPython average time: {biopython_time:.6f} seconds")

speedup = biopython_time / dnadesign_time
print(f"\nDNA design is {speedup:.2f}x faster than BioPython")

if __name__ == "__main__":
main()
98 changes: 98 additions & 0 deletions py/dnadesign/definitions.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,101 @@ typedef struct {

FastqResult ParseFastqFromCFile(void* cfile);
FastqResult ParseFastqFromCString(char* cstring);

// Genbank definitions
// GenbankLocation
typedef struct GenbankLocation {
int start;
int end;
int complement;
int join;
int five_prime_partial;
int three_prime_partial;
char* gbk_location_string;
struct GenbankLocation* sub_locations;
int sub_locations_count;
} GenbankLocation;

// GenbankFeature
typedef struct {
char* type_;
char* description;
char** attribute_keys;
char*** attribute_values;
int* attribute_value_counts;
int attribute_count;
char* sequence_hash;
char* sequence_hash_function;
char* sequence;
GenbankLocation location;
} GenbankFeature;

// GenbankReference
typedef struct {
char* authors;
char* title;
char* journal;
char* pub_med;
char* remark;
char* range_;
char* consortium;
} GenbankReference;

// GenbankLocus
typedef struct {
char* name;
char* sequence_length;
char* molecule_type;
char* genbank_division;
char* modification_date;
char* sequence_coding;
int circular;
} GenbankLocus;

// GenbankBaseCount
typedef struct {
char base;
int count;
} GenbankBaseCount;

// GenbankMeta
typedef struct {
char* date;
char* definition;
char* accession;
char* version;
char* keywords;
char* organism;
char* source;
char** taxonomy;
int taxonomy_count;
char* origin;
GenbankLocus locus;
GenbankReference* references;
int reference_count;
GenbankBaseCount* base_counts;
int base_count_count;
char** other_keys;
char** other_values;
int other_count;
char* name;
char* sequence_hash;
char* sequence_hash_function;
} GenbankMeta;

// Genbank
typedef struct {
GenbankMeta meta;
GenbankFeature* features;
int feature_count;
char* sequence;
} Genbank;

typedef struct {
Genbank* records;
int numRecords;
char* error;
} GenbankResult;

GenbankResult ParseGenbankFromCFile(void* cfile);
GenbankResult ParseGenbankFromCString(char* cstring);
209 changes: 209 additions & 0 deletions py/dnadesign/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,212 @@ def _process_fastq_result(result) -> List[FastqRecord]:
optionals
))
return fastq_records

# Genbank time
class GenbankLocus:
def __init__(self, name: str, sequence_length: str, molecule_type: str, genbank_division: str,
modification_date: str, sequence_coding: str, circular: bool):
self.name = name
self.sequence_length = sequence_length
self.molecule_type = molecule_type
self.genbank_division = genbank_division
self.modification_date = modification_date
self.sequence_coding = sequence_coding
self.circular = circular

class GenbankReference:
def __init__(self, authors: str, title: str, journal: str, pub_med: str, remark: str, range_: str, consortium: str):
self.authors = authors
self.title = title
self.journal = journal
self.pub_med = pub_med
self.remark = remark
self.range_ = range_
self.consortium = consortium

class GenbankBaseCount:
def __init__(self, base: str, count: int):
self.base = base
self.count = count

class GenbankMeta:
def __init__(self, date: str, definition: str, accession: str, version: str, keywords: str,
organism: str, source: str, taxonomy: List[str], origin: str, locus: GenbankLocus,
references: List[GenbankReference], base_counts: List[GenbankBaseCount], other: Dict[str, str],
name: str, sequence_hash: str, hash_function: str):
self.date = date
self.definition = definition
self.accession = accession
self.version = version
self.keywords = keywords
self.organism = organism
self.source = source
self.taxonomy = taxonomy
self.origin = origin
self.locus = locus
self.references = references
self.base_counts = base_counts
self.other = other
self.name = name
self.sequence_hash = sequence_hash
self.hash_function = hash_function

# Update the Location class to match the C struct
class GenbankLocation:
def __init__(self, start: int, end: int, complement: bool, join: bool, five_prime_partial: bool,
three_prime_partial: bool, gbk_location_string: str, sub_locations: List['GenbankLocation']):
self.start = start
self.end = end
self.complement = complement
self.join = join
self.five_prime_partial = five_prime_partial
self.three_prime_partial = three_prime_partial
self.gbk_location_string = gbk_location_string
self.sub_locations = sub_locations

class GenbankFeature:
def __init__(self, type_: str, description: str, attributes: Dict[str, List[str]],
sequence_hash: str, hash_function: str, sequence: str, location: GenbankLocation):
self.type_ = type_
self.description = description
self.attributes = attributes
self.sequence_hash = sequence_hash
self.hash_function = hash_function
self.sequence = sequence
self.location = location

class Genbank:
def __init__(self, meta: GenbankMeta, features: List[GenbankFeature], sequence: str):
self.meta = meta
self.features = features
self.sequence = sequence

def _safe_open_file(file_path: str):
if not os.path.exists(file_path):
raise FileNotFoundError(f"The file {file_path} does not exist.")
cfile = lib.fopen(file_path.encode('utf-8'), "r".encode('utf-8'))
if cfile == ffi.NULL:
raise IOError(f"Failed to open the file {file_path}.")
return cfile

def parse_genbank_from_c_file(file_path: str) -> List[Genbank]:
try:
cfile = _safe_open_file(file_path)
result = lib.ParseGenbankFromCFile(cfile)
return _process_genbank_result(result)
finally:
if 'cfile' in locals() and cfile != ffi.NULL:
lib.fclose(cfile)

def parse_genbank_from_c_string(cstring: str) -> List[Genbank]:
result = lib.ParseGenbankFromCString(cstring.encode('utf-8'))
return _process_genbank_result(result)

def _process_genbank_result(result) -> List[Genbank]:
if result.error != ffi.NULL:
error_str = ffi.string(result.error).decode('utf-8')
raise Exception("Error parsing Genbank: " + error_str)
num_records = result.numRecords
records = ffi.cast("Genbank*", result.records)
return [_convert_genbank_record(records[i]) for i in range(num_records)]

def _convert_genbank_record(record) -> Genbank:
meta = _convert_meta(record.meta)
features = [_convert_feature(record.features[i]) for i in range(record.feature_count)]
sequence = ffi.string(record.sequence).decode('utf-8')
return Genbank(meta, features, sequence)

def _convert_meta(meta) -> GenbankMeta:
locus = GenbankLocus(
ffi.string(meta.locus.name).decode('utf-8'),
ffi.string(meta.locus.sequence_length).decode('utf-8'),
ffi.string(meta.locus.molecule_type).decode('utf-8'),
ffi.string(meta.locus.genbank_division).decode('utf-8'),
ffi.string(meta.locus.modification_date).decode('utf-8'),
ffi.string(meta.locus.sequence_coding).decode('utf-8'),
meta.locus.circular
)

references = [
GenbankReference(
ffi.string(ref.authors).decode('utf-8'),
ffi.string(ref.title).decode('utf-8'),
ffi.string(ref.journal).decode('utf-8'),
ffi.string(ref.pub_med).decode('utf-8'),
ffi.string(ref.remark).decode('utf-8'),
ffi.string(ref.range_).decode('utf-8'),
ffi.string(ref.consortium).decode('utf-8')
) for ref in meta.references[0:meta.reference_count]
]

base_counts = [
GenbankBaseCount(
ffi.string(bc.base).decode('utf-8'),
bc.count
) for bc in meta.base_counts[0:meta.base_count_count]
]

other = {}
for i in range(meta.other_count):
key = ffi.string(meta.other_keys[i]).decode('utf-8')
value = ffi.string(meta.other_values[i]).decode('utf-8')
other[key] = value

return GenbankMeta(
ffi.string(meta.date).decode('utf-8'),
ffi.string(meta.definition).decode('utf-8'),
ffi.string(meta.accession).decode('utf-8'),
ffi.string(meta.version).decode('utf-8'),
ffi.string(meta.keywords).decode('utf-8'),
ffi.string(meta.organism).decode('utf-8'),
ffi.string(meta.source).decode('utf-8'),
[ffi.string(tax).decode('utf-8') for tax in meta.taxonomy[0:meta.taxonomy_count]],
ffi.string(meta.origin).decode('utf-8'),
locus,
references,
base_counts,
other,
ffi.string(meta.name).decode('utf-8'),
ffi.string(meta.sequence_hash).decode('utf-8'),
ffi.string(meta.sequence_hash_function).decode('utf-8')
)

def _convert_location(loc) -> GenbankLocation:
sub_locations = []

if loc.sub_locations_count > 0:
sub_locations_array = ffi.cast("GenbankLocation *", loc.sub_locations)
for i in range(loc.sub_locations_count):
sub_loc = sub_locations_array[i]
sub_locations.append(_convert_location(sub_loc))

return GenbankLocation(
loc.start,
loc.end,
bool(loc.complement),
bool(loc.join),
bool(loc.five_prime_partial),
bool(loc.three_prime_partial),
ffi.string(loc.gbk_location_string).decode('utf-8') if loc.gbk_location_string else "",
sub_locations
)

def _convert_feature(feature) -> GenbankFeature:
attributes = {}
for i in range(feature.attribute_count):
key = ffi.string(feature.attribute_keys[i]).decode('utf-8')
values = [ffi.string(feature.attribute_values[i][j]).decode('utf-8')
for j in range(feature.attribute_value_counts[i])]
attributes[key] = values

location = _convert_location(feature.location)

return GenbankFeature(
ffi.string(feature.type_).decode('utf-8'),
ffi.string(feature.description).decode('utf-8'),
attributes,
ffi.string(feature.sequence_hash).decode('utf-8'),
ffi.string(feature.sequence_hash_function).decode('utf-8'),
ffi.string(feature.sequence).decode('utf-8'),
location
)
Loading
Loading