From d150736ba3e22a88bdd74a0ad4519efd38f498a3 Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Sat, 17 Aug 2024 03:12:27 +0000 Subject: [PATCH 1/2] add genbank parser to python --- py/dnadesign/definitions.h | 98 +++++++++++ py/dnadesign/parsers.py | 209 +++++++++++++++++++++++ py/lib.go | 282 ++++++++++++++++++++++++++++++++ py/setup.py | 2 +- py/tests/data/example.gb | 141 ++++++++++++++++ py/tests/test_genbank_parser.py | 64 ++++++++ 6 files changed, 795 insertions(+), 1 deletion(-) create mode 100644 py/tests/data/example.gb create mode 100644 py/tests/test_genbank_parser.py diff --git a/py/dnadesign/definitions.h b/py/dnadesign/definitions.h index f1b5612..1602e30 100644 --- a/py/dnadesign/definitions.h +++ b/py/dnadesign/definitions.h @@ -39,3 +39,101 @@ typedef struct { FastqResult ParseFastqFromCFile(void* cfile); FastqResult ParseFastqFromCString(char* cstring); + +// Genbank definitions +// GenbankLocation +typedef struct GenbankLocation { + int start; + int end; + int complement; + int join; + int five_prime_partial; + int three_prime_partial; + char* gbk_location_string; + struct GenbankLocation* sub_locations; + int sub_locations_count; +} GenbankLocation; + +// GenbankFeature +typedef struct { + char* type_; + char* description; + char** attribute_keys; + char*** attribute_values; + int* attribute_value_counts; + int attribute_count; + char* sequence_hash; + char* sequence_hash_function; + char* sequence; + GenbankLocation location; +} GenbankFeature; + +// GenbankReference +typedef struct { + char* authors; + char* title; + char* journal; + char* pub_med; + char* remark; + char* range_; + char* consortium; +} GenbankReference; + +// GenbankLocus +typedef struct { + char* name; + char* sequence_length; + char* molecule_type; + char* genbank_division; + char* modification_date; + char* sequence_coding; + int circular; +} GenbankLocus; + +// GenbankBaseCount +typedef struct { + char base; + int count; +} GenbankBaseCount; + +// GenbankMeta +typedef struct { + char* date; + char* definition; + char* accession; + char* version; + char* keywords; + char* organism; + char* source; + char** taxonomy; + int taxonomy_count; + char* origin; + GenbankLocus locus; + GenbankReference* references; + int reference_count; + GenbankBaseCount* base_counts; + int base_count_count; + char** other_keys; + char** other_values; + int other_count; + char* name; + char* sequence_hash; + char* sequence_hash_function; +} GenbankMeta; + +// Genbank +typedef struct { + GenbankMeta meta; + GenbankFeature* features; + int feature_count; + char* sequence; +} Genbank; + +typedef struct { + Genbank* records; + int numRecords; + char* error; +} GenbankResult; + +GenbankResult ParseGenbankFromCFile(void* cfile); +GenbankResult ParseGenbankFromCString(char* cstring); diff --git a/py/dnadesign/parsers.py b/py/dnadesign/parsers.py index 590fd71..454ab28 100644 --- a/py/dnadesign/parsers.py +++ b/py/dnadesign/parsers.py @@ -78,3 +78,212 @@ def _process_fastq_result(result) -> List[FastqRecord]: optionals )) return fastq_records + +# Genbank time +class GenbankLocus: + def __init__(self, name: str, sequence_length: str, molecule_type: str, genbank_division: str, + modification_date: str, sequence_coding: str, circular: bool): + self.name = name + self.sequence_length = sequence_length + self.molecule_type = molecule_type + self.genbank_division = genbank_division + self.modification_date = modification_date + self.sequence_coding = sequence_coding + self.circular = circular + +class GenbankReference: + def __init__(self, authors: str, title: str, journal: str, pub_med: str, remark: str, range_: str, consortium: str): + self.authors = authors + self.title = title + self.journal = journal + self.pub_med = pub_med + self.remark = remark + self.range_ = range_ + self.consortium = consortium + +class GenbankBaseCount: + def __init__(self, base: str, count: int): + self.base = base + self.count = count + +class GenbankMeta: + def __init__(self, date: str, definition: str, accession: str, version: str, keywords: str, + organism: str, source: str, taxonomy: List[str], origin: str, locus: GenbankLocus, + references: List[GenbankReference], base_counts: List[GenbankBaseCount], other: Dict[str, str], + name: str, sequence_hash: str, hash_function: str): + self.date = date + self.definition = definition + self.accession = accession + self.version = version + self.keywords = keywords + self.organism = organism + self.source = source + self.taxonomy = taxonomy + self.origin = origin + self.locus = locus + self.references = references + self.base_counts = base_counts + self.other = other + self.name = name + self.sequence_hash = sequence_hash + self.hash_function = hash_function + +# Update the Location class to match the C struct +class GenbankLocation: + def __init__(self, start: int, end: int, complement: bool, join: bool, five_prime_partial: bool, + three_prime_partial: bool, gbk_location_string: str, sub_locations: List['GenbankLocation']): + self.start = start + self.end = end + self.complement = complement + self.join = join + self.five_prime_partial = five_prime_partial + self.three_prime_partial = three_prime_partial + self.gbk_location_string = gbk_location_string + self.sub_locations = sub_locations + +class GenbankFeature: + def __init__(self, type_: str, description: str, attributes: Dict[str, List[str]], + sequence_hash: str, hash_function: str, sequence: str, location: GenbankLocation): + self.type_ = type_ + self.description = description + self.attributes = attributes + self.sequence_hash = sequence_hash + self.hash_function = hash_function + self.sequence = sequence + self.location = location + +class Genbank: + def __init__(self, meta: GenbankMeta, features: List[GenbankFeature], sequence: str): + self.meta = meta + self.features = features + self.sequence = sequence + +def _safe_open_file(file_path: str): + if not os.path.exists(file_path): + raise FileNotFoundError(f"The file {file_path} does not exist.") + cfile = lib.fopen(file_path.encode('utf-8'), "r".encode('utf-8')) + if cfile == ffi.NULL: + raise IOError(f"Failed to open the file {file_path}.") + return cfile + +def parse_genbank_from_c_file(file_path: str) -> List[Genbank]: + try: + cfile = _safe_open_file(file_path) + result = lib.ParseGenbankFromCFile(cfile) + return _process_genbank_result(result) + finally: + if 'cfile' in locals() and cfile != ffi.NULL: + lib.fclose(cfile) + +def parse_genbank_from_c_string(cstring: str) -> List[Genbank]: + result = lib.ParseGenbankFromCString(cstring.encode('utf-8')) + return _process_genbank_result(result) + +def _process_genbank_result(result) -> List[Genbank]: + if result.error != ffi.NULL: + error_str = ffi.string(result.error).decode('utf-8') + raise Exception("Error parsing Genbank: " + error_str) + num_records = result.numRecords + records = ffi.cast("Genbank*", result.records) + return [_convert_genbank_record(records[i]) for i in range(num_records)] + +def _convert_genbank_record(record) -> Genbank: + meta = _convert_meta(record.meta) + features = [_convert_feature(record.features[i]) for i in range(record.feature_count)] + sequence = ffi.string(record.sequence).decode('utf-8') + return Genbank(meta, features, sequence) + +def _convert_meta(meta) -> GenbankMeta: + locus = GenbankLocus( + ffi.string(meta.locus.name).decode('utf-8'), + ffi.string(meta.locus.sequence_length).decode('utf-8'), + ffi.string(meta.locus.molecule_type).decode('utf-8'), + ffi.string(meta.locus.genbank_division).decode('utf-8'), + ffi.string(meta.locus.modification_date).decode('utf-8'), + ffi.string(meta.locus.sequence_coding).decode('utf-8'), + meta.locus.circular + ) + + references = [ + GenbankReference( + ffi.string(ref.authors).decode('utf-8'), + ffi.string(ref.title).decode('utf-8'), + ffi.string(ref.journal).decode('utf-8'), + ffi.string(ref.pub_med).decode('utf-8'), + ffi.string(ref.remark).decode('utf-8'), + ffi.string(ref.range_).decode('utf-8'), + ffi.string(ref.consortium).decode('utf-8') + ) for ref in meta.references[0:meta.reference_count] + ] + + base_counts = [ + GenbankBaseCount( + ffi.string(bc.base).decode('utf-8'), + bc.count + ) for bc in meta.base_counts[0:meta.base_count_count] + ] + + other = {} + for i in range(meta.other_count): + key = ffi.string(meta.other_keys[i]).decode('utf-8') + value = ffi.string(meta.other_values[i]).decode('utf-8') + other[key] = value + + return GenbankMeta( + ffi.string(meta.date).decode('utf-8'), + ffi.string(meta.definition).decode('utf-8'), + ffi.string(meta.accession).decode('utf-8'), + ffi.string(meta.version).decode('utf-8'), + ffi.string(meta.keywords).decode('utf-8'), + ffi.string(meta.organism).decode('utf-8'), + ffi.string(meta.source).decode('utf-8'), + [ffi.string(tax).decode('utf-8') for tax in meta.taxonomy[0:meta.taxonomy_count]], + ffi.string(meta.origin).decode('utf-8'), + locus, + references, + base_counts, + other, + ffi.string(meta.name).decode('utf-8'), + ffi.string(meta.sequence_hash).decode('utf-8'), + ffi.string(meta.sequence_hash_function).decode('utf-8') + ) + +def _convert_location(loc) -> GenbankLocation: + sub_locations = [] + + if loc.sub_locations_count > 0: + sub_locations_array = ffi.cast("Location *", loc.sub_locations) + for i in range(loc.sub_locations_count): + sub_loc = sub_locations_array[i] + sub_locations.append(_convert_location(sub_loc)) + + return GenbankLocation( + loc.start, + loc.end, + bool(loc.complement), + bool(loc.join), + bool(loc.five_prime_partial), + bool(loc.three_prime_partial), + ffi.string(loc.gbk_location_string).decode('utf-8') if loc.gbk_location_string else "", + sub_locations + ) + +def _convert_feature(feature) -> GenbankFeature: + attributes = {} + for i in range(feature.attribute_count): + key = ffi.string(feature.attribute_keys[i]).decode('utf-8') + values = [ffi.string(feature.attribute_values[i][j]).decode('utf-8') + for j in range(feature.attribute_value_counts[i])] + attributes[key] = values + + location = _convert_location(feature.location) + + return GenbankFeature( + ffi.string(feature.type_).decode('utf-8'), + ffi.string(feature.description).decode('utf-8'), + attributes, + ffi.string(feature.sequence_hash).decode('utf-8'), + ffi.string(feature.sequence_hash_function).decode('utf-8'), + ffi.string(feature.sequence).decode('utf-8'), + location + ) diff --git a/py/lib.go b/py/lib.go index 35982aa..1e9c458 100644 --- a/py/lib.go +++ b/py/lib.go @@ -24,14 +24,104 @@ typedef struct { char* sequence; char* quality; } FastqRecord; + +// GenbankLocation +typedef struct GenbankLocation { + int start; + int end; + int complement; + int join; + int five_prime_partial; + int three_prime_partial; + char* gbk_location_string; + struct GenbankLocation* sub_locations; + int sub_locations_count; +} GenbankLocation; + +// GenbankFeature +typedef struct { + char* type_; + char* description; + char** attribute_keys; + char*** attribute_values; + int* attribute_value_counts; + int attribute_count; + char* sequence_hash; + char* sequence_hash_function; + char* sequence; + GenbankLocation location; +} GenbankFeature; + +// GenbankReference +typedef struct { + char* authors; + char* title; + char* journal; + char* pub_med; + char* remark; + char* range_; + char* consortium; +} GenbankReference; + +// GenbankLocus +typedef struct { + char* name; + char* sequence_length; + char* molecule_type; + char* genbank_division; + char* modification_date; + char* sequence_coding; + int circular; +} GenbankLocus; + +// GenbankBaseCount +typedef struct { + char base; + int count; +} GenbankBaseCount; + +// GenbankMeta +typedef struct { + char* date; + char* definition; + char* accession; + char* version; + char* keywords; + char* organism; + char* source; + char** taxonomy; + int taxonomy_count; + char* origin; + GenbankLocus locus; + GenbankReference* references; + int reference_count; + GenbankBaseCount* base_counts; + int base_count_count; + char** other_keys; + char** other_values; + int other_count; + char* name; + char* sequence_hash; + char* sequence_hash_function; +} GenbankMeta; + +// Genbank +typedef struct { + GenbankMeta meta; + GenbankFeature* features; + int feature_count; + char* sequence; +} Genbank; */ import "C" import ( + "fmt" "io" "strings" "unsafe" "github.com/koeng101/dnadesign/lib/bio" + "github.com/koeng101/dnadesign/lib/bio/genbank" ) /****************************************************************************** @@ -155,6 +245,198 @@ func ParseFastqFromCString(cstring *C.char) (*C.FastqRecord, int, *C.char) { /****************************************************************************** +Genbank + +******************************************************************************/ + +// goGenbankToCGenbank converts a slice of genbank.Genbank to a C.Genbank array +func goGenbankToCGenbank(gbs []genbank.Genbank) (*C.Genbank, int, *C.char) { + if len(gbs) == 0 { + return nil, 0, C.CString("No genbank records provided") + } + + cGenbanks := (*C.Genbank)(C.malloc(C.size_t(len(gbs)) * C.size_t(unsafe.Sizeof(C.Genbank{})))) + slice := (*[1<<30 - 1]C.Genbank)(unsafe.Pointer(cGenbanks))[:len(gbs):len(gbs)] + + for i, gb := range gbs { + // Convert Meta + slice[i].meta = C.GenbankMeta{ + date: C.CString(gb.Meta.Date), + definition: C.CString(gb.Meta.Definition), + accession: C.CString(gb.Meta.Accession), + version: C.CString(gb.Meta.Version), + keywords: C.CString(gb.Meta.Keywords), + organism: C.CString(gb.Meta.Organism), + source: C.CString(gb.Meta.Source), + taxonomy: (**C.char)(C.malloc(C.size_t(len(gb.Meta.Taxonomy)) * C.size_t(unsafe.Sizeof(uintptr(0))))), + taxonomy_count: C.int(len(gb.Meta.Taxonomy)), + origin: C.CString(gb.Meta.Origin), + name: C.CString(gb.Meta.Name), + sequence_hash: C.CString(gb.Meta.SequenceHash), + sequence_hash_function: C.CString(gb.Meta.SequenceHashFunction), + } + + // Convert Taxonomy + taxonomySlice := (*[1 << 30]*C.char)(unsafe.Pointer(slice[i].meta.taxonomy))[:len(gb.Meta.Taxonomy):len(gb.Meta.Taxonomy)] + for j, taxon := range gb.Meta.Taxonomy { + taxonomySlice[j] = C.CString(taxon) + } + + // Convert Locus + slice[i].meta.locus = C.GenbankLocus{ + name: C.CString(gb.Meta.Locus.Name), + sequence_length: C.CString(gb.Meta.Locus.SequenceLength), + molecule_type: C.CString(gb.Meta.Locus.MoleculeType), + genbank_division: C.CString(gb.Meta.Locus.GenbankDivision), + modification_date: C.CString(gb.Meta.Locus.ModificationDate), + sequence_coding: C.CString(gb.Meta.Locus.SequenceCoding), + circular: C.int(boolToInt(gb.Meta.Locus.Circular)), + } + + // Convert References + slice[i].meta.references = (*C.GenbankReference)(C.malloc(C.size_t(len(gb.Meta.References)) * C.size_t(unsafe.Sizeof(C.GenbankReference{})))) + slice[i].meta.reference_count = C.int(len(gb.Meta.References)) + refSlice := (*[1 << 30]C.GenbankReference)(unsafe.Pointer(slice[i].meta.references))[:len(gb.Meta.References):len(gb.Meta.References)] + for j, ref := range gb.Meta.References { + refSlice[j] = C.GenbankReference{ + authors: C.CString(ref.Authors), + title: C.CString(ref.Title), + journal: C.CString(ref.Journal), + pub_med: C.CString(ref.PubMed), + remark: C.CString(ref.Remark), + range_: C.CString(ref.Range), + consortium: C.CString(ref.Consortium), + } + } + + // Convert BaseCount + slice[i].meta.base_counts = (*C.GenbankBaseCount)(C.malloc(C.size_t(len(gb.Meta.BaseCount)) * C.size_t(unsafe.Sizeof(C.GenbankBaseCount{})))) + slice[i].meta.base_count_count = C.int(len(gb.Meta.BaseCount)) + baseCountSlice := (*[1 << 30]C.GenbankBaseCount)(unsafe.Pointer(slice[i].meta.base_counts))[:len(gb.Meta.BaseCount):len(gb.Meta.BaseCount)] + for j, bc := range gb.Meta.BaseCount { + baseCountSlice[j] = C.GenbankBaseCount{ + base: C.char(bc.Base[0]), + count: C.int(bc.Count), + } + } + + // Convert Other + slice[i].meta.other_keys = (**C.char)(C.malloc(C.size_t(len(gb.Meta.Other)) * C.size_t(unsafe.Sizeof(uintptr(0))))) + slice[i].meta.other_values = (**C.char)(C.malloc(C.size_t(len(gb.Meta.Other)) * C.size_t(unsafe.Sizeof(uintptr(0))))) + slice[i].meta.other_count = C.int(len(gb.Meta.Other)) + otherKeysSlice := (*[1 << 30]*C.char)(unsafe.Pointer(slice[i].meta.other_keys))[:len(gb.Meta.Other):len(gb.Meta.Other)] + otherValuesSlice := (*[1 << 30]*C.char)(unsafe.Pointer(slice[i].meta.other_values))[:len(gb.Meta.Other):len(gb.Meta.Other)] + j := 0 + for k, v := range gb.Meta.Other { + otherKeysSlice[j] = C.CString(k) + otherValuesSlice[j] = C.CString(v) + j++ + } + + // Convert Features + slice[i].features = (*C.GenbankFeature)(C.malloc(C.size_t(len(gb.Features)) * C.size_t(unsafe.Sizeof(C.GenbankFeature{})))) + slice[i].feature_count = C.int(len(gb.Features)) + featureSlice := (*[1 << 30]C.GenbankFeature)(unsafe.Pointer(slice[i].features))[:len(gb.Features):len(gb.Features)] + for j, feature := range gb.Features { + fmt.Println(feature.Type) + featureSlice[j] = C.GenbankFeature{ + type_: C.CString(feature.Type), + description: C.CString(feature.Description), + sequence_hash: C.CString(feature.SequenceHash), + sequence_hash_function: C.CString(feature.SequenceHashFunction), + sequence: C.CString(feature.Sequence), + } + + // Convert Attributes + featureSlice[j].attribute_keys = (**C.char)(C.malloc(C.size_t(len(feature.Attributes)) * C.size_t(unsafe.Sizeof(uintptr(0))))) + featureSlice[j].attribute_values = (***C.char)(C.malloc(C.size_t(len(feature.Attributes)) * C.size_t(unsafe.Sizeof(uintptr(0))))) + featureSlice[j].attribute_value_counts = (*C.int)(C.malloc(C.size_t(len(feature.Attributes)) * C.size_t(unsafe.Sizeof(C.int(0))))) + featureSlice[j].attribute_count = C.int(len(feature.Attributes)) + + attrKeysSlice := (*[1 << 30]*C.char)(unsafe.Pointer(featureSlice[j].attribute_keys))[:len(feature.Attributes):len(feature.Attributes)] + attrValuesSlice := (*[1 << 30]**C.char)(unsafe.Pointer(featureSlice[j].attribute_values))[:len(feature.Attributes):len(feature.Attributes)] + attrValueCountsSlice := (*[1 << 30]C.int)(unsafe.Pointer(featureSlice[j].attribute_value_counts))[:len(feature.Attributes):len(feature.Attributes)] + + k := 0 + for key, values := range feature.Attributes { + attrKeysSlice[k] = C.CString(key) + attrValuesSlice[k] = (**C.char)(C.malloc(C.size_t(len(values)) * C.size_t(unsafe.Sizeof(uintptr(0))))) + attrValueCountsSlice[k] = C.int(len(values)) + valueSlice := (*[1 << 30]*C.char)(unsafe.Pointer(attrValuesSlice[k]))[:len(values):len(values)] + for l, val := range values { + valueSlice[l] = C.CString(val) + } + k++ + } + + // Convert Location + featureSlice[j].location = convertLocation(feature.Location) + } + + // Convert Sequence + slice[i].sequence = C.CString(gb.Sequence) + } + + return cGenbanks, len(gbs), nil +} + +// Convert Location +func convertLocation(location genbank.Location) C.GenbankLocation { + cLocation := C.GenbankLocation{ + start: C.int(location.Start), + end: C.int(location.End), + complement: C.int(boolToInt(location.Complement)), + join: C.int(boolToInt(location.Join)), + five_prime_partial: C.int(boolToInt(location.FivePrimePartial)), + three_prime_partial: C.int(boolToInt(location.ThreePrimePartial)), + gbk_location_string: C.CString(location.GbkLocationString), + sub_locations: nil, + sub_locations_count: 0, + } + + if len(location.SubLocations) > 0 { + cSubLocations := make([]C.GenbankLocation, len(location.SubLocations)) + for i, subLocation := range location.SubLocations { + cSubLocations[i] = convertLocation(subLocation) + } + cLocation.sub_locations = &cSubLocations[0] + cLocation.sub_locations_count = C.int(len(cSubLocations)) + } + + return cLocation +} + +func boolToInt(b bool) int { + if b { + return 1 + } + return 0 +} + +//export ParseGenbankFromCFile +func ParseGenbankFromCFile(cfile *C.FILE) (*C.Genbank, int, *C.char) { + reader := readerFromCFile(cfile) + parser := bio.NewGenbankParser(reader) + genbanks, err := parser.Parse() + if err != nil { + return nil, len(genbanks), C.CString(err.Error()) + } + return goGenbankToCGenbank(genbanks) +} + +//export ParseGenbankFromCString +func ParseGenbankFromCString(cstring *C.char) (*C.Genbank, int, *C.char) { + reader := strings.NewReader(C.GoString(cstring)) + parser := bio.NewGenbankParser(reader) + genbanks, err := parser.Parse() + if err != nil { + return nil, len(genbanks), C.CString(err.Error()) + } + return goGenbankToCGenbank(genbanks) +} + +/****************************************************************************** + main.go ******************************************************************************/ diff --git a/py/setup.py b/py/setup.py index c065e4d..55a6320 100644 --- a/py/setup.py +++ b/py/setup.py @@ -13,7 +13,7 @@ def get_shared_lib_ext(): setup( name='dnadesign', - version='0.1.4', + version='0.1.5', packages=find_packages(), package_data={'dnadesign': ['definitions.h', 'libdnadesign.h', "libdnadesign" + get_shared_lib_ext()]}, install_requires=[ diff --git a/py/tests/data/example.gb b/py/tests/data/example.gb new file mode 100644 index 0000000..11a7de5 --- /dev/null +++ b/py/tests/data/example.gb @@ -0,0 +1,141 @@ +LOCUS puc19.gbk 2686 bp DNA circular 22-OCT-2019 +DEFINITION pUC cloning vector. +ACCESSION . +VERSION . +KEYWORDS pUC19 +SOURCE synthetic DNA construct + ORGANISM synthetic DNA construct +REFERENCE 1 (bases 1 to 2686) + AUTHORS Norrander J, Kempe T, Messing J + TITLE Construction of improved M13 vectors using + oligodeoxynucleotide-directed mutagenesis. + JOURNAL Gene. 1983 Dec;26(1):101-6. + PUBMED 6323249 +REFERENCE 2 (bases 1 to 2686) + AUTHORS . + TITLE Direct Submission + JOURNAL Exported Sep 13, 2018 from SnapGene Server 1.1.58 + http://www.snapgene.com +COMMENT description: pUC cloning vector. +FEATURES Location/Qualifiers + source 1..2686 + /label="synthetic DNA construct" + /organism="synthetic DNA construct" + /mol_type="other DNA" + primer_bind 118..137 + /label="pBR322ori-F" + /note="pBR322 origin, forward primer" + primer_bind 371..388 + /label="L4440" + /note="L4440 vector, forward primer" + protein_bind 505..526 + /label="CAP binding site" + /bound_moiety="E. coli catabolite activator protein" + /note="CAP binding activates transcription in the presenceof cAMP." + promoter 541..571 + /label="lac promoter" + /note="promoter for the E. coli lac operon" + protein_bind 579..595 + /label="lac operator" + /bound_moiety="lac repressor encoded by lacI" + /note="The lac repressor binds to the lac operator toinhibit transcription in E. coli. This inhibition can berelieved by adding lactose orisopropyl-beta-D-thiogalactopyranoside (IPTG)." + primer_bind 584..606 + /label="M13/pUC Reverse" + /note="In lacZ gene" + primer_bind 603..619 + /label="M13 rev" + /note="common sequencing primer, one of multiple similarvariants" + primer_bind 603..619 + /label="M13 Reverse" + /note="In lacZ gene. Also called M13-rev" + CDS 615..938 + /label="lacZ-alpha" + /codon_start="1" + /gene="lacZ fragment" + /product="LacZ-alpha fragment of beta-galactosidase" + /translation="MTMITPSLHACRSTLEDPRVPSSNSLAVVLQRRDWENPGVTQLNRLAAHPPFASWRNSEEARTDRPSQQLRSLNGEWRLMRYFLLTHLCGISHRIWCTLSTICSDAA" + misc_feature 632..688 + /label="MCS" + /note="pUC18/19 multiple cloning site" + primer_bind complement(689..706) + /label="M13 Forward" + /note="In lacZ gene. Also called M13-F20 or M13 (-21)Forward" + primer_bind complement(689..705) + /label="M13 fwd" + /note="common sequencing primer, one of multiple similarvariants" + primer_bind complement(698..720) + /label="M13/pUC Forward" + /note="In lacZ gene" + primer_bind complement(914..933) + /label="pRS-marker" + /note="pRS vectors, use to sequence yeast selectablemarker" + primer_bind 1033..1055 + /label="pGEX 3'" + /note="pGEX vectors, reverse primer" + primer_bind complement(1093..1111) + /label="pBRforEco" + /note="pBR322 vectors, upsteam of EcoRI site, forwardprimer" + promoter 1179..1283 + /label="AmpR promoter" + /gene="bla" + CDS 1284..2144 + /label="AmpR" + /codon_start="1" + /gene="bla" + /product="beta-lactamase" + /note="confers resistance to ampicillin, carbenicillin, andrelated antibiotics" + /translation="MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRIDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPVAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW" + primer_bind complement(1502..1521) + /label="Amp-R" + /note="Ampicillin resistance gene, reverse primer" + rep_origin 2315..217 + /label="ori" + /direction="RIGHT" + /note="high-copy-number ColE1/pMB1/pBR322/pUC origin ofreplication" +ORIGIN + 1 gagataccta cagcgtgagc tatgagaaag cgccacgctt cccgaaggga gaaaggcgga + 61 caggtatccg gtaagcggca gggtcggaac aggagagcgc acgagggagc ttccaggggg + 121 aaacgcctgg tatctttata gtcctgtcgg gtttcgccac ctctgacttg agcgtcgatt + 181 tttgtgatgc tcgtcagggg ggcggagcct atggaaaaac gccagcaacg cggccttttt + 241 acggttcctg gccttttgct ggccttttgc tcacatgttc tttcctgcgt tatcccctga + 301 ttctgtggat aaccgtatta ccgcctttga gtgagctgat accgctcgcc gcagccgaac + 361 gaccgagcgc agcgagtcag tgagcgagga agcggaagag cgcccaatac gcaaaccgcc + 421 tctccccgcg cgttggccga ttcattaatg cagctggcac gacaggtttc ccgactggaa + 481 agcgggcagt gagcgcaacg caattaatgt gagttagctc actcattagg caccccaggc + 541 tttacacttt atgcttccgg ctcgtatgtt gtgtggaatt gtgagcggat aacaatttca + 601 cacaggaaac agctatgacc atgattacgc caagcttgca tgcctgcagg tcgactctag + 661 aggatccccg ggtaccgagc tcgaattcac tggccgtcgt tttacaacgt cgtgactggg + 721 aaaaccctgg cgttacccaa cttaatcgcc ttgcagcaca tccccctttc gccagctggc + 781 gtaatagcga agaggcccgc accgatcgcc cttcccaaca gttgcgcagc ctgaatggcg + 841 aatggcgcct gatgcggtat tttctcctta cgcatctgtg cggtatttca caccgcatat + 901 ggtgcactct cagtacaatc tgctctgatg ccgcatagtt aagccagccc cgacacccgc + 961 caacacccgc tgacgcgccc tgacgggctt gtctgctccc ggcatccgct tacagacaag + 1021 ctgtgaccgt ctccgggagc tgcatgtgtc agaggttttc accgtcatca ccgaaacgcg + 1081 cgagacgaaa gggcctcgtg atacgcctat ttttataggt taatgtcatg ataataatgg + 1141 tttcttagac gtcaggtggc acttttcggg gaaatgtgcg cggaacccct atttgtttat + 1201 ttttctaaat acattcaaat atgtatccgc tcatgagaca ataaccctga taaatgcttc + 1261 aataatattg aaaaaggaag agtatgagta ttcaacattt ccgtgtcgcc cttattccct + 1321 tttttgcggc attttgcctt cctgtttttg ctcacccaga aacgctggtg aaagtaaaag + 1381 atgctgaaga tcagttgggt gcacgagtgg gttacatcga actggatctc aacagcggta + 1441 agatccttga gagttttcgc cccgaagaac gttttccaat gatgagcact tttaaagttc + 1501 tgctatgtgg cgcggtatta tcccgtattg acgccgggca agagcaactc ggtcgccgca + 1561 tacactattc tcagaatgac ttggttgagt actcaccagt cacagaaaag catcttacgg + 1621 atggcatgac agtaagagaa ttatgcagtg ctgccataac catgagtgat aacactgcgg + 1681 ccaacttact tctgacaacg atcggaggac cgaaggagct aaccgctttt ttgcacaaca + 1741 tgggggatca tgtaactcgc cttgatcgtt gggaaccgga gctgaatgaa gccataccaa + 1801 acgacgagcg tgacaccacg atgcctgtag caatggcaac aacgttgcgc aaactattaa + 1861 ctggcgaact acttactcta gcttcccggc aacaattaat agactggatg gaggcggata + 1921 aagttgcagg accacttctg cgctcggccc ttccggctgg ctggtttatt gctgataaat + 1981 ctggagccgg tgagcgtggg tctcgcggta tcattgcagc actggggcca gatggtaagc + 2041 cctcccgtat cgtagttatc tacacgacgg ggagtcaggc aactatggat gaacgaaata + 2101 gacagatcgc tgagataggt gcctcactga ttaagcattg gtaactgtca gaccaagttt + 2161 actcatatat actttagatt gatttaaaac ttcattttta atttaaaagg atctaggtga + 2221 agatcctttt tgataatctc atgaccaaaa tcccttaacg tgagttttcg ttccactgag + 2281 cgtcagaccc cgtagaaaag atcaaaggat cttcttgaga tccttttttt ctgcgcgtaa + 2341 tctgctgctt gcaaacaaaa aaaccaccgc taccagcggt ggtttgtttg ccggatcaag + 2401 agctaccaac tctttttccg aaggtaactg gcttcagcag agcgcagata ccaaatactg + 2461 ttcttctagt gtagccgtag ttaggccacc acttcaagaa ctctgtagca ccgcctacat + 2521 acctcgctct gctaatcctg ttaccagtgg ctgctgccag tggcgataag tcgtgtctta + 2581 ccgggttgga ctcaagacga tagttaccgg ataaggcgca gcggtcgggc tgaacggggg + 2641 gttcgtgcac acagcccagc ttggagcgaa cgacctacac cgaact +// diff --git a/py/tests/test_genbank_parser.py b/py/tests/test_genbank_parser.py new file mode 100644 index 0000000..4a80faf --- /dev/null +++ b/py/tests/test_genbank_parser.py @@ -0,0 +1,64 @@ +import pytest +import os +from dnadesign.parsers import parse_genbank_from_c_file, parse_genbank_from_c_string, Genbank, GenbankMeta, GenbankFeature + +def test_parse_genbank_from_c_file(): + current_dir = os.path.dirname(__file__) + example_path = os.path.join(current_dir, 'data/example.gb') + records = parse_genbank_from_c_file(example_path) + assert len(records) > 0 + assert all(isinstance(r, Genbank) for r in records) + + # Test the first record + first_record = records[0] + assert isinstance(first_record.meta, GenbankMeta) + assert isinstance(first_record.features, list) + assert all(isinstance(f, GenbankFeature) for f in first_record.features) + assert isinstance(first_record.sequence, str) + + # Test some fields of the first record + assert first_record.meta.accession + assert first_record.meta.version + assert first_record.meta.organism + assert len(first_record.features) > 0 + assert first_record.sequence + +def test_parse_genbank_from_c_string(): + genbank_data = """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999 +DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p + (AXL2) and Rev7p (REV7) genes, complete cds. +ACCESSION U49845 +VERSION U49845.1 GI:1293613 +KEYWORDS . +SOURCE Saccharomyces cerevisiae (baker's yeast) + ORGANISM Saccharomyces cerevisiae + Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; + Saccharomycetales; Saccharomycetaceae; Saccharomyces. +REFERENCE 1 (bases 1 to 5028) + AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W. + TITLE Cloning and sequence of REV7, a gene whose function is required for + DNA damage-induced mutagenesis in Saccharomyces cerevisiae + JOURNAL Yeast 10 (11), 1503-1509 (1994) + PUBMED 7871890 +FEATURES Location/Qualifiers + source 1..5028 + /organism="Saccharomyces cerevisiae" + /db_xref="taxon:4932" + /chromosome="IX" + /map="9" +ORIGIN + 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg + 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct + 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa +// +""" + records = parse_genbank_from_c_string(genbank_data) + assert len(records) == 1 + record = records[0] + assert record.meta.locus.name == "SCU49845" + assert record.meta.accession == "U49845" + assert record.meta.organism == "Saccharomyces cerevisiae" + assert len(record.features) == 1 + assert record.features[0].type_ == "source" + assert record.sequence.startswith("gatcctccat") + assert len(record.sequence) == 180 # Based on the ORIGIN section in the example From 8a6441bb3d6948afb69522e003718d6577de775e Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Sat, 17 Aug 2024 03:38:15 +0000 Subject: [PATCH 2/2] add benchmark --- README.md | 1 + py/README.md | 15 ++++++++++- py/benchmark_against_biopython.py | 41 +++++++++++++++++++++++++++++++ py/dnadesign/parsers.py | 2 +- py/lib.go | 2 -- 5 files changed, 57 insertions(+), 4 deletions(-) create mode 100644 py/benchmark_against_biopython.py diff --git a/README.md b/README.md index 2d60916..538de42 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +- Adds genbank parsing to python package. Release version 0.1.5 of dnadesign python. [#87](https://github.com/Koeng101/dnadesign/pull/87) - Adds fastq parsing to python package. Releases version 0.1.4 of dnadesign python. [#86](https://github.com/Koeng101/dnadesign/pull/86) - Integrated errgroup into source tree [#84](https://github.com/Koeng101/dnadesign/pull/84) - Added kmer detection for ligation events in cloning and removed enzyme manager [#83](https://github.com/Koeng101/dnadesign/pull/83) diff --git a/py/README.md b/py/README.md index 0b7b365..ac114b3 100644 --- a/py/README.md +++ b/py/README.md @@ -1,7 +1,20 @@ # DnaDesign (Python) This directory contains code for allowing python users to use dnadesign through a shared C library. -This is a work-in-progress. Right now, we have only ported the fasta parser. +This is a work-in-progress. Right now, we have only ported the fasta, fastq, and genbank parsers. + +For genbank files, we have this benchmark with bsub.gbk: +``` +Running 10 iterations for each parser... + +Results: +DNA design average time: 0.337005 seconds +BioPython average time: 0.332007 seconds + +DNA design is 0.99x faster than BioPython +``` + +Turns out, we're just about as performant! ### Other platforms If you have interest in other platforms, like openbsd or freebsd, please add an issue! I'd be happy to add automatic packaging for these alternative platforms if I know someone will use them. diff --git a/py/benchmark_against_biopython.py b/py/benchmark_against_biopython.py new file mode 100644 index 0000000..a05b363 --- /dev/null +++ b/py/benchmark_against_biopython.py @@ -0,0 +1,41 @@ +import timeit +import os +from Bio import SeqIO +from dnadesign.parsers import parse_genbank_from_c_file + +def benchmark_dnadesign(file_path, num_iterations=10): + def parse_with_dnadesign(): + records = parse_genbank_from_c_file(file_path) + return records + + time_taken = timeit.timeit(parse_with_dnadesign, number=num_iterations) + return time_taken / num_iterations + +def benchmark_biopython(file_path, num_iterations=10): + def parse_with_biopython(): + with open(file_path, "r") as handle: + records = list(SeqIO.parse(handle, "genbank")) + return records + + time_taken = timeit.timeit(parse_with_biopython, number=num_iterations) + return time_taken / num_iterations + +def main(): + current_dir = os.path.dirname(__file__) + example_path = os.path.join(current_dir, '../lib/bio/genbank/data/bsub.gbk') + + print(f"Benchmarking GenBank parsing for file: {example_path}") + print("Running 10 iterations for each parser...") + + dnadesign_time = benchmark_dnadesign(example_path) + biopython_time = benchmark_biopython(example_path) + + print(f"\nResults:") + print(f"DNA design average time: {dnadesign_time:.6f} seconds") + print(f"BioPython average time: {biopython_time:.6f} seconds") + + speedup = biopython_time / dnadesign_time + print(f"\nDNA design is {speedup:.2f}x faster than BioPython") + +if __name__ == "__main__": + main() diff --git a/py/dnadesign/parsers.py b/py/dnadesign/parsers.py index 454ab28..295a7da 100644 --- a/py/dnadesign/parsers.py +++ b/py/dnadesign/parsers.py @@ -252,7 +252,7 @@ def _convert_location(loc) -> GenbankLocation: sub_locations = [] if loc.sub_locations_count > 0: - sub_locations_array = ffi.cast("Location *", loc.sub_locations) + sub_locations_array = ffi.cast("GenbankLocation *", loc.sub_locations) for i in range(loc.sub_locations_count): sub_loc = sub_locations_array[i] sub_locations.append(_convert_location(sub_loc)) diff --git a/py/lib.go b/py/lib.go index 1e9c458..e6a5aef 100644 --- a/py/lib.go +++ b/py/lib.go @@ -115,7 +115,6 @@ typedef struct { */ import "C" import ( - "fmt" "io" "strings" "unsafe" @@ -338,7 +337,6 @@ func goGenbankToCGenbank(gbs []genbank.Genbank) (*C.Genbank, int, *C.char) { slice[i].feature_count = C.int(len(gb.Features)) featureSlice := (*[1 << 30]C.GenbankFeature)(unsafe.Pointer(slice[i].features))[:len(gb.Features):len(gb.Features)] for j, feature := range gb.Features { - fmt.Println(feature.Type) featureSlice[j] = C.GenbankFeature{ type_: C.CString(feature.Type), description: C.CString(feature.Description),