From d150736ba3e22a88bdd74a0ad4519efd38f498a3 Mon Sep 17 00:00:00 2001
From: Keoni Gandall <koeng101@gmail.com>
Date: Sat, 17 Aug 2024 03:12:27 +0000
Subject: [PATCH 1/2] add genbank parser to python

---
 py/dnadesign/definitions.h      |  98 +++++++++++
 py/dnadesign/parsers.py         | 209 +++++++++++++++++++++++
 py/lib.go                       | 282 ++++++++++++++++++++++++++++++++
 py/setup.py                     |   2 +-
 py/tests/data/example.gb        | 141 ++++++++++++++++
 py/tests/test_genbank_parser.py |  64 ++++++++
 6 files changed, 795 insertions(+), 1 deletion(-)
 create mode 100644 py/tests/data/example.gb
 create mode 100644 py/tests/test_genbank_parser.py

diff --git a/py/dnadesign/definitions.h b/py/dnadesign/definitions.h
index f1b5612..1602e30 100644
--- a/py/dnadesign/definitions.h
+++ b/py/dnadesign/definitions.h
@@ -39,3 +39,101 @@ typedef struct {
 
 FastqResult ParseFastqFromCFile(void* cfile);
 FastqResult ParseFastqFromCString(char* cstring);
+
+// Genbank definitions
+// GenbankLocation
+typedef struct GenbankLocation {
+    int start;
+    int end;
+    int complement;
+    int join;
+    int five_prime_partial;
+    int three_prime_partial;
+    char* gbk_location_string;
+    struct GenbankLocation* sub_locations;
+    int sub_locations_count;
+} GenbankLocation;
+
+// GenbankFeature
+typedef struct {
+    char* type_;
+    char* description;
+    char** attribute_keys;
+    char*** attribute_values;
+    int* attribute_value_counts;
+    int attribute_count;
+    char* sequence_hash;
+    char* sequence_hash_function;
+    char* sequence;
+    GenbankLocation location;
+} GenbankFeature;
+
+// GenbankReference
+typedef struct {
+    char* authors;
+    char* title;
+    char* journal;
+    char* pub_med;
+    char* remark;
+    char* range_;
+    char* consortium;
+} GenbankReference;
+
+// GenbankLocus
+typedef struct {
+    char* name;
+    char* sequence_length;
+    char* molecule_type;
+    char* genbank_division;
+    char* modification_date;
+    char* sequence_coding;
+    int circular;
+} GenbankLocus;
+
+// GenbankBaseCount
+typedef struct {
+    char base;
+    int count;
+} GenbankBaseCount;
+
+// GenbankMeta
+typedef struct {
+    char* date;
+    char* definition;
+    char* accession;
+    char* version;
+    char* keywords;
+    char* organism;
+    char* source;
+    char** taxonomy;
+    int taxonomy_count;
+    char* origin;
+    GenbankLocus locus;
+    GenbankReference* references;
+    int reference_count;
+    GenbankBaseCount* base_counts;
+    int base_count_count;
+    char** other_keys;
+    char** other_values;
+    int other_count;
+    char* name;
+    char* sequence_hash;
+    char* sequence_hash_function;
+} GenbankMeta;
+
+// Genbank
+typedef struct {
+    GenbankMeta meta;
+    GenbankFeature* features;
+    int feature_count;
+    char* sequence;
+} Genbank;
+
+typedef struct {
+    Genbank* records;
+    int numRecords;
+    char* error;
+} GenbankResult;
+
+GenbankResult ParseGenbankFromCFile(void* cfile);
+GenbankResult ParseGenbankFromCString(char* cstring);
diff --git a/py/dnadesign/parsers.py b/py/dnadesign/parsers.py
index 590fd71..454ab28 100644
--- a/py/dnadesign/parsers.py
+++ b/py/dnadesign/parsers.py
@@ -78,3 +78,212 @@ def _process_fastq_result(result) -> List[FastqRecord]:
             optionals
         ))
     return fastq_records
+
+# Genbank time
+class GenbankLocus:
+    def __init__(self, name: str, sequence_length: str, molecule_type: str, genbank_division: str,
+                 modification_date: str, sequence_coding: str, circular: bool):
+        self.name = name
+        self.sequence_length = sequence_length
+        self.molecule_type = molecule_type
+        self.genbank_division = genbank_division
+        self.modification_date = modification_date
+        self.sequence_coding = sequence_coding
+        self.circular = circular
+
+class GenbankReference:
+    def __init__(self, authors: str, title: str, journal: str, pub_med: str, remark: str, range_: str, consortium: str):
+        self.authors = authors
+        self.title = title
+        self.journal = journal
+        self.pub_med = pub_med
+        self.remark = remark
+        self.range_ = range_
+        self.consortium = consortium
+
+class GenbankBaseCount:
+    def __init__(self, base: str, count: int):
+        self.base = base
+        self.count = count
+
+class GenbankMeta:
+    def __init__(self, date: str, definition: str, accession: str, version: str, keywords: str,
+                 organism: str, source: str, taxonomy: List[str], origin: str, locus: GenbankLocus,
+                 references: List[GenbankReference], base_counts: List[GenbankBaseCount], other: Dict[str, str],
+                 name: str, sequence_hash: str, hash_function: str):
+        self.date = date
+        self.definition = definition
+        self.accession = accession
+        self.version = version
+        self.keywords = keywords
+        self.organism = organism
+        self.source = source
+        self.taxonomy = taxonomy
+        self.origin = origin
+        self.locus = locus
+        self.references = references
+        self.base_counts = base_counts
+        self.other = other
+        self.name = name
+        self.sequence_hash = sequence_hash
+        self.hash_function = hash_function
+
+# Update the Location class to match the C struct
+class GenbankLocation:
+    def __init__(self, start: int, end: int, complement: bool, join: bool, five_prime_partial: bool,
+                 three_prime_partial: bool, gbk_location_string: str, sub_locations: List['GenbankLocation']):
+        self.start = start
+        self.end = end
+        self.complement = complement
+        self.join = join
+        self.five_prime_partial = five_prime_partial
+        self.three_prime_partial = three_prime_partial
+        self.gbk_location_string = gbk_location_string
+        self.sub_locations = sub_locations
+
+class GenbankFeature:
+    def __init__(self, type_: str, description: str, attributes: Dict[str, List[str]],
+                 sequence_hash: str, hash_function: str, sequence: str, location: GenbankLocation):
+        self.type_ = type_
+        self.description = description
+        self.attributes = attributes
+        self.sequence_hash = sequence_hash
+        self.hash_function = hash_function
+        self.sequence = sequence
+        self.location = location
+
+class Genbank:
+    def __init__(self, meta: GenbankMeta, features: List[GenbankFeature], sequence: str):
+        self.meta = meta
+        self.features = features
+        self.sequence = sequence
+
+def _safe_open_file(file_path: str):
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"The file {file_path} does not exist.")
+    cfile = lib.fopen(file_path.encode('utf-8'), "r".encode('utf-8'))
+    if cfile == ffi.NULL:
+        raise IOError(f"Failed to open the file {file_path}.")
+    return cfile
+
+def parse_genbank_from_c_file(file_path: str) -> List[Genbank]:
+    try:
+        cfile = _safe_open_file(file_path)
+        result = lib.ParseGenbankFromCFile(cfile)
+        return _process_genbank_result(result)
+    finally:
+        if 'cfile' in locals() and cfile != ffi.NULL:
+            lib.fclose(cfile)
+
+def parse_genbank_from_c_string(cstring: str) -> List[Genbank]:
+    result = lib.ParseGenbankFromCString(cstring.encode('utf-8'))
+    return _process_genbank_result(result)
+
+def _process_genbank_result(result) -> List[Genbank]:
+    if result.error != ffi.NULL:
+        error_str = ffi.string(result.error).decode('utf-8')
+        raise Exception("Error parsing Genbank: " + error_str)
+    num_records = result.numRecords
+    records = ffi.cast("Genbank*", result.records)
+    return [_convert_genbank_record(records[i]) for i in range(num_records)]
+
+def _convert_genbank_record(record) -> Genbank:
+    meta = _convert_meta(record.meta)
+    features = [_convert_feature(record.features[i]) for i in range(record.feature_count)]
+    sequence = ffi.string(record.sequence).decode('utf-8')
+    return Genbank(meta, features, sequence)
+
+def _convert_meta(meta) -> GenbankMeta:
+    locus = GenbankLocus(
+        ffi.string(meta.locus.name).decode('utf-8'),
+        ffi.string(meta.locus.sequence_length).decode('utf-8'),
+        ffi.string(meta.locus.molecule_type).decode('utf-8'),
+        ffi.string(meta.locus.genbank_division).decode('utf-8'),
+        ffi.string(meta.locus.modification_date).decode('utf-8'),
+        ffi.string(meta.locus.sequence_coding).decode('utf-8'),
+        meta.locus.circular
+    )
+    
+    references = [
+        GenbankReference(
+            ffi.string(ref.authors).decode('utf-8'),
+            ffi.string(ref.title).decode('utf-8'),
+            ffi.string(ref.journal).decode('utf-8'),
+            ffi.string(ref.pub_med).decode('utf-8'),
+            ffi.string(ref.remark).decode('utf-8'),
+            ffi.string(ref.range_).decode('utf-8'),
+            ffi.string(ref.consortium).decode('utf-8')
+        ) for ref in meta.references[0:meta.reference_count]
+    ]
+    
+    base_counts = [
+        GenbankBaseCount(
+            ffi.string(bc.base).decode('utf-8'),
+            bc.count
+        ) for bc in meta.base_counts[0:meta.base_count_count]
+    ]
+    
+    other = {}
+    for i in range(meta.other_count):
+        key = ffi.string(meta.other_keys[i]).decode('utf-8')
+        value = ffi.string(meta.other_values[i]).decode('utf-8')
+        other[key] = value
+    
+    return GenbankMeta(
+        ffi.string(meta.date).decode('utf-8'),
+        ffi.string(meta.definition).decode('utf-8'),
+        ffi.string(meta.accession).decode('utf-8'),
+        ffi.string(meta.version).decode('utf-8'),
+        ffi.string(meta.keywords).decode('utf-8'),
+        ffi.string(meta.organism).decode('utf-8'),
+        ffi.string(meta.source).decode('utf-8'),
+        [ffi.string(tax).decode('utf-8') for tax in meta.taxonomy[0:meta.taxonomy_count]],
+        ffi.string(meta.origin).decode('utf-8'),
+        locus,
+        references,
+        base_counts,
+        other,
+        ffi.string(meta.name).decode('utf-8'),
+        ffi.string(meta.sequence_hash).decode('utf-8'),
+        ffi.string(meta.sequence_hash_function).decode('utf-8')
+    )
+
+def _convert_location(loc) -> GenbankLocation:
+    sub_locations = []
+
+    if loc.sub_locations_count > 0:
+        sub_locations_array = ffi.cast("Location *", loc.sub_locations)
+        for i in range(loc.sub_locations_count):
+            sub_loc = sub_locations_array[i]
+            sub_locations.append(_convert_location(sub_loc))
+
+    return GenbankLocation(
+        loc.start,
+        loc.end,
+        bool(loc.complement),
+        bool(loc.join),
+        bool(loc.five_prime_partial),
+        bool(loc.three_prime_partial),
+        ffi.string(loc.gbk_location_string).decode('utf-8') if loc.gbk_location_string else "",
+        sub_locations
+    )
+
+def _convert_feature(feature) -> GenbankFeature:
+    attributes = {}
+    for i in range(feature.attribute_count):
+        key = ffi.string(feature.attribute_keys[i]).decode('utf-8')
+        values = [ffi.string(feature.attribute_values[i][j]).decode('utf-8')
+                  for j in range(feature.attribute_value_counts[i])]
+        attributes[key] = values
+    
+    location = _convert_location(feature.location)
+    
+    return GenbankFeature(
+        ffi.string(feature.type_).decode('utf-8'),
+        ffi.string(feature.description).decode('utf-8'),
+        attributes,
+        ffi.string(feature.sequence_hash).decode('utf-8'),
+        ffi.string(feature.sequence_hash_function).decode('utf-8'),
+        ffi.string(feature.sequence).decode('utf-8'),
+        location
+    )
diff --git a/py/lib.go b/py/lib.go
index 35982aa..1e9c458 100644
--- a/py/lib.go
+++ b/py/lib.go
@@ -24,14 +24,104 @@ typedef struct {
     char* sequence;
     char* quality;
 } FastqRecord;
+
+// GenbankLocation
+typedef struct GenbankLocation {
+    int start;
+    int end;
+    int complement;
+    int join;
+    int five_prime_partial;
+    int three_prime_partial;
+    char* gbk_location_string;
+    struct GenbankLocation* sub_locations;
+    int sub_locations_count;
+} GenbankLocation;
+
+// GenbankFeature
+typedef struct {
+    char* type_;
+    char* description;
+    char** attribute_keys;
+    char*** attribute_values;
+    int* attribute_value_counts;
+    int attribute_count;
+    char* sequence_hash;
+    char* sequence_hash_function;
+    char* sequence;
+    GenbankLocation location;
+} GenbankFeature;
+
+// GenbankReference
+typedef struct {
+    char* authors;
+    char* title;
+    char* journal;
+    char* pub_med;
+    char* remark;
+    char* range_;
+    char* consortium;
+} GenbankReference;
+
+// GenbankLocus
+typedef struct {
+    char* name;
+    char* sequence_length;
+    char* molecule_type;
+    char* genbank_division;
+    char* modification_date;
+    char* sequence_coding;
+    int circular;
+} GenbankLocus;
+
+// GenbankBaseCount
+typedef struct {
+    char base;
+    int count;
+} GenbankBaseCount;
+
+// GenbankMeta
+typedef struct {
+    char* date;
+    char* definition;
+    char* accession;
+    char* version;
+    char* keywords;
+    char* organism;
+    char* source;
+    char** taxonomy;
+    int taxonomy_count;
+    char* origin;
+    GenbankLocus locus;
+    GenbankReference* references;
+    int reference_count;
+    GenbankBaseCount* base_counts;
+    int base_count_count;
+    char** other_keys;
+    char** other_values;
+    int other_count;
+    char* name;
+    char* sequence_hash;
+    char* sequence_hash_function;
+} GenbankMeta;
+
+// Genbank
+typedef struct {
+    GenbankMeta meta;
+    GenbankFeature* features;
+    int feature_count;
+    char* sequence;
+} Genbank;
 */
 import "C"
 import (
+	"fmt"
 	"io"
 	"strings"
 	"unsafe"
 
 	"github.com/koeng101/dnadesign/lib/bio"
+	"github.com/koeng101/dnadesign/lib/bio/genbank"
 )
 
 /******************************************************************************
@@ -155,6 +245,198 @@ func ParseFastqFromCString(cstring *C.char) (*C.FastqRecord, int, *C.char) {
 
 /******************************************************************************
 
+Genbank
+
+******************************************************************************/
+
+// goGenbankToCGenbank converts a slice of genbank.Genbank to a C.Genbank array
+func goGenbankToCGenbank(gbs []genbank.Genbank) (*C.Genbank, int, *C.char) {
+	if len(gbs) == 0 {
+		return nil, 0, C.CString("No genbank records provided")
+	}
+
+	cGenbanks := (*C.Genbank)(C.malloc(C.size_t(len(gbs)) * C.size_t(unsafe.Sizeof(C.Genbank{}))))
+	slice := (*[1<<30 - 1]C.Genbank)(unsafe.Pointer(cGenbanks))[:len(gbs):len(gbs)]
+
+	for i, gb := range gbs {
+		// Convert Meta
+		slice[i].meta = C.GenbankMeta{
+			date:                   C.CString(gb.Meta.Date),
+			definition:             C.CString(gb.Meta.Definition),
+			accession:              C.CString(gb.Meta.Accession),
+			version:                C.CString(gb.Meta.Version),
+			keywords:               C.CString(gb.Meta.Keywords),
+			organism:               C.CString(gb.Meta.Organism),
+			source:                 C.CString(gb.Meta.Source),
+			taxonomy:               (**C.char)(C.malloc(C.size_t(len(gb.Meta.Taxonomy)) * C.size_t(unsafe.Sizeof(uintptr(0))))),
+			taxonomy_count:         C.int(len(gb.Meta.Taxonomy)),
+			origin:                 C.CString(gb.Meta.Origin),
+			name:                   C.CString(gb.Meta.Name),
+			sequence_hash:          C.CString(gb.Meta.SequenceHash),
+			sequence_hash_function: C.CString(gb.Meta.SequenceHashFunction),
+		}
+
+		// Convert Taxonomy
+		taxonomySlice := (*[1 << 30]*C.char)(unsafe.Pointer(slice[i].meta.taxonomy))[:len(gb.Meta.Taxonomy):len(gb.Meta.Taxonomy)]
+		for j, taxon := range gb.Meta.Taxonomy {
+			taxonomySlice[j] = C.CString(taxon)
+		}
+
+		// Convert Locus
+		slice[i].meta.locus = C.GenbankLocus{
+			name:              C.CString(gb.Meta.Locus.Name),
+			sequence_length:   C.CString(gb.Meta.Locus.SequenceLength),
+			molecule_type:     C.CString(gb.Meta.Locus.MoleculeType),
+			genbank_division:  C.CString(gb.Meta.Locus.GenbankDivision),
+			modification_date: C.CString(gb.Meta.Locus.ModificationDate),
+			sequence_coding:   C.CString(gb.Meta.Locus.SequenceCoding),
+			circular:          C.int(boolToInt(gb.Meta.Locus.Circular)),
+		}
+
+		// Convert References
+		slice[i].meta.references = (*C.GenbankReference)(C.malloc(C.size_t(len(gb.Meta.References)) * C.size_t(unsafe.Sizeof(C.GenbankReference{}))))
+		slice[i].meta.reference_count = C.int(len(gb.Meta.References))
+		refSlice := (*[1 << 30]C.GenbankReference)(unsafe.Pointer(slice[i].meta.references))[:len(gb.Meta.References):len(gb.Meta.References)]
+		for j, ref := range gb.Meta.References {
+			refSlice[j] = C.GenbankReference{
+				authors:    C.CString(ref.Authors),
+				title:      C.CString(ref.Title),
+				journal:    C.CString(ref.Journal),
+				pub_med:    C.CString(ref.PubMed),
+				remark:     C.CString(ref.Remark),
+				range_:     C.CString(ref.Range),
+				consortium: C.CString(ref.Consortium),
+			}
+		}
+
+		// Convert BaseCount
+		slice[i].meta.base_counts = (*C.GenbankBaseCount)(C.malloc(C.size_t(len(gb.Meta.BaseCount)) * C.size_t(unsafe.Sizeof(C.GenbankBaseCount{}))))
+		slice[i].meta.base_count_count = C.int(len(gb.Meta.BaseCount))
+		baseCountSlice := (*[1 << 30]C.GenbankBaseCount)(unsafe.Pointer(slice[i].meta.base_counts))[:len(gb.Meta.BaseCount):len(gb.Meta.BaseCount)]
+		for j, bc := range gb.Meta.BaseCount {
+			baseCountSlice[j] = C.GenbankBaseCount{
+				base:  C.char(bc.Base[0]),
+				count: C.int(bc.Count),
+			}
+		}
+
+		// Convert Other
+		slice[i].meta.other_keys = (**C.char)(C.malloc(C.size_t(len(gb.Meta.Other)) * C.size_t(unsafe.Sizeof(uintptr(0)))))
+		slice[i].meta.other_values = (**C.char)(C.malloc(C.size_t(len(gb.Meta.Other)) * C.size_t(unsafe.Sizeof(uintptr(0)))))
+		slice[i].meta.other_count = C.int(len(gb.Meta.Other))
+		otherKeysSlice := (*[1 << 30]*C.char)(unsafe.Pointer(slice[i].meta.other_keys))[:len(gb.Meta.Other):len(gb.Meta.Other)]
+		otherValuesSlice := (*[1 << 30]*C.char)(unsafe.Pointer(slice[i].meta.other_values))[:len(gb.Meta.Other):len(gb.Meta.Other)]
+		j := 0
+		for k, v := range gb.Meta.Other {
+			otherKeysSlice[j] = C.CString(k)
+			otherValuesSlice[j] = C.CString(v)
+			j++
+		}
+
+		// Convert Features
+		slice[i].features = (*C.GenbankFeature)(C.malloc(C.size_t(len(gb.Features)) * C.size_t(unsafe.Sizeof(C.GenbankFeature{}))))
+		slice[i].feature_count = C.int(len(gb.Features))
+		featureSlice := (*[1 << 30]C.GenbankFeature)(unsafe.Pointer(slice[i].features))[:len(gb.Features):len(gb.Features)]
+		for j, feature := range gb.Features {
+			fmt.Println(feature.Type)
+			featureSlice[j] = C.GenbankFeature{
+				type_:                  C.CString(feature.Type),
+				description:            C.CString(feature.Description),
+				sequence_hash:          C.CString(feature.SequenceHash),
+				sequence_hash_function: C.CString(feature.SequenceHashFunction),
+				sequence:               C.CString(feature.Sequence),
+			}
+
+			// Convert Attributes
+			featureSlice[j].attribute_keys = (**C.char)(C.malloc(C.size_t(len(feature.Attributes)) * C.size_t(unsafe.Sizeof(uintptr(0)))))
+			featureSlice[j].attribute_values = (***C.char)(C.malloc(C.size_t(len(feature.Attributes)) * C.size_t(unsafe.Sizeof(uintptr(0)))))
+			featureSlice[j].attribute_value_counts = (*C.int)(C.malloc(C.size_t(len(feature.Attributes)) * C.size_t(unsafe.Sizeof(C.int(0)))))
+			featureSlice[j].attribute_count = C.int(len(feature.Attributes))
+
+			attrKeysSlice := (*[1 << 30]*C.char)(unsafe.Pointer(featureSlice[j].attribute_keys))[:len(feature.Attributes):len(feature.Attributes)]
+			attrValuesSlice := (*[1 << 30]**C.char)(unsafe.Pointer(featureSlice[j].attribute_values))[:len(feature.Attributes):len(feature.Attributes)]
+			attrValueCountsSlice := (*[1 << 30]C.int)(unsafe.Pointer(featureSlice[j].attribute_value_counts))[:len(feature.Attributes):len(feature.Attributes)]
+
+			k := 0
+			for key, values := range feature.Attributes {
+				attrKeysSlice[k] = C.CString(key)
+				attrValuesSlice[k] = (**C.char)(C.malloc(C.size_t(len(values)) * C.size_t(unsafe.Sizeof(uintptr(0)))))
+				attrValueCountsSlice[k] = C.int(len(values))
+				valueSlice := (*[1 << 30]*C.char)(unsafe.Pointer(attrValuesSlice[k]))[:len(values):len(values)]
+				for l, val := range values {
+					valueSlice[l] = C.CString(val)
+				}
+				k++
+			}
+
+			// Convert Location
+			featureSlice[j].location = convertLocation(feature.Location)
+		}
+
+		// Convert Sequence
+		slice[i].sequence = C.CString(gb.Sequence)
+	}
+
+	return cGenbanks, len(gbs), nil
+}
+
+// Convert Location
+func convertLocation(location genbank.Location) C.GenbankLocation {
+	cLocation := C.GenbankLocation{
+		start:               C.int(location.Start),
+		end:                 C.int(location.End),
+		complement:          C.int(boolToInt(location.Complement)),
+		join:                C.int(boolToInt(location.Join)),
+		five_prime_partial:  C.int(boolToInt(location.FivePrimePartial)),
+		three_prime_partial: C.int(boolToInt(location.ThreePrimePartial)),
+		gbk_location_string: C.CString(location.GbkLocationString),
+		sub_locations:       nil,
+		sub_locations_count: 0,
+	}
+
+	if len(location.SubLocations) > 0 {
+		cSubLocations := make([]C.GenbankLocation, len(location.SubLocations))
+		for i, subLocation := range location.SubLocations {
+			cSubLocations[i] = convertLocation(subLocation)
+		}
+		cLocation.sub_locations = &cSubLocations[0]
+		cLocation.sub_locations_count = C.int(len(cSubLocations))
+	}
+
+	return cLocation
+}
+
+func boolToInt(b bool) int {
+	if b {
+		return 1
+	}
+	return 0
+}
+
+//export ParseGenbankFromCFile
+func ParseGenbankFromCFile(cfile *C.FILE) (*C.Genbank, int, *C.char) {
+	reader := readerFromCFile(cfile)
+	parser := bio.NewGenbankParser(reader)
+	genbanks, err := parser.Parse()
+	if err != nil {
+		return nil, len(genbanks), C.CString(err.Error())
+	}
+	return goGenbankToCGenbank(genbanks)
+}
+
+//export ParseGenbankFromCString
+func ParseGenbankFromCString(cstring *C.char) (*C.Genbank, int, *C.char) {
+	reader := strings.NewReader(C.GoString(cstring))
+	parser := bio.NewGenbankParser(reader)
+	genbanks, err := parser.Parse()
+	if err != nil {
+		return nil, len(genbanks), C.CString(err.Error())
+	}
+	return goGenbankToCGenbank(genbanks)
+}
+
+/******************************************************************************
+
 main.go
 
 ******************************************************************************/
diff --git a/py/setup.py b/py/setup.py
index c065e4d..55a6320 100644
--- a/py/setup.py
+++ b/py/setup.py
@@ -13,7 +13,7 @@ def get_shared_lib_ext():
 
 setup(
     name='dnadesign',
-    version='0.1.4',
+    version='0.1.5',
     packages=find_packages(),
     package_data={'dnadesign': ['definitions.h', 'libdnadesign.h', "libdnadesign" + get_shared_lib_ext()]},
     install_requires=[
diff --git a/py/tests/data/example.gb b/py/tests/data/example.gb
new file mode 100644
index 0000000..11a7de5
--- /dev/null
+++ b/py/tests/data/example.gb
@@ -0,0 +1,141 @@
+LOCUS       puc19.gbk               2686 bp    DNA     circular  22-OCT-2019
+DEFINITION  pUC cloning vector.
+ACCESSION   .
+VERSION     .
+KEYWORDS    pUC19
+SOURCE      synthetic DNA construct
+  ORGANISM  synthetic DNA construct
+REFERENCE   1  (bases 1 to 2686)
+  AUTHORS   Norrander J, Kempe T, Messing J
+  TITLE     Construction of improved M13 vectors using 
+            oligodeoxynucleotide-directed mutagenesis.
+  JOURNAL   Gene. 1983 Dec;26(1):101-6.
+  PUBMED    6323249
+REFERENCE   2  (bases 1 to 2686)
+  AUTHORS   .
+  TITLE     Direct Submission
+  JOURNAL   Exported Sep 13, 2018 from SnapGene Server 1.1.58
+            http://www.snapgene.com
+COMMENT             description: pUC cloning vector.
+FEATURES             Location/Qualifiers
+     source          1..2686
+                     /label="synthetic DNA construct"
+                     /organism="synthetic DNA construct"
+                     /mol_type="other DNA"
+     primer_bind     118..137
+                     /label="pBR322ori-F"
+                     /note="pBR322 origin, forward primer"
+     primer_bind     371..388
+                     /label="L4440"
+                     /note="L4440 vector, forward primer"
+     protein_bind    505..526
+                     /label="CAP binding site"
+                     /bound_moiety="E. coli catabolite activator protein"
+                     /note="CAP binding activates transcription in the presenceof cAMP."
+     promoter        541..571
+                     /label="lac promoter"
+                     /note="promoter for the E. coli lac operon"
+     protein_bind    579..595
+                     /label="lac operator"
+                     /bound_moiety="lac repressor encoded by lacI"
+                     /note="The lac repressor binds to the lac operator toinhibit transcription in E. coli. This inhibition can berelieved by adding lactose orisopropyl-beta-D-thiogalactopyranoside (IPTG)."
+     primer_bind     584..606
+                     /label="M13/pUC Reverse"
+                     /note="In lacZ gene"
+     primer_bind     603..619
+                     /label="M13 rev"
+                     /note="common sequencing primer, one of multiple similarvariants"
+     primer_bind     603..619
+                     /label="M13 Reverse"
+                     /note="In lacZ gene. Also called M13-rev"
+     CDS             615..938
+                     /label="lacZ-alpha"
+                     /codon_start="1"
+                     /gene="lacZ fragment"
+                     /product="LacZ-alpha fragment of beta-galactosidase"
+                     /translation="MTMITPSLHACRSTLEDPRVPSSNSLAVVLQRRDWENPGVTQLNRLAAHPPFASWRNSEEARTDRPSQQLRSLNGEWRLMRYFLLTHLCGISHRIWCTLSTICSDAA"
+     misc_feature    632..688
+                     /label="MCS"
+                     /note="pUC18/19 multiple cloning site"
+     primer_bind     complement(689..706)
+                     /label="M13 Forward"
+                     /note="In lacZ gene. Also called M13-F20 or M13 (-21)Forward"
+     primer_bind     complement(689..705)
+                     /label="M13 fwd"
+                     /note="common sequencing primer, one of multiple similarvariants"
+     primer_bind     complement(698..720)
+                     /label="M13/pUC Forward"
+                     /note="In lacZ gene"
+     primer_bind     complement(914..933)
+                     /label="pRS-marker"
+                     /note="pRS vectors, use to sequence yeast selectablemarker"
+     primer_bind     1033..1055
+                     /label="pGEX 3'"
+                     /note="pGEX vectors, reverse primer"
+     primer_bind     complement(1093..1111)
+                     /label="pBRforEco"
+                     /note="pBR322 vectors, upsteam of EcoRI site, forwardprimer"
+     promoter        1179..1283
+                     /label="AmpR promoter"
+                     /gene="bla"
+     CDS             1284..2144
+                     /label="AmpR"
+                     /codon_start="1"
+                     /gene="bla"
+                     /product="beta-lactamase"
+                     /note="confers resistance to ampicillin, carbenicillin, andrelated antibiotics"
+                     /translation="MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRIDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPVAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW"
+     primer_bind     complement(1502..1521)
+                     /label="Amp-R"
+                     /note="Ampicillin resistance gene, reverse primer"
+     rep_origin      2315..217
+                     /label="ori"
+                     /direction="RIGHT"
+                     /note="high-copy-number ColE1/pMB1/pBR322/pUC origin ofreplication"
+ORIGIN      
+        1 gagataccta cagcgtgagc tatgagaaag cgccacgctt cccgaaggga gaaaggcgga
+       61 caggtatccg gtaagcggca gggtcggaac aggagagcgc acgagggagc ttccaggggg
+      121 aaacgcctgg tatctttata gtcctgtcgg gtttcgccac ctctgacttg agcgtcgatt
+      181 tttgtgatgc tcgtcagggg ggcggagcct atggaaaaac gccagcaacg cggccttttt
+      241 acggttcctg gccttttgct ggccttttgc tcacatgttc tttcctgcgt tatcccctga
+      301 ttctgtggat aaccgtatta ccgcctttga gtgagctgat accgctcgcc gcagccgaac
+      361 gaccgagcgc agcgagtcag tgagcgagga agcggaagag cgcccaatac gcaaaccgcc
+      421 tctccccgcg cgttggccga ttcattaatg cagctggcac gacaggtttc ccgactggaa
+      481 agcgggcagt gagcgcaacg caattaatgt gagttagctc actcattagg caccccaggc
+      541 tttacacttt atgcttccgg ctcgtatgtt gtgtggaatt gtgagcggat aacaatttca
+      601 cacaggaaac agctatgacc atgattacgc caagcttgca tgcctgcagg tcgactctag
+      661 aggatccccg ggtaccgagc tcgaattcac tggccgtcgt tttacaacgt cgtgactggg
+      721 aaaaccctgg cgttacccaa cttaatcgcc ttgcagcaca tccccctttc gccagctggc
+      781 gtaatagcga agaggcccgc accgatcgcc cttcccaaca gttgcgcagc ctgaatggcg
+      841 aatggcgcct gatgcggtat tttctcctta cgcatctgtg cggtatttca caccgcatat
+      901 ggtgcactct cagtacaatc tgctctgatg ccgcatagtt aagccagccc cgacacccgc
+      961 caacacccgc tgacgcgccc tgacgggctt gtctgctccc ggcatccgct tacagacaag
+     1021 ctgtgaccgt ctccgggagc tgcatgtgtc agaggttttc accgtcatca ccgaaacgcg
+     1081 cgagacgaaa gggcctcgtg atacgcctat ttttataggt taatgtcatg ataataatgg
+     1141 tttcttagac gtcaggtggc acttttcggg gaaatgtgcg cggaacccct atttgtttat
+     1201 ttttctaaat acattcaaat atgtatccgc tcatgagaca ataaccctga taaatgcttc
+     1261 aataatattg aaaaaggaag agtatgagta ttcaacattt ccgtgtcgcc cttattccct
+     1321 tttttgcggc attttgcctt cctgtttttg ctcacccaga aacgctggtg aaagtaaaag
+     1381 atgctgaaga tcagttgggt gcacgagtgg gttacatcga actggatctc aacagcggta
+     1441 agatccttga gagttttcgc cccgaagaac gttttccaat gatgagcact tttaaagttc
+     1501 tgctatgtgg cgcggtatta tcccgtattg acgccgggca agagcaactc ggtcgccgca
+     1561 tacactattc tcagaatgac ttggttgagt actcaccagt cacagaaaag catcttacgg
+     1621 atggcatgac agtaagagaa ttatgcagtg ctgccataac catgagtgat aacactgcgg
+     1681 ccaacttact tctgacaacg atcggaggac cgaaggagct aaccgctttt ttgcacaaca
+     1741 tgggggatca tgtaactcgc cttgatcgtt gggaaccgga gctgaatgaa gccataccaa
+     1801 acgacgagcg tgacaccacg atgcctgtag caatggcaac aacgttgcgc aaactattaa
+     1861 ctggcgaact acttactcta gcttcccggc aacaattaat agactggatg gaggcggata
+     1921 aagttgcagg accacttctg cgctcggccc ttccggctgg ctggtttatt gctgataaat
+     1981 ctggagccgg tgagcgtggg tctcgcggta tcattgcagc actggggcca gatggtaagc
+     2041 cctcccgtat cgtagttatc tacacgacgg ggagtcaggc aactatggat gaacgaaata
+     2101 gacagatcgc tgagataggt gcctcactga ttaagcattg gtaactgtca gaccaagttt
+     2161 actcatatat actttagatt gatttaaaac ttcattttta atttaaaagg atctaggtga
+     2221 agatcctttt tgataatctc atgaccaaaa tcccttaacg tgagttttcg ttccactgag
+     2281 cgtcagaccc cgtagaaaag atcaaaggat cttcttgaga tccttttttt ctgcgcgtaa
+     2341 tctgctgctt gcaaacaaaa aaaccaccgc taccagcggt ggtttgtttg ccggatcaag
+     2401 agctaccaac tctttttccg aaggtaactg gcttcagcag agcgcagata ccaaatactg
+     2461 ttcttctagt gtagccgtag ttaggccacc acttcaagaa ctctgtagca ccgcctacat
+     2521 acctcgctct gctaatcctg ttaccagtgg ctgctgccag tggcgataag tcgtgtctta
+     2581 ccgggttgga ctcaagacga tagttaccgg ataaggcgca gcggtcgggc tgaacggggg
+     2641 gttcgtgcac acagcccagc ttggagcgaa cgacctacac cgaact 
+//
diff --git a/py/tests/test_genbank_parser.py b/py/tests/test_genbank_parser.py
new file mode 100644
index 0000000..4a80faf
--- /dev/null
+++ b/py/tests/test_genbank_parser.py
@@ -0,0 +1,64 @@
+import pytest
+import os
+from dnadesign.parsers import parse_genbank_from_c_file, parse_genbank_from_c_string, Genbank, GenbankMeta, GenbankFeature
+
+def test_parse_genbank_from_c_file():
+    current_dir = os.path.dirname(__file__)
+    example_path = os.path.join(current_dir, 'data/example.gb')
+    records = parse_genbank_from_c_file(example_path)
+    assert len(records) > 0
+    assert all(isinstance(r, Genbank) for r in records)
+
+    # Test the first record
+    first_record = records[0]
+    assert isinstance(first_record.meta, GenbankMeta)
+    assert isinstance(first_record.features, list)
+    assert all(isinstance(f, GenbankFeature) for f in first_record.features)
+    assert isinstance(first_record.sequence, str)
+
+    # Test some fields of the first record
+    assert first_record.meta.accession
+    assert first_record.meta.version
+    assert first_record.meta.organism
+    assert len(first_record.features) > 0
+    assert first_record.sequence
+
+def test_parse_genbank_from_c_string():
+    genbank_data = """LOCUS       SCU49845     5028 bp    DNA             PLN       21-JUN-1999
+DEFINITION  Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p
+            (AXL2) and Rev7p (REV7) genes, complete cds.
+ACCESSION   U49845
+VERSION     U49845.1  GI:1293613
+KEYWORDS    .
+SOURCE      Saccharomyces cerevisiae (baker's yeast)
+  ORGANISM  Saccharomyces cerevisiae
+            Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes;
+            Saccharomycetales; Saccharomycetaceae; Saccharomyces.
+REFERENCE   1  (bases 1 to 5028)
+  AUTHORS   Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.
+  TITLE     Cloning and sequence of REV7, a gene whose function is required for
+            DNA damage-induced mutagenesis in Saccharomyces cerevisiae
+  JOURNAL   Yeast 10 (11), 1503-1509 (1994)
+  PUBMED    7871890
+FEATURES             Location/Qualifiers
+     source          1..5028
+                     /organism="Saccharomyces cerevisiae"
+                     /db_xref="taxon:4932"
+                     /chromosome="IX"
+                     /map="9"
+ORIGIN
+        1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
+       61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
+      121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa
+//
+"""
+    records = parse_genbank_from_c_string(genbank_data)
+    assert len(records) == 1
+    record = records[0]
+    assert record.meta.locus.name == "SCU49845"
+    assert record.meta.accession == "U49845"
+    assert record.meta.organism == "Saccharomyces cerevisiae"
+    assert len(record.features) == 1
+    assert record.features[0].type_ == "source"
+    assert record.sequence.startswith("gatcctccat")
+    assert len(record.sequence) == 180  # Based on the ORIGIN section in the example

From 8a6441bb3d6948afb69522e003718d6577de775e Mon Sep 17 00:00:00 2001
From: Keoni Gandall <koeng101@gmail.com>
Date: Sat, 17 Aug 2024 03:38:15 +0000
Subject: [PATCH 2/2] add benchmark

---
 README.md                         |  1 +
 py/README.md                      | 15 ++++++++++-
 py/benchmark_against_biopython.py | 41 +++++++++++++++++++++++++++++++
 py/dnadesign/parsers.py           |  2 +-
 py/lib.go                         |  2 --
 5 files changed, 57 insertions(+), 4 deletions(-)
 create mode 100644 py/benchmark_against_biopython.py

diff --git a/README.md b/README.md
index 2d60916..538de42 100644
--- a/README.md
+++ b/README.md
@@ -75,6 +75,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
+- Adds genbank parsing to python package. Release version 0.1.5 of dnadesign python. [#87](https://github.com/Koeng101/dnadesign/pull/87)
 - Adds fastq parsing to python package. Releases version 0.1.4 of dnadesign python. [#86](https://github.com/Koeng101/dnadesign/pull/86)
 - Integrated errgroup into source tree [#84](https://github.com/Koeng101/dnadesign/pull/84)
 - Added kmer detection for ligation events in cloning and removed enzyme manager [#83](https://github.com/Koeng101/dnadesign/pull/83)
diff --git a/py/README.md b/py/README.md
index 0b7b365..ac114b3 100644
--- a/py/README.md
+++ b/py/README.md
@@ -1,7 +1,20 @@
 # DnaDesign (Python)
 This directory contains code for allowing python users to use dnadesign through a shared C library.
 
-This is a work-in-progress. Right now, we have only ported the fasta parser. 
+This is a work-in-progress. Right now, we have only ported the fasta, fastq, and genbank parsers.
+
+For genbank files, we have this benchmark with bsub.gbk:
+```
+Running 10 iterations for each parser...
+
+Results:
+DNA design average time: 0.337005 seconds
+BioPython average time:  0.332007 seconds
+
+DNA design is 0.99x faster than BioPython
+```
+
+Turns out, we're just about as performant!
 
 ### Other platforms
 If you have interest in other platforms, like openbsd or freebsd, please add an issue! I'd be happy to add automatic packaging for these alternative platforms if I know someone will use them.
diff --git a/py/benchmark_against_biopython.py b/py/benchmark_against_biopython.py
new file mode 100644
index 0000000..a05b363
--- /dev/null
+++ b/py/benchmark_against_biopython.py
@@ -0,0 +1,41 @@
+import timeit
+import os
+from Bio import SeqIO
+from dnadesign.parsers import parse_genbank_from_c_file
+
+def benchmark_dnadesign(file_path, num_iterations=10):
+    def parse_with_dnadesign():
+        records = parse_genbank_from_c_file(file_path)
+        return records
+
+    time_taken = timeit.timeit(parse_with_dnadesign, number=num_iterations)
+    return time_taken / num_iterations
+
+def benchmark_biopython(file_path, num_iterations=10):
+    def parse_with_biopython():
+        with open(file_path, "r") as handle:
+            records = list(SeqIO.parse(handle, "genbank"))
+        return records
+
+    time_taken = timeit.timeit(parse_with_biopython, number=num_iterations)
+    return time_taken / num_iterations
+
+def main():
+    current_dir = os.path.dirname(__file__)
+    example_path = os.path.join(current_dir, '../lib/bio/genbank/data/bsub.gbk')
+
+    print(f"Benchmarking GenBank parsing for file: {example_path}")
+    print("Running 10 iterations for each parser...")
+
+    dnadesign_time = benchmark_dnadesign(example_path)
+    biopython_time = benchmark_biopython(example_path)
+
+    print(f"\nResults:")
+    print(f"DNA design average time: {dnadesign_time:.6f} seconds")
+    print(f"BioPython average time:  {biopython_time:.6f} seconds")
+
+    speedup = biopython_time / dnadesign_time
+    print(f"\nDNA design is {speedup:.2f}x faster than BioPython")
+
+if __name__ == "__main__":
+    main()
diff --git a/py/dnadesign/parsers.py b/py/dnadesign/parsers.py
index 454ab28..295a7da 100644
--- a/py/dnadesign/parsers.py
+++ b/py/dnadesign/parsers.py
@@ -252,7 +252,7 @@ def _convert_location(loc) -> GenbankLocation:
     sub_locations = []
 
     if loc.sub_locations_count > 0:
-        sub_locations_array = ffi.cast("Location *", loc.sub_locations)
+        sub_locations_array = ffi.cast("GenbankLocation *", loc.sub_locations)
         for i in range(loc.sub_locations_count):
             sub_loc = sub_locations_array[i]
             sub_locations.append(_convert_location(sub_loc))
diff --git a/py/lib.go b/py/lib.go
index 1e9c458..e6a5aef 100644
--- a/py/lib.go
+++ b/py/lib.go
@@ -115,7 +115,6 @@ typedef struct {
 */
 import "C"
 import (
-	"fmt"
 	"io"
 	"strings"
 	"unsafe"
@@ -338,7 +337,6 @@ func goGenbankToCGenbank(gbs []genbank.Genbank) (*C.Genbank, int, *C.char) {
 		slice[i].feature_count = C.int(len(gb.Features))
 		featureSlice := (*[1 << 30]C.GenbankFeature)(unsafe.Pointer(slice[i].features))[:len(gb.Features):len(gb.Features)]
 		for j, feature := range gb.Features {
-			fmt.Println(feature.Type)
 			featureSlice[j] = C.GenbankFeature{
 				type_:                  C.CString(feature.Type),
 				description:            C.CString(feature.Description),