Implement new index type that also includes mutltihash code

Implement a new CARv2 index that contains enough information to reconstruct the multihashes of the data payload, since `CarIndexSorted` only includes multihash digests. Note, this index intentionally ignores any given record with `multihash.IDENTITY` CID hash. Add a test that asserts offsets for the same CID across sorted index and new multihash sorted index are consistent. Note, there is a need for a multicodec to be defined for the new index type. For now TODOs are left since it requires coordination across repos.
ipld · Sep 1, 2021 · 22949f0 · 22949f0
1 parent 1bac13d
commit 22949f0
Show file tree

Hide file tree

Showing 3 changed files with 293 additions and 15 deletions.
diff --git a/v2/index/index.go b/v2/index/index.go
@@ -72,6 +72,9 @@ func New(codec multicodec.Code) (Index, error) {
 	switch codec {
 	case multicodec.CarIndexSorted:
 		return newSorted(), nil
+	// TODO replace with proper multicodec once defined.
+	case IndexMultihashSortedCodec:
+		return newMultihashSorted(), nil
 	default:
 		return nil, fmt.Errorf("unknwon index codec: %v", codec)
 	}

diff --git a/v2/index/indexmhsorted.go b/v2/index/indexmhsorted.go
@@ -0,0 +1,174 @@
+package index
+
+import (
+	"encoding/binary"
+	"io"
+	"sort"
+
+	"github.com/ipfs/go-cid"
+	"github.com/multiformats/go-multicodec"
+	"github.com/multiformats/go-multihash"
+)
+
+// TODO replace with propper multicodec once defined.
+const IndexMultihashSortedCodec = multicodec.Code(0x0401)
+
+// multiWidthCodedIndex maps multihash code (i.e. hashing algorithm) to singleWidthIndex.
+// This index type is implemented with the underlying assumption that all digests generated by the
+// same multihash code are of the same length.
+// This index ignores any Record with multihash.IDENTITY.
+type multiWidthCodedIndex map[uint64]*singleWidthCodedIndex
+
+type singleWidthCodedIndex struct {
+	singleWidthIndex
+	code uint64
+}
+
+func (m *singleWidthCodedIndex) Marshal(w io.Writer) error {
+	if err := binary.Write(w, binary.LittleEndian, m.code); err != nil {
+		return err
+	}
+	return m.singleWidthIndex.Marshal(w)
+}
+
+func (m *singleWidthCodedIndex) Unmarshal(r io.Reader) error {
+	if err := binary.Read(r, binary.LittleEndian, &m.code); err != nil {
+		return err
+	}
+	return m.singleWidthIndex.Unmarshal(r)
+}
+
+func (m *multiWidthCodedIndex) Codec() multicodec.Code {
+	// TODO introduce this codec to mutlicodec table once finalized.
+	return IndexMultihashSortedCodec
+}
+
+func (m *multiWidthCodedIndex) Marshal(w io.Writer) error {
+	if err := binary.Write(w, binary.LittleEndian, int32(len(*m))); err != nil {
+		return err
+	}
+	// The codes are unique, but ranging over a map isn't deterministic.
+	// As per the CARv2 spec, we must order buckets by digest length.
+	// TODO update CARv2 spec to reflect this for the new index type.
+	codes := m.sortedKeys()
+
+	for _, code := range codes {
+		swci := (*m)[code]
+		if err := swci.Marshal(w); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (m *multiWidthCodedIndex) sortedKeys() []uint64 {
+	codes := make([]uint64, 0, len(*m))
+	for code := range *m {
+		codes = append(codes, code)
+	}
+	sort.Slice(codes, func(i, j int) bool {
+		return codes[i] < codes[j]
+	})
+	return codes
+}
+
+func (m *multiWidthCodedIndex) Unmarshal(r io.Reader) error {
+	var l int32
+	if err := binary.Read(r, binary.LittleEndian, &l); err != nil {
+		return err
+	}
+	for i := 0; i < int(l); i++ {
+		swci := &singleWidthCodedIndex{}
+		if err := swci.Unmarshal(r); err != nil {
+			return err
+		}
+		m.put(swci)
+	}
+	return nil
+}
+
+func (m *multiWidthCodedIndex) put(swci *singleWidthCodedIndex) {
+	(*m)[swci.code] = swci
+}
+
+func (m *multiWidthCodedIndex) Load(records []Record) error {
+	// Split cids on their digest length
+	byCode := make(map[uint64][]digestRecord)
+	for _, item := range records {
+		dmh, err := multihash.Decode(item.Hash())
+		if err != nil {
+			return err
+		}
+
+		code := dmh.Code
+
+		// Ignore IDENTITY multihashes in the index.
+		if code == multihash.IDENTITY {
+			continue
+		}
+		digest := dmh.Digest
+		swi, ok := byCode[code]
+		if !ok {
+			swi = make([]digestRecord, 0)
+			byCode[code] = swi
+		}
+
+		byCode[code] = append(swi, digestRecord{digest, item.Offset})
+	}
+
+	// Sort each list. then write to compact form.
+	for code, lst := range byCode {
+		sort.Sort(recordSet(lst))
+
+		// None of the lists can possibly be empty at this point; so we grab the first one
+		width := len(lst[0].digest)
+
+		// TODO: refactor compaction as a receiver on singleWidthIndex
+		swci := newSingleWidthCodedIndex(width, lst, code)
+		m.put(swci)
+	}
+	return nil
+}
+
+func newSingleWidthCodedIndex(width int, lst []digestRecord, code uint64) *singleWidthCodedIndex {
+	// TODO refactor duplicate compaction code in singleWidthIndex type
+	rcrdWdth := width + 8
+	compact := make([]byte, rcrdWdth*len(lst))
+	for off, itm := range lst {
+		itm.write(compact[off*rcrdWdth : (off+1)*rcrdWdth])
+	}
+	swci := &singleWidthCodedIndex{
+		singleWidthIndex: singleWidthIndex{
+			width: uint32(rcrdWdth),
+			len:   uint64(len(lst)),
+			index: compact,
+		},
+		code: code,
+	}
+	return swci
+}
+
+func (m *multiWidthCodedIndex) GetAll(cid cid.Cid, f func(uint64) bool) error {
+	hash := cid.Hash()
+	dmh, err := multihash.Decode(hash)
+	if err != nil {
+		return err
+	}
+	swci, err := m.get(dmh)
+	if err != nil {
+		return err
+	}
+	return swci.getAll(dmh.Digest, f)
+}
+
+func (m *multiWidthCodedIndex) get(dmh *multihash.DecodedMultihash) (*singleWidthCodedIndex, error) {
+	if codedIdx, ok := (*m)[dmh.Code]; ok {
+		return codedIdx, nil
+	}
+	return nil, ErrNotFound
+}
+
+func newMultihashSorted() Index {
+	index := make(multiWidthCodedIndex)
+	return &index
+}