From 7597825e3cdbbd0f7c831fac8f7734325daa3d88 Mon Sep 17 00:00:00 2001 From: Rod Vagg Date: Tue, 11 Aug 2020 13:54:32 +1000 Subject: [PATCH] BREAKING CHANGE: change Pointer CBOR representation to a kinded union From: type Pointer union { &Node "0" Bucket "1" } representation keyed i.e. {"0": CID} or {"1": [KV...]} To: type Pointer union { &Node link Bucket list } representation kinded i.e. CID or [KV...] Also removes redundant refmt tags Closes: https://github.com/ipfs/go-hamt-ipld/issues/53 --- hamt.go | 30 +++++++--------- hamt_test.go | 85 ++++++++++--------------------------------- pointer_cbor.go | 96 +++++++++++++++++-------------------------------- 3 files changed, 65 insertions(+), 146 deletions(-) diff --git a/hamt.go b/hamt.go index fddee26..6b0ba61 100644 --- a/hamt.go +++ b/hamt.go @@ -59,9 +59,6 @@ var ErrMalformedHamt = fmt.Errorf("HAMT node was malformed") // array. Indexes `[1]` and `[2]` are not present, but index `[3]` is at // the second position of our Pointers array. // -// (Note: the `refmt` tags are ignored by cbor-gen which will generate an -// array type rather than map.) -// // The IPLD Schema representation of this data structure is as follows: // // type Node struct { @@ -69,8 +66,8 @@ var ErrMalformedHamt = fmt.Errorf("HAMT node was malformed") // pointers [Pointer] // } representation tuple type Node struct { - Bitfield *big.Int `refmt:"bf"` - Pointers []*Pointer `refmt:"p"` + Bitfield *big.Int + Pointers []*Pointer bitWidth int hash func([]byte) []byte @@ -81,10 +78,11 @@ type Node struct { // Pointer is an element in a HAMT node's Pointers array, encoded as an IPLD // tuple in DAG-CBOR of shape: -// {"0": CID} or {"1": [KV...]} -// Where a map with a single key of "0" contains a Link, where a map with a -// single key of "1" contains a KV bucket. The map may contain only one of -// these two possible keys. +// CID or [KV...] +// i.e. it is represented as a "kinded union" where a Link is a pointer to a +// child node, while an array is a bucket of elements local to this node. A +// Pointer must represent exactly one of of these two states and cannot be both +// (or neither). // // There are between 1 and 2^bitWidth of these Pointers in any HAMT node. // @@ -93,20 +91,17 @@ type Node struct { // the bucket is replaced with a link to a newly created HAMT node which will // contain the `bucketSize+1` elements in its own Pointers array. // -// (Note: the `refmt` tags are ignored by cbor-gen which will generate an -// array type rather than map.) -// // The IPLD Schema representation of this data structure is as follows: // // type Pointer union { -// &Node "0" -// Bucket "1" -// } representation keyed +// &Node link +// Bucket list +// } representation kinded // // type Bucket [KV] type Pointer struct { - KVs []*KV `refmt:"v,omitempty"` - Link cid.Cid `refmt:"l,omitempty"` + KVs []*KV + Link cid.Cid // cached node to avoid too many serialization operations // TODO(rvagg): we should check that this is actually used optimally. Flush() @@ -373,6 +368,7 @@ func loadNode( isLink := ch.isShard() isBucket := ch.KVs != nil if !((isLink && !isBucket) || (!isLink && isBucket)) { + // Pointer#UnmarshalCBOR shouldn't allow this return nil, ErrMalformedHamt } if isLink && ch.Link.Type() != cid.DagCBOR { // not dag-cbor diff --git a/hamt_test.go b/hamt_test.go index b816eea..6ffbe1a 100644 --- a/hamt_test.go +++ b/hamt_test.go @@ -904,10 +904,9 @@ func TestMalformedHamt(t *testing.T) { bcat(b(0x40+1), b(kv.key)), // bytes(1) "\x??" bcat(b(0x40+1), b(kv.value)))) // bytes(1) "\x??" } - return bcat(b(0xa0+1), // map(1) - bcat(b(0x60+1), b(0x31)), // string(1) "1" - bcat(b(0x80+byte(len(kvs))), // array(?) - en)) // bucket contents + return bcat( + b(0x80+byte(len(kvs))), // array(?) + en) // bucket contents } // most minimal HAMT node with one k/v entry, sanity check we can load this @@ -964,53 +963,15 @@ func TestMalformedHamt(t *testing.T) { t.Fatal("Should have returned ErrMalformedHamt for mismatch bitfield count") } - // test mixed link & bucket - - // this node contains 2 elements, the first is a plain entry with one bucket - // and with a single key of 0x0100, the second element is a link to a child - // node which happens to be the same CID as this node will be stored in. - // However, this second entry has both a CID and a bucket in the same - // element, which is not allowed. Without checks for exactly one of these - // two things then then a lookup for key 0x0100 would navigate through this - // node and back again as its own child to the first element. - store( - bcat(b(0x80+2), // array(2) - bcat(b(0x40+1), b(0x03)), // bytes(1) "\x03" (bitmap) - bcat(b(0x80+2), // array(2) - bcat(b(0xa0+1), // map(1) - bcat(b(0x60+1), b(0x31)), // string(1) "1" - bcat(b(0x80+1), // array(1) - bcat(b(0x80+2), // array(2) - bcat(b(0x40+2), []byte{0x01, 0x00}), // bytes(2) "\x0100" - bcat(b(0x40+1), b(0xff))))), // bytes(1) "\xff" - bcat(b(0xa0+2), // map(2) - bcat(b(0x60+1), b(0x30)), // string(1) "0" - bcat(b(0xd8), b(0x2a), // tag(42) - b(0x58), b(0x27), // bytes(39) - cidBytes), // cid - bcat(b(0x60+1), b(0x31)), // string(1) "1" - bcat(b(0x80+1), // array(1) - bcat(b(0x80+2), // array(2) - bcat(b(0x40+1), b(0x01)), // bytes(1) "\x00" - bcat(b(0x40+1), b(0xfe)))))))) // bytes(1) "\xff - - n, err = LoadNode(ctx, cs, bcid, UseTreeBitWidth(8), UseHashFunction(identityHash)) - if err == nil || n != nil || err.Error() != "Pointers should be a single element map" { - // no ErrMalformedHamt here possible bcause of cbor-gen wrapping - t.Fatal("Should have returned error for bad Pointer cbor") - } - // test pointers with links have are DAG-CBOR multicodec // sanity check minimal node pointing to a child node store( bcat(b(0x80+2), // array(2) bcat(b(0x40+1), b(0x01)), // bytes(1) "\x01" (bitmap) bcat(b(0x80+1), // array(1) - bcat(b(0xa0+1), // map(1) - bcat(b(0x60+1), b(0x30)), // string(1) "0" - bcat(b(0xd8), b(0x2a), // tag(42) - b(0x58), b(0x27), // bytes(39) - cidBytes))))) // cid + bcat(b(0xd8), b(0x2a), // tag(42) + b(0x58), b(0x27), // bytes(39) + cidBytes)))) // cid load() // node pointing to a non-dag-cbor node @@ -1018,11 +979,9 @@ func TestMalformedHamt(t *testing.T) { bcat(b(0x80+2), // array(2) bcat(b(0x40+1), b(0x01)), // bytes(1) "\x01" (bitmap) bcat(b(0x80+1), // array(1) - bcat(b(0xa0+1), // map(1) - bcat(b(0x60+1), b(0x30)), // string(1) "0" - bcat(b(0xd8), b(0x2a), // tag(42) - b(0x58), b(0x27), // bytes(39) - badCidBytes))))) // cid + bcat(b(0xd8), b(0x2a), // tag(42) + b(0x58), b(0x27), // bytes(39) + badCidBytes)))) // cid n, err = LoadNode(ctx, cs, bcid, UseTreeBitWidth(8), UseHashFunction(identityHash)) if err != ErrMalformedHamt || n != nil { t.Fatal("Should have returned ErrMalformedHamt for bad child link codec") @@ -1102,11 +1061,9 @@ func TestMalformedHamt(t *testing.T) { bcat(b(0x80+2), // array(2) bcat(b(0x40+1), b(0x01)), // bytes(1) "\x01" (bitmap) bcat(b(0x80+1), // array(1) - bcat(b(0xa0+1), // map(1) - bcat(b(0x60+1), b(0x30)), // string(1) "0" - bcat(b(0xd8), b(0x2a), // tag(42) - b(0x58), b(0x27), // bytes(39) - ccidBytes))))) // cid + bcat(b(0xd8), b(0x2a), // tag(42) + b(0x58), b(0x27), // bytes(39) + ccidBytes)))) // cid vg, err := load().FindRaw(ctx, string([]byte{0x00, 0x01})) // without validation of the child block, this would return an ErrNotFound @@ -1145,17 +1102,13 @@ func TestMalformedHamt(t *testing.T) { bcat(b(0x80+2), // array(2) bcat(b(0x40+1), b(0x03)), // bytes(1) "\x03" (bitmap) bcat(b(0x80+2), // array(2) - bcat(b(0xa0+1), // map(1) - bcat(b(0x60+1), b(0x31)), // string(1) "1" - bcat(b(0x80+1), // array(1) - bcat(b(0x80+2), // array(2) - bcat(b(0x40+2), []byte{0x00, 0x01}), // bytes(2) "\x0001" - bcat(b(0x40+1), b(0xff))))), // bytes(1) "\xff" - bcat(b(0xa0+1), // map(1) - bcat(b(0x60+1), b(0x30)), // string(1) "0" - bcat(b(0xd8), b(0x2a), // tag(42) - b(0x58), b(0x27), // bytes(39) - ccidBytes))))) // cid + bcat(b(0x80+1), // array(1) + bcat(b(0x80+2), // array(2) + bcat(b(0x40+2), []byte{0x00, 0x01}), // bytes(2) "\x0001" + bcat(b(0x40+1), b(0xff)))), // bytes(1) "\xff" + bcat(b(0xd8), b(0x2a), // tag(42) + b(0x58), b(0x27), // bytes(39) + ccidBytes)))) // cid vg, err = load().FindRaw(ctx, string([]byte{0x00, 0x01})) // without validation of the child block, this would return an ErrNotFound diff --git a/pointer_cbor.go b/pointer_cbor.go index 8734626..d39db84 100644 --- a/pointer_cbor.go +++ b/pointer_cbor.go @@ -8,8 +8,8 @@ import ( cbg "github.com/whyrusleeping/cbor-gen" ) -var keyZero = []byte("0") -var keyOne = []byte("1") +// implemented as a kinded union - a "Pointer" is either a Link (child node) or +// an Array (bucket) func (t *Pointer) MarshalCBOR(w io.Writer) error { if t.Link != cid.Undef && len(t.KVs) > 0 { @@ -18,36 +18,11 @@ func (t *Pointer) MarshalCBOR(w io.Writer) error { scratch := make([]byte, 9) - if err := cbg.WriteMajorTypeHeaderBuf(scratch, w, cbg.MajMap, 1); err != nil { - return err - } - if t.Link != cid.Undef { - // key for links is "0" - // Refmt (and the general IPLD data model currently) can't deal - // with non string keys. So we have this weird restriction right now - // hoping to be able to use integer keys soon - if err := cbg.WriteMajorTypeHeaderBuf(scratch, w, cbg.MajTextString, 1); err != nil { - return err - } - - if _, err := w.Write(keyZero); err != nil { - return err - } - if err := cbg.WriteCidBuf(scratch, w, t.Link); err != nil { return err } } else { - // key for KVs is "1" - if err := cbg.WriteMajorTypeHeaderBuf(scratch, w, cbg.MajTextString, 1); err != nil { - return err - } - - if _, err := w.Write(keyOne); err != nil { - return err - } - if err := cbg.WriteMajorTypeHeaderBuf(scratch, w, cbg.MajArray, uint64(len(t.KVs))); err != nil { return err } @@ -69,51 +44,29 @@ func (t *Pointer) UnmarshalCBOR(br io.Reader) error { if err != nil { return err } - if maj != cbg.MajMap { - return fmt.Errorf("cbor input should be of map") - } - - if extra != 1 { - return fmt.Errorf("Pointers should be a single element map") - } - - maj, val, err := cbg.CborReadHeaderBuf(br, scratch) - if err != nil { - return err - } - if maj != cbg.MajTextString { - return fmt.Errorf("expected text string key") - } - - if val != 1 { - return fmt.Errorf("map keys in pointers must be a single byte long") - } - - if _, err := io.ReadAtLeast(br, scratch[:1], 1); err != nil { - return err - } + if maj == cbg.MajTag { + if extra != 42 { + return fmt.Errorf("expected tag 42 for child node link") + } - switch scratch[0] { - case '0': - c, err := cbg.ReadCid(br) + ba, err := cbg.ReadByteArray(br, 512) if err != nil { return err } - t.Link = c - return nil - case '1': - maj, length, err := cbg.CborReadHeaderBuf(br, scratch) + + c, err := bufToCid(ba) if err != nil { return err } - if maj != cbg.MajArray { - return fmt.Errorf("expected an array of KVs in cbor input") - } + t.Link = c + return nil + } else if maj == cbg.MajArray { + length := extra if length > 32 { - return fmt.Errorf("KV array in cbor input for pointer was too long") + return fmt.Errorf("KV array in CBOR input for pointer was too long") } t.KVs = make([]*KV, length) @@ -127,7 +80,24 @@ func (t *Pointer) UnmarshalCBOR(br io.Reader) error { } return nil - default: - return fmt.Errorf("invalid pointer map key in cbor input: %d", val) + } else { + return fmt.Errorf("expected CBOR child node link or array") + } +} + +// from https://github.com/whyrusleeping/cbor-gen/blob/211df3b9e24c6e0d0c338b440e6ab4ab298505b2/utils.go#L530 +func bufToCid(buf []byte) (cid.Cid, error) { + if len(buf) == 0 { + return cid.Undef, fmt.Errorf("undefined CID") } + + if len(buf) < 2 { + return cid.Undef, fmt.Errorf("DAG-CBOR serialized CIDs must have at least two bytes") + } + + if buf[0] != 0 { + return cid.Undef, fmt.Errorf("DAG-CBOR serialized CIDs must have binary multibase") + } + + return cid.Cast(buf[1:]) }