internal/build: add IPFS CID computation

Compute CIDv1 and base58 multihash for files using only stdlib. Enables content-addressed verification of release archives.
2026-05-31 20:18:37 +00:00 · 2026-01-23 14:51:40 +01:00 · 2026-01-23 14:51:40 +01:00 · 533f51746c
commit 533f51746c
parent 9a8e14e77e
2 changed files with 325 additions and 0 deletions
--- a/internal/build/cid.go
+++ b/internal/build/cid.go
@ -0,0 +1,127 @@
+// Copyright 2024 The go-ethereum Authors
+// This file is part of the go-ethereum library.
+//
+// The go-ethereum library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The go-ethereum library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
+
+package build
+
+import (
+	"crypto/sha256"
+	"encoding/base32"
+	"io"
+	"math/big"
+	"os"
+	"strings"
+)
+
+// CID represents an IPFS Content Identifier for raw file content.
+type CID struct {
+	// V1 is the CIDv1 with raw codec: bafkrei... (base32lower, 59 chars)
+	// This is the canonical format for raw binary content.
+	V1 string
+
+	// Multihash is the raw SHA256 multihash (base58btc encoded): Qm... (46 chars)
+	// Note: This is NOT a valid CIDv0 for raw content (CIDv0 requires dag-pb codec).
+	// However, it's included for compatibility with tools that expect Qm... format.
+	// To get the actual content, use the V1 CID or convert: ipfs cid format -v 1 <multihash>
+	Multihash string
+}
+
+// ComputeFileCID computes the IPFS CID for a file's raw content.
+//
+// The CID is computed using SHA256 and the raw multicodec (0x55), which means
+// the hash is of the file's exact bytes with no wrapping or chunking.
+//
+// Returns CIDv1 (bafkrei...) as the primary identifier, plus the base58-encoded
+// multihash for compatibility with legacy tooling.
+//
+// Verify with: ipfs add --only-hash --raw-leaves -Q <file>
+func ComputeFileCID(path string) (*CID, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	return ComputeCID(f)
+}
+
+// ComputeCID computes the IPFS CID from a reader's content.
+func ComputeCID(r io.Reader) (*CID, error) {
+	h := sha256.New()
+	if _, err := io.Copy(h, r); err != nil {
+		return nil, err
+	}
+	digest := h.Sum(nil)
+
+	// Build multihash: 0x12 (SHA256) + 0x20 (32 bytes length) + digest
+	multihash := make([]byte, 0, 34)
+	multihash = append(multihash, 0x12) // SHA256 multicodec
+	multihash = append(multihash, 0x20) // 32 bytes
+	multihash = append(multihash, digest...)
+
+	// Base58-encoded multihash (Qm... format, for legacy compatibility)
+	mhBase58 := base58Encode(multihash)
+
+	// CIDv1 = 'b' + base32lower(0x01 + 0x55 + multihash)
+	// 0x01 = CIDv1, 0x55 = raw multicodec
+	cidv1Bytes := make([]byte, 0, 36)
+	cidv1Bytes = append(cidv1Bytes, 0x01) // CID version 1
+	cidv1Bytes = append(cidv1Bytes, 0x55) // raw codec
+	cidv1Bytes = append(cidv1Bytes, multihash...)
+
+	encoded := base32.StdEncoding.WithPadding(base32.NoPadding).EncodeToString(cidv1Bytes)
+	cidv1 := "b" + strings.ToLower(encoded)
+
+	return &CID{V1: cidv1, Multihash: mhBase58}, nil
+}
+
+// base58Encode encodes bytes using Bitcoin's base58 alphabet.
+// This is used for IPFS CIDv0 encoding.
+func base58Encode(data []byte) string {
+	const alphabet = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"
+
+	// Count leading zeros
+	var zeros int
+	for _, b := range data {
+		if b != 0 {
+			break
+		}
+		zeros++
+	}
+
+	// Convert to big integer
+	num := new(big.Int).SetBytes(data)
+	base := big.NewInt(58)
+	mod := new(big.Int)
+
+	// Build result in reverse
+	var result []byte
+	for num.Sign() > 0 {
+		num.DivMod(num, base, mod)
+		result = append(result, alphabet[mod.Int64()])
+	}
+
+	// Add leading '1's for each leading zero byte
+	for i := 0; i < zeros; i++ {
+		result = append(result, '1')
+	}
+
+	// Reverse the result
+	for i, j := 0, len(result)-1; i < j; i, j = i+1, j-1 {
+		result[i], result[j] = result[j], result[i]
+	}
+
+	return string(result)
+}
--- a/internal/build/cid_test.go
+++ b/internal/build/cid_test.go
@ -0,0 +1,198 @@
+// Copyright 2024 The go-ethereum Authors
+// This file is part of the go-ethereum library.
+//
+// The go-ethereum library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The go-ethereum library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
+
+package build
+
+import (
+	"bytes"
+	"os"
+	"strings"
+	"testing"
+)
+
+func TestBase58Encode(t *testing.T) {
+	tests := []struct {
+		input    []byte
+		expected string
+	}{
+		{[]byte{}, ""},
+		{[]byte{0}, "1"},
+		{[]byte{0, 0, 0}, "111"},
+		{[]byte("Hello World!"), "2NEpo7TZRRrLZSi2U"},
+	}
+
+	for _, tt := range tests {
+		result := base58Encode(tt.input)
+		if result != tt.expected {
+			t.Errorf("base58Encode(%v) = %q, want %q", tt.input, result, tt.expected)
+		}
+	}
+}
+
+func TestComputeCID(t *testing.T) {
+	tests := []struct {
+		name        string
+		content     []byte
+		wantV1Start string
+		wantMHStart string
+		wantV1Len   int
+		wantMHLen   int
+	}{
+		{
+			name:        "empty content",
+			content:     []byte{},
+			wantV1Start: "bafkrei",
+			wantMHStart: "Qm",
+			wantV1Len:   59,
+			wantMHLen:   46,
+		},
+		{
+			name:        "hello world",
+			content:     []byte("hello world"),
+			wantV1Start: "bafkrei",
+			wantMHStart: "Qm",
+			wantV1Len:   59,
+			wantMHLen:   46,
+		},
+		{
+			name:        "binary content",
+			content:     []byte{0x00, 0x01, 0x02, 0x03, 0xff, 0xfe, 0xfd},
+			wantV1Start: "bafkrei",
+			wantMHStart: "Qm",
+			wantV1Len:   59,
+			wantMHLen:   46,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			cid, err := ComputeCID(bytes.NewReader(tt.content))
+			if err != nil {
+				t.Fatalf("ComputeCID() error = %v", err)
+			}
+
+			// Check CIDv1 format
+			if !strings.HasPrefix(cid.V1, tt.wantV1Start) {
+				t.Errorf("V1 = %q, want prefix %q", cid.V1, tt.wantV1Start)
+			}
+			if len(cid.V1) != tt.wantV1Len {
+				t.Errorf("V1 length = %d, want %d", len(cid.V1), tt.wantV1Len)
+			}
+
+			// Check multihash format
+			if !strings.HasPrefix(cid.Multihash, tt.wantMHStart) {
+				t.Errorf("Multihash = %q, want prefix %q", cid.Multihash, tt.wantMHStart)
+			}
+			if len(cid.Multihash) != tt.wantMHLen {
+				t.Errorf("Multihash length = %d, want %d", len(cid.Multihash), tt.wantMHLen)
+			}
+
+			// CIDv1 should be lowercase
+			if cid.V1 != strings.ToLower(cid.V1) {
+				t.Errorf("V1 should be lowercase: %q", cid.V1)
+			}
+		})
+	}
+}
+
+func TestComputeCIDDeterministic(t *testing.T) {
+	content := []byte("deterministic test content")
+
+	cid1, err := ComputeCID(bytes.NewReader(content))
+	if err != nil {
+		t.Fatalf("ComputeCID() error = %v", err)
+	}
+
+	cid2, err := ComputeCID(bytes.NewReader(content))
+	if err != nil {
+		t.Fatalf("ComputeCID() error = %v", err)
+	}
+
+	if cid1.V1 != cid2.V1 {
+		t.Errorf("V1 not deterministic: %q != %q", cid1.V1, cid2.V1)
+	}
+	if cid1.Multihash != cid2.Multihash {
+		t.Errorf("Multihash not deterministic: %q != %q", cid1.Multihash, cid2.Multihash)
+	}
+}
+
+// TestKnownCID verifies against a known IPFS CID.
+// Verified with: echo -n "hello" | ipfs add --only-hash --raw-leaves -Q
+// Output: bafkreibm6jg3ux5qumhcn2b3flc3tyu6dmlb4xa7u5bf44yegnrjhc4yeq
+func TestKnownCID(t *testing.T) {
+	content := []byte("hello")
+	cid, err := ComputeCID(bytes.NewReader(content))
+	if err != nil {
+		t.Fatalf("ComputeCID() error = %v", err)
+	}
+
+	// This is the CIDv1 for raw "hello" bytes
+	// Verified with: echo -n "hello" | ipfs add --only-hash --raw-leaves -Q
+	expectedV1 := "bafkreibm6jg3ux5qumhcn2b3flc3tyu6dmlb4xa7u5bf44yegnrjhc4yeq"
+	if cid.V1 != expectedV1 {
+		t.Errorf("V1 for 'hello' = %q, want %q", cid.V1, expectedV1)
+	}
+
+	t.Logf("V1 (CIDv1):    %s", cid.V1)
+	t.Logf("Multihash:     %s", cid.Multihash)
+}
+
+// TestEmptyContent verifies the CID for empty content.
+// Verified with: echo -n "" | ipfs add --only-hash --raw-leaves -Q
+// Output: bafkreihdwdcefgh4dqkjv67uzcmw7ojee6xedzdetojuzjevtenxquvyku
+func TestEmptyContent(t *testing.T) {
+	content := []byte{}
+	cid, err := ComputeCID(bytes.NewReader(content))
+	if err != nil {
+		t.Fatalf("ComputeCID() error = %v", err)
+	}
+
+	// This is the CIDv1 for empty content (SHA256 of nothing)
+	expectedV1 := "bafkreihdwdcefgh4dqkjv67uzcmw7ojee6xedzdetojuzjevtenxquvyku"
+	if cid.V1 != expectedV1 {
+		t.Errorf("V1 for empty = %q, want %q", cid.V1, expectedV1)
+	}
+
+	t.Logf("V1 (CIDv1):    %s", cid.V1)
+	t.Logf("Multihash:     %s", cid.Multihash)
+}
+
+// TestReadmeFile verifies CID computation on an actual file in the repo.
+// Run: ipfs add --only-hash --raw-leaves -Q ../../README.md
+// to get the expected CID for comparison.
+func TestReadmeFile(t *testing.T) {
+	// This test only runs if the README.md exists (it should in the repo)
+	path := "../../README.md"
+	if _, err := os.Stat(path); os.IsNotExist(err) {
+		t.Skip("README.md not found, skipping file test")
+	}
+
+	cid, err := ComputeFileCID(path)
+	if err != nil {
+		t.Fatalf("ComputeFileCID() error = %v", err)
+	}
+
+	// Just verify it produces valid-looking CIDs
+	if !strings.HasPrefix(cid.V1, "bafkrei") {
+		t.Errorf("V1 should start with bafkrei: %s", cid.V1)
+	}
+	if !strings.HasPrefix(cid.Multihash, "Qm") {
+		t.Errorf("Multihash should start with Qm: %s", cid.Multihash)
+	}
+
+	t.Logf("README.md CIDv1: %s", cid.V1)
+	t.Logf("To verify: ipfs add --only-hash --raw-leaves -Q ../../README.md")
+}