doesn't work yet, but the structures are all there

2026-03-07 16:21:17 -05:00
parent 8c8a70407b
commit aa5332e994
6 changed files with 308 additions and 15 deletions
--- a/microgopt.go
+++ b/microgopt.go
@@ -4,12 +4,24 @@ import (
 	"fmt"
 	"maps"
 	"math"
+	"math/rand/v2"
 	"slices"
+	"sort"
 	"strings"
-
-	"github.com/davecgh/go-spew/spew"
 )

+// Initialize the parameters, to store the knowledge of the model
+const (
+	nLayer    = 1             // depth of the transformer neural network (number of layers)
+	nEmbd     = 16            // width of the network (embedding dimension)
+	blockSize = 16            // maximum context length of the attention window (note: the longest name is 15 characters)
+	nHead     = 4             // number of attention heads
+	headDim   = nEmbd / nHead // derived dimension of each head
+)
+
+var stateMap = map[string][][]*value{}
+
+// this type pun just worked in python but go needs to be more explicit
 func btof(b bool) float64 {
 	if b {
 		return 1.0
@@ -17,12 +29,22 @@ func btof(b bool) float64 {
 	return 0.0
 }

+func valcmp(a, b *value) int {
+	if a.data < b.data {
+		return -1
+	} else if a.data > b.data {
+		return 1
+	} else {
+		return 0
+	}
+}
+
 func Run(docs []string) {
 	// remove leading and trailing whitespace in documents
 	for i := range docs {
 		docs[i] = strings.TrimSpace(docs[i])
 	}
-	fmt.Printf("num docs: %d", len(docs))
+	fmt.Printf("num docs: %d\n", len(docs))

 	// construct the vocabulary from the documents: an ordered list of all characters in the dataset,
 	// plus a "Beginning Of Sequence" (BOS) token
@@ -33,9 +55,229 @@ func Run(docs []string) {
 		}
 	}
 	uchars := slices.Sorted(maps.Keys(set))
-	// BOS := len(uchars)
+	BOS := len(uchars)
 	vocabSize := len(uchars) + 1
-	fmt.Printf("vocab size: %d", vocabSize)
+	fmt.Printf("vocab size: %d\n", vocabSize)
+
+	// in the python code, at this point, the Value class was created
+	// and the global parameters were set up
+
+	stateMap["wte"] = genMatrix(vocabSize, nEmbd)
+	stateMap["wpe"] = genMatrix(blockSize, nEmbd)
+	stateMap["lm_head"] = genMatrix(vocabSize, nEmbd)
+	for i := range nLayer {
+		stateMap[fmt.Sprintf("layer%d.attn_wq", i)] = genMatrix(nEmbd, nEmbd)
+		stateMap[fmt.Sprintf("layer%d.attn_wk", i)] = genMatrix(nEmbd, nEmbd)
+		stateMap[fmt.Sprintf("layer%d.attn_wv", i)] = genMatrix(nEmbd, nEmbd)
+		stateMap[fmt.Sprintf("layer%d.attn_wo", i)] = genMatrix(nEmbd, nEmbd)
+		stateMap[fmt.Sprintf("layer%d.mlp_fc1", i)] = genMatrix(4*nEmbd, nEmbd)
+		stateMap[fmt.Sprintf("layer%d.mlp_fc2", i)] = genMatrix(nEmbd, 4*nEmbd)
+	}
+	// flatten params into a single []value
+	params := []*value{}
+	for _, mat := range stateMap {
+		for _, row := range mat {
+			for _, p := range row {
+				params = append(params, p)
+			}
+		}
+	}
+	fmt.Printf("num params: %d\n", len(params))
+
+	// at this point in the python, linear(), softmax(), rmsnorm(), and gpt() are all defined
+
+	// "Let there be Adam, the blessed optimizer and its buffers"
+	learningRate, beta1, beta2, epsAdam := 0.01, 0.85, 0.99, 1e-8
+	m := slices.Repeat([]float64{}, len(params)) // first moment buffer
+	v := slices.Repeat([]float64{}, len(params)) // second moment buffer
+
+	// Repeat in sequence
+	numSteps := 1000 // number of training steps
+	for step := range numSteps {
+		// Take single document, tokenize it, surround it with BOS special token on both sides
+		doc := docs[step%len(docs)]
+		tokens := []int{BOS}
+		for _, ch := range doc {
+			tokens = append(tokens, slices.Index(uchars, ch))
+		}
+		tokens = append(tokens, BOS)
+		n := min(blockSize, len(tokens)-1)
+
+		// Forward the token sequence through the model, building up the computation graph all the way to the loss
+		keys, values := mkDeepSlice(nLayer), mkDeepSlice(nLayer)
+		losses := []*value{}
+		for posId := range n {
+			tokenId, targetId := tokens[posId], tokens[posId+1]
+			logits := gpt(tokenId, posId, keys, values)
+			probs := softMax(logits)
+			lossT := probs[targetId].Log().Neg()
+			losses = append(losses, lossT)
+		}
+		lossSum := &value{}
+		for _, l := range losses {
+			lossSum.Add(l)
+		}
+		loss := lossSum.Mul(&value{data: float64(1 / n)}) // final average loss over the document sequence. May yours be low.
+
+		// Backward the loss, calculating the gradients with respect to all model parameters
+		loss.Backward()
+
+		// Adam optimizer update: update the model parameters based on the corresponding gradients
+		lrt := learningRate * (float64(1) - float64(step)/float64(numSteps))
+		for i, p := range params {
+			m[i] = beta1*m[i] + (1-beta1)*p.grad
+			v[i] = beta2*v[i] + (1-beta2)*math.Pow(p.grad, 2.0)
+			m_hat := m[i] / (1 - math.Pow(beta1, float64(step+1)))
+			v_hat := v[i] / (1 - math.Pow(beta2, float64(step+1)))
+			p.data = p.data - (lrt*m_hat)/(math.Pow(v_hat, 0.5)+epsAdam)
+			p.grad = 0.0
+		}
+		fmt.Printf("step %4d / %4d | loss %.4f\n", step+1, numSteps, loss.data)
+	}
+
+	// Inference: may the model babble back to us
+	temperature := 0.5 // in (0, 1], control the "creativity" of generated text, low to high
+	fmt.Println("--- inference (new, hallucinated names) ---")
+	for sampleIdx := range 20 {
+		keys, values := mkDeepSlice(nLayer), mkDeepSlice(nLayer)
+		tokenId := BOS
+		sample := []rune{}
+		for posId := range blockSize {
+			logits := gpt(tokenId, posId, keys, values)
+			probs := make([]*value, len(logits))
+			for i, l := range logits {
+				probs[i] = l.Div(&value{data: temperature})
+			}
+			probs = softMax(probs)
+			tokenId := RouletteDraw(probs)
+			if tokenId == BOS {
+				break
+			}
+			sample = append(sample, uchars[tokenId])
+		}
+		fmt.Printf("sample %2d: %s\n", sampleIdx, string(sample))
+	}
+}
+
+func genMatrix(out, in int) [][]*value {
+	m := make([][]*value, out)
+	for o := range out {
+		m[o] = make([]*value, in)
+		for i := range in {
+			m[o][i] = &value{data: rand.NormFloat64() * 0.08}
+		}
+	}
+	return m
+}
+
+func linear(x []*value, w [][]*value) []*value {
+	r := []*value{}
+	for _, wo := range w {
+		for i := range wo {
+			r = append(r, wo[i].Mul(x[i]))
+		}
+	}
+	return r
+}
+
+func softMax(logits []*value) []*value {
+	maxVal := slices.MaxFunc(logits, valcmp)
+	exps := []*value{}
+	for _, val := range logits {
+		exps = append(exps, val.Sub(maxVal).Exp())
+	}
+	total := &value{}
+	for _, e := range exps {
+		total = total.Add(e)
+	}
+	for i := range exps {
+		exps[i] = exps[i].Div(total)
+	}
+	return exps
+}
+
+func rmsNorm(x []*value) []*value {
+	ms := &value{}
+	for _, xi := range x {
+		ms = ms.Add(xi.Mul(xi))
+	}
+	ms = ms.Div(&value{data: float64(len(x))})
+	scale := ms.Add(&value{data: 1e-5}).Pow(&value{data: -0.5})
+	for i := range x {
+		x[i] = x[i].Mul(scale)
+	}
+	return x
+}
+
+func gpt(tokenId int, posId int, keys [][][]*value, values [][][]*value) []*value {
+	tokEmb := stateMap["wte"][tokenId] // token embedding
+	posEmb := stateMap["wpe"][posId]   // position embedding
+	x := []*value{}
+	// joint token and position embedding
+	for i := range tokEmb {
+		x = append(x, tokEmb[i].Add(posEmb[i]))
+	}
+	x = rmsNorm(x) // note: not redundant due to backward pass via the residual connection
+
+	for li := range nLayer {
+		// 1) Multi-head Attention block
+		xResidual := slices.Clone(x)
+		x = rmsNorm(x)
+		q := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wq", li)])
+		k := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wk", li)])
+		v := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wv", li)])
+		keys[li] = append(keys[li], k)
+		values[li] = append(values[li], v)
+		xAttn := []*value{}
+		// basically, distribute the work over the "attention heads"
+		for h := range nHead {
+			hs := h * headDim
+			q_h := q[hs : hs+headDim]
+			k_h := [][]*value{}
+			for _, ki := range keys[li] {
+				k_h = append(k_h, ki[hs:hs+headDim])
+			}
+			v_h := [][]*value{}
+			for _, vi := range values[li] {
+				v_h = append(v_h, vi[hs:hs+headDim])
+			}
+			attnLogits := []*value{}
+			for t := range len(k_h) {
+				s := &value{data: 0.0}
+				for j := range headDim {
+					s = s.Add(q_h[j].Mul(k_h[t][j]))
+				}
+				attnLogits = append(attnLogits, s.Div(&value{data: math.Pow(float64(headDim), 0.5)}))
+			}
+			attnWeights := softMax(attnLogits)
+			headOut := []*value{}
+			for j := range headDim {
+				s := &value{data: 0.0}
+				for t := range len(v_h) {
+					s = s.Add(attnWeights[t].Mul(v_h[t][j]))
+				}
+				headOut = append(headOut, s)
+			}
+			xAttn = append(xAttn, headOut...)
+		}
+		x = linear(xAttn, stateMap[fmt.Sprintf("layer%d.attn_wo", li)])
+		for i := range x {
+			x[i] = x[i].Add(xResidual[i])
+		}
+		// 2) MLP block
+		xResidual = x
+		x = rmsNorm(x)
+		x = linear(x, stateMap[fmt.Sprintf("layer%d.mlp_fc1", li)])
+		for i := range x {
+			x[i] = x[i].Relu()
+		}
+		x = linear(x, stateMap[fmt.Sprintf("layer%d.mlp_fc2", li)])
+		for i := range x {
+			x[i] = x[i].Add(xResidual[i])
+		}
+	}
+	logits := linear(x, stateMap["lm_head"])
+	return logits
 }

 type value struct {
@@ -45,9 +287,10 @@ type value struct {
 	localGrads []*value
 }

+// this lets us build a set-like map with our Values.
+// If the slices were removed from the struct, that would make this method irrelevant.
 func (v *value) toKey() string {
 	k := fmt.Sprintf("%+v", v)
-	fmt.Println(k)
 	return k
 }

@@ -55,7 +298,7 @@ func (v *value) Add(other *value) *value {
 	return &value{
 		data:       v.data + other.data,
 		children:   []*value{v, other},
-		localGrads: []*value{{data: 1}, {data: 1}},
+		localGrads: []*value{{data: 1.0}, {data: 1.0}},
 	}
 }

@@ -68,6 +311,7 @@ func (v *value) Div(other *value) *value {
 }

 func (v *value) Mul(other *value) *value {
+	// note the swap here: children are stored as v, other but grads are other, v
 	return &value{
 		data:     v.data * other.data,
 		children: []*value{v, other},
@@ -137,11 +381,39 @@ func (v *value) Backward() {
 		}
 	}
 	buildTopo(v)
-	spew.Dump(topo)
-	v.grad = 1
+	v.grad = 1.0
 	for _, v := range slices.Backward(topo) {
 		for i := range v.children {
 			v.children[i].grad += v.localGrads[i].data * v.grad
 		}
 	}
 }
+
+func mkDeepSlice(size int) [][][]*value {
+	a := make([][][]*value, 1, 10)
+	a[0] = make([][]*value, 1, 10)
+	a[0][0] = make([]*value, 1, 10)
+	return a
+}
+
+// implement our own weighted random chooser
+// based on https://cybernetist.com/2019/01/24/random-weighted-draws-in-go/ but without the dependency on gonum
+func RouletteDraw(p []*value) int {
+	// Initialization: create the discrete CDF
+	cdf := make([]float64, len(p))
+	for i, v := range p {
+		if i == 0 {
+			cdf[i] = v.data
+		} else {
+			cdf[i] = cdf[i-1] + v.data
+		}
+	}
+	// Generation:
+	// 1. Generate a uniformly-random value x in the range [0,1)
+	// 2. Using a binary search, find the index of the smallest element in cdf larger than x
+	var val float64
+	// multiply the sample with the largest CDF value; easier than normalizing to [0,1)
+	val = rand.Float64() * cdf[len(cdf)-1]
+	// Search returns the smallest index i such that cdf[i] > val
+	return sort.Search(len(cdf), func(i int) bool { return cdf[i] > val })
+}