From aa5332e99470bef963ce5d1709140e83d2f40ef2 Mon Sep 17 00:00:00 2001
From: David Ashby <delta.mu.alpha@gmail.com>
Date: Sat, 7 Mar 2026 16:21:17 -0500
Subject: [PATCH] doesn't work yet, but the structures are all there

---
 .gitignore   |   1 +
 cmd/main.go  |  20 +++-
 go.mod       |   2 -
 go.sum       |   2 +
 microgopt.go | 290 +++++++++++++++++++++++++++++++++++++++++++++++++--
 readme.md    |   8 +-
 6 files changed, 308 insertions(+), 15 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..314f02b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*.txt
\ No newline at end of file
diff --git a/cmd/main.go b/cmd/main.go
index 04d7020..1de1f9e 100644
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -1,7 +1,23 @@
 package main
 
-import "git.yetaga.in/alazyreader/microgopt"
+import (
+	"log"
+	"os"
+	"strings"
+
+	"git.yetaga.in/alazyreader/microgopt"
+)
 
 func main() {
-	microgopt.Run([]string{})
+	f := "names.txt"
+	if len(os.Args) > 1 {
+		f = os.Args[1]
+	}
+	b, err := os.ReadFile(f)
+	if err != nil {
+		log.Fatalf("%v", err)
+		return
+	}
+	s := string(b)
+	microgopt.Run(strings.Split(s, "\n"))
 }
diff --git a/go.mod b/go.mod
index bd9fc32..e41a1b8 100644
--- a/go.mod
+++ b/go.mod
@@ -1,5 +1,3 @@
 module git.yetaga.in/alazyreader/microgopt
 
 go 1.26.0
-
-require github.com/davecgh/go-spew v1.1.1
diff --git a/go.sum b/go.sum
index b5e2922..e766d2c 100644
--- a/go.sum
+++ b/go.sum
@@ -1,2 +1,4 @@
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/mroth/weightedrand/v3 v3.0.0 h1:FPPz/Xpri6qOzZTj8gEA1i6YBSlwLjkDQ+WaZpNHiiI=
+github.com/mroth/weightedrand/v3 v3.0.0/go.mod h1:Qfpt3At9/pYtQOzy9c2iHVWiHBPL+gvMY7mIN5WRlGg=
diff --git a/microgopt.go b/microgopt.go
index a7185d2..c0794a2 100644
--- a/microgopt.go
+++ b/microgopt.go
@@ -4,12 +4,24 @@ import (
 	"fmt"
 	"maps"
 	"math"
+	"math/rand/v2"
 	"slices"
+	"sort"
 	"strings"
-
-	"github.com/davecgh/go-spew/spew"
 )
 
+// Initialize the parameters, to store the knowledge of the model
+const (
+	nLayer    = 1             // depth of the transformer neural network (number of layers)
+	nEmbd     = 16            // width of the network (embedding dimension)
+	blockSize = 16            // maximum context length of the attention window (note: the longest name is 15 characters)
+	nHead     = 4             // number of attention heads
+	headDim   = nEmbd / nHead // derived dimension of each head
+)
+
+var stateMap = map[string][][]*value{}
+
+// this type pun just worked in python but go needs to be more explicit
 func btof(b bool) float64 {
 	if b {
 		return 1.0
@@ -17,12 +29,22 @@ func btof(b bool) float64 {
 	return 0.0
 }
 
+func valcmp(a, b *value) int {
+	if a.data < b.data {
+		return -1
+	} else if a.data > b.data {
+		return 1
+	} else {
+		return 0
+	}
+}
+
 func Run(docs []string) {
 	// remove leading and trailing whitespace in documents
 	for i := range docs {
 		docs[i] = strings.TrimSpace(docs[i])
 	}
-	fmt.Printf("num docs: %d", len(docs))
+	fmt.Printf("num docs: %d\n", len(docs))
 
 	// construct the vocabulary from the documents: an ordered list of all characters in the dataset,
 	// plus a "Beginning Of Sequence" (BOS) token
@@ -33,9 +55,229 @@ func Run(docs []string) {
 		}
 	}
 	uchars := slices.Sorted(maps.Keys(set))
-	// BOS := len(uchars)
+	BOS := len(uchars)
 	vocabSize := len(uchars) + 1
-	fmt.Printf("vocab size: %d", vocabSize)
+	fmt.Printf("vocab size: %d\n", vocabSize)
+
+	// in the python code, at this point, the Value class was created
+	// and the global parameters were set up
+
+	stateMap["wte"] = genMatrix(vocabSize, nEmbd)
+	stateMap["wpe"] = genMatrix(blockSize, nEmbd)
+	stateMap["lm_head"] = genMatrix(vocabSize, nEmbd)
+	for i := range nLayer {
+		stateMap[fmt.Sprintf("layer%d.attn_wq", i)] = genMatrix(nEmbd, nEmbd)
+		stateMap[fmt.Sprintf("layer%d.attn_wk", i)] = genMatrix(nEmbd, nEmbd)
+		stateMap[fmt.Sprintf("layer%d.attn_wv", i)] = genMatrix(nEmbd, nEmbd)
+		stateMap[fmt.Sprintf("layer%d.attn_wo", i)] = genMatrix(nEmbd, nEmbd)
+		stateMap[fmt.Sprintf("layer%d.mlp_fc1", i)] = genMatrix(4*nEmbd, nEmbd)
+		stateMap[fmt.Sprintf("layer%d.mlp_fc2", i)] = genMatrix(nEmbd, 4*nEmbd)
+	}
+	// flatten params into a single []value
+	params := []*value{}
+	for _, mat := range stateMap {
+		for _, row := range mat {
+			for _, p := range row {
+				params = append(params, p)
+			}
+		}
+	}
+	fmt.Printf("num params: %d\n", len(params))
+
+	// at this point in the python, linear(), softmax(), rmsnorm(), and gpt() are all defined
+
+	// "Let there be Adam, the blessed optimizer and its buffers"
+	learningRate, beta1, beta2, epsAdam := 0.01, 0.85, 0.99, 1e-8
+	m := slices.Repeat([]float64{}, len(params)) // first moment buffer
+	v := slices.Repeat([]float64{}, len(params)) // second moment buffer
+
+	// Repeat in sequence
+	numSteps := 1000 // number of training steps
+	for step := range numSteps {
+		// Take single document, tokenize it, surround it with BOS special token on both sides
+		doc := docs[step%len(docs)]
+		tokens := []int{BOS}
+		for _, ch := range doc {
+			tokens = append(tokens, slices.Index(uchars, ch))
+		}
+		tokens = append(tokens, BOS)
+		n := min(blockSize, len(tokens)-1)
+
+		// Forward the token sequence through the model, building up the computation graph all the way to the loss
+		keys, values := mkDeepSlice(nLayer), mkDeepSlice(nLayer)
+		losses := []*value{}
+		for posId := range n {
+			tokenId, targetId := tokens[posId], tokens[posId+1]
+			logits := gpt(tokenId, posId, keys, values)
+			probs := softMax(logits)
+			lossT := probs[targetId].Log().Neg()
+			losses = append(losses, lossT)
+		}
+		lossSum := &value{}
+		for _, l := range losses {
+			lossSum.Add(l)
+		}
+		loss := lossSum.Mul(&value{data: float64(1 / n)}) // final average loss over the document sequence. May yours be low.
+
+		// Backward the loss, calculating the gradients with respect to all model parameters
+		loss.Backward()
+
+		// Adam optimizer update: update the model parameters based on the corresponding gradients
+		lrt := learningRate * (float64(1) - float64(step)/float64(numSteps))
+		for i, p := range params {
+			m[i] = beta1*m[i] + (1-beta1)*p.grad
+			v[i] = beta2*v[i] + (1-beta2)*math.Pow(p.grad, 2.0)
+			m_hat := m[i] / (1 - math.Pow(beta1, float64(step+1)))
+			v_hat := v[i] / (1 - math.Pow(beta2, float64(step+1)))
+			p.data = p.data - (lrt*m_hat)/(math.Pow(v_hat, 0.5)+epsAdam)
+			p.grad = 0.0
+		}
+		fmt.Printf("step %4d / %4d | loss %.4f\n", step+1, numSteps, loss.data)
+	}
+
+	// Inference: may the model babble back to us
+	temperature := 0.5 // in (0, 1], control the "creativity" of generated text, low to high
+	fmt.Println("--- inference (new, hallucinated names) ---")
+	for sampleIdx := range 20 {
+		keys, values := mkDeepSlice(nLayer), mkDeepSlice(nLayer)
+		tokenId := BOS
+		sample := []rune{}
+		for posId := range blockSize {
+			logits := gpt(tokenId, posId, keys, values)
+			probs := make([]*value, len(logits))
+			for i, l := range logits {
+				probs[i] = l.Div(&value{data: temperature})
+			}
+			probs = softMax(probs)
+			tokenId := RouletteDraw(probs)
+			if tokenId == BOS {
+				break
+			}
+			sample = append(sample, uchars[tokenId])
+		}
+		fmt.Printf("sample %2d: %s\n", sampleIdx, string(sample))
+	}
+}
+
+func genMatrix(out, in int) [][]*value {
+	m := make([][]*value, out)
+	for o := range out {
+		m[o] = make([]*value, in)
+		for i := range in {
+			m[o][i] = &value{data: rand.NormFloat64() * 0.08}
+		}
+	}
+	return m
+}
+
+func linear(x []*value, w [][]*value) []*value {
+	r := []*value{}
+	for _, wo := range w {
+		for i := range wo {
+			r = append(r, wo[i].Mul(x[i]))
+		}
+	}
+	return r
+}
+
+func softMax(logits []*value) []*value {
+	maxVal := slices.MaxFunc(logits, valcmp)
+	exps := []*value{}
+	for _, val := range logits {
+		exps = append(exps, val.Sub(maxVal).Exp())
+	}
+	total := &value{}
+	for _, e := range exps {
+		total = total.Add(e)
+	}
+	for i := range exps {
+		exps[i] = exps[i].Div(total)
+	}
+	return exps
+}
+
+func rmsNorm(x []*value) []*value {
+	ms := &value{}
+	for _, xi := range x {
+		ms = ms.Add(xi.Mul(xi))
+	}
+	ms = ms.Div(&value{data: float64(len(x))})
+	scale := ms.Add(&value{data: 1e-5}).Pow(&value{data: -0.5})
+	for i := range x {
+		x[i] = x[i].Mul(scale)
+	}
+	return x
+}
+
+func gpt(tokenId int, posId int, keys [][][]*value, values [][][]*value) []*value {
+	tokEmb := stateMap["wte"][tokenId] // token embedding
+	posEmb := stateMap["wpe"][posId]   // position embedding
+	x := []*value{}
+	// joint token and position embedding
+	for i := range tokEmb {
+		x = append(x, tokEmb[i].Add(posEmb[i]))
+	}
+	x = rmsNorm(x) // note: not redundant due to backward pass via the residual connection
+
+	for li := range nLayer {
+		// 1) Multi-head Attention block
+		xResidual := slices.Clone(x)
+		x = rmsNorm(x)
+		q := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wq", li)])
+		k := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wk", li)])
+		v := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wv", li)])
+		keys[li] = append(keys[li], k)
+		values[li] = append(values[li], v)
+		xAttn := []*value{}
+		// basically, distribute the work over the "attention heads"
+		for h := range nHead {
+			hs := h * headDim
+			q_h := q[hs : hs+headDim]
+			k_h := [][]*value{}
+			for _, ki := range keys[li] {
+				k_h = append(k_h, ki[hs:hs+headDim])
+			}
+			v_h := [][]*value{}
+			for _, vi := range values[li] {
+				v_h = append(v_h, vi[hs:hs+headDim])
+			}
+			attnLogits := []*value{}
+			for t := range len(k_h) {
+				s := &value{data: 0.0}
+				for j := range headDim {
+					s = s.Add(q_h[j].Mul(k_h[t][j]))
+				}
+				attnLogits = append(attnLogits, s.Div(&value{data: math.Pow(float64(headDim), 0.5)}))
+			}
+			attnWeights := softMax(attnLogits)
+			headOut := []*value{}
+			for j := range headDim {
+				s := &value{data: 0.0}
+				for t := range len(v_h) {
+					s = s.Add(attnWeights[t].Mul(v_h[t][j]))
+				}
+				headOut = append(headOut, s)
+			}
+			xAttn = append(xAttn, headOut...)
+		}
+		x = linear(xAttn, stateMap[fmt.Sprintf("layer%d.attn_wo", li)])
+		for i := range x {
+			x[i] = x[i].Add(xResidual[i])
+		}
+		// 2) MLP block
+		xResidual = x
+		x = rmsNorm(x)
+		x = linear(x, stateMap[fmt.Sprintf("layer%d.mlp_fc1", li)])
+		for i := range x {
+			x[i] = x[i].Relu()
+		}
+		x = linear(x, stateMap[fmt.Sprintf("layer%d.mlp_fc2", li)])
+		for i := range x {
+			x[i] = x[i].Add(xResidual[i])
+		}
+	}
+	logits := linear(x, stateMap["lm_head"])
+	return logits
 }
 
 type value struct {
@@ -45,9 +287,10 @@ type value struct {
 	localGrads []*value
 }
 
+// this lets us build a set-like map with our Values.
+// If the slices were removed from the struct, that would make this method irrelevant.
 func (v *value) toKey() string {
 	k := fmt.Sprintf("%+v", v)
-	fmt.Println(k)
 	return k
 }
 
@@ -55,7 +298,7 @@ func (v *value) Add(other *value) *value {
 	return &value{
 		data:       v.data + other.data,
 		children:   []*value{v, other},
-		localGrads: []*value{{data: 1}, {data: 1}},
+		localGrads: []*value{{data: 1.0}, {data: 1.0}},
 	}
 }
 
@@ -68,6 +311,7 @@ func (v *value) Div(other *value) *value {
 }
 
 func (v *value) Mul(other *value) *value {
+	// note the swap here: children are stored as v, other but grads are other, v
 	return &value{
 		data:     v.data * other.data,
 		children: []*value{v, other},
@@ -137,11 +381,39 @@ func (v *value) Backward() {
 		}
 	}
 	buildTopo(v)
-	spew.Dump(topo)
-	v.grad = 1
+	v.grad = 1.0
 	for _, v := range slices.Backward(topo) {
 		for i := range v.children {
 			v.children[i].grad += v.localGrads[i].data * v.grad
 		}
 	}
 }
+
+func mkDeepSlice(size int) [][][]*value {
+	a := make([][][]*value, 1, 10)
+	a[0] = make([][]*value, 1, 10)
+	a[0][0] = make([]*value, 1, 10)
+	return a
+}
+
+// implement our own weighted random chooser
+// based on https://cybernetist.com/2019/01/24/random-weighted-draws-in-go/ but without the dependency on gonum
+func RouletteDraw(p []*value) int {
+	// Initialization: create the discrete CDF
+	cdf := make([]float64, len(p))
+	for i, v := range p {
+		if i == 0 {
+			cdf[i] = v.data
+		} else {
+			cdf[i] = cdf[i-1] + v.data
+		}
+	}
+	// Generation:
+	// 1. Generate a uniformly-random value x in the range [0,1)
+	// 2. Using a binary search, find the index of the smallest element in cdf larger than x
+	var val float64
+	// multiply the sample with the largest CDF value; easier than normalizing to [0,1)
+	val = rand.Float64() * cdf[len(cdf)-1]
+	// Search returns the smallest index i such that cdf[i] > val
+	return sort.Search(len(cdf), func(i int) bool { return cdf[i] > val })
+}
diff --git a/readme.md b/readme.md
index 50911b0..b9a8f93 100644
--- a/readme.md
+++ b/readme.md
@@ -6,6 +6,10 @@ Original python is included in the repo for reference against bitrot.
 
 To use: `go run cmd/main.go input.txt`
 
-Differences between the Go and the Python:
+Differences between the Go and the Python, as well as notes more generally:
 
-* go is implemented as a package and, separately, as a command-line wrapper that calls it, just to keep the algorithm separate from the invocation details
+* The GPT is implemented as a package and, separately, as a command-line wrapper that calls it, just to keep the algorithm separate from the invocation details.
+* The Value class is more type-safe in go, using values everywhere as opposed to mingling floats and values in the localgrad tuple.
+* The Value struct has actual tests confirming the backward propagation logic.
+* When writing the Value struct and its methods, I accidentally swapped the order of the values in the `localGrads` slice in `Mul` and tore my hair out trying to figure out where the bug was. When I broke down and asked copilot to "compare these two implementations and tell me how they differ," it managed to find the error -- but also reported three non-existent differences and told me that `slices.Backward()` doesn't exist.
+* Initial pass translating the linear algebra functions has me worried that all those value structs aren't going to be very fast...