doesn't work yet, but the structures are all there

2026-03-07 16:21:17 -05:00
parent 8c8a70407b
commit aa5332e994
6 changed files with 308 additions and 15 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
 *.txt
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -1,7 +1,23 @@
 package main
-import "git.yetaga.in/alazyreader/microgopt"
+import (
 	"log"
 	"os"
 	"strings"
 	"git.yetaga.in/alazyreader/microgopt"
 )
 func main() {
-	microgopt.Run([]string{})
+	f := "names.txt"
 	if len(os.Args) > 1 {
 		f = os.Args[1]
 	}
 	b, err := os.ReadFile(f)
 	if err != nil {
 		log.Fatalf("%v", err)
 		return
 	}
 	s := string(b)
 	microgopt.Run(strings.Split(s, "\n"))
 }
--- a/go.mod
+++ b/go.mod
@@ -1,5 +1,3 @@
 module git.yetaga.in/alazyreader/microgopt
 go 1.26.0
 require github.com/davecgh/go-spew v1.1.1
--- a/go.sum
+++ b/go.sum
@@ -1,2 +1,4 @@
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/mroth/weightedrand/v3 v3.0.0 h1:FPPz/Xpri6qOzZTj8gEA1i6YBSlwLjkDQ+WaZpNHiiI=
 github.com/mroth/weightedrand/v3 v3.0.0/go.mod h1:Qfpt3At9/pYtQOzy9c2iHVWiHBPL+gvMY7mIN5WRlGg=
--- a/microgopt.go
+++ b/microgopt.go
@@ -4,12 +4,24 @@ import (
 	"fmt"
 	"maps"
 	"math"
 	"math/rand/v2"
 	"slices"
 	"sort"
 	"strings"
 	"github.com/davecgh/go-spew/spew"
 )
 // Initialize the parameters, to store the knowledge of the model
 const (
 	nLayer    = 1             // depth of the transformer neural network (number of layers)
 	nEmbd     = 16            // width of the network (embedding dimension)
 	blockSize = 16            // maximum context length of the attention window (note: the longest name is 15 characters)
 	nHead     = 4             // number of attention heads
 	headDim   = nEmbd / nHead // derived dimension of each head
 )
 var stateMap = map[string][][]*value{}
 // this type pun just worked in python but go needs to be more explicit
 func btof(b bool) float64 {
 	if b {
 		return 1.0
@@ -17,12 +29,22 @@ func btof(b bool) float64 {
 	return 0.0
 }
 func valcmp(a, b *value) int {
 	if a.data < b.data {
 		return -1
 	} else if a.data > b.data {
 		return 1
 	} else {
 		return 0
 	}
 }
 func Run(docs []string) {
 	// remove leading and trailing whitespace in documents
 	for i := range docs {
 		docs[i] = strings.TrimSpace(docs[i])
 	}
-	fmt.Printf("num docs: %d", len(docs))
+	fmt.Printf("num docs: %d\n", len(docs))
 	// construct the vocabulary from the documents: an ordered list of all characters in the dataset,
 	// plus a "Beginning Of Sequence" (BOS) token
@@ -33,9 +55,229 @@ func Run(docs []string) {
 		}
 	}
 	uchars := slices.Sorted(maps.Keys(set))
-	// BOS := len(uchars)
+	BOS := len(uchars)
 	vocabSize := len(uchars) + 1
-	fmt.Printf("vocab size: %d", vocabSize)
+	fmt.Printf("vocab size: %d\n", vocabSize)
 	// in the python code, at this point, the Value class was created
 	// and the global parameters were set up
 	stateMap["wte"] = genMatrix(vocabSize, nEmbd)
 	stateMap["wpe"] = genMatrix(blockSize, nEmbd)
 	stateMap["lm_head"] = genMatrix(vocabSize, nEmbd)
 	for i := range nLayer {
 		stateMap[fmt.Sprintf("layer%d.attn_wq", i)] = genMatrix(nEmbd, nEmbd)
 		stateMap[fmt.Sprintf("layer%d.attn_wk", i)] = genMatrix(nEmbd, nEmbd)
 		stateMap[fmt.Sprintf("layer%d.attn_wv", i)] = genMatrix(nEmbd, nEmbd)
 		stateMap[fmt.Sprintf("layer%d.attn_wo", i)] = genMatrix(nEmbd, nEmbd)
 		stateMap[fmt.Sprintf("layer%d.mlp_fc1", i)] = genMatrix(4*nEmbd, nEmbd)
 		stateMap[fmt.Sprintf("layer%d.mlp_fc2", i)] = genMatrix(nEmbd, 4*nEmbd)
 	}
 	// flatten params into a single []value
 	params := []*value{}
 	for _, mat := range stateMap {
 		for _, row := range mat {
 			for _, p := range row {
 				params = append(params, p)
 			}
 		}
 	}
 	fmt.Printf("num params: %d\n", len(params))
 	// at this point in the python, linear(), softmax(), rmsnorm(), and gpt() are all defined
 	// "Let there be Adam, the blessed optimizer and its buffers"
 	learningRate, beta1, beta2, epsAdam := 0.01, 0.85, 0.99, 1e-8
 	m := slices.Repeat([]float64{}, len(params)) // first moment buffer
 	v := slices.Repeat([]float64{}, len(params)) // second moment buffer
 	// Repeat in sequence
 	numSteps := 1000 // number of training steps
 	for step := range numSteps {
 		// Take single document, tokenize it, surround it with BOS special token on both sides
 		doc := docs[step%len(docs)]
 		tokens := []int{BOS}
 		for _, ch := range doc {
 			tokens = append(tokens, slices.Index(uchars, ch))
 		}
 		tokens = append(tokens, BOS)
 		n := min(blockSize, len(tokens)-1)
 		// Forward the token sequence through the model, building up the computation graph all the way to the loss
 		keys, values := mkDeepSlice(nLayer), mkDeepSlice(nLayer)
 		losses := []*value{}
 		for posId := range n {
 			tokenId, targetId := tokens[posId], tokens[posId+1]
 			logits := gpt(tokenId, posId, keys, values)
 			probs := softMax(logits)
 			lossT := probs[targetId].Log().Neg()
 			losses = append(losses, lossT)
 		}
 		lossSum := &value{}
 		for _, l := range losses {
 			lossSum.Add(l)
 		}
 		loss := lossSum.Mul(&value{data: float64(1 / n)}) // final average loss over the document sequence. May yours be low.
 		// Backward the loss, calculating the gradients with respect to all model parameters
 		loss.Backward()
 		// Adam optimizer update: update the model parameters based on the corresponding gradients
 		lrt := learningRate * (float64(1) - float64(step)/float64(numSteps))
 		for i, p := range params {
 			m[i] = beta1*m[i] + (1-beta1)*p.grad
 			v[i] = beta2*v[i] + (1-beta2)*math.Pow(p.grad, 2.0)
 			m_hat := m[i] / (1 - math.Pow(beta1, float64(step+1)))
 			v_hat := v[i] / (1 - math.Pow(beta2, float64(step+1)))
 			p.data = p.data - (lrt*m_hat)/(math.Pow(v_hat, 0.5)+epsAdam)
 			p.grad = 0.0
 		}
 		fmt.Printf("step %4d / %4d | loss %.4f\n", step+1, numSteps, loss.data)
 	}
 	// Inference: may the model babble back to us
 	temperature := 0.5 // in (0, 1], control the "creativity" of generated text, low to high
 	fmt.Println("--- inference (new, hallucinated names) ---")
 	for sampleIdx := range 20 {
 		keys, values := mkDeepSlice(nLayer), mkDeepSlice(nLayer)
 		tokenId := BOS
 		sample := []rune{}
 		for posId := range blockSize {
 			logits := gpt(tokenId, posId, keys, values)
 			probs := make([]*value, len(logits))
 			for i, l := range logits {
 				probs[i] = l.Div(&value{data: temperature})
 			}
 			probs = softMax(probs)
 			tokenId := RouletteDraw(probs)
 			if tokenId == BOS {
 				break
 			}
 			sample = append(sample, uchars[tokenId])
 		}
 		fmt.Printf("sample %2d: %s\n", sampleIdx, string(sample))
 	}
 }
 func genMatrix(out, in int) [][]*value {
 	m := make([][]*value, out)
 	for o := range out {
 		m[o] = make([]*value, in)
 		for i := range in {
 			m[o][i] = &value{data: rand.NormFloat64() * 0.08}
 		}
 	}
 	return m
 }
 func linear(x []*value, w [][]*value) []*value {
 	r := []*value{}
 	for _, wo := range w {
 		for i := range wo {
 			r = append(r, wo[i].Mul(x[i]))
 		}
 	}
 	return r
 }
 func softMax(logits []*value) []*value {
 	maxVal := slices.MaxFunc(logits, valcmp)
 	exps := []*value{}
 	for _, val := range logits {
 		exps = append(exps, val.Sub(maxVal).Exp())
 	}
 	total := &value{}
 	for _, e := range exps {
 		total = total.Add(e)
 	}
 	for i := range exps {
 		exps[i] = exps[i].Div(total)
 	}
 	return exps
 }
 func rmsNorm(x []*value) []*value {
 	ms := &value{}
 	for _, xi := range x {
 		ms = ms.Add(xi.Mul(xi))
 	}
 	ms = ms.Div(&value{data: float64(len(x))})
 	scale := ms.Add(&value{data: 1e-5}).Pow(&value{data: -0.5})
 	for i := range x {
 		x[i] = x[i].Mul(scale)
 	}
 	return x
 }
 func gpt(tokenId int, posId int, keys [][][]*value, values [][][]*value) []*value {
 	tokEmb := stateMap["wte"][tokenId] // token embedding
 	posEmb := stateMap["wpe"][posId]   // position embedding
 	x := []*value{}
 	// joint token and position embedding
 	for i := range tokEmb {
 		x = append(x, tokEmb[i].Add(posEmb[i]))
 	}
 	x = rmsNorm(x) // note: not redundant due to backward pass via the residual connection
 	for li := range nLayer {
 		// 1) Multi-head Attention block
 		xResidual := slices.Clone(x)
 		x = rmsNorm(x)
 		q := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wq", li)])
 		k := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wk", li)])
 		v := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wv", li)])
 		keys[li] = append(keys[li], k)
 		values[li] = append(values[li], v)
 		xAttn := []*value{}
 		// basically, distribute the work over the "attention heads"
 		for h := range nHead {
 			hs := h * headDim
 			q_h := q[hs : hs+headDim]
 			k_h := [][]*value{}
 			for _, ki := range keys[li] {
 				k_h = append(k_h, ki[hs:hs+headDim])
 			}
 			v_h := [][]*value{}
 			for _, vi := range values[li] {
 				v_h = append(v_h, vi[hs:hs+headDim])
 			}
 			attnLogits := []*value{}
 			for t := range len(k_h) {
 				s := &value{data: 0.0}
 				for j := range headDim {
 					s = s.Add(q_h[j].Mul(k_h[t][j]))
 				}
 				attnLogits = append(attnLogits, s.Div(&value{data: math.Pow(float64(headDim), 0.5)}))
 			}
 			attnWeights := softMax(attnLogits)
 			headOut := []*value{}
 			for j := range headDim {
 				s := &value{data: 0.0}
 				for t := range len(v_h) {
 					s = s.Add(attnWeights[t].Mul(v_h[t][j]))
 				}
 				headOut = append(headOut, s)
 			}
 			xAttn = append(xAttn, headOut...)
 		}
 		x = linear(xAttn, stateMap[fmt.Sprintf("layer%d.attn_wo", li)])
 		for i := range x {
 			x[i] = x[i].Add(xResidual[i])
 		}
 		// 2) MLP block
 		xResidual = x
 		x = rmsNorm(x)
 		x = linear(x, stateMap[fmt.Sprintf("layer%d.mlp_fc1", li)])
 		for i := range x {
 			x[i] = x[i].Relu()
 		}
 		x = linear(x, stateMap[fmt.Sprintf("layer%d.mlp_fc2", li)])
 		for i := range x {
 			x[i] = x[i].Add(xResidual[i])
 		}
 	}
 	logits := linear(x, stateMap["lm_head"])
 	return logits
 }
 type value struct {
@@ -45,9 +287,10 @@ type value struct {
 	localGrads []*value
 }
 // this lets us build a set-like map with our Values.
 // If the slices were removed from the struct, that would make this method irrelevant.
 func (v *value) toKey() string {
 	k := fmt.Sprintf("%+v", v)
 	fmt.Println(k)
 	return k
 }
@@ -55,7 +298,7 @@ func (v *value) Add(other *value) *value {
 	return &value{
 		data:       v.data + other.data,
 		children:   []*value{v, other},
-		localGrads: []*value{{data: 1}, {data: 1}},
+		localGrads: []*value{{data: 1.0}, {data: 1.0}},
 	}
 }
@@ -68,6 +311,7 @@ func (v *value) Div(other *value) *value {
 }
 func (v *value) Mul(other *value) *value {
 	// note the swap here: children are stored as v, other but grads are other, v
 	return &value{
 		data:     v.data * other.data,
 		children: []*value{v, other},
@@ -137,11 +381,39 @@ func (v *value) Backward() {
 		}
 	}
 	buildTopo(v)
-	spew.Dump(topo)
+	v.grad = 1.0
 	v.grad = 1
 	for _, v := range slices.Backward(topo) {
 		for i := range v.children {
 			v.children[i].grad += v.localGrads[i].data * v.grad
 		}
 	}
 }
 func mkDeepSlice(size int) [][][]*value {
 	a := make([][][]*value, 1, 10)
 	a[0] = make([][]*value, 1, 10)
 	a[0][0] = make([]*value, 1, 10)
 	return a
 }
 // implement our own weighted random chooser
 // based on https://cybernetist.com/2019/01/24/random-weighted-draws-in-go/ but without the dependency on gonum
 func RouletteDraw(p []*value) int {
 	// Initialization: create the discrete CDF
 	cdf := make([]float64, len(p))
 	for i, v := range p {
 		if i == 0 {
 			cdf[i] = v.data
 		} else {
 			cdf[i] = cdf[i-1] + v.data
 		}
 	}
 	// Generation:
 	// 1. Generate a uniformly-random value x in the range [0,1)
 	// 2. Using a binary search, find the index of the smallest element in cdf larger than x
 	var val float64
 	// multiply the sample with the largest CDF value; easier than normalizing to [0,1)
 	val = rand.Float64() * cdf[len(cdf)-1]
 	// Search returns the smallest index i such that cdf[i] > val
 	return sort.Search(len(cdf), func(i int) bool { return cdf[i] > val })
 }
--- a/readme.md
+++ b/readme.md
@@ -6,6 +6,10 @@ Original python is included in the repo for reference against bitrot.
 To use: `go run cmd/main.go input.txt`
-Differences between the Go and the Python:
+Differences between the Go and the Python, as well as notes more generally:
-* go is implemented as a package and, separately, as a command-line wrapper that calls it, just to keep the algorithm separate from the invocation details
+* The GPT is implemented as a package and, separately, as a command-line wrapper that calls it, just to keep the algorithm separate from the invocation details.
 * The Value class is more type-safe in go, using values everywhere as opposed to mingling floats and values in the localgrad tuple.
 * The Value struct has actual tests confirming the backward propagation logic.
 * When writing the Value struct and its methods, I accidentally swapped the order of the values in the `localGrads` slice in `Mul` and tore my hair out trying to figure out where the bug was. When I broke down and asked copilot to "compare these two implementations and tell me how they differ," it managed to find the error -- but also reported three non-existent differences and told me that `slices.Backward()` doesn't exist.
 * Initial pass translating the linear algebra functions has me worried that all those value structs aren't going to be very fast...