From aa5332e99470bef963ce5d1709140e83d2f40ef2 Mon Sep 17 00:00:00 2001 From: David Ashby Date: Sat, 7 Mar 2026 16:21:17 -0500 Subject: [PATCH] doesn't work yet, but the structures are all there --- .gitignore | 1 + cmd/main.go | 20 +++- go.mod | 2 - go.sum | 2 + microgopt.go | 290 +++++++++++++++++++++++++++++++++++++++++++++++++-- readme.md | 8 +- 6 files changed, 308 insertions(+), 15 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..314f02b --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.txt \ No newline at end of file diff --git a/cmd/main.go b/cmd/main.go index 04d7020..1de1f9e 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -1,7 +1,23 @@ package main -import "git.yetaga.in/alazyreader/microgopt" +import ( + "log" + "os" + "strings" + + "git.yetaga.in/alazyreader/microgopt" +) func main() { - microgopt.Run([]string{}) + f := "names.txt" + if len(os.Args) > 1 { + f = os.Args[1] + } + b, err := os.ReadFile(f) + if err != nil { + log.Fatalf("%v", err) + return + } + s := string(b) + microgopt.Run(strings.Split(s, "\n")) } diff --git a/go.mod b/go.mod index bd9fc32..e41a1b8 100644 --- a/go.mod +++ b/go.mod @@ -1,5 +1,3 @@ module git.yetaga.in/alazyreader/microgopt go 1.26.0 - -require github.com/davecgh/go-spew v1.1.1 diff --git a/go.sum b/go.sum index b5e2922..e766d2c 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +1,4 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/mroth/weightedrand/v3 v3.0.0 h1:FPPz/Xpri6qOzZTj8gEA1i6YBSlwLjkDQ+WaZpNHiiI= +github.com/mroth/weightedrand/v3 v3.0.0/go.mod h1:Qfpt3At9/pYtQOzy9c2iHVWiHBPL+gvMY7mIN5WRlGg= diff --git a/microgopt.go b/microgopt.go index a7185d2..c0794a2 100644 --- a/microgopt.go +++ b/microgopt.go @@ -4,12 +4,24 @@ import ( "fmt" "maps" "math" + "math/rand/v2" "slices" + "sort" "strings" - - "github.com/davecgh/go-spew/spew" ) +// Initialize the parameters, to store the knowledge of the model +const ( + nLayer = 1 // depth of the transformer neural network (number of layers) + nEmbd = 16 // width of the network (embedding dimension) + blockSize = 16 // maximum context length of the attention window (note: the longest name is 15 characters) + nHead = 4 // number of attention heads + headDim = nEmbd / nHead // derived dimension of each head +) + +var stateMap = map[string][][]*value{} + +// this type pun just worked in python but go needs to be more explicit func btof(b bool) float64 { if b { return 1.0 @@ -17,12 +29,22 @@ func btof(b bool) float64 { return 0.0 } +func valcmp(a, b *value) int { + if a.data < b.data { + return -1 + } else if a.data > b.data { + return 1 + } else { + return 0 + } +} + func Run(docs []string) { // remove leading and trailing whitespace in documents for i := range docs { docs[i] = strings.TrimSpace(docs[i]) } - fmt.Printf("num docs: %d", len(docs)) + fmt.Printf("num docs: %d\n", len(docs)) // construct the vocabulary from the documents: an ordered list of all characters in the dataset, // plus a "Beginning Of Sequence" (BOS) token @@ -33,9 +55,229 @@ func Run(docs []string) { } } uchars := slices.Sorted(maps.Keys(set)) - // BOS := len(uchars) + BOS := len(uchars) vocabSize := len(uchars) + 1 - fmt.Printf("vocab size: %d", vocabSize) + fmt.Printf("vocab size: %d\n", vocabSize) + + // in the python code, at this point, the Value class was created + // and the global parameters were set up + + stateMap["wte"] = genMatrix(vocabSize, nEmbd) + stateMap["wpe"] = genMatrix(blockSize, nEmbd) + stateMap["lm_head"] = genMatrix(vocabSize, nEmbd) + for i := range nLayer { + stateMap[fmt.Sprintf("layer%d.attn_wq", i)] = genMatrix(nEmbd, nEmbd) + stateMap[fmt.Sprintf("layer%d.attn_wk", i)] = genMatrix(nEmbd, nEmbd) + stateMap[fmt.Sprintf("layer%d.attn_wv", i)] = genMatrix(nEmbd, nEmbd) + stateMap[fmt.Sprintf("layer%d.attn_wo", i)] = genMatrix(nEmbd, nEmbd) + stateMap[fmt.Sprintf("layer%d.mlp_fc1", i)] = genMatrix(4*nEmbd, nEmbd) + stateMap[fmt.Sprintf("layer%d.mlp_fc2", i)] = genMatrix(nEmbd, 4*nEmbd) + } + // flatten params into a single []value + params := []*value{} + for _, mat := range stateMap { + for _, row := range mat { + for _, p := range row { + params = append(params, p) + } + } + } + fmt.Printf("num params: %d\n", len(params)) + + // at this point in the python, linear(), softmax(), rmsnorm(), and gpt() are all defined + + // "Let there be Adam, the blessed optimizer and its buffers" + learningRate, beta1, beta2, epsAdam := 0.01, 0.85, 0.99, 1e-8 + m := slices.Repeat([]float64{}, len(params)) // first moment buffer + v := slices.Repeat([]float64{}, len(params)) // second moment buffer + + // Repeat in sequence + numSteps := 1000 // number of training steps + for step := range numSteps { + // Take single document, tokenize it, surround it with BOS special token on both sides + doc := docs[step%len(docs)] + tokens := []int{BOS} + for _, ch := range doc { + tokens = append(tokens, slices.Index(uchars, ch)) + } + tokens = append(tokens, BOS) + n := min(blockSize, len(tokens)-1) + + // Forward the token sequence through the model, building up the computation graph all the way to the loss + keys, values := mkDeepSlice(nLayer), mkDeepSlice(nLayer) + losses := []*value{} + for posId := range n { + tokenId, targetId := tokens[posId], tokens[posId+1] + logits := gpt(tokenId, posId, keys, values) + probs := softMax(logits) + lossT := probs[targetId].Log().Neg() + losses = append(losses, lossT) + } + lossSum := &value{} + for _, l := range losses { + lossSum.Add(l) + } + loss := lossSum.Mul(&value{data: float64(1 / n)}) // final average loss over the document sequence. May yours be low. + + // Backward the loss, calculating the gradients with respect to all model parameters + loss.Backward() + + // Adam optimizer update: update the model parameters based on the corresponding gradients + lrt := learningRate * (float64(1) - float64(step)/float64(numSteps)) + for i, p := range params { + m[i] = beta1*m[i] + (1-beta1)*p.grad + v[i] = beta2*v[i] + (1-beta2)*math.Pow(p.grad, 2.0) + m_hat := m[i] / (1 - math.Pow(beta1, float64(step+1))) + v_hat := v[i] / (1 - math.Pow(beta2, float64(step+1))) + p.data = p.data - (lrt*m_hat)/(math.Pow(v_hat, 0.5)+epsAdam) + p.grad = 0.0 + } + fmt.Printf("step %4d / %4d | loss %.4f\n", step+1, numSteps, loss.data) + } + + // Inference: may the model babble back to us + temperature := 0.5 // in (0, 1], control the "creativity" of generated text, low to high + fmt.Println("--- inference (new, hallucinated names) ---") + for sampleIdx := range 20 { + keys, values := mkDeepSlice(nLayer), mkDeepSlice(nLayer) + tokenId := BOS + sample := []rune{} + for posId := range blockSize { + logits := gpt(tokenId, posId, keys, values) + probs := make([]*value, len(logits)) + for i, l := range logits { + probs[i] = l.Div(&value{data: temperature}) + } + probs = softMax(probs) + tokenId := RouletteDraw(probs) + if tokenId == BOS { + break + } + sample = append(sample, uchars[tokenId]) + } + fmt.Printf("sample %2d: %s\n", sampleIdx, string(sample)) + } +} + +func genMatrix(out, in int) [][]*value { + m := make([][]*value, out) + for o := range out { + m[o] = make([]*value, in) + for i := range in { + m[o][i] = &value{data: rand.NormFloat64() * 0.08} + } + } + return m +} + +func linear(x []*value, w [][]*value) []*value { + r := []*value{} + for _, wo := range w { + for i := range wo { + r = append(r, wo[i].Mul(x[i])) + } + } + return r +} + +func softMax(logits []*value) []*value { + maxVal := slices.MaxFunc(logits, valcmp) + exps := []*value{} + for _, val := range logits { + exps = append(exps, val.Sub(maxVal).Exp()) + } + total := &value{} + for _, e := range exps { + total = total.Add(e) + } + for i := range exps { + exps[i] = exps[i].Div(total) + } + return exps +} + +func rmsNorm(x []*value) []*value { + ms := &value{} + for _, xi := range x { + ms = ms.Add(xi.Mul(xi)) + } + ms = ms.Div(&value{data: float64(len(x))}) + scale := ms.Add(&value{data: 1e-5}).Pow(&value{data: -0.5}) + for i := range x { + x[i] = x[i].Mul(scale) + } + return x +} + +func gpt(tokenId int, posId int, keys [][][]*value, values [][][]*value) []*value { + tokEmb := stateMap["wte"][tokenId] // token embedding + posEmb := stateMap["wpe"][posId] // position embedding + x := []*value{} + // joint token and position embedding + for i := range tokEmb { + x = append(x, tokEmb[i].Add(posEmb[i])) + } + x = rmsNorm(x) // note: not redundant due to backward pass via the residual connection + + for li := range nLayer { + // 1) Multi-head Attention block + xResidual := slices.Clone(x) + x = rmsNorm(x) + q := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wq", li)]) + k := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wk", li)]) + v := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wv", li)]) + keys[li] = append(keys[li], k) + values[li] = append(values[li], v) + xAttn := []*value{} + // basically, distribute the work over the "attention heads" + for h := range nHead { + hs := h * headDim + q_h := q[hs : hs+headDim] + k_h := [][]*value{} + for _, ki := range keys[li] { + k_h = append(k_h, ki[hs:hs+headDim]) + } + v_h := [][]*value{} + for _, vi := range values[li] { + v_h = append(v_h, vi[hs:hs+headDim]) + } + attnLogits := []*value{} + for t := range len(k_h) { + s := &value{data: 0.0} + for j := range headDim { + s = s.Add(q_h[j].Mul(k_h[t][j])) + } + attnLogits = append(attnLogits, s.Div(&value{data: math.Pow(float64(headDim), 0.5)})) + } + attnWeights := softMax(attnLogits) + headOut := []*value{} + for j := range headDim { + s := &value{data: 0.0} + for t := range len(v_h) { + s = s.Add(attnWeights[t].Mul(v_h[t][j])) + } + headOut = append(headOut, s) + } + xAttn = append(xAttn, headOut...) + } + x = linear(xAttn, stateMap[fmt.Sprintf("layer%d.attn_wo", li)]) + for i := range x { + x[i] = x[i].Add(xResidual[i]) + } + // 2) MLP block + xResidual = x + x = rmsNorm(x) + x = linear(x, stateMap[fmt.Sprintf("layer%d.mlp_fc1", li)]) + for i := range x { + x[i] = x[i].Relu() + } + x = linear(x, stateMap[fmt.Sprintf("layer%d.mlp_fc2", li)]) + for i := range x { + x[i] = x[i].Add(xResidual[i]) + } + } + logits := linear(x, stateMap["lm_head"]) + return logits } type value struct { @@ -45,9 +287,10 @@ type value struct { localGrads []*value } +// this lets us build a set-like map with our Values. +// If the slices were removed from the struct, that would make this method irrelevant. func (v *value) toKey() string { k := fmt.Sprintf("%+v", v) - fmt.Println(k) return k } @@ -55,7 +298,7 @@ func (v *value) Add(other *value) *value { return &value{ data: v.data + other.data, children: []*value{v, other}, - localGrads: []*value{{data: 1}, {data: 1}}, + localGrads: []*value{{data: 1.0}, {data: 1.0}}, } } @@ -68,6 +311,7 @@ func (v *value) Div(other *value) *value { } func (v *value) Mul(other *value) *value { + // note the swap here: children are stored as v, other but grads are other, v return &value{ data: v.data * other.data, children: []*value{v, other}, @@ -137,11 +381,39 @@ func (v *value) Backward() { } } buildTopo(v) - spew.Dump(topo) - v.grad = 1 + v.grad = 1.0 for _, v := range slices.Backward(topo) { for i := range v.children { v.children[i].grad += v.localGrads[i].data * v.grad } } } + +func mkDeepSlice(size int) [][][]*value { + a := make([][][]*value, 1, 10) + a[0] = make([][]*value, 1, 10) + a[0][0] = make([]*value, 1, 10) + return a +} + +// implement our own weighted random chooser +// based on https://cybernetist.com/2019/01/24/random-weighted-draws-in-go/ but without the dependency on gonum +func RouletteDraw(p []*value) int { + // Initialization: create the discrete CDF + cdf := make([]float64, len(p)) + for i, v := range p { + if i == 0 { + cdf[i] = v.data + } else { + cdf[i] = cdf[i-1] + v.data + } + } + // Generation: + // 1. Generate a uniformly-random value x in the range [0,1) + // 2. Using a binary search, find the index of the smallest element in cdf larger than x + var val float64 + // multiply the sample with the largest CDF value; easier than normalizing to [0,1) + val = rand.Float64() * cdf[len(cdf)-1] + // Search returns the smallest index i such that cdf[i] > val + return sort.Search(len(cdf), func(i int) bool { return cdf[i] > val }) +} diff --git a/readme.md b/readme.md index 50911b0..b9a8f93 100644 --- a/readme.md +++ b/readme.md @@ -6,6 +6,10 @@ Original python is included in the repo for reference against bitrot. To use: `go run cmd/main.go input.txt` -Differences between the Go and the Python: +Differences between the Go and the Python, as well as notes more generally: -* go is implemented as a package and, separately, as a command-line wrapper that calls it, just to keep the algorithm separate from the invocation details +* The GPT is implemented as a package and, separately, as a command-line wrapper that calls it, just to keep the algorithm separate from the invocation details. +* The Value class is more type-safe in go, using values everywhere as opposed to mingling floats and values in the localgrad tuple. +* The Value struct has actual tests confirming the backward propagation logic. +* When writing the Value struct and its methods, I accidentally swapped the order of the values in the `localGrads` slice in `Mul` and tore my hair out trying to figure out where the bug was. When I broke down and asked copilot to "compare these two implementations and tell me how they differ," it managed to find the error -- but also reported three non-existent differences and told me that `slices.Backward()` doesn't exist. +* Initial pass translating the linear algebra functions has me worried that all those value structs aren't going to be very fast...