From 40fd5e99d123287d61f051e44d2c26ce35be1fd2 Mon Sep 17 00:00:00 2001 From: David Ashby Date: Sun, 8 Mar 2026 21:29:51 -0400 Subject: [PATCH] add some tests --- microgopt.go | 40 +++++++++++++++-------- microgopt_test.go | 83 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 14 deletions(-) diff --git a/microgopt.go b/microgopt.go index ef471ad..5d01392 100644 --- a/microgopt.go +++ b/microgopt.go @@ -39,11 +39,23 @@ func valcmp(a, b *value) int { } } +func sum(l []*value) *value { + r := &value{} + for v := range l { + r = r.Add(l[v]) + } + return r +} + func Run(docs []string) { // remove leading and trailing whitespace in documents for i := range docs { docs[i] = strings.TrimSpace(docs[i]) } + rand.Shuffle( + len(docs), + func(i, j int) { docs[i], docs[j] = docs[j], docs[i] }, + ) fmt.Printf("num docs: %d\n", len(docs)) // construct the vocabulary from the documents: an ordered list of all characters in the dataset, @@ -104,7 +116,7 @@ func Run(docs []string) { n := min(blockSize, len(tokens)-1) // Forward the token sequence through the model, building up the computation graph all the way to the loss - keys, values := mkDeepSlice(), mkDeepSlice() + keys, values := mkDeepSlice(nLayer), mkDeepSlice(nLayer) losses := []*value{} for posId := range n { tokenId, targetId := tokens[posId], tokens[posId+1] @@ -113,10 +125,7 @@ func Run(docs []string) { lossT := probs[targetId].Log().Neg() losses = append(losses, lossT) } - lossSum := &value{} - for _, l := range losses { - lossSum = lossSum.Add(l) - } + lossSum := sum(losses) loss := (&value{data: 1 / float64(n)}).Mul(lossSum) // final average loss over the document sequence. May yours be low. // Backward the loss, calculating the gradients with respect to all model parameters loss.Backward() @@ -138,7 +147,7 @@ func Run(docs []string) { temperature := 0.5 // in (0, 1], control the "creativity" of generated text, low to high fmt.Println("\n--- inference (new, hallucinated names) ---") for sampleIdx := range 20 { - keys, values := mkDeepSlice(), mkDeepSlice() + keys, values := mkDeepSlice(nLayer), mkDeepSlice(nLayer) tokenId := BOS sample := []rune{} for posId := range blockSize { @@ -187,10 +196,7 @@ func softMax(logits []*value) []*value { for _, val := range logits { exps = append(exps, val.Sub(maxVal).Exp()) } - total := &value{} - for _, e := range exps { - total = total.Add(e) - } + total := sum(exps) for i := range exps { exps[i] = exps[i].Div(total) } @@ -222,7 +228,7 @@ func gpt(tokenId int, posId int, keys [][][]*value, values [][][]*value) []*valu for li := range nLayer { // 1) Multi-head Attention block - xResidual := slices.Clone(x) + xResidual := x x = rmsNorm(x) q := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wq", li)]) k := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wk", li)]) @@ -290,6 +296,10 @@ type value struct { rLocalGrad *value } +func (v *value) String() string { + return fmt.Sprintf("%.16f", v.data) +} + func (v *value) Add(other *value) *value { return &value{ data: v.data + other.data, @@ -384,9 +394,11 @@ func (v *value) Backward() { } } -func mkDeepSlice() [][][]*value { - a := make([][][]*value, 1, 10) - a[0] = make([][]*value, 0, 10) +func mkDeepSlice(size int) [][][]*value { + a := make([][][]*value, size) + for i := range size { + a[i] = make([][]*value, 0) + } return a } diff --git a/microgopt_test.go b/microgopt_test.go index 724941c..6362b1f 100644 --- a/microgopt_test.go +++ b/microgopt_test.go @@ -33,3 +33,86 @@ func TestValue(t *testing.T) { }) } } + +func TestLinear(t *testing.T) { + tests := []struct { + name string + x []*value + w [][]*value + want []*value + }{ + { + name: "base case", + x: []*value{{data: 1}, {data: 2}, {data: 3}}, + w: [][]*value{{{data: 4}, {data: 5}, {data: 6}}, {{data: 7}, {data: 8}, {data: 9}}}, + want: []*value{{data: 32}, {data: 50}}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := linear(tt.x, tt.w) + if len(tt.want) != len(got) { + t.Errorf("linear() = %v, want %v", got, tt.want) + } + for i, v := range tt.want { + if v.data != got[i].data { + t.Errorf("linear() = %v, want %v", got, tt.want) + } + } + }) + } +} + +func TestSoftMax(t *testing.T) { + tests := []struct { + name string + logits []*value + want []*value + }{ + { + name: "base case", + logits: []*value{{data: 1}, {data: 2}, {data: 3}}, + want: []*value{{data: 0.09003057317038045}, {data: 0.2447284710547976}, {data: 0.6652409557748218}}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := softMax(tt.logits) + if len(tt.want) != len(got) { + t.Errorf("softMax() = %v, want %v", got, tt.want) + } + for i, v := range tt.want { + if v.data != got[i].data { + t.Errorf("softMax() = %v, want %v", got, tt.want) + } + } + }) + } +} + +func TestRmsNorm(t *testing.T) { + tests := []struct { + name string + x []*value + want []*value + }{ + { + name: "base case", + x: []*value{{data: 1}, {data: 2}, {data: 3}}, + want: []*value{{data: 0.4629095539120195}, {data: 0.925819107824039}, {data: 1.3887286617360584}}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := rmsNorm(tt.x) + if len(tt.want) != len(got) { + t.Errorf("rmsNorm() = %v, want %v", got, tt.want) + } + for i, v := range tt.want { + if v.data != got[i].data { + t.Errorf("rmsNorm() = %v, want %v", got, tt.want) + } + } + }) + } +}