add some tests

2026-03-08 21:29:51 -04:00
parent aabd0087d5
commit 40fd5e99d1
2 changed files with 109 additions and 14 deletions
--- a/microgopt.go
+++ b/microgopt.go
@@ -39,11 +39,23 @@ func valcmp(a, b *value) int {
 	}
 }

+func sum(l []*value) *value {
+	r := &value{}
+	for v := range l {
+		r = r.Add(l[v])
+	}
+	return r
+}
+
 func Run(docs []string) {
 	// remove leading and trailing whitespace in documents
 	for i := range docs {
 		docs[i] = strings.TrimSpace(docs[i])
 	}
+	rand.Shuffle(
+		len(docs),
+		func(i, j int) { docs[i], docs[j] = docs[j], docs[i] },
+	)
 	fmt.Printf("num docs: %d\n", len(docs))

 	// construct the vocabulary from the documents: an ordered list of all characters in the dataset,
@@ -104,7 +116,7 @@ func Run(docs []string) {
 		n := min(blockSize, len(tokens)-1)

 		// Forward the token sequence through the model, building up the computation graph all the way to the loss
-		keys, values := mkDeepSlice(), mkDeepSlice()
+		keys, values := mkDeepSlice(nLayer), mkDeepSlice(nLayer)
 		losses := []*value{}
 		for posId := range n {
 			tokenId, targetId := tokens[posId], tokens[posId+1]
@@ -113,10 +125,7 @@ func Run(docs []string) {
 			lossT := probs[targetId].Log().Neg()
 			losses = append(losses, lossT)
 		}
-		lossSum := &value{}
-		for _, l := range losses {
-			lossSum = lossSum.Add(l)
-		}
+		lossSum := sum(losses)
 		loss := (&value{data: 1 / float64(n)}).Mul(lossSum) // final average loss over the document sequence. May yours be low.
 		// Backward the loss, calculating the gradients with respect to all model parameters
 		loss.Backward()
@@ -138,7 +147,7 @@ func Run(docs []string) {
 	temperature := 0.5 // in (0, 1], control the "creativity" of generated text, low to high
 	fmt.Println("\n--- inference (new, hallucinated names) ---")
 	for sampleIdx := range 20 {
-		keys, values := mkDeepSlice(), mkDeepSlice()
+		keys, values := mkDeepSlice(nLayer), mkDeepSlice(nLayer)
 		tokenId := BOS
 		sample := []rune{}
 		for posId := range blockSize {
@@ -187,10 +196,7 @@ func softMax(logits []*value) []*value {
 	for _, val := range logits {
 		exps = append(exps, val.Sub(maxVal).Exp())
 	}
-	total := &value{}
-	for _, e := range exps {
-		total = total.Add(e)
-	}
+	total := sum(exps)
 	for i := range exps {
 		exps[i] = exps[i].Div(total)
 	}
@@ -222,7 +228,7 @@ func gpt(tokenId int, posId int, keys [][][]*value, values [][][]*value) []*valu

 	for li := range nLayer {
 		// 1) Multi-head Attention block
-		xResidual := slices.Clone(x)
+		xResidual := x
 		x = rmsNorm(x)
 		q := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wq", li)])
 		k := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wk", li)])
@@ -290,6 +296,10 @@ type value struct {
 	rLocalGrad *value
 }

+func (v *value) String() string {
+	return fmt.Sprintf("%.16f", v.data)
+}
+
 func (v *value) Add(other *value) *value {
 	return &value{
 		data:       v.data + other.data,
@@ -384,9 +394,11 @@ func (v *value) Backward() {
 	}
 }

-func mkDeepSlice() [][][]*value {
-	a := make([][][]*value, 1, 10)
-	a[0] = make([][]*value, 0, 10)
+func mkDeepSlice(size int) [][][]*value {
+	a := make([][][]*value, size)
+	for i := range size {
+		a[i] = make([][]*value, 0)
+	}
 	return a
 }