doesn't work yet, but the structures are all there

This commit is contained in:
2026-03-07 16:21:17 -05:00
parent 8c8a70407b
commit aa5332e994
6 changed files with 308 additions and 15 deletions

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
*.txt

View File

@@ -1,7 +1,23 @@
package main package main
import "git.yetaga.in/alazyreader/microgopt" import (
"log"
"os"
"strings"
"git.yetaga.in/alazyreader/microgopt"
)
func main() { func main() {
microgopt.Run([]string{}) f := "names.txt"
if len(os.Args) > 1 {
f = os.Args[1]
}
b, err := os.ReadFile(f)
if err != nil {
log.Fatalf("%v", err)
return
}
s := string(b)
microgopt.Run(strings.Split(s, "\n"))
} }

2
go.mod
View File

@@ -1,5 +1,3 @@
module git.yetaga.in/alazyreader/microgopt module git.yetaga.in/alazyreader/microgopt
go 1.26.0 go 1.26.0
require github.com/davecgh/go-spew v1.1.1

2
go.sum
View File

@@ -1,2 +1,4 @@
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/mroth/weightedrand/v3 v3.0.0 h1:FPPz/Xpri6qOzZTj8gEA1i6YBSlwLjkDQ+WaZpNHiiI=
github.com/mroth/weightedrand/v3 v3.0.0/go.mod h1:Qfpt3At9/pYtQOzy9c2iHVWiHBPL+gvMY7mIN5WRlGg=

View File

@@ -4,12 +4,24 @@ import (
"fmt" "fmt"
"maps" "maps"
"math" "math"
"math/rand/v2"
"slices" "slices"
"sort"
"strings" "strings"
"github.com/davecgh/go-spew/spew"
) )
// Initialize the parameters, to store the knowledge of the model
const (
nLayer = 1 // depth of the transformer neural network (number of layers)
nEmbd = 16 // width of the network (embedding dimension)
blockSize = 16 // maximum context length of the attention window (note: the longest name is 15 characters)
nHead = 4 // number of attention heads
headDim = nEmbd / nHead // derived dimension of each head
)
var stateMap = map[string][][]*value{}
// this type pun just worked in python but go needs to be more explicit
func btof(b bool) float64 { func btof(b bool) float64 {
if b { if b {
return 1.0 return 1.0
@@ -17,12 +29,22 @@ func btof(b bool) float64 {
return 0.0 return 0.0
} }
func valcmp(a, b *value) int {
if a.data < b.data {
return -1
} else if a.data > b.data {
return 1
} else {
return 0
}
}
func Run(docs []string) { func Run(docs []string) {
// remove leading and trailing whitespace in documents // remove leading and trailing whitespace in documents
for i := range docs { for i := range docs {
docs[i] = strings.TrimSpace(docs[i]) docs[i] = strings.TrimSpace(docs[i])
} }
fmt.Printf("num docs: %d", len(docs)) fmt.Printf("num docs: %d\n", len(docs))
// construct the vocabulary from the documents: an ordered list of all characters in the dataset, // construct the vocabulary from the documents: an ordered list of all characters in the dataset,
// plus a "Beginning Of Sequence" (BOS) token // plus a "Beginning Of Sequence" (BOS) token
@@ -33,9 +55,229 @@ func Run(docs []string) {
} }
} }
uchars := slices.Sorted(maps.Keys(set)) uchars := slices.Sorted(maps.Keys(set))
// BOS := len(uchars) BOS := len(uchars)
vocabSize := len(uchars) + 1 vocabSize := len(uchars) + 1
fmt.Printf("vocab size: %d", vocabSize) fmt.Printf("vocab size: %d\n", vocabSize)
// in the python code, at this point, the Value class was created
// and the global parameters were set up
stateMap["wte"] = genMatrix(vocabSize, nEmbd)
stateMap["wpe"] = genMatrix(blockSize, nEmbd)
stateMap["lm_head"] = genMatrix(vocabSize, nEmbd)
for i := range nLayer {
stateMap[fmt.Sprintf("layer%d.attn_wq", i)] = genMatrix(nEmbd, nEmbd)
stateMap[fmt.Sprintf("layer%d.attn_wk", i)] = genMatrix(nEmbd, nEmbd)
stateMap[fmt.Sprintf("layer%d.attn_wv", i)] = genMatrix(nEmbd, nEmbd)
stateMap[fmt.Sprintf("layer%d.attn_wo", i)] = genMatrix(nEmbd, nEmbd)
stateMap[fmt.Sprintf("layer%d.mlp_fc1", i)] = genMatrix(4*nEmbd, nEmbd)
stateMap[fmt.Sprintf("layer%d.mlp_fc2", i)] = genMatrix(nEmbd, 4*nEmbd)
}
// flatten params into a single []value
params := []*value{}
for _, mat := range stateMap {
for _, row := range mat {
for _, p := range row {
params = append(params, p)
}
}
}
fmt.Printf("num params: %d\n", len(params))
// at this point in the python, linear(), softmax(), rmsnorm(), and gpt() are all defined
// "Let there be Adam, the blessed optimizer and its buffers"
learningRate, beta1, beta2, epsAdam := 0.01, 0.85, 0.99, 1e-8
m := slices.Repeat([]float64{}, len(params)) // first moment buffer
v := slices.Repeat([]float64{}, len(params)) // second moment buffer
// Repeat in sequence
numSteps := 1000 // number of training steps
for step := range numSteps {
// Take single document, tokenize it, surround it with BOS special token on both sides
doc := docs[step%len(docs)]
tokens := []int{BOS}
for _, ch := range doc {
tokens = append(tokens, slices.Index(uchars, ch))
}
tokens = append(tokens, BOS)
n := min(blockSize, len(tokens)-1)
// Forward the token sequence through the model, building up the computation graph all the way to the loss
keys, values := mkDeepSlice(nLayer), mkDeepSlice(nLayer)
losses := []*value{}
for posId := range n {
tokenId, targetId := tokens[posId], tokens[posId+1]
logits := gpt(tokenId, posId, keys, values)
probs := softMax(logits)
lossT := probs[targetId].Log().Neg()
losses = append(losses, lossT)
}
lossSum := &value{}
for _, l := range losses {
lossSum.Add(l)
}
loss := lossSum.Mul(&value{data: float64(1 / n)}) // final average loss over the document sequence. May yours be low.
// Backward the loss, calculating the gradients with respect to all model parameters
loss.Backward()
// Adam optimizer update: update the model parameters based on the corresponding gradients
lrt := learningRate * (float64(1) - float64(step)/float64(numSteps))
for i, p := range params {
m[i] = beta1*m[i] + (1-beta1)*p.grad
v[i] = beta2*v[i] + (1-beta2)*math.Pow(p.grad, 2.0)
m_hat := m[i] / (1 - math.Pow(beta1, float64(step+1)))
v_hat := v[i] / (1 - math.Pow(beta2, float64(step+1)))
p.data = p.data - (lrt*m_hat)/(math.Pow(v_hat, 0.5)+epsAdam)
p.grad = 0.0
}
fmt.Printf("step %4d / %4d | loss %.4f\n", step+1, numSteps, loss.data)
}
// Inference: may the model babble back to us
temperature := 0.5 // in (0, 1], control the "creativity" of generated text, low to high
fmt.Println("--- inference (new, hallucinated names) ---")
for sampleIdx := range 20 {
keys, values := mkDeepSlice(nLayer), mkDeepSlice(nLayer)
tokenId := BOS
sample := []rune{}
for posId := range blockSize {
logits := gpt(tokenId, posId, keys, values)
probs := make([]*value, len(logits))
for i, l := range logits {
probs[i] = l.Div(&value{data: temperature})
}
probs = softMax(probs)
tokenId := RouletteDraw(probs)
if tokenId == BOS {
break
}
sample = append(sample, uchars[tokenId])
}
fmt.Printf("sample %2d: %s\n", sampleIdx, string(sample))
}
}
func genMatrix(out, in int) [][]*value {
m := make([][]*value, out)
for o := range out {
m[o] = make([]*value, in)
for i := range in {
m[o][i] = &value{data: rand.NormFloat64() * 0.08}
}
}
return m
}
func linear(x []*value, w [][]*value) []*value {
r := []*value{}
for _, wo := range w {
for i := range wo {
r = append(r, wo[i].Mul(x[i]))
}
}
return r
}
func softMax(logits []*value) []*value {
maxVal := slices.MaxFunc(logits, valcmp)
exps := []*value{}
for _, val := range logits {
exps = append(exps, val.Sub(maxVal).Exp())
}
total := &value{}
for _, e := range exps {
total = total.Add(e)
}
for i := range exps {
exps[i] = exps[i].Div(total)
}
return exps
}
func rmsNorm(x []*value) []*value {
ms := &value{}
for _, xi := range x {
ms = ms.Add(xi.Mul(xi))
}
ms = ms.Div(&value{data: float64(len(x))})
scale := ms.Add(&value{data: 1e-5}).Pow(&value{data: -0.5})
for i := range x {
x[i] = x[i].Mul(scale)
}
return x
}
func gpt(tokenId int, posId int, keys [][][]*value, values [][][]*value) []*value {
tokEmb := stateMap["wte"][tokenId] // token embedding
posEmb := stateMap["wpe"][posId] // position embedding
x := []*value{}
// joint token and position embedding
for i := range tokEmb {
x = append(x, tokEmb[i].Add(posEmb[i]))
}
x = rmsNorm(x) // note: not redundant due to backward pass via the residual connection
for li := range nLayer {
// 1) Multi-head Attention block
xResidual := slices.Clone(x)
x = rmsNorm(x)
q := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wq", li)])
k := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wk", li)])
v := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wv", li)])
keys[li] = append(keys[li], k)
values[li] = append(values[li], v)
xAttn := []*value{}
// basically, distribute the work over the "attention heads"
for h := range nHead {
hs := h * headDim
q_h := q[hs : hs+headDim]
k_h := [][]*value{}
for _, ki := range keys[li] {
k_h = append(k_h, ki[hs:hs+headDim])
}
v_h := [][]*value{}
for _, vi := range values[li] {
v_h = append(v_h, vi[hs:hs+headDim])
}
attnLogits := []*value{}
for t := range len(k_h) {
s := &value{data: 0.0}
for j := range headDim {
s = s.Add(q_h[j].Mul(k_h[t][j]))
}
attnLogits = append(attnLogits, s.Div(&value{data: math.Pow(float64(headDim), 0.5)}))
}
attnWeights := softMax(attnLogits)
headOut := []*value{}
for j := range headDim {
s := &value{data: 0.0}
for t := range len(v_h) {
s = s.Add(attnWeights[t].Mul(v_h[t][j]))
}
headOut = append(headOut, s)
}
xAttn = append(xAttn, headOut...)
}
x = linear(xAttn, stateMap[fmt.Sprintf("layer%d.attn_wo", li)])
for i := range x {
x[i] = x[i].Add(xResidual[i])
}
// 2) MLP block
xResidual = x
x = rmsNorm(x)
x = linear(x, stateMap[fmt.Sprintf("layer%d.mlp_fc1", li)])
for i := range x {
x[i] = x[i].Relu()
}
x = linear(x, stateMap[fmt.Sprintf("layer%d.mlp_fc2", li)])
for i := range x {
x[i] = x[i].Add(xResidual[i])
}
}
logits := linear(x, stateMap["lm_head"])
return logits
} }
type value struct { type value struct {
@@ -45,9 +287,10 @@ type value struct {
localGrads []*value localGrads []*value
} }
// this lets us build a set-like map with our Values.
// If the slices were removed from the struct, that would make this method irrelevant.
func (v *value) toKey() string { func (v *value) toKey() string {
k := fmt.Sprintf("%+v", v) k := fmt.Sprintf("%+v", v)
fmt.Println(k)
return k return k
} }
@@ -55,7 +298,7 @@ func (v *value) Add(other *value) *value {
return &value{ return &value{
data: v.data + other.data, data: v.data + other.data,
children: []*value{v, other}, children: []*value{v, other},
localGrads: []*value{{data: 1}, {data: 1}}, localGrads: []*value{{data: 1.0}, {data: 1.0}},
} }
} }
@@ -68,6 +311,7 @@ func (v *value) Div(other *value) *value {
} }
func (v *value) Mul(other *value) *value { func (v *value) Mul(other *value) *value {
// note the swap here: children are stored as v, other but grads are other, v
return &value{ return &value{
data: v.data * other.data, data: v.data * other.data,
children: []*value{v, other}, children: []*value{v, other},
@@ -137,11 +381,39 @@ func (v *value) Backward() {
} }
} }
buildTopo(v) buildTopo(v)
spew.Dump(topo) v.grad = 1.0
v.grad = 1
for _, v := range slices.Backward(topo) { for _, v := range slices.Backward(topo) {
for i := range v.children { for i := range v.children {
v.children[i].grad += v.localGrads[i].data * v.grad v.children[i].grad += v.localGrads[i].data * v.grad
} }
} }
} }
func mkDeepSlice(size int) [][][]*value {
a := make([][][]*value, 1, 10)
a[0] = make([][]*value, 1, 10)
a[0][0] = make([]*value, 1, 10)
return a
}
// implement our own weighted random chooser
// based on https://cybernetist.com/2019/01/24/random-weighted-draws-in-go/ but without the dependency on gonum
func RouletteDraw(p []*value) int {
// Initialization: create the discrete CDF
cdf := make([]float64, len(p))
for i, v := range p {
if i == 0 {
cdf[i] = v.data
} else {
cdf[i] = cdf[i-1] + v.data
}
}
// Generation:
// 1. Generate a uniformly-random value x in the range [0,1)
// 2. Using a binary search, find the index of the smallest element in cdf larger than x
var val float64
// multiply the sample with the largest CDF value; easier than normalizing to [0,1)
val = rand.Float64() * cdf[len(cdf)-1]
// Search returns the smallest index i such that cdf[i] > val
return sort.Search(len(cdf), func(i int) bool { return cdf[i] > val })
}

View File

@@ -6,6 +6,10 @@ Original python is included in the repo for reference against bitrot.
To use: `go run cmd/main.go input.txt` To use: `go run cmd/main.go input.txt`
Differences between the Go and the Python: Differences between the Go and the Python, as well as notes more generally:
* go is implemented as a package and, separately, as a command-line wrapper that calls it, just to keep the algorithm separate from the invocation details * The GPT is implemented as a package and, separately, as a command-line wrapper that calls it, just to keep the algorithm separate from the invocation details.
* The Value class is more type-safe in go, using values everywhere as opposed to mingling floats and values in the localgrad tuple.
* The Value struct has actual tests confirming the backward propagation logic.
* When writing the Value struct and its methods, I accidentally swapped the order of the values in the `localGrads` slice in `Mul` and tore my hair out trying to figure out where the bug was. When I broke down and asked copilot to "compare these two implementations and tell me how they differ," it managed to find the error -- but also reported three non-existent differences and told me that `slices.Backward()` doesn't exist.
* Initial pass translating the linear algebra functions has me worried that all those value structs aren't going to be very fast...