doesn't work yet, but the structures are all there
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
*.txt
|
||||||
20
cmd/main.go
20
cmd/main.go
@@ -1,7 +1,23 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import "git.yetaga.in/alazyreader/microgopt"
|
import (
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"git.yetaga.in/alazyreader/microgopt"
|
||||||
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
microgopt.Run([]string{})
|
f := "names.txt"
|
||||||
|
if len(os.Args) > 1 {
|
||||||
|
f = os.Args[1]
|
||||||
|
}
|
||||||
|
b, err := os.ReadFile(f)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("%v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s := string(b)
|
||||||
|
microgopt.Run(strings.Split(s, "\n"))
|
||||||
}
|
}
|
||||||
|
|||||||
2
go.mod
2
go.mod
@@ -1,5 +1,3 @@
|
|||||||
module git.yetaga.in/alazyreader/microgopt
|
module git.yetaga.in/alazyreader/microgopt
|
||||||
|
|
||||||
go 1.26.0
|
go 1.26.0
|
||||||
|
|
||||||
require github.com/davecgh/go-spew v1.1.1
|
|
||||||
|
|||||||
2
go.sum
2
go.sum
@@ -1,2 +1,4 @@
|
|||||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
|
github.com/mroth/weightedrand/v3 v3.0.0 h1:FPPz/Xpri6qOzZTj8gEA1i6YBSlwLjkDQ+WaZpNHiiI=
|
||||||
|
github.com/mroth/weightedrand/v3 v3.0.0/go.mod h1:Qfpt3At9/pYtQOzy9c2iHVWiHBPL+gvMY7mIN5WRlGg=
|
||||||
|
|||||||
290
microgopt.go
290
microgopt.go
@@ -4,12 +4,24 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"maps"
|
"maps"
|
||||||
"math"
|
"math"
|
||||||
|
"math/rand/v2"
|
||||||
"slices"
|
"slices"
|
||||||
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/davecgh/go-spew/spew"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Initialize the parameters, to store the knowledge of the model
|
||||||
|
const (
|
||||||
|
nLayer = 1 // depth of the transformer neural network (number of layers)
|
||||||
|
nEmbd = 16 // width of the network (embedding dimension)
|
||||||
|
blockSize = 16 // maximum context length of the attention window (note: the longest name is 15 characters)
|
||||||
|
nHead = 4 // number of attention heads
|
||||||
|
headDim = nEmbd / nHead // derived dimension of each head
|
||||||
|
)
|
||||||
|
|
||||||
|
var stateMap = map[string][][]*value{}
|
||||||
|
|
||||||
|
// this type pun just worked in python but go needs to be more explicit
|
||||||
func btof(b bool) float64 {
|
func btof(b bool) float64 {
|
||||||
if b {
|
if b {
|
||||||
return 1.0
|
return 1.0
|
||||||
@@ -17,12 +29,22 @@ func btof(b bool) float64 {
|
|||||||
return 0.0
|
return 0.0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func valcmp(a, b *value) int {
|
||||||
|
if a.data < b.data {
|
||||||
|
return -1
|
||||||
|
} else if a.data > b.data {
|
||||||
|
return 1
|
||||||
|
} else {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func Run(docs []string) {
|
func Run(docs []string) {
|
||||||
// remove leading and trailing whitespace in documents
|
// remove leading and trailing whitespace in documents
|
||||||
for i := range docs {
|
for i := range docs {
|
||||||
docs[i] = strings.TrimSpace(docs[i])
|
docs[i] = strings.TrimSpace(docs[i])
|
||||||
}
|
}
|
||||||
fmt.Printf("num docs: %d", len(docs))
|
fmt.Printf("num docs: %d\n", len(docs))
|
||||||
|
|
||||||
// construct the vocabulary from the documents: an ordered list of all characters in the dataset,
|
// construct the vocabulary from the documents: an ordered list of all characters in the dataset,
|
||||||
// plus a "Beginning Of Sequence" (BOS) token
|
// plus a "Beginning Of Sequence" (BOS) token
|
||||||
@@ -33,9 +55,229 @@ func Run(docs []string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
uchars := slices.Sorted(maps.Keys(set))
|
uchars := slices.Sorted(maps.Keys(set))
|
||||||
// BOS := len(uchars)
|
BOS := len(uchars)
|
||||||
vocabSize := len(uchars) + 1
|
vocabSize := len(uchars) + 1
|
||||||
fmt.Printf("vocab size: %d", vocabSize)
|
fmt.Printf("vocab size: %d\n", vocabSize)
|
||||||
|
|
||||||
|
// in the python code, at this point, the Value class was created
|
||||||
|
// and the global parameters were set up
|
||||||
|
|
||||||
|
stateMap["wte"] = genMatrix(vocabSize, nEmbd)
|
||||||
|
stateMap["wpe"] = genMatrix(blockSize, nEmbd)
|
||||||
|
stateMap["lm_head"] = genMatrix(vocabSize, nEmbd)
|
||||||
|
for i := range nLayer {
|
||||||
|
stateMap[fmt.Sprintf("layer%d.attn_wq", i)] = genMatrix(nEmbd, nEmbd)
|
||||||
|
stateMap[fmt.Sprintf("layer%d.attn_wk", i)] = genMatrix(nEmbd, nEmbd)
|
||||||
|
stateMap[fmt.Sprintf("layer%d.attn_wv", i)] = genMatrix(nEmbd, nEmbd)
|
||||||
|
stateMap[fmt.Sprintf("layer%d.attn_wo", i)] = genMatrix(nEmbd, nEmbd)
|
||||||
|
stateMap[fmt.Sprintf("layer%d.mlp_fc1", i)] = genMatrix(4*nEmbd, nEmbd)
|
||||||
|
stateMap[fmt.Sprintf("layer%d.mlp_fc2", i)] = genMatrix(nEmbd, 4*nEmbd)
|
||||||
|
}
|
||||||
|
// flatten params into a single []value
|
||||||
|
params := []*value{}
|
||||||
|
for _, mat := range stateMap {
|
||||||
|
for _, row := range mat {
|
||||||
|
for _, p := range row {
|
||||||
|
params = append(params, p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fmt.Printf("num params: %d\n", len(params))
|
||||||
|
|
||||||
|
// at this point in the python, linear(), softmax(), rmsnorm(), and gpt() are all defined
|
||||||
|
|
||||||
|
// "Let there be Adam, the blessed optimizer and its buffers"
|
||||||
|
learningRate, beta1, beta2, epsAdam := 0.01, 0.85, 0.99, 1e-8
|
||||||
|
m := slices.Repeat([]float64{}, len(params)) // first moment buffer
|
||||||
|
v := slices.Repeat([]float64{}, len(params)) // second moment buffer
|
||||||
|
|
||||||
|
// Repeat in sequence
|
||||||
|
numSteps := 1000 // number of training steps
|
||||||
|
for step := range numSteps {
|
||||||
|
// Take single document, tokenize it, surround it with BOS special token on both sides
|
||||||
|
doc := docs[step%len(docs)]
|
||||||
|
tokens := []int{BOS}
|
||||||
|
for _, ch := range doc {
|
||||||
|
tokens = append(tokens, slices.Index(uchars, ch))
|
||||||
|
}
|
||||||
|
tokens = append(tokens, BOS)
|
||||||
|
n := min(blockSize, len(tokens)-1)
|
||||||
|
|
||||||
|
// Forward the token sequence through the model, building up the computation graph all the way to the loss
|
||||||
|
keys, values := mkDeepSlice(nLayer), mkDeepSlice(nLayer)
|
||||||
|
losses := []*value{}
|
||||||
|
for posId := range n {
|
||||||
|
tokenId, targetId := tokens[posId], tokens[posId+1]
|
||||||
|
logits := gpt(tokenId, posId, keys, values)
|
||||||
|
probs := softMax(logits)
|
||||||
|
lossT := probs[targetId].Log().Neg()
|
||||||
|
losses = append(losses, lossT)
|
||||||
|
}
|
||||||
|
lossSum := &value{}
|
||||||
|
for _, l := range losses {
|
||||||
|
lossSum.Add(l)
|
||||||
|
}
|
||||||
|
loss := lossSum.Mul(&value{data: float64(1 / n)}) // final average loss over the document sequence. May yours be low.
|
||||||
|
|
||||||
|
// Backward the loss, calculating the gradients with respect to all model parameters
|
||||||
|
loss.Backward()
|
||||||
|
|
||||||
|
// Adam optimizer update: update the model parameters based on the corresponding gradients
|
||||||
|
lrt := learningRate * (float64(1) - float64(step)/float64(numSteps))
|
||||||
|
for i, p := range params {
|
||||||
|
m[i] = beta1*m[i] + (1-beta1)*p.grad
|
||||||
|
v[i] = beta2*v[i] + (1-beta2)*math.Pow(p.grad, 2.0)
|
||||||
|
m_hat := m[i] / (1 - math.Pow(beta1, float64(step+1)))
|
||||||
|
v_hat := v[i] / (1 - math.Pow(beta2, float64(step+1)))
|
||||||
|
p.data = p.data - (lrt*m_hat)/(math.Pow(v_hat, 0.5)+epsAdam)
|
||||||
|
p.grad = 0.0
|
||||||
|
}
|
||||||
|
fmt.Printf("step %4d / %4d | loss %.4f\n", step+1, numSteps, loss.data)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Inference: may the model babble back to us
|
||||||
|
temperature := 0.5 // in (0, 1], control the "creativity" of generated text, low to high
|
||||||
|
fmt.Println("--- inference (new, hallucinated names) ---")
|
||||||
|
for sampleIdx := range 20 {
|
||||||
|
keys, values := mkDeepSlice(nLayer), mkDeepSlice(nLayer)
|
||||||
|
tokenId := BOS
|
||||||
|
sample := []rune{}
|
||||||
|
for posId := range blockSize {
|
||||||
|
logits := gpt(tokenId, posId, keys, values)
|
||||||
|
probs := make([]*value, len(logits))
|
||||||
|
for i, l := range logits {
|
||||||
|
probs[i] = l.Div(&value{data: temperature})
|
||||||
|
}
|
||||||
|
probs = softMax(probs)
|
||||||
|
tokenId := RouletteDraw(probs)
|
||||||
|
if tokenId == BOS {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
sample = append(sample, uchars[tokenId])
|
||||||
|
}
|
||||||
|
fmt.Printf("sample %2d: %s\n", sampleIdx, string(sample))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func genMatrix(out, in int) [][]*value {
|
||||||
|
m := make([][]*value, out)
|
||||||
|
for o := range out {
|
||||||
|
m[o] = make([]*value, in)
|
||||||
|
for i := range in {
|
||||||
|
m[o][i] = &value{data: rand.NormFloat64() * 0.08}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return m
|
||||||
|
}
|
||||||
|
|
||||||
|
func linear(x []*value, w [][]*value) []*value {
|
||||||
|
r := []*value{}
|
||||||
|
for _, wo := range w {
|
||||||
|
for i := range wo {
|
||||||
|
r = append(r, wo[i].Mul(x[i]))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
|
||||||
|
func softMax(logits []*value) []*value {
|
||||||
|
maxVal := slices.MaxFunc(logits, valcmp)
|
||||||
|
exps := []*value{}
|
||||||
|
for _, val := range logits {
|
||||||
|
exps = append(exps, val.Sub(maxVal).Exp())
|
||||||
|
}
|
||||||
|
total := &value{}
|
||||||
|
for _, e := range exps {
|
||||||
|
total = total.Add(e)
|
||||||
|
}
|
||||||
|
for i := range exps {
|
||||||
|
exps[i] = exps[i].Div(total)
|
||||||
|
}
|
||||||
|
return exps
|
||||||
|
}
|
||||||
|
|
||||||
|
func rmsNorm(x []*value) []*value {
|
||||||
|
ms := &value{}
|
||||||
|
for _, xi := range x {
|
||||||
|
ms = ms.Add(xi.Mul(xi))
|
||||||
|
}
|
||||||
|
ms = ms.Div(&value{data: float64(len(x))})
|
||||||
|
scale := ms.Add(&value{data: 1e-5}).Pow(&value{data: -0.5})
|
||||||
|
for i := range x {
|
||||||
|
x[i] = x[i].Mul(scale)
|
||||||
|
}
|
||||||
|
return x
|
||||||
|
}
|
||||||
|
|
||||||
|
func gpt(tokenId int, posId int, keys [][][]*value, values [][][]*value) []*value {
|
||||||
|
tokEmb := stateMap["wte"][tokenId] // token embedding
|
||||||
|
posEmb := stateMap["wpe"][posId] // position embedding
|
||||||
|
x := []*value{}
|
||||||
|
// joint token and position embedding
|
||||||
|
for i := range tokEmb {
|
||||||
|
x = append(x, tokEmb[i].Add(posEmb[i]))
|
||||||
|
}
|
||||||
|
x = rmsNorm(x) // note: not redundant due to backward pass via the residual connection
|
||||||
|
|
||||||
|
for li := range nLayer {
|
||||||
|
// 1) Multi-head Attention block
|
||||||
|
xResidual := slices.Clone(x)
|
||||||
|
x = rmsNorm(x)
|
||||||
|
q := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wq", li)])
|
||||||
|
k := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wk", li)])
|
||||||
|
v := linear(x, stateMap[fmt.Sprintf("layer%d.attn_wv", li)])
|
||||||
|
keys[li] = append(keys[li], k)
|
||||||
|
values[li] = append(values[li], v)
|
||||||
|
xAttn := []*value{}
|
||||||
|
// basically, distribute the work over the "attention heads"
|
||||||
|
for h := range nHead {
|
||||||
|
hs := h * headDim
|
||||||
|
q_h := q[hs : hs+headDim]
|
||||||
|
k_h := [][]*value{}
|
||||||
|
for _, ki := range keys[li] {
|
||||||
|
k_h = append(k_h, ki[hs:hs+headDim])
|
||||||
|
}
|
||||||
|
v_h := [][]*value{}
|
||||||
|
for _, vi := range values[li] {
|
||||||
|
v_h = append(v_h, vi[hs:hs+headDim])
|
||||||
|
}
|
||||||
|
attnLogits := []*value{}
|
||||||
|
for t := range len(k_h) {
|
||||||
|
s := &value{data: 0.0}
|
||||||
|
for j := range headDim {
|
||||||
|
s = s.Add(q_h[j].Mul(k_h[t][j]))
|
||||||
|
}
|
||||||
|
attnLogits = append(attnLogits, s.Div(&value{data: math.Pow(float64(headDim), 0.5)}))
|
||||||
|
}
|
||||||
|
attnWeights := softMax(attnLogits)
|
||||||
|
headOut := []*value{}
|
||||||
|
for j := range headDim {
|
||||||
|
s := &value{data: 0.0}
|
||||||
|
for t := range len(v_h) {
|
||||||
|
s = s.Add(attnWeights[t].Mul(v_h[t][j]))
|
||||||
|
}
|
||||||
|
headOut = append(headOut, s)
|
||||||
|
}
|
||||||
|
xAttn = append(xAttn, headOut...)
|
||||||
|
}
|
||||||
|
x = linear(xAttn, stateMap[fmt.Sprintf("layer%d.attn_wo", li)])
|
||||||
|
for i := range x {
|
||||||
|
x[i] = x[i].Add(xResidual[i])
|
||||||
|
}
|
||||||
|
// 2) MLP block
|
||||||
|
xResidual = x
|
||||||
|
x = rmsNorm(x)
|
||||||
|
x = linear(x, stateMap[fmt.Sprintf("layer%d.mlp_fc1", li)])
|
||||||
|
for i := range x {
|
||||||
|
x[i] = x[i].Relu()
|
||||||
|
}
|
||||||
|
x = linear(x, stateMap[fmt.Sprintf("layer%d.mlp_fc2", li)])
|
||||||
|
for i := range x {
|
||||||
|
x[i] = x[i].Add(xResidual[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logits := linear(x, stateMap["lm_head"])
|
||||||
|
return logits
|
||||||
}
|
}
|
||||||
|
|
||||||
type value struct {
|
type value struct {
|
||||||
@@ -45,9 +287,10 @@ type value struct {
|
|||||||
localGrads []*value
|
localGrads []*value
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// this lets us build a set-like map with our Values.
|
||||||
|
// If the slices were removed from the struct, that would make this method irrelevant.
|
||||||
func (v *value) toKey() string {
|
func (v *value) toKey() string {
|
||||||
k := fmt.Sprintf("%+v", v)
|
k := fmt.Sprintf("%+v", v)
|
||||||
fmt.Println(k)
|
|
||||||
return k
|
return k
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -55,7 +298,7 @@ func (v *value) Add(other *value) *value {
|
|||||||
return &value{
|
return &value{
|
||||||
data: v.data + other.data,
|
data: v.data + other.data,
|
||||||
children: []*value{v, other},
|
children: []*value{v, other},
|
||||||
localGrads: []*value{{data: 1}, {data: 1}},
|
localGrads: []*value{{data: 1.0}, {data: 1.0}},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -68,6 +311,7 @@ func (v *value) Div(other *value) *value {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (v *value) Mul(other *value) *value {
|
func (v *value) Mul(other *value) *value {
|
||||||
|
// note the swap here: children are stored as v, other but grads are other, v
|
||||||
return &value{
|
return &value{
|
||||||
data: v.data * other.data,
|
data: v.data * other.data,
|
||||||
children: []*value{v, other},
|
children: []*value{v, other},
|
||||||
@@ -137,11 +381,39 @@ func (v *value) Backward() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
buildTopo(v)
|
buildTopo(v)
|
||||||
spew.Dump(topo)
|
v.grad = 1.0
|
||||||
v.grad = 1
|
|
||||||
for _, v := range slices.Backward(topo) {
|
for _, v := range slices.Backward(topo) {
|
||||||
for i := range v.children {
|
for i := range v.children {
|
||||||
v.children[i].grad += v.localGrads[i].data * v.grad
|
v.children[i].grad += v.localGrads[i].data * v.grad
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func mkDeepSlice(size int) [][][]*value {
|
||||||
|
a := make([][][]*value, 1, 10)
|
||||||
|
a[0] = make([][]*value, 1, 10)
|
||||||
|
a[0][0] = make([]*value, 1, 10)
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
|
||||||
|
// implement our own weighted random chooser
|
||||||
|
// based on https://cybernetist.com/2019/01/24/random-weighted-draws-in-go/ but without the dependency on gonum
|
||||||
|
func RouletteDraw(p []*value) int {
|
||||||
|
// Initialization: create the discrete CDF
|
||||||
|
cdf := make([]float64, len(p))
|
||||||
|
for i, v := range p {
|
||||||
|
if i == 0 {
|
||||||
|
cdf[i] = v.data
|
||||||
|
} else {
|
||||||
|
cdf[i] = cdf[i-1] + v.data
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Generation:
|
||||||
|
// 1. Generate a uniformly-random value x in the range [0,1)
|
||||||
|
// 2. Using a binary search, find the index of the smallest element in cdf larger than x
|
||||||
|
var val float64
|
||||||
|
// multiply the sample with the largest CDF value; easier than normalizing to [0,1)
|
||||||
|
val = rand.Float64() * cdf[len(cdf)-1]
|
||||||
|
// Search returns the smallest index i such that cdf[i] > val
|
||||||
|
return sort.Search(len(cdf), func(i int) bool { return cdf[i] > val })
|
||||||
|
}
|
||||||
|
|||||||
@@ -6,6 +6,10 @@ Original python is included in the repo for reference against bitrot.
|
|||||||
|
|
||||||
To use: `go run cmd/main.go input.txt`
|
To use: `go run cmd/main.go input.txt`
|
||||||
|
|
||||||
Differences between the Go and the Python:
|
Differences between the Go and the Python, as well as notes more generally:
|
||||||
|
|
||||||
* go is implemented as a package and, separately, as a command-line wrapper that calls it, just to keep the algorithm separate from the invocation details
|
* The GPT is implemented as a package and, separately, as a command-line wrapper that calls it, just to keep the algorithm separate from the invocation details.
|
||||||
|
* The Value class is more type-safe in go, using values everywhere as opposed to mingling floats and values in the localgrad tuple.
|
||||||
|
* The Value struct has actual tests confirming the backward propagation logic.
|
||||||
|
* When writing the Value struct and its methods, I accidentally swapped the order of the values in the `localGrads` slice in `Mul` and tore my hair out trying to figure out where the bug was. When I broke down and asked copilot to "compare these two implementations and tell me how they differ," it managed to find the error -- but also reported three non-existent differences and told me that `slices.Backward()` doesn't exist.
|
||||||
|
* Initial pass translating the linear algebra functions has me worried that all those value structs aren't going to be very fast...
|
||||||
|
|||||||
Reference in New Issue
Block a user