package main import ( "log" "math" "math/rand" "gonum.org/v1/gonum/mat" ) /* Dataset + LSTM + helper come elencato */ // ================== LSTM (gonum) ================== type LSTM struct { InputSize int HiddenSize int Wxi, Whi, bi *mat.Dense Wxf, Whf, bf *mat.Dense Wxo, Who, bo *mat.Dense Wxg, Whg, bg *mat.Dense Why, by *mat.Dense h, c *mat.Dense LR float64 } func newLSTM(inputSize, hiddenSize int, lr float64, seed int64) *LSTM { rng := rand.New(rand.NewSource(seed)) init := func(r, c int) *mat.Dense { data := make([]float64, r*c) scale := 1.0 / math.Sqrt(float64(c)) for i := range data { data[i] = (rng.Float64()*2 - 1) * 0.1 * scale } return mat.NewDense(r, c, data) } zeros := func(r, c int) *mat.Dense { return mat.NewDense(r, c, nil) } return &LSTM{ InputSize: inputSize, HiddenSize: hiddenSize, Wxi: init(hiddenSize, inputSize), Whi: init(hiddenSize, hiddenSize), bi: zeros(hiddenSize, 1), Wxf: init(hiddenSize, inputSize), Whf: init(hiddenSize, hiddenSize), bf: zeros(hiddenSize, 1), Wxo: init(hiddenSize, inputSize), Who: init(hiddenSize, hiddenSize), bo: zeros(hiddenSize, 1), Wxg: init(hiddenSize, inputSize), Whg: init(hiddenSize, hiddenSize), bg: zeros(hiddenSize, 1), Why: init(1, hiddenSize), by: zeros(1, 1), h: zeros(hiddenSize, 1), c: zeros(hiddenSize, 1), LR: lr, } } func (m *LSTM) resetState() { m.h.Zero(); m.c.Zero() } func denseOf(a mat.Matrix) *mat.Dense { r, c := a.Dims() out := mat.NewDense(r, c, nil) out.Copy(a) return out } func mm(a mat.Matrix, b mat.Matrix) *mat.Dense { var out mat.Dense; out.Mul(a, b); return &out } func addM(a mat.Matrix, b mat.Matrix) *mat.Dense { da := denseOf(a); da.Add(da, b); return da } func hadM(a mat.Matrix, b mat.Matrix) *mat.Dense { ar, ac := a.Dims() br, bc := b.Dims() if ar != br || ac != bc { panic("hadM: dimensioni incompatibili") } out := mat.NewDense(ar, ac, nil) for i := 0; i < ar; i++ { for j := 0; j < ac; j++ { out.Set(i, j, a.At(i, j)*b.At(i, j)) } } return out } func applyM(a mat.Matrix, f func(float64) float64) *mat.Dense { r, c := a.Dims() out := mat.NewDense(r, c, nil) for i := 0; i < r; i++ { for j := 0; j < c; j++ { out.Set(i, j, f(a.At(i, j))) } } return out } func scaleM(a mat.Matrix, s float64) *mat.Dense { return applyM(a, func(v float64) float64 { return v * s }) } func sigmoid(x float64) float64 { return 1.0 / (1.0 + math.Exp(-x)) } type lstmCache struct { xs, hs, cs []*mat.Dense is, fs, os []*mat.Dense gs, ys []*mat.Dense } func (m *LSTM) forward(seq []float64) (float64, lstmCache) { T := len(seq) cache := lstmCache{ xs: make([]*mat.Dense, T), hs: make([]*mat.Dense, T+1), cs: make([]*mat.Dense, T+1), is: make([]*mat.Dense, T), fs: make([]*mat.Dense, T), os: make([]*mat.Dense, T), gs: make([]*mat.Dense, T), ys: make([]*mat.Dense, T), } cache.hs[0] = denseOf(m.h) cache.cs[0] = denseOf(m.c) h := denseOf(m.h) c := denseOf(m.c) for t := 0; t < T; t++ { xt := mat.NewDense(m.InputSize, 1, []float64{seq[t]}) cache.xs[t] = xt i := applyM(addM(addM(mm(m.Wxi, xt), mm(m.Whi, h)), m.bi), sigmoid) f := applyM(addM(addM(mm(m.Wxf, xt), mm(m.Whf, h)), m.bf), sigmoid) o := applyM(addM(addM(mm(m.Wxo, xt), mm(m.Who, h)), m.bo), sigmoid) g := applyM(addM(addM(mm(m.Wxg, xt), mm(m.Whg, h)), m.bg), math.Tanh) c = addM(hadM(f, c), hadM(i, g)) h = hadM(o, applyM(c, math.Tanh)) y := addM(mm(m.Why, h), m.by) cache.is[t] = i cache.fs[t] = f cache.os[t] = o cache.gs[t] = g cache.hs[t+1] = denseOf(h) cache.cs[t+1] = denseOf(c) cache.ys[t] = y } pred := cache.ys[T-1].At(0, 0) m.h.Copy(cache.hs[T]) m.c.Copy(cache.cs[T]) return pred, cache } func (m *LSTM) backward(cache lstmCache, target float64) float64 { T := len(cache.xs) yT := cache.ys[T-1].At(0, 0) // Huber loss delta := 1.0 err := yT - target var loss, grad float64 if math.Abs(err) <= delta { loss = 0.5 * err * err grad = err } else { loss = delta*math.Abs(err) - 0.5*delta*delta if err >= 0 { grad = delta } else { grad = -delta } } dy := mat.NewDense(1, 1, []float64{grad}) var dWhy mat.Dense dWhy.Mul(dy, cache.hs[T].T()) dby := dy var dh mat.Dense dh.Mul(m.Why.T(), dy) dWxi := mat.NewDense(m.HiddenSize, m.InputSize, nil) dWhi := mat.NewDense(m.HiddenSize, m.HiddenSize, nil) dbi := mat.NewDense(m.HiddenSize, 1, nil) dWxf := mat.NewDense(m.HiddenSize, m.InputSize, nil) dWhf := mat.NewDense(m.HiddenSize, m.HiddenSize, nil) dbf := mat.NewDense(m.HiddenSize, 1, nil) dWxo := mat.NewDense(m.HiddenSize, m.InputSize, nil) dWho := mat.NewDense(m.HiddenSize, m.HiddenSize, nil) dbo := mat.NewDense(m.HiddenSize, 1, nil) dWxg := mat.NewDense(m.HiddenSize, m.InputSize, nil) dWhg := mat.NewDense(m.HiddenSize, m.HiddenSize, nil) dbg := mat.NewDense(m.HiddenSize, 1, nil) dcNext := mat.NewDense(m.HiddenSize, 1, nil) for t := T - 1; t >= 0; t-- { h := cache.hs[t+1] c := cache.cs[t+1] cPrev := cache.cs[t] i := cache.is[t] f := cache.fs[t] o := cache.os[t] g := cache.gs[t] x := cache.xs[t] hPrev := cache.hs[t] tanhc := applyM(c, math.Tanh) do := hadM(&dh, tanhc) o1mo := applyM(o, func(v float64) float64 { return v * (1 - v) }) do = hadM(do, o1mo) oneMinusTanh2 := applyM(tanhc, func(v float64) float64 { return 1 - v*v }) tmp := hadM(&dh, o) dc := hadM(tmp, oneMinusTanh2) dc = addM(dc, dcNext) df := hadM(dc, cPrev) f1mf := applyM(f, func(v float64) float64 { return v * (1 - v) }) df = hadM(df, f1mf) di := hadM(dc, g) i1mi := applyM(i, func(v float64) float64 { return v * (1 - v) }) di = hadM(di, i1mi) dg := hadM(dc, i) g1mg2 := applyM(g, func(v float64) float64 { return 1 - v*v }) dg = hadM(dg, g1mg2) dWxi.Add(dWxi, mm(di, x.T())) dWhi.Add(dWhi, mm(di, hPrev.T())) dbi.Add(dbi, di) dWxf.Add(dWxf, mm(df, x.T())) dWhf.Add(dWhf, mm(df, hPrev.T())) dbf.Add(dbf, df) dWxo.Add(dWxo, mm(do, x.T())) dWho.Add(dWho, mm(do, hPrev.T())) dbo.Add(dbo, do) dWxg.Add(dWxg, mm(dg, x.T())) dWhg.Add(dWhg, mm(dg, hPrev.T())) dbg.Add(dbg, dg) var dhi, dhf, dho, dhg mat.Dense dhi.Mul(m.Whi.T(), di) dhf.Mul(m.Whf.T(), df) dho.Mul(m.Who.T(), do) dhg.Mul(m.Whg.T(), dg) var dhSum mat.Dense dhSum.Add(&dhi, &dhf) dhSum.Add(&dhSum, &dho) dhSum.Add(&dhSum, &dhg) dh = dhSum dcNext = hadM(dc, f) _ = h } clip := func(d *mat.Dense, maxNorm float64) { r, c := d.Dims() sum := 0.0 for i := 0; i < r; i++ { for j := 0; j < c; j++ { v := d.At(i, j) sum += v * v } } norm := math.Sqrt(sum) if norm > maxNorm && norm > 0 { f := maxNorm / norm for i := 0; i < r; i++ { for j := 0; j < c; j++ { d.Set(i, j, d.At(i, j)*f) } } } } maxG := 5.0 clip(dWxi, maxG) clip(dWhi, maxG) clip(dbi, maxG) clip(dWxf, maxG) clip(dWhf, maxG) clip(dbf, maxG) clip(dWxo, maxG) clip(dWho, maxG) clip(dbo, maxG) clip(dWxg, maxG) clip(dWhg, maxG) clip(dbg, maxG) clip(&dWhy, maxG) clip(dby, maxG) negLR := -m.LR m.Wxi.Add(m.Wxi, scaleM(dWxi, negLR)) m.Whi.Add(m.Whi, scaleM(dWhi, negLR)) m.bi.Add(m.bi, scaleM(dbi, negLR)) m.Wxf.Add(m.Wxf, scaleM(dWxf, negLR)) m.Whf.Add(m.Whf, scaleM(dWhf, negLR)) m.bf.Add(m.bf, scaleM(dbf, negLR)) m.Wxo.Add(m.Wxo, scaleM(dWxo, negLR)) m.Who.Add(m.Who, scaleM(dWho, negLR)) m.bo.Add(m.bo, scaleM(dbo, negLR)) m.Wxg.Add(m.Wxg, scaleM(dWxg, negLR)) m.Whg.Add(m.Whg, scaleM(dWhg, negLR)) m.bg.Add(m.bg, scaleM(dbg, negLR)) m.Why.Add(m.Why, scaleM(&dWhy, negLR)) m.by.Add(m.by, scaleM(dby, negLR)) return loss } func (m *LSTM) train(dsTrain, dsVal Dataset, batchSize int, maxEpochs int, earlyStopFrac float64, seed int64) (firstLoss float64, bestValMAE float64, epochsRun int) { rng := rand.New(rand.NewSource(seed)) indices := func(n int) []int { idx := make([]int, n) for i := range idx { idx[i] = i } rng.Shuffle(n, func(i, j int) { idx[i], idx[j] = idx[j], idx[i] }) return idx } valMAE := func() float64 { if len(dsVal.Seqs) == 0 { return math.NaN() } sum := 0.0 for i := range dsVal.Seqs { m.resetState() p, _ := m.forward(dsVal.Seqs[i]) sum += math.Abs(p - dsVal.Labels[i]) } return sum / float64(len(dsVal.Seqs)) } bestValMAE = math.Inf(1) var baseLoss float64 for epoch := 1; epoch <= maxEpochs; epoch++ { idx := indices(len(dsTrain.Seqs)) totalLoss := 0.0 n := 0 for start := 0; start < len(idx); start += batchSize { end := start + batchSize if end > len(idx) { end = len(idx) } for _, ii := range idx[start:end] { m.resetState() _, cache := m.forward(dsTrain.Seqs[ii]) loss := m.backward(cache, dsTrain.Labels[ii]) totalLoss += loss n++ } } avgLoss := totalLoss / float64(max(1, n)) curVal := valMAE() if epoch == 1 { firstLoss = avgLoss baseLoss = avgLoss } if curVal < bestValMAE { bestValMAE = curVal } epochsRun = epoch log.Printf("epoca=%d avgLoss=%.6f firstLoss=%.6f valMAE=%.6f", epoch, avgLoss, firstLoss, curVal) if avgLoss <= baseLoss*earlyStopFrac { log.Printf("early-stopping: avgLoss=%.6f soglia=%.6f (%.2f%% di firstLoss) epoca=%d", avgLoss, baseLoss*earlyStopFrac, earlyStopFrac*100, epoch) break } } return }