python - 我的 LSTM 学习，损失减少，但数值梯度与分析梯度不匹配

coder 2023-08-27 原文

以下是自包含，当您运行它时，它将:

1. 打印损失以验证它正在减少(学习 sin 波)，

2.对照我的手推梯度函数检查数值梯度。

这两个梯度倾向于在 1e-1 到 1e-2 内匹配(这仍然很糟糕，但表明它正在尝试)并且偶尔会出现极端异常值。

我整个星期六都在退回到正常的 FFNN，让它工作(耶，梯度匹配!)，现在星期天在这个 LSTM 上，好吧，我找不到我逻辑中的错误。哦，这在很大程度上取决于我的随机种子，有时很好，有时很糟糕。

我已经根据 LSTM 方程的手推导数(我做了微积分)和这 3 个博客/要点中的实现手工检查了我的实现:

并尝试了此处建议的(惊人的)调试方法:https://blog.slavv.com/37-reasons-why-your-neural-network-is-not-working-4020854bd607

你能帮忙看看我哪里做错了吗？

import numpy as np
np.set_printoptions(precision=3, suppress=True)

def check_grad(params, In, Target, f, df_analytical, delta=1e-5, tolerance=1e-7, num_checks=10):
    """
    delta : how far on either side of the param value to go

    tolerance : how far the analytical and numerical values can diverge
    """

    h_n = params['Wf'].shape[1] # TODO: h & c should be passed in (?)
    h = np.zeros(h_n)
    c = np.zeros(h_n)

    y, outputs, loss, h, c, caches = f(params, h, c, inputs, targets)
    dparams = df_analytical(params, inputs, targets, outputs, caches)

    passes = True
    for _ in range(num_checks):
        print()
        for pname, p, dpname, dp in zip(params.keys(), params.values(), dparams.keys(), dparams.values()):

            pix = np.random.randint(0, p.size)
            old_val = p.flat[pix]

            # d = delta * abs(old_val) if old_val != 0 else 1e-5
            d = delta

            p.flat[pix] = old_val + d
            _, _, loss_plus, _, _, _ = f(params, h, c, In, Target) # note `_` is the cache
            p.flat[pix] = old_val - d
            _, _, loss_minus, _, _, _ = f(params, h, c, In, Target)
            p.flat[pix] = old_val

            grad_analytic = dp.flat[pix]
            grad_numeric = (loss_plus - loss_minus) / (2 * d)

            denom = abs(grad_numeric + grad_analytic) + 1e-12 # max((abs(grad_numeric), abs(grad_analytic)))
            relative_error = abs(grad_analytic - grad_numeric) / denom

            if relative_error > tolerance:
                print(("fails: %s % 4d |  r: % 3.4f,   a: % 3.4f,   n: % 3.4f,   a/n: %0.2f") % (pname, pix, relative_error, grad_analytic, grad_numeric, grad_analytic/grad_numeric))
            passes &= relative_error <= tolerance

    return passes


# ----------

def lstm(params, inp, h_old, c_old):

    Wf, Wi, Wg, Wo, Wy = params['Wf'], params['Wi'], params['Wg'], params['Wo'], params['Wy']
    bf, bi, bg, bo, by = params['bf'], params['bi'], params['bg'], params['bo'], params['by']

    xh = np.concatenate([inp, h_old])

    f = np.dot(xh, Wf) + bf
    f_sigm = 1 / (1 + np.exp(-f))

    i = np.dot(xh, Wi) + bi
    i_sigm = 1 / (1 + np.exp(-i))

    g = np.dot(xh, Wg) + bg # C-tilde or C-bar in the literature
    g_tanh = np.tanh(g)

    o = np.dot(xh, Wo) + bo
    o_sigm = 1 / (1 + np.exp(-o))

    c = f_sigm * c_old + i_sigm * g_tanh

    c_tanh = np.tanh(c)
    h = o_sigm * c_tanh

    y = np.dot(h, Wy) + by # NOTE: this is a dense layer bolted on after a normal LSTM
    # TODO: should it have a nonlinearity after it? MSE would not work well with, for ex, a sigmoid

    cache = (xh, f, f_sigm, i, i_sigm, g, g_tanh, o, o_sigm, c, c_tanh, c_old, h)
    return y, h, c, cache


def dlstm(params, dy, dh_next, dc_next, cache):

    Wf, Wi, Wg, Wo, Wy = params['Wf'], params['Wi'], params['Wg'], params['Wo'], params['Wy']
    bf, bi, bg, bo, by = params['bf'], params['bi'], params['bg'], params['bo'], params['by']

    xh, f, f_sigm, i, i_sigm, g, g_tanh, o, o_sigm, c, c_tanh, c_old, h = cache

    dby = dy.copy()
    dWy = np.outer(h, dy)
    dh = np.dot(dy, Wy.T) + dh_next.copy()
    do = c_tanh * dh * o_sigm * (1 - o_sigm)
    dc = dc_next.copy() + o_sigm * dh * (1 - c_tanh ** 2) # TODO: copy?
    dg = i_sigm * dc * (1 - g_tanh ** 2)
    di = g_tanh * dc * i_sigm * (1 - i_sigm)
    df = c_old  * dc * f_sigm * (1 - f_sigm) # ERROR FIXED: ??? c_old -> c?, c->c_old?

    dWo = np.outer(xh, do); dbo = do; dXo = np.dot(do, Wo.T)
    dWg = np.outer(xh, dg); dbg = dg; dXg = np.dot(dg, Wg.T)
    dWi = np.outer(xh, di); dbi = di; dXi = np.dot(di, Wi.T)
    dWf = np.outer(xh, df); dbf = df; dXf = np.dot(df, Wf.T)

    dX = dXo + dXg + dXi + dXf
    dh_next = dX[-h.size:]
    dc_next = f_sigm * dc

    dparams = dict(Wf = dWf, Wi = dWi, Wg = dWg, Wo = dWo, Wy = dWy,
                   bf = dbf, bi = dbi, bg = dbg, bo = dbo, by = dby)

    return dparams, dh_next, dc_next


def lstm_loss(params, h, c, inputs, targets):
    loss = 0
    outputs = []
    caches = []
    for inp, target in zip(inputs, targets):
        y, h, c, cache = lstm(params, inp, h, c)
        loss += np.mean((y - target) ** 2)
        outputs.append(y)
        caches.append(cache)
    loss = loss # / inputs.shape[0]
    return y, outputs, loss, h, c, caches

def dlstm_loss(params, inputs, targets, outputs, caches):
    h_shape = caches[0][-1].shape
    dparams = {k:np.zeros_like(v) for k, v in params.items()}
    dh = np.zeros(h_shape)
    dc = np.zeros(h_shape)

    for inp, out, target, cache in reversed(list(zip(inputs, outputs, targets, caches))):
        dy = 2 * (out - target)
        dps, dh, dc = dlstm(params, dy, dh, dc, cache)
        for dpk, dpv in dps.items():
            dparams[dpk] += dpv
    return  dparams


# ----------
# setup

x_n = 1
h_n = 5
o_n = 1

params = dict(
    Wf = np.random.normal(size=(x_n + h_n, h_n)),
    Wi = np.random.normal(size=(x_n + h_n, h_n)),
    Wg = np.random.normal(size=(x_n + h_n, h_n)),
    Wo = np.random.normal(size=(x_n + h_n, h_n)),
    Wy = np.random.normal(size=(h_n, o_n)),
    bf = np.zeros(h_n) + np.random.normal(size=h_n) * 0.1,
    bi = np.zeros(h_n) + np.random.normal(size=h_n) * 0.1,
    bg = np.zeros(h_n) + np.random.normal(size=h_n) * 0.1,
    bo = np.zeros(h_n) + np.random.normal(size=h_n) * 0.1,
    by = np.zeros(o_n) + np.random.normal(size=o_n) * 0.1,
)

for name in ['Wf', 'Wi', 'Wg', 'Wo', 'Wy']:
    W = params[name]
    W *= np.sqrt(2 / (W.shape[0] + W.shape[1])) # Xavier initialization
for name in params:
    params[name] = params[name].astype('float64')


# ----------
# Sanity check, learn sin wave

def test_sin():
    emaloss = 1 # EMA average
    emak = 0.99

    for t in range(5000):
        data = np.sin(np.linspace(0, 3 * np.pi, 30))
        start = np.random.randint(0, data.size // 4)
        end = np.random.randint((data.size * 3) // 4, data.size)
        inputs = data[start:end, None]
        targets = np.roll(inputs, 1, axis=0)


        h_n = params['Wf'].shape[1] # TODO: h & c should be passed in
        h = np.random.normal(size=h_n)
        c = np.random.normal(size=h_n)

        y, outputs, loss, h, c, caches = lstm_loss(params, h, c, inputs, targets)
        dparams = dlstm_loss(params, inputs, targets, outputs, caches)

        for k in params.keys():
            params[k] -= dparams[k] * 0.01


        emaloss = emaloss * emak + loss * (1 - emak)
        if t % 100 == 0:
            print('%.4f' % emaloss)
test_sin()

# ----------
data = np.sin(np.linspace(0, 4 * np.pi, 90))
start = np.random.randint(0, data.size // 4)
end = np.random.randint((data.size * 3) // 4, data.size)
inputs = data[start:end, None]
targets = np.roll(inputs, 1, axis=0)

for inp, targ in zip(inputs, targets):
    assert(check_grad(params, inputs, targets, lstm_loss, dlstm_loss, delta=1e-5, tolerance=1e-7, num_checks=10))
print('grads are ok') # <- i never reach here

最佳答案

解决了!在我的 check_grad 中，我需要构建用于 df_analytical 的 caches，但在这样做时，我也覆盖了 h 和 c 应该是 np.zeroes。

y, outputs, loss, h, c, caches = f(params, h, c, inputs, targets)

_, _, loss_minus, _, _, _ = f(params, h, c, inputs, targets)
p.flat[pix] = old_val

因此，只需不覆盖 h 和 c 即可修复它，并且 LSTM 代码正常。

_, outputs, loss, _, _, caches = f(params, h, c, inputs, targets)

关于python - 我的 LSTM 学习，损失减少，但数值梯度与分析梯度不匹配，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/54282206/

python LSTM params np 39 machine-learning recurrent-neural-network

有关python - 我的 LSTM 学习，损失减少，但数值梯度与分析梯度不匹配的更多相关文章

python - 如何使用 Ruby 或 Python 创建一系列高音调和低音调的蜂鸣声？ - 2
关闭。这个问题是opinion-based.它目前不接受答案。想要改进这个问题？更新问题，以便editingthispost可以用事实和引用来回答它.关闭4年前。Improvethisquestion我想在固定时间创建一系列低音和高音调的哔哔声。例如:在150毫秒时发出高音调的蜂鸣声在151毫秒时发出低音调的蜂鸣声200毫秒时发出低音调的蜂鸣声250毫秒的高音调蜂鸣声有没有办法在Ruby或Python中做到这一点？我真的不在乎输出编码是什么(.wav、.mp3、.ogg等等)，但我确实想创建一个输出文件。
ruby 正则表达式 - 如何替换字符串中匹配项的第 n 个实例 - 2
在我的应用程序中，我需要能够找到所有数字子字符串，然后扫描每个子字符串，找到第一个匹配范围(例如5到15之间)的子字符串，并将该实例替换为另一个字符串“X”。我的测试字符串s="1foo100bar10gee1"我的初始模式是1个或多个数字的任何字符串，例如，re=Regexp.new(/\d+/)matches=s.scan(re)给出["1","100","10","1"]如果我想用“X”替换第N个匹配项，并且只替换第N个匹配项，我该怎么做？例如，如果我想替换第三个匹配项“10”(匹配项[2])，我不能只说s[matches[2]]="X"因为它做了两次替换“1fooX0barXg
ruby - 匹配未转义的平衡定界符对 - 2
如何匹配未被反斜杠转义的平衡定界符对(其本身未被反斜杠转义)(无需考虑嵌套)？例如对于反引号，我试过了，但是转义的反引号没有像转义那样工作。regex=/(?!$1:"how\\"#expected"how\\`are"上面的正则表达式不考虑由反斜杠转义并位于反引号前面的反斜杠，但我愿意考虑。StackOverflow如何做到这一点？这样做的目的并不复杂。我有文档文本，其中包括内联代码的反引号，就像StackOverflow一样，我想在HTML文件中显示它，内联代码用一些spanMaterial装饰。不会有嵌套，但转义反引号或转义反斜杠可能出现在任何地方。
ruby-on-rails - 如何在我的 Rails 应用程序 View 中打印 ruby 变量的内容？ - 2
我是一个Rails初学者，但我想从我的RailsView(html.haml文件)中查看Ruby变量的内容。我试图在ruby中打印出变量(认为它会在终端中出现)，但没有得到任何结果。有什么建议吗？我知道Rails调试器，但更喜欢使用inspect来打印我的变量。最佳答案您可以在View中使用puts方法将信息输出到服务器控制台。您应该能够在View中的任何位置使用Haml执行以下操作:-puts@my_variable.inspect 关于ruby-on-rails-如何在我的R
ruby - 匹配大写字母并用后续字母填充，直到一定的字符串长度 - 2
我有一个驼峰式字符串，例如:JustAString。我想按照以下规则形成长度为4的字符串:抓取所有大写字母；如果超过4个大写字母，只保留前4个；如果少于4个大写字母，则将最后大写字母后的字母大写并添加字母，直到长度变为4。以下是可能发生的3种情况:ThisIsMyString将产生TIMS(大写字母)；ThisIsOneVeryLongString将产生TIOV(前4个大写字母)；MyString将生成MSTR(大写字母+tr大写)。我设法用这个片段解决了前两种情况:str.scan(/[A-Z]/).first(4).join但是，我不太确定如何最好地修改上面的代码片段以处理最后一种
ruby - 我可以将我的 README.textile 以正确的格式放入我的 RDoc 中吗？ - 2
我喜欢使用Textile或Markdown为我的项目编写自述文件，但是当我生成RDoc时，自述文件被解释为RDoc并且看起来非常糟糕。有没有办法让RDoc通过RedCloth或BlueCloth而不是它自己的格式化程序运行文件？它可以配置为自动检测文件后缀的格式吗？(例如README.textile通过RedCloth运行，但README.mdown通过BlueCloth运行) 最佳答案使用YARD直接代替RDoc将允许您包含Textile或Markdown文件，只要它们的文件后缀是合理的。我经常使用类似于以下Rake任务的东西:
jquery - 我的 jquery AJAX POST 请求无需发送 Authenticity Token (Rails) - 2
rails中是否有任何规定允许站点的所有AJAXPOST请求在没有authenticity_token的情况下通过？我有一个调用Controller方法的JqueryPOSTajax调用，但我没有在其中放置任何真实性代码，但调用成功。我的ApplicationController确实有'request_forgery_protection'并且我已经改变了config.action_controller.consider_all_requests_local在我的environments/development.rb中为false我还搜索了我的代码以确保我没有重载ajaxSend来发送
ruby-on-rails - Rails 3，嵌套资源，没有路由匹配 [PUT] - 2
我真的为这个而疯狂。我一直在搜索答案并尝试我找到的所有内容，包括相关问题和stackoverflow上的答案，但仍然无法正常工作。我正在使用嵌套资源，但无法使表单正常工作。我总是遇到错误，例如没有路线匹配[PUT]"/galleries/1/photos"表格在这里:/galleries/1/photos/1/edit路线.rbresources:galleriesdoresources:photosendresources:galleriesresources:photos照片Controller.rbdefnew@gallery=Gallery.find(params[:galle
Python 相当于 Perl/Ruby ||= - 2
这个问题在这里已经有了答案:关闭10年前。PossibleDuplicate:Pythonconditionalassignmentoperator对于这样一个简单的问题表示歉意，但是谷歌搜索||=并不是很有帮助；)Python中是否有与Ruby和Perl中的||=语句等效的语句？例如:foo="hey"foo||="what"#assignfooifit'sundefined#fooisstill"hey"bar||="yeah"#baris"yeah"另外，类似这样的东西的通用术语是什么？条件分配是我的第一个猜测，但Wikipediapage跟我想的不太一样。
java - 我的模型类或其他类中应该有逻辑吗 - 2
我只想对我一直在思考的这个问题有其他意见，例如我有classuser_controller和classuserclassUserattr_accessor:name,:usernameendclassUserController//dosomethingaboutanythingaboutusersend问题是我的User类中是否应该有逻辑user=User.newuser.do_something(user1)oritshouldbeuser_controller=UserController.newuser_controller.do_something(user1,user2)我

python - 我的 LSTM 学习，损失减少，但数值梯度与分析梯度不匹配

有关python - 我的 LSTM 学习，损失减少，但数值梯度与分析梯度不匹配的更多相关文章

随机推荐