renesax14 · June 18, 2020 18:08 · renesax14 · Jun 18, 2020
diff --git a/checking_copy_initial_weight.py b/checking_copy_initial_weight.py
 def test_training_initial_weights():
    import torch
    import torch.optim as optim
    import torch.nn as nn
    from collections import OrderedDict

    ## training config
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    episodes = 5
    nb_inner_train_steps = 5
    ## get base model
    base_mdl = nn.Sequential(OrderedDict([
        ('fc', nn.Linear(1,1, bias=False)),
        ('relu', nn.ReLU())
        ]))
    ## get outer optimizer (not differentiable nor trainable)
    outer_opt = optim.Adam(base_mdl.parameters(), lr=0.01)
    for episode in range(episodes):
        spt_x, spt_y, qry_x, qry_y = torch.randn(1), torch.randn(1), torch.randn(1), torch.randn(1)
        inner_opt = torch.optim.SGD(base_mdl.parameters(), lr=1e-1)
        with higher.innerloop_ctx(base_mdl, inner_opt, copy_initial_weights=False, track_higher_grads=False) as (fmodel, diffopt):
            for i_inner in range(nb_inner_train_steps): # this current version implements full gradient descent on k_shot examples (which is usually small  5)
                fmodel.train()
                # base/child model forward pass
                inner_loss = 0.5*((fmodel(spt_x) - spt_y))**2
                # inner-opt update
                diffopt.step(inner_loss)
            ## Evaluate on query set for current task
            qry_loss = 0.5*((fmodel(qry_x) - qry_y))**2
            qry_loss.backward() # for memory efficient computation
        ## outer update
        print(f'episode = {episode}')
        print(f'base_mdl.grad = {base_mdl.fc.weight.grad}')
        outer_opt.step()
        outer_opt.zero_grad()
        
 if __name__ == '__main__':
    test_training_initial_weights()
    print('Done \a')
	def test_training_initial_weights():
	import torch
	import torch.optim as optim
	import torch.nn as nn
	from collections import OrderedDict

	## training config
	#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	episodes = 5
	nb_inner_train_steps = 5
	## get base model
	base_mdl = nn.Sequential(OrderedDict([
	('fc', nn.Linear(1,1, bias=False)),
	('relu', nn.ReLU())
	]))
	## get outer optimizer (not differentiable nor trainable)
	outer_opt = optim.Adam(base_mdl.parameters(), lr=0.01)
	for episode in range(episodes):
	spt_x, spt_y, qry_x, qry_y = torch.randn(1), torch.randn(1), torch.randn(1), torch.randn(1)
	inner_opt = torch.optim.SGD(base_mdl.parameters(), lr=1e-1)
	with higher.innerloop_ctx(base_mdl, inner_opt, copy_initial_weights=False, track_higher_grads=False) as (fmodel, diffopt):
	for i_inner in range(nb_inner_train_steps): # this current version implements full gradient descent on k_shot examples (which is usually small 5)
	fmodel.train()
	# base/child model forward pass
	inner_loss = 0.5((fmodel(spt_x) - spt_y))*2
	# inner-opt update
	diffopt.step(inner_loss)
	## Evaluate on query set for current task
	qry_loss = 0.5((fmodel(qry_x) - qry_y))*2
	qry_loss.backward() # for memory efficient computation
	## outer update
	print(f'episode = {episode}')
	print(f'base_mdl.grad = {base_mdl.fc.weight.grad}')
	outer_opt.step()
	outer_opt.zero_grad()

	if __name__ == '__main__':
	test_training_initial_weights()
	print('Done \a')