本文主要是介绍先挖个坑等着填DQN PolicyGradient,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
问题
1.不能反向传播
2.计算出的loss用不用加和平均
import torch.nn as nn
import torch.nn.functional as F
import torch
import gym
import numpy as np
import torch.optim as optim
import random
import collections
from torch.distributions import Categoricalclass Policy(nn.Module):def __init__(self):super(Policy, self).__init__()self.l1 = nn.Linear(N_STATES, 32)self.l2 = nn.Linear(32, N_ACTIONS)def forward(self, x):out = self.l1(x)out = F.relu(out)out = self.l2(out)out = F.relu(out)return F.softmax(out, dim=-1)def calc_future_reward(reward_list):for i in range(len(reward_list)-2,-1,-1):reward_list[i] += gamma * reward_list[i+1]return reward_listenv = gym.make('CartPole-v0')
env = env.unwrapped
N_ACTIONS = env.action_space.n
N_STATES = env.observation_space.shape[0]
print(N_ACTIONS)
print(N_STATES)lr = 0.01
gamma = 0.9
greedy = 1
policy = Policy()
optimizer = optim.Adam(policy.parameters(), lr=1e-2)
print('\nCollecting experience...')
for i_episode in range(1):s = env.reset()step_cnt = 0episode = []a_list = []s_list = []r_list = []prob_list = []while True:step_cnt += 1env.render()prob = policy.forward(torch.FloatTensor(s))print(prob)prob_list.append(list(prob.data.numpy()))m = Categorical(prob)action = m.sample()a = action.item()s_, r, done, _ = env.step(a)s_list.append(s)a_list.append(a)r_list.append(r)if done:s_tensor = torch.FloatTensor(s_list)a_tensor = torch.LongTensor(a_list).view(-1, 1)G_list = calc_future_reward(r_list)G_tensor = torch.FloatTensor(G_list).view(-1, 1)prob_tensor = torch.FloatTensor(prob_list)one_hot = torch.zeros(step_cnt, N_ACTIONS).scatter_(1, a_tensor, 1)loss = -1.0 * (prob_tensor.log() * one_hot).sum(dim=1).view(-1, 1) * G_tensoroptimizer.zero_grad()print(loss)mess_loss = torch.cat((loss)).sum()/step_cntmess_loss.backward()optimizer.step()breaks = s_
这篇关于先挖个坑等着填DQN PolicyGradient的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!