import math num_episodes = 200 # number of episodes to train batch_size = 64 # batch size for sampling from replay gamma = 0.99 # discount factor tau = 0.005 # target network update rate (tau) exploration_noise = 0.1 # stddev for Gaussian exploration noise for episode in range(num_episodes): state, _ = env.reset() state = state.astype(np.float32) episode_reward = 0.0 for step in range(500): # max steps per episode (Pendulum typically truncated at 200) # Select action according to current policy + exploration noise state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0) with torch.no_grad(): action = actor(state_tensor).cpu().numpy()[0] # Add exploration noise (Gaussian) action = action + np.random.normal(0, exploration_noise * max_action, size=action_dim) action = np.clip(action, -max_action, max_action) next_state, reward, terminated, truncated, info = env.step(action) done = terminated or truncated next_state = next_state.astype(np.float32) # Store transition in replay buffer buffer.add(state, action, reward, next_state, done) state = next_state episode_reward += reward # Train the networks if we have enough samples in replay buffer if len(buffer) >= batch_size: # Sample a batch states, actions, rewards, next_states, dones = buffer.sample(batch_size) # Compute target Q values using target networks with torch.no_grad(): # Target actor for next action next_actions = target_actor(next_states) target_Q = target_critic(next_states, next_actions) # If done (terminal), no future reward; use (1-done) mask target_Q = rewards + gamma * (1 - dones) * target_Q # Critic loss = MSE between current Q and target Q current_Q = critic(states, actions) critic_loss = nn.MSELoss()(current_Q, target_Q) # Update critic critic_optimizer.zero_grad() critic_loss.backward() critic_optimizer.step() # Actor loss = -mean(Q) (because we want to maximize Q, so minimize -Q) actor_actions = actor(states) actor_loss = -critic(states, actor_actions).mean() # Update actor actor_optimizer.zero_grad() actor_loss.backward() actor_optimizer.step() # Soft update target networks for param, target_param in zip(critic.parameters(), target_critic.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) for param, target_param in zip(actor.parameters(), target_actor.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) if done: break # episode ends # Logging (print) the cumulative reward of the episode print(f"Episode {episode+1}: Reward = {episode_reward:.2f}")