import math

num_episodes = 200       # number of episodes to train
batch_size = 64          # batch size for sampling from replay
gamma = 0.99             # discount factor
tau = 0.005              # target network update rate (tau)
exploration_noise = 0.1  # stddev for Gaussian exploration noise

for episode in range(num_episodes):
    state, _ = env.reset()
    state = state.astype(np.float32)
    episode_reward = 0.0
    for step in range(500):  # max steps per episode (Pendulum typically truncated at 200)
        # Select action according to current policy + exploration noise
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            action = actor(state_tensor).cpu().numpy()[0]
        # Add exploration noise (Gaussian)
        action = action + np.random.normal(0, exploration_noise * max_action, size=action_dim)
        action = np.clip(action, -max_action, max_action)
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        next_state = next_state.astype(np.float32)
        # Store transition in replay buffer
        buffer.add(state, action, reward, next_state, done)
        state = next_state
        episode_reward += reward

        # Train the networks if we have enough samples in replay buffer
        if len(buffer) >= batch_size:
            # Sample a batch
            states, actions, rewards, next_states, dones = buffer.sample(batch_size)
            # Compute target Q values using target networks
            with torch.no_grad():
                # Target actor for next action
                next_actions = target_actor(next_states)
                target_Q = target_critic(next_states, next_actions)
                # If done (terminal), no future reward; use (1-done) mask
                target_Q = rewards + gamma * (1 - dones) * target_Q
            # Critic loss = MSE between current Q and target Q
            current_Q = critic(states, actions)
            critic_loss = nn.MSELoss()(current_Q, target_Q)
            # Update critic
            critic_optimizer.zero_grad()
            critic_loss.backward()
            critic_optimizer.step()

            # Actor loss = -mean(Q) (because we want to maximize Q, so minimize -Q)
            actor_actions = actor(states)
            actor_loss = -critic(states, actor_actions).mean()
            # Update actor
            actor_optimizer.zero_grad()
            actor_loss.backward()
            actor_optimizer.step()

            # Soft update target networks
            for param, target_param in zip(critic.parameters(), target_critic.parameters()):
                target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
            for param, target_param in zip(actor.parameters(), target_actor.parameters()):
                target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

        if done:
            break  # episode ends
    # Logging (print) the cumulative reward of the episode
    print(f"Episode {episode+1}: Reward = {episode_reward:.2f}")