# Initialize Gymnasium environment env = TCLabEnv(setpoint=50) actor = Actor(state_dim=1, action_dim=1, max_action=100) critic = Critic(state_dim=1, action_dim=1) target_actor = Actor(state_dim=1, action_dim=1, max_action=100) target_critic = Critic(state_dim=1, action_dim=1) target_actor.load_state_dict(actor.state_dict()) target_critic.load_state_dict(critic.state_dict()) actor_optimizer = optim.Adam(actor.parameters(), lr=1e-3) critic_optimizer = optim.Adam(critic.parameters(), lr=1e-3) buffer = ReplayBuffer() gamma = 0.99 tau = 0.005 for episode in range(100): state, _ = env.reset() episode_reward = 0 for step in range(200): state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0) action = actor(state_tensor).detach().cpu().numpy().flatten()[0] # Convert to scalar next_state, reward, done, _, _ = env.step([action]) # Wrap action in list buffer.add(state, action, reward, next_state, done) state = next_state episode_reward += reward if len(buffer) > 64: states, actions, rewards, next_states, dones = buffer.sample(64) with torch.no_grad(): next_actions = target_actor(next_states) target_q = target_critic(next_states, next_actions) target_q = rewards + gamma * (1 - dones) * target_q current_q = critic(states, actions) critic_loss = nn.MSELoss()(current_q, target_q) critic_optimizer.zero_grad() critic_loss.backward() critic_optimizer.step() actor_loss = -critic(states, actor(states)).mean() actor_optimizer.zero_grad() actor_loss.backward() actor_optimizer.step() for param, target_param in zip(critic.parameters(), target_critic.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) for param, target_param in zip(actor.parameters(), target_actor.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) print(f"Episode {episode+1}: Reward = {episode_reward:.2f}") env.close()