In [None]:
import gym
import random
import numpy as np
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim


In [None]:
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 24)     # First hidden layer with 24 neurons
        self.fc2 = nn.Linear(24, 24)              # Second hidden layer
        self.fc3 = nn.Linear(24, action_size)     # Output layer: one Q-value per action

    def forward(self, x):
        x = torch.relu(self.fc1(x))              # Apply ReLU after first layer
        x = torch.relu(self.fc2(x))              # Apply ReLU after second layer
        return self.fc3(x)                       # Output Q-values for all actions


In [None]:
env = gym.make("CartPole-v1")
state_size = env.observation_space.shape[0]  # 4 features
action_size = env.action_space.n             # 2 actions (left/right)

# DQN Hyperparameters
gamma = 0.99             # Discount factor
epsilon = 1.0            # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
learning_rate = 0.001
batch_size = 64
memory_size = 10000


  deprecation(
  deprecation(


In [None]:
memory = deque(maxlen=memory_size)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

policy_net = DQN(state_size, action_size).to(device)
target_net = DQN(state_size, action_size).to(device)
target_net.load_state_dict(policy_net.state_dict())  # Copy weights
target_net.eval()  # Target net in evaluation mode (no gradients)

optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()


In [None]:
def get_action(state, epsilon):
    if random.random() < epsilon:
        return random.choice(range(action_size))  # Random action
    else:
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            q_values = policy_net(state)         # Get Q-values from policy network
        return q_values.argmax().item()          # Pick action with highest Q-value


In [None]:
def replay():
    if len(memory) < batch_size:
        return  # Don't train until we have enough samples

    minibatch = random.sample(memory, batch_size)

    states, actions, rewards, next_states, dones = zip(*minibatch)

    states = torch.FloatTensor(states).to(device)
    actions = torch.LongTensor(actions).unsqueeze(1).to(device)
    rewards = torch.FloatTensor(rewards).unsqueeze(1).to(device)
    next_states = torch.FloatTensor(next_states).to(device)
    dones = torch.FloatTensor(dones).unsqueeze(1).to(device)

    # Predicted Q-values for actions taken
    current_q = policy_net(states).gather(1, actions)

    # Target Q-values: r + Î³ * max(Q(next_state))
    next_q = target_net(next_states).max(1)[0].detach().unsqueeze(1)
    target_q = rewards + (gamma * next_q * (1 - dones))

    # Compute loss and update weights
    loss = loss_fn(current_q, target_q)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


In [None]:
episodes = 500
target_update_freq = 10

for episode in range(episodes):
    reset_result = env.reset()
    state = reset_result[0] if isinstance(reset_result, tuple) else reset_result
    total_reward = 0

    for t in range(500):
        action = get_action(state, epsilon)
        step_result = env.step(action)

        if len(step_result) == 5:
            next_state, reward, terminated, truncated, _ = step_result
            done = terminated or truncated
        else:
            next_state, reward, done, _ = step_result

        memory.append((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward

        replay()
        if done:
            break

    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    if episode % target_update_freq == 0:
        target_net.load_state_dict(policy_net.state_dict())

    print(f"Episode {episode}, Total Reward: {total_reward}, Epsilon: {epsilon:.3f}")



Episode 0, Total Reward: 20.0, Epsilon: 0.995
Episode 1, Total Reward: 17.0, Epsilon: 0.990
Episode 2, Total Reward: 37.0, Epsilon: 0.985
Episode 3, Total Reward: 16.0, Epsilon: 0.980
Episode 4, Total Reward: 12.0, Epsilon: 0.975


  states = torch.FloatTensor(states).to(device)


Episode 5, Total Reward: 16.0, Epsilon: 0.970
Episode 6, Total Reward: 29.0, Epsilon: 0.966
Episode 7, Total Reward: 19.0, Epsilon: 0.961
Episode 8, Total Reward: 27.0, Epsilon: 0.956
Episode 9, Total Reward: 22.0, Epsilon: 0.951
Episode 10, Total Reward: 18.0, Epsilon: 0.946
Episode 11, Total Reward: 12.0, Epsilon: 0.942
Episode 12, Total Reward: 13.0, Epsilon: 0.937
Episode 13, Total Reward: 17.0, Epsilon: 0.932
Episode 14, Total Reward: 14.0, Epsilon: 0.928
Episode 15, Total Reward: 16.0, Epsilon: 0.923
Episode 16, Total Reward: 12.0, Epsilon: 0.918
Episode 17, Total Reward: 11.0, Epsilon: 0.914
Episode 18, Total Reward: 19.0, Epsilon: 0.909
Episode 19, Total Reward: 24.0, Epsilon: 0.905
Episode 20, Total Reward: 14.0, Epsilon: 0.900
Episode 21, Total Reward: 24.0, Epsilon: 0.896
Episode 22, Total Reward: 20.0, Epsilon: 0.891
Episode 23, Total Reward: 23.0, Epsilon: 0.887
Episode 24, Total Reward: 28.0, Epsilon: 0.882
Episode 25, Total Reward: 12.0, Epsilon: 0.878
Episode 26, Total 