Examples
In this section, you will find a variety of examples that demonstrate how to use this library to solve reinforcement learning tasks. With the knowledge and skills you gain from trying these examples, you will be well on your way to using this library to solve your reinforcement learning problems
Table of Contents
Note
It is recommended to use the Table of Contents in the sidebar or in this section to improve the browsing experience
Gym/Gymnasium
Gym/Gymnasium environments
These examples perform the training of one agent in a Gym/Gymnasium environment (one agent, one environment)
The following components or practices are exemplified (highlighted):
Load and wrap a Gym environment: Pendulum (DDPG), CartPole (CEM)
Recurrent neural network models (RNN, GRU, LSTM): PendulumNoVel (DDPG)
Instantiate models using the model instantiation utility: CartPole (DQN)
Create a tabular model (\(\epsilon\)-greedy policy): Taxi (SARSA), FrozenLake (Q-Learning)
Load a checkpoint during evaluation: Pendulum (DDPG), CartPole (CEM), CartPole (DQN), Taxi (SARSA), FrozenLake (Q-Learning)
Benchmark results are listed in Benchmark results #32 (Gym/Gymnasium)
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
# Import the skrl components to build the RL system
from skrl.models.torch import Model, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG
from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the models (deterministic models) for the DDPG agent using mixin
# - Actor (policy): takes as input the environment's observation/state and returns an action
# - Critic: takes the state and action as input and provides a value to guide the policy
class Actor(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.linear_layer_1 = nn.Linear(self.num_observations, 400)
self.linear_layer_2 = nn.Linear(400, 300)
self.action_layer = nn.Linear(300, self.num_actions)
def compute(self, inputs, role):
x = F.relu(self.linear_layer_1(inputs["states"]))
x = F.relu(self.linear_layer_2(x))
# Pendulum-v1 action_space is -2 to 2
return 2 * torch.tanh(self.action_layer(x)), {}
class Critic(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.linear_layer_1 = nn.Linear(self.num_observations + self.num_actions, 400)
self.linear_layer_2 = nn.Linear(400, 300)
self.linear_layer_3 = nn.Linear(300, 1)
def compute(self, inputs, role):
x = F.relu(self.linear_layer_1(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1)))
x = F.relu(self.linear_layer_2(x))
return self.linear_layer_3(x), {}
# Load and wrap the Gym environment.
# Note: the environment version may change depending on the gym version
try:
env = gym.make("Pendulum-v1")
except gym.error.DeprecatedEnv as e:
env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("Pendulum-v")][0]
print("Pendulum-v1 not found. Trying {}".format(env_id))
env = gym.make(env_id)
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory (without replacement) as experience replay memory
memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False)
# Instantiate the agent's models (function approximators).
# DDPG requires 4 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models
models_ddpg = {}
models_ddpg["policy"] = Actor(env.observation_space, env.action_space, device)
models_ddpg["target_policy"] = Actor(env.observation_space, env.action_space, device)
models_ddpg["critic"] = Critic(env.observation_space, env.action_space, device)
models_ddpg["target_critic"] = Critic(env.observation_space, env.action_space, device)
# Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models_ddpg.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters
cfg_ddpg = DDPG_DEFAULT_CONFIG.copy()
cfg_ddpg["exploration"]["noise"] = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.1, base_scale=1.0, device=device)
cfg_ddpg["discount_factor"] = 0.98
cfg_ddpg["batch_size"] = 100
cfg_ddpg["random_timesteps"] = 1000
cfg_ddpg["learning_starts"] = 1000
# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively
cfg_ddpg["experiment"]["write_interval"] = 75
cfg_ddpg["experiment"]["checkpoint_interval"] = 750
agent_ddpg = DDPG(models=models_ddpg,
memory=memory,
cfg=cfg_ddpg,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 15000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg)
# start training
trainer.train()
Note: It is necessary to adjust the checkpoint path according to the directories generated by the new experiments
Note: Warnings such as [skrl:WARNING] Cannot load the <module> module. The agent doesn't have such an instance
can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
# Import the skrl components to build the RL system
from skrl.models.torch import Model, DeterministicMixin
from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define only the policy for evaluation
class DeterministicActor(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.linear_layer_1 = nn.Linear(self.num_observations, 400)
self.linear_layer_2 = nn.Linear(400, 300)
self.action_layer = nn.Linear(300, self.num_actions)
def compute(self, inputs, role):
x = F.relu(self.linear_layer_1(inputs["states"]))
x = F.relu(self.linear_layer_2(x))
return 2 * torch.tanh(self.action_layer(x)), {} # Pendulum-v1 action_space is -2 to 2
# Load and wrap the Gym environment.
# Note: the environment version may change depending on the gym version
try:
env = gym.make("Pendulum-v1")
except gym.error.DeprecatedEnv as e:
env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("Pendulum-v")][0]
print("Pendulum-v1 not found. Trying {}".format(env_id))
env = gym.make(env_id)
env = wrap_env(env)
device = env.device
# Instantiate the agent's policy.
# DDPG requires 4 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models
models_ddpg = {}
models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters
cfg_ddpg = DDPG_DEFAULT_CONFIG.copy()
cfg_ddpg["random_timesteps"] = 0
# logging to TensorBoard each 300 timesteps and ignore checkpoints
cfg_ddpg["experiment"]["write_interval"] = 300
cfg_ddpg["experiment"]["checkpoint_interval"] = 0
agent_ddpg = DDPG(models=models_ddpg,
memory=None,
cfg=cfg_ddpg,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# load checkpoint
agent_ddpg.load("./runs/22-09-10_11-02-46-773796_DDPG/checkpoints/agent_15000.pt")
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 15000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg)
# evaluate the agent
trainer.eval()
Note
The examples use a wrapper around the original environment to mask the velocity in the observation. The intention is to make the MDP partially observable and to show the capabilities of recurrent neural networks
More examples with other algorithms can be found in the repository documentation example folder and in the benchmark results indicated above
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
# Import the skrl components to build the RL system
from skrl.models.torch import Model, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG
from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the models (deterministic models) for the DDPG agent using mixin
# - Actor (policy): takes as input the environment's observation/state and returns an action
# - Critic: takes the state and action as input and provides a value to guide the policy
class Actor(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
num_envs=1, num_layers=1, hidden_size=400, sequence_length=20):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.num_envs = num_envs
self.num_layers = num_layers
self.hidden_size = hidden_size # Hout
self.sequence_length = sequence_length
self.rnn = nn.RNN(input_size=self.num_observations,
hidden_size=self.hidden_size,
num_layers=self.num_layers,
batch_first=True) # batch_first -> (batch, sequence, features)
self.linear_layer_1 = nn.Linear(self.hidden_size, 400)
self.linear_layer_2 = nn.Linear(400, 300)
self.action_layer = nn.Linear(300, self.num_actions)
def get_specification(self):
# batch size (N) is the number of envs
return {"rnn": {"sequence_length": self.sequence_length,
"sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout)
def compute(self, inputs, role):
states = inputs["states"]
terminated = inputs.get("terminated", None)
hidden_states = inputs["rnn"][0]
# training
if self.training:
rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length
hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout)
# get the hidden states corresponding to the initial sequence
sequence_index = 1 if role == "target_policy" else 0 # target networks act on the next state of the environment
hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout)
# reset the RNN state in the middle of a sequence
if terminated is not None and torch.any(terminated):
rnn_outputs = []
terminated = terminated.view(-1, self.sequence_length)
indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length]
for i in range(len(indexes) - 1):
i0, i1 = indexes[i], indexes[i + 1]
rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states)
hidden_states[:, (terminated[:,i1-1]), :] = 0
rnn_outputs.append(rnn_output)
rnn_output = torch.cat(rnn_outputs, dim=1)
# no need to reset the RNN state in the sequence
else:
rnn_output, hidden_states = self.rnn(rnn_input, hidden_states)
# rollout
else:
rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1
rnn_output, hidden_states = self.rnn(rnn_input, hidden_states)
# flatten the RNN output
rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout)
x = F.relu(self.linear_layer_1(rnn_output))
x = F.relu(self.linear_layer_2(x))
# Pendulum-v1 action_space is -2 to 2
return 2 * torch.tanh(self.action_layer(x)), {"rnn": [hidden_states]}
class Critic(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
num_envs=1, num_layers=1, hidden_size=400, sequence_length=20):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.num_envs = num_envs
self.num_layers = num_layers
self.hidden_size = hidden_size # Hout
self.sequence_length = sequence_length
self.rnn = nn.RNN(input_size=self.num_observations,
hidden_size=self.hidden_size,
num_layers=self.num_layers,
batch_first=True) # batch_first -> (batch, sequence, features)
self.linear_layer_1 = nn.Linear(self.hidden_size + self.num_actions, 400)
self.linear_layer_2 = nn.Linear(400, 300)
self.linear_layer_3 = nn.Linear(300, 1)
def get_specification(self):
# batch size (N) is the number of envs
return {"rnn": {"sequence_length": self.sequence_length,
"sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout)
def compute(self, inputs, role):
states = inputs["states"]
terminated = inputs.get("terminated", None)
hidden_states = inputs["rnn"][0]
# critic is only used during training
rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length
hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout)
# get the hidden states corresponding to the initial sequence
sequence_index = 1 if role == "target_critic" else 0 # target networks act on the next state of the environment
hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout)
# reset the RNN state in the middle of a sequence
if terminated is not None and torch.any(terminated):
rnn_outputs = []
terminated = terminated.view(-1, self.sequence_length)
indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length]
for i in range(len(indexes) - 1):
i0, i1 = indexes[i], indexes[i + 1]
rnn_output, hidden_states = self.rnn(rnn_input[:,i0:i1,:], hidden_states)
hidden_states[:, (terminated[:,i1-1]), :] = 0
rnn_outputs.append(rnn_output)
rnn_output = torch.cat(rnn_outputs, dim=1)
# no need to reset the RNN state in the sequence
else:
rnn_output, hidden_states = self.rnn(rnn_input, hidden_states)
# flatten the RNN output
rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout)
x = F.relu(self.linear_layer_1(torch.cat([rnn_output, inputs["taken_actions"]], dim=1)))
x = F.relu(self.linear_layer_2(x))
return self.linear_layer_3(x), {"rnn": [hidden_states]}
# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py)
class NoVelocityWrapper(gym.ObservationWrapper):
def observation(self, observation):
# observation: x, y, angular velocity
return observation * np.array([1, 1, 0])
gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1")))
# Load and wrap the Gym environment
env = gym.make("PendulumNoVel-v1")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory (without replacement) as experience replay memory
memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False)
# Instantiate the agent's models (function approximators).
# DDPG requires 4 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models
models_ddpg = {}
models_ddpg["policy"] = Actor(env.observation_space, env.action_space, device, num_envs=env.num_envs)
models_ddpg["target_policy"] = Actor(env.observation_space, env.action_space, device, num_envs=env.num_envs)
models_ddpg["critic"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs)
models_ddpg["target_critic"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs)
# Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models_ddpg.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters
cfg_ddpg = DDPG_DEFAULT_CONFIG.copy()
cfg_ddpg["exploration"]["noise"] = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.1, base_scale=1.0, device=device)
cfg_ddpg["discount_factor"] = 0.98
cfg_ddpg["batch_size"] = 100
cfg_ddpg["random_timesteps"] = 0
cfg_ddpg["learning_starts"] = 1000
# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively
cfg_ddpg["experiment"]["write_interval"] = 75
cfg_ddpg["experiment"]["checkpoint_interval"] = 750
agent_ddpg = DDPG(models=models_ddpg,
memory=memory,
cfg=cfg_ddpg,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 15000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg)
# start training
trainer.train()
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
# Import the skrl components to build the RL system
from skrl.models.torch import Model, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG
from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the models (deterministic models) for the DDPG agent using mixin
# - Actor (policy): takes as input the environment's observation/state and returns an action
# - Critic: takes the state and action as input and provides a value to guide the policy
class Actor(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
num_envs=1, num_layers=1, hidden_size=400, sequence_length=20):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.num_envs = num_envs
self.num_layers = num_layers
self.hidden_size = hidden_size # Hout
self.sequence_length = sequence_length
self.gru = nn.GRU(input_size=self.num_observations,
hidden_size=self.hidden_size,
num_layers=self.num_layers,
batch_first=True) # batch_first -> (batch, sequence, features)
self.linear_layer_1 = nn.Linear(self.hidden_size, 400)
self.linear_layer_2 = nn.Linear(400, 300)
self.action_layer = nn.Linear(300, self.num_actions)
def get_specification(self):
# batch size (N) is the number of envs
return {"rnn": {"sequence_length": self.sequence_length,
"sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout)
def compute(self, inputs, role):
states = inputs["states"]
terminated = inputs.get("terminated", None)
hidden_states = inputs["rnn"][0]
# training
if self.training:
rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length
hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout)
# get the hidden states corresponding to the initial sequence
sequence_index = 1 if role == "target_policy" else 0 # target networks act on the next state of the environment
hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout)
# reset the RNN state in the middle of a sequence
if terminated is not None and torch.any(terminated):
rnn_outputs = []
terminated = terminated.view(-1, self.sequence_length)
indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length]
for i in range(len(indexes) - 1):
i0, i1 = indexes[i], indexes[i + 1]
rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states)
hidden_states[:, (terminated[:,i1-1]), :] = 0
rnn_outputs.append(rnn_output)
rnn_output = torch.cat(rnn_outputs, dim=1)
# no need to reset the RNN state in the sequence
else:
rnn_output, hidden_states = self.gru(rnn_input, hidden_states)
# rollout
else:
rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1
rnn_output, hidden_states = self.gru(rnn_input, hidden_states)
# flatten the RNN output
rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout)
x = F.relu(self.linear_layer_1(rnn_output))
x = F.relu(self.linear_layer_2(x))
# Pendulum-v1 action_space is -2 to 2
return 2 * torch.tanh(self.action_layer(x)), {"rnn": [hidden_states]}
class Critic(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
num_envs=1, num_layers=1, hidden_size=400, sequence_length=20):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.num_envs = num_envs
self.num_layers = num_layers
self.hidden_size = hidden_size # Hout
self.sequence_length = sequence_length
self.gru = nn.GRU(input_size=self.num_observations,
hidden_size=self.hidden_size,
num_layers=self.num_layers,
batch_first=True) # batch_first -> (batch, sequence, features)
self.linear_layer_1 = nn.Linear(self.hidden_size + self.num_actions, 400)
self.linear_layer_2 = nn.Linear(400, 300)
self.linear_layer_3 = nn.Linear(300, 1)
def get_specification(self):
# batch size (N) is the number of envs
return {"rnn": {"sequence_length": self.sequence_length,
"sizes": [(self.num_layers, self.num_envs, self.hidden_size)]}} # hidden states (D ∗ num_layers, N, Hout)
def compute(self, inputs, role):
states = inputs["states"]
terminated = inputs.get("terminated", None)
hidden_states = inputs["rnn"][0]
# critic is only used during training
rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length
hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout)
# get the hidden states corresponding to the initial sequence
sequence_index = 1 if role == "target_critic" else 0 # target networks act on the next state of the environment
hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout)
# reset the RNN state in the middle of a sequence
if terminated is not None and torch.any(terminated):
rnn_outputs = []
terminated = terminated.view(-1, self.sequence_length)
indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length]
for i in range(len(indexes) - 1):
i0, i1 = indexes[i], indexes[i + 1]
rnn_output, hidden_states = self.gru(rnn_input[:,i0:i1,:], hidden_states)
hidden_states[:, (terminated[:,i1-1]), :] = 0
rnn_outputs.append(rnn_output)
rnn_output = torch.cat(rnn_outputs, dim=1)
# no need to reset the RNN state in the sequence
else:
rnn_output, hidden_states = self.gru(rnn_input, hidden_states)
# flatten the RNN output
rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout)
x = F.relu(self.linear_layer_1(torch.cat([rnn_output, inputs["taken_actions"]], dim=1)))
x = F.relu(self.linear_layer_2(x))
return self.linear_layer_3(x), {"rnn": [hidden_states]}
# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py)
class NoVelocityWrapper(gym.ObservationWrapper):
def observation(self, observation):
# observation: x, y, angular velocity
return observation * np.array([1, 1, 0])
gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1")))
# Load and wrap the Gym environment
env = gym.make("PendulumNoVel-v1")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory (without replacement) as experience replay memory
memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False)
# Instantiate the agent's models (function approximators).
# DDPG requires 4 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models
models_ddpg = {}
models_ddpg["policy"] = Actor(env.observation_space, env.action_space, device, num_envs=env.num_envs)
models_ddpg["target_policy"] = Actor(env.observation_space, env.action_space, device, num_envs=env.num_envs)
models_ddpg["critic"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs)
models_ddpg["target_critic"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs)
# Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models_ddpg.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters
cfg_ddpg = DDPG_DEFAULT_CONFIG.copy()
cfg_ddpg["exploration"]["noise"] = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.1, base_scale=1.0, device=device)
cfg_ddpg["discount_factor"] = 0.98
cfg_ddpg["batch_size"] = 100
cfg_ddpg["random_timesteps"] = 0
cfg_ddpg["learning_starts"] = 1000
# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively
cfg_ddpg["experiment"]["write_interval"] = 75
cfg_ddpg["experiment"]["checkpoint_interval"] = 750
agent_ddpg = DDPG(models=models_ddpg,
memory=memory,
cfg=cfg_ddpg,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 15000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg)
# start training
trainer.train()
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
# Import the skrl components to build the RL system
from skrl.models.torch import Model, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG
from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the models (deterministic models) for the DDPG agent using mixin
# - Actor (policy): takes as input the environment's observation/state and returns an action
# - Critic: takes the state and action as input and provides a value to guide the policy
class Actor(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
num_envs=1, num_layers=1, hidden_size=400, sequence_length=20):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.num_envs = num_envs
self.num_layers = num_layers
self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0)
self.sequence_length = sequence_length
self.lstm = nn.LSTM(input_size=self.num_observations,
hidden_size=self.hidden_size,
num_layers=self.num_layers,
batch_first=True) # batch_first -> (batch, sequence, features)
self.linear_layer_1 = nn.Linear(self.hidden_size, 400)
self.linear_layer_2 = nn.Linear(400, 300)
self.action_layer = nn.Linear(300, self.num_actions)
def get_specification(self):
# batch size (N) is the number of envs
return {"rnn": {"sequence_length": self.sequence_length,
"sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout)
(self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell)
def compute(self, inputs, role):
states = inputs["states"]
terminated = inputs.get("terminated", None)
hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1]
# training
if self.training:
rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length
hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout)
cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell)
# get the hidden/cell states corresponding to the initial sequence
sequence_index = 1 if role == "target_policy" else 0 # target networks act on the next state of the environment
hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout)
cell_states = cell_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hcell)
# reset the RNN state in the middle of a sequence
if terminated is not None and torch.any(terminated):
rnn_outputs = []
terminated = terminated.view(-1, self.sequence_length)
indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length]
for i in range(len(indexes) - 1):
i0, i1 = indexes[i], indexes[i + 1]
rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states))
hidden_states[:, (terminated[:,i1-1]), :] = 0
cell_states[:, (terminated[:,i1-1]), :] = 0
rnn_outputs.append(rnn_output)
rnn_states = (hidden_states, cell_states)
rnn_output = torch.cat(rnn_outputs, dim=1)
# no need to reset the RNN state in the sequence
else:
rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states))
# rollout
else:
rnn_input = states.view(-1, 1, states.shape[-1]) # (N, L, Hin): N=num_envs, L=1
rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states))
# flatten the RNN output
rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout)
x = F.relu(self.linear_layer_1(rnn_output))
x = F.relu(self.linear_layer_2(x))
# Pendulum-v1 action_space is -2 to 2
return 2 * torch.tanh(self.action_layer(x)), {"rnn": [rnn_states[0], rnn_states[1]]}
class Critic(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
num_envs=1, num_layers=1, hidden_size=400, sequence_length=20):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.num_envs = num_envs
self.num_layers = num_layers
self.hidden_size = hidden_size # Hcell (Hout is Hcell because proj_size = 0)
self.sequence_length = sequence_length
self.lstm = nn.LSTM(input_size=self.num_observations,
hidden_size=self.hidden_size,
num_layers=self.num_layers,
batch_first=True) # batch_first -> (batch, sequence, features)
self.linear_layer_1 = nn.Linear(self.hidden_size + self.num_actions, 400)
self.linear_layer_2 = nn.Linear(400, 300)
self.linear_layer_3 = nn.Linear(300, 1)
def get_specification(self):
# batch size (N) is the number of envs
return {"rnn": {"sequence_length": self.sequence_length,
"sizes": [(self.num_layers, self.num_envs, self.hidden_size), # hidden states (D ∗ num_layers, N, Hout)
(self.num_layers, self.num_envs, self.hidden_size)]}} # cell states (D ∗ num_layers, N, Hcell)
def compute(self, inputs, role):
states = inputs["states"]
terminated = inputs.get("terminated", None)
hidden_states, cell_states = inputs["rnn"][0], inputs["rnn"][1]
# critic is only used during training
rnn_input = states.view(-1, self.sequence_length, states.shape[-1]) # (N, L, Hin): N=batch_size, L=sequence_length
hidden_states = hidden_states.view(self.num_layers, -1, self.sequence_length, hidden_states.shape[-1]) # (D * num_layers, N, L, Hout)
cell_states = cell_states.view(self.num_layers, -1, self.sequence_length, cell_states.shape[-1]) # (D * num_layers, N, L, Hcell)
# get the hidden/cell states corresponding to the initial sequence
sequence_index = 1 if role == "target_critic" else 0 # target networks act on the next state of the environment
hidden_states = hidden_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hout)
cell_states = cell_states[:,:,sequence_index,:].contiguous() # (D * num_layers, N, Hcell)
# reset the RNN state in the middle of a sequence
if terminated is not None and torch.any(terminated):
rnn_outputs = []
terminated = terminated.view(-1, self.sequence_length)
indexes = [0] + (terminated[:,:-1].any(dim=0).nonzero(as_tuple=True)[0] + 1).tolist() + [self.sequence_length]
for i in range(len(indexes) - 1):
i0, i1 = indexes[i], indexes[i + 1]
rnn_output, (hidden_states, cell_states) = self.lstm(rnn_input[:,i0:i1,:], (hidden_states, cell_states))
hidden_states[:, (terminated[:,i1-1]), :] = 0
cell_states[:, (terminated[:,i1-1]), :] = 0
rnn_outputs.append(rnn_output)
rnn_states = (hidden_states, cell_states)
rnn_output = torch.cat(rnn_outputs, dim=1)
# no need to reset the RNN state in the sequence
else:
rnn_output, rnn_states = self.lstm(rnn_input, (hidden_states, cell_states))
# flatten the RNN output
rnn_output = torch.flatten(rnn_output, start_dim=0, end_dim=1) # (N, L, D ∗ Hout) -> (N * L, D ∗ Hout)
x = F.relu(self.linear_layer_1(torch.cat([rnn_output, inputs["taken_actions"]], dim=1)))
x = F.relu(self.linear_layer_2(x))
return self.linear_layer_3(x), {"rnn": [rnn_states[0], rnn_states[1]]}
# Gym environment observation wrapper used to mask velocity. Adapted from rl_zoo3 (rl_zoo3/wrappers.py)
class NoVelocityWrapper(gym.ObservationWrapper):
def observation(self, observation):
# observation: x, y, angular velocity
return observation * np.array([1, 1, 0])
gym.envs.registration.register(id="PendulumNoVel-v1", entry_point=lambda: NoVelocityWrapper(gym.make("Pendulum-v1")))
# Load and wrap the Gym environment
env = gym.make("PendulumNoVel-v1")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory (without replacement) as experience replay memory
memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False)
# Instantiate the agent's models (function approximators).
# DDPG requires 4 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models
models_ddpg = {}
models_ddpg["policy"] = Actor(env.observation_space, env.action_space, device, num_envs=env.num_envs)
models_ddpg["target_policy"] = Actor(env.observation_space, env.action_space, device, num_envs=env.num_envs)
models_ddpg["critic"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs)
models_ddpg["target_critic"] = Critic(env.observation_space, env.action_space, device, num_envs=env.num_envs)
# Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models_ddpg.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters
cfg_ddpg = DDPG_DEFAULT_CONFIG.copy()
cfg_ddpg["exploration"]["noise"] = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.1, base_scale=1.0, device=device)
cfg_ddpg["discount_factor"] = 0.98
cfg_ddpg["batch_size"] = 100
cfg_ddpg["random_timesteps"] = 0
cfg_ddpg["learning_starts"] = 1000
# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively
cfg_ddpg["experiment"]["write_interval"] = 75
cfg_ddpg["experiment"]["checkpoint_interval"] = 750
agent_ddpg = DDPG(models=models_ddpg,
memory=memory,
cfg=cfg_ddpg,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 15000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg)
# start training
trainer.train()
import gym
import torch.nn as nn
import torch.nn.functional as F
# Import the skrl components to build the RL system
from skrl.models.torch import Model, CategoricalMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.cem import CEM, CEM_DEFAULT_CONFIG
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the model (categorical model) for the CEM agent using mixin
# - Policy: takes as input the environment's observation/state and returns an action
class Policy(CategoricalMixin, Model):
def __init__(self, observation_space, action_space, device, unnormalized_log_prob=True):
Model.__init__(self, observation_space, action_space, device)
CategoricalMixin.__init__(self, unnormalized_log_prob)
self.linear_layer_1 = nn.Linear(self.num_observations, 64)
self.linear_layer_2 = nn.Linear(64, 64)
self.output_layer = nn.Linear(64, self.num_actions)
def compute(self, inputs, role):
x = F.relu(self.linear_layer_1(inputs["states"]))
x = F.relu(self.linear_layer_2(x))
return self.output_layer(x), {}
# Load and wrap the Gym environment.
# Note: the environment version may change depending on the gym version
try:
env = gym.make("CartPole-v0")
except gym.error.DeprecatedEnv as e:
env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("CartPole-v")][0]
print("CartPole-v0 not found. Trying {}".format(env_id))
env = gym.make(env_id)
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory (without replacement) as experience replay memory
memory = RandomMemory(memory_size=1000, num_envs=env.num_envs, device=device, replacement=False)
# Instantiate the agent's model (function approximator).
# CEM requires 1 model, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.cem.html#spaces-and-models
models_cem = {}
models_cem["policy"] = Policy(env.observation_space, env.action_space, device)
# Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models_cem.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.cem.html#configuration-and-hyperparameters
cfg_cem = CEM_DEFAULT_CONFIG.copy()
cfg_cem["rollouts"] = 1000
cfg_cem["learning_starts"] = 100
# logging to TensorBoard and write checkpoints each 1000 and 5000 timesteps respectively
cfg_cem["experiment"]["write_interval"] = 1000
cfg_cem["experiment"]["checkpoint_interval"] = 5000
agent_cem = CEM(models=models_cem,
memory=memory,
cfg=cfg_cem,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 100000, "headless": True}
trainer = SequentialTrainer(env=env, agents=[agent_cem], cfg=cfg_trainer)
# start training
trainer.train()
Note: It is necessary to adjust the checkpoint path according to the directories generated by the new experiments
Note: Warnings such as [skrl:WARNING] Cannot load the <module> module. The agent doesn't have such an instance
can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined
import gym
import torch.nn as nn
import torch.nn.functional as F
# Import the skrl components to build the RL system
from skrl.models.torch import Model, CategoricalMixin
from skrl.agents.torch.cem import CEM, CEM_DEFAULT_CONFIG
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the model (categorical model) for the CEM agent using mixin
# - Policy: takes as input the environment's observation/state and returns an action
class Policy(CategoricalMixin, Model):
def __init__(self, observation_space, action_space, device, unnormalized_log_prob=True):
Model.__init__(self, observation_space, action_space, device)
CategoricalMixin.__init__(self, unnormalized_log_prob)
self.linear_layer_1 = nn.Linear(self.num_observations, 64)
self.linear_layer_2 = nn.Linear(64, 64)
self.output_layer = nn.Linear(64, self.num_actions)
def compute(self, inputs, role):
x = F.relu(self.linear_layer_1(inputs["states"]))
x = F.relu(self.linear_layer_2(x))
return self.output_layer(x), {}
# Load and wrap the Gym environment.
# Note: the environment version may change depending on the gym version
try:
env = gym.make("CartPole-v0")
except gym.error.DeprecatedEnv as e:
env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("CartPole-v")][0]
print("CartPole-v0 not found. Trying {}".format(env_id))
env = gym.make(env_id)
env = wrap_env(env)
device = env.device
# Instantiate the agent's model (function approximators).
# CEM requires 1 model, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.cem.html#spaces-and-models
models_cem = {}
models_cem["policy"] = Policy(env.observation_space, env.action_space, device)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.cem.html#configuration-and-hyperparameters
cfg_cem = CEM_DEFAULT_CONFIG.copy()
cfg_cem["rollouts"] = 1000
cfg_cem["learning_starts"] = 100
# logging to TensorBoard each 1000 timesteps and ignore checkpoints
cfg_cem["experiment"]["write_interval"] = 1000
cfg_cem["experiment"]["checkpoint_interval"] = 0
agent_cem = CEM(models=models_cem,
memory=None,
cfg=cfg_cem,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# load checkpoint
agent_cem.load("./runs/22-09-07_21-41-05-854385_CEM/checkpoints/best_agent.pt")
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 100000, "headless": True}
trainer = SequentialTrainer(env=env, agents=[agent_cem], cfg=cfg_trainer)
# evaluate the agent
trainer.eval()
import gym
# Import the skrl components to build the RL system
from skrl.utils.model_instantiators import deterministic_model, Shape
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.dqn import DQN, DQN_DEFAULT_CONFIG
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Load and wrap the Gym environment.
# Note: the environment version may change depending on the gym version
try:
env = gym.make("CartPole-v0")
except gym.error.DeprecatedEnv as e:
env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("CartPole-v")][0]
print("CartPole-v0 not found. Trying {}".format(env_id))
env = gym.make(env_id)
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory (without replacement) as experience replay memory
memory = RandomMemory(memory_size=50000, num_envs=env.num_envs, device=device, replacement=False)
# Instantiate the agent's models (function approximators) using the model instantiator utility
# DQN requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#spaces-and-models
models_dqn = {}
models_dqn["q_network"] = deterministic_model(observation_space=env.observation_space,
action_space=env.action_space,
device=device,
clip_actions=False,
input_shape=Shape.OBSERVATIONS,
hiddens=[64, 64],
hidden_activation=["relu", "relu"],
output_shape=Shape.ACTIONS,
output_activation=None,
output_scale=1.0)
models_dqn["target_q_network"] = deterministic_model(observation_space=env.observation_space,
action_space=env.action_space,
device=device,
clip_actions=False,
input_shape=Shape.OBSERVATIONS,
hiddens=[64, 64],
hidden_activation=["relu", "relu"],
output_shape=Shape.ACTIONS,
output_activation=None,
output_scale=1.0)
# Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models_dqn.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#configuration-and-hyperparameters
cfg_dqn = DQN_DEFAULT_CONFIG.copy()
cfg_dqn["learning_starts"] = 100
cfg_dqn["exploration"]["final_epsilon"] = 0.04
cfg_dqn["exploration"]["timesteps"] = 1500
# logging to TensorBoard and write checkpoints each 1000 and 5000 timesteps respectively
cfg_dqn["experiment"]["write_interval"] = 1000
cfg_dqn["experiment"]["checkpoint_interval"] = 5000
agent_dqn = DQN(models=models_dqn,
memory=memory,
cfg=cfg_dqn,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 50000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_dqn)
# start training
trainer.train()
Note: It is necessary to adjust the checkpoint path according to the directories generated by the new experiments
Note: Warnings such as [skrl:WARNING] Cannot load the <module> module. The agent doesn't have such an instance
can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined
import gym
# Import the skrl components to build the RL system
from skrl.utils.model_instantiators import deterministic_model, Shape
from skrl.agents.torch.dqn import DQN, DQN_DEFAULT_CONFIG
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Load and wrap the Gym environment.
# Note: the environment version may change depending on the gym version
try:
env = gym.make("CartPole-v0")
except gym.error.DeprecatedEnv as e:
env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("CartPole-v")][0]
print("CartPole-v0 not found. Trying {}".format(env_id))
env = gym.make(env_id)
env = wrap_env(env)
device = env.device
# Instantiate only the policy for evaluation.
# DQN requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#spaces-and-models
models_dqn = {}
models_dqn["q_network"] = deterministic_model(observation_space=env.observation_space,
action_space=env.action_space,
device=device,
clip_actions=False,
input_shape=Shape.OBSERVATIONS,
hiddens=[64, 64],
hidden_activation=["relu", "relu"],
output_shape=Shape.ACTIONS,
output_activation=None,
output_scale=1.0)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#configuration-and-hyperparameters
cfg_dqn = DQN_DEFAULT_CONFIG.copy()
cfg_dqn["exploration"]["timesteps"] = 0
# # logging to TensorBoard each 1000 timesteps and ignore checkpoints
cfg_dqn["experiment"]["write_interval"] = 1000
cfg_dqn["experiment"]["checkpoint_interval"] = 0
agent_dqn = DQN(models=models_dqn,
memory=None,
cfg=cfg_dqn,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# load checkpoint
agent_dqn.load("./runs/22-09-10_10-48-10-551426_DQN/checkpoints/best_agent.pt")
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 50000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_dqn)
# evaluate the agent
trainer.eval()
import gym
import torch
# Import the skrl components to build the RL system
from skrl.models.torch import Model, TabularMixin
from skrl.agents.torch.sarsa import SARSA, SARSA_DEFAULT_CONFIG
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the model (tabular model) for the SARSA agent using mixin
class EpilonGreedyPolicy(TabularMixin, Model):
def __init__(self, observation_space, action_space, device, num_envs=1, epsilon=0.1):
Model.__init__(self, observation_space, action_space, device)
TabularMixin.__init__(self, num_envs)
self.epsilon = epsilon
self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions),
dtype=torch.float32, device=self.device)
def compute(self, inputs, role):
actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), inputs["states"]],
dim=-1, keepdim=True).view(-1,1)
# choose random actions for exploration according to epsilon
indexes = (torch.rand(inputs["states"].shape[0], device=self.device) < self.epsilon).nonzero().view(-1)
if indexes.numel():
actions[indexes] = torch.randint(self.num_actions, (indexes.numel(), 1), device=self.device)
return actions, {}
# Load and wrap the Gym environment.
# Note: the environment version may change depending on the gym version
try:
env = gym.make("Taxi-v3")
except gym.error.DeprecatedEnv as e:
env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("Taxi-v")][0]
print("Taxi-v3 not found. Trying {}".format(env_id))
env = gym.make(env_id)
env = wrap_env(env)
device = env.device
# Instantiate the agent's models (table)
# SARSA requires 1 model, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sarsa.html#spaces-and-models
models_sarsa = {}
models_sarsa["policy"] = EpilonGreedyPolicy(env.observation_space, env.action_space, device, num_envs=env.num_envs, epsilon=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sarsa.html#configuration-and-hyperparameters
cfg_sarsa = SARSA_DEFAULT_CONFIG.copy()
cfg_sarsa["discount_factor"] = 0.999
cfg_sarsa["alpha"] = 0.4
# logging to TensorBoard and write checkpoints each 1600 and 8000 timesteps respectively
cfg_sarsa["experiment"]["write_interval"] = 1600
cfg_sarsa["experiment"]["checkpoint_interval"] = 8000
agent_sarsa = SARSA(models=models_sarsa,
memory=None,
cfg=cfg_sarsa,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 80000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_sarsa)
# start training
trainer.train()
Note: It is necessary to adjust the checkpoint path according to the directories generated by the new experiments
Note: Warnings such as [skrl:WARNING] Cannot load the <module> module. The agent doesn't have such an instance
can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined
import gym
import torch
# Import the skrl components to build the RL system
from skrl.models.torch import Model, TabularMixin
from skrl.agents.torch.sarsa import SARSA, SARSA_DEFAULT_CONFIG
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the model (tabular model) for the SARSA agent using a helper class
class EpilonGreedyPolicy(TabularMixin, Model):
def __init__(self, observation_space, action_space, device, num_envs=1, epsilon=0.1):
Model.__init__(self, observation_space, action_space, device)
TabularMixin.__init__(self, num_envs)
self.epsilon = epsilon
self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions),
dtype=torch.float32, device=self.device)
def compute(self, inputs, role):
actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), inputs["states"]],
dim=-1, keepdim=True).view(-1,1)
# choose random actions for exploration according to epsilon
indexes = (torch.rand(inputs["states"].shape[0], device=self.device) < self.epsilon).nonzero().view(-1)
if indexes.numel():
actions[indexes] = torch.randint(self.num_actions, (indexes.numel(), 1), device=self.device)
return actions, {}
# Load and wrap the Gym environment.
# Note: the environment version may change depending on the gym version
try:
env = gym.make("Taxi-v3")
except gym.error.DeprecatedEnv as e:
env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("Taxi-v")][0]
print("Taxi-v3 not found. Trying {}".format(env_id))
env = gym.make(env_id)
env = wrap_env(env)
device = env.device
# Instantiate the agent's models (table)
# SARSA requires 1 model, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sarsa.html#spaces-and-models
models_sarsa = {}
models_sarsa["policy"] = EpilonGreedyPolicy(env.observation_space, env.action_space, device, num_envs=env.num_envs, epsilon=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sarsa.html#configuration-and-hyperparameters
cfg_sarsa = SARSA_DEFAULT_CONFIG.copy()
cfg_sarsa["random_timesteps"] = 0
# logging to TensorBoard and write checkpoints each 1600 and ignore checkpoints
cfg_sarsa["experiment"]["write_interval"] = 1600
cfg_sarsa["experiment"]["checkpoint_interval"] = 0
agent_sarsa = SARSA(models=models_sarsa,
memory=None,
cfg=cfg_sarsa,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# load checkpoint
agent_sarsa.load("./runs/22-09-10_13-13-41-011999_SARSA/checkpoints/agent_80000.pt")
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 80000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_sarsa)
# evaluate the agent
trainer.eval()
import gym
import torch
# Import the skrl components to build the RL system
from skrl.models.torch import Model, TabularMixin
from skrl.agents.torch.q_learning import Q_LEARNING, Q_LEARNING_DEFAULT_CONFIG
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the model (tabular model) for the SARSA agent using mixin
class EpilonGreedyPolicy(TabularMixin, Model):
def __init__(self, observation_space, action_space, device, num_envs=1, epsilon=0.1):
Model.__init__(self, observation_space, action_space, device)
TabularMixin.__init__(self, num_envs)
self.epsilon = epsilon
self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions),
dtype=torch.float32, device=self.device)
def compute(self, inputs, role):
actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), inputs["states"]],
dim=-1, keepdim=True).view(-1,1)
# choose random actions for exploration according to epsilon
indexes = (torch.rand(inputs["states"].shape[0], device=self.device) < self.epsilon).nonzero().view(-1)
if indexes.numel():
actions[indexes] = torch.randint(self.num_actions, (indexes.numel(), 1), device=self.device)
return actions, {}
# Load and wrap the Gym environment.
# Note: the environment version may change depending on the gym version
try:
env = gym.make("FrozenLake-v0")
except gym.error.DeprecatedEnv as e:
env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("FrozenLake-v")][0]
print("FrozenLake-v0 not found. Trying {}".format(env_id))
env = gym.make(env_id)
env = wrap_env(env)
device = env.device
# Instantiate the agent's models (table)
# Q-learning requires 1 model, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.q_learning.html#spaces-and-models
models_q_learning = {}
models_q_learning["policy"] = EpilonGreedyPolicy(env.observation_space, env.action_space, device, num_envs=env.num_envs, epsilon=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.q_learning.html#configuration-and-hyperparameters
cfg_q_learning = Q_LEARNING_DEFAULT_CONFIG.copy()
cfg_q_learning["discount_factor"] = 0.999
cfg_q_learning["alpha"] = 0.4
# logging to TensorBoard and write checkpoints each 1600 and 8000 timesteps respectively
cfg_q_learning["experiment"]["write_interval"] = 1600
cfg_q_learning["experiment"]["checkpoint_interval"] = 8000
agent_q_learning = Q_LEARNING(models=models_q_learning,
memory=None,
cfg=cfg_q_learning,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 80000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_q_learning)
# start training
trainer.train()
Note: It is necessary to adjust the checkpoint path according to the directories generated by the new experiments
Note: Warnings such as [skrl:WARNING] Cannot load the <module> module. The agent doesn't have such an instance
can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined
import gym
import torch
# Import the skrl components to build the RL system
from skrl.models.torch import Model, TabularMixin
from skrl.agents.torch.q_learning import Q_LEARNING, Q_LEARNING_DEFAULT_CONFIG
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the model (tabular model) for the SARSA agent using mixin
class EpilonGreedyPolicy(TabularMixin, Model):
def __init__(self, observation_space, action_space, device, num_envs=1, epsilon=0.1):
Model.__init__(self, observation_space, action_space, device)
TabularMixin.__init__(self, num_envs)
self.epsilon = epsilon
self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions),
dtype=torch.float32, device=self.device)
def compute(self, inputs, role):
actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), inputs["states"]],
dim=-1, keepdim=True).view(-1,1)
# choose random actions for exploration according to epsilon
indexes = (torch.rand(inputs["states"].shape[0], device=self.device) < self.epsilon).nonzero().view(-1)
if indexes.numel():
actions[indexes] = torch.randint(self.num_actions, (indexes.numel(), 1), device=self.device)
return actions, {}
# Load and wrap the Gym environment.
# Note: the environment version may change depending on the gym version
try:
env = gym.make("FrozenLake-v0")
except gym.error.DeprecatedEnv as e:
env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("FrozenLake-v")][0]
print("FrozenLake-v0 not found. Trying {}".format(env_id))
env = gym.make(env_id)
env = wrap_env(env)
device = env.device
# Instantiate the agent's models (table)
# Q-learning requires 1 model, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.q_learning.html#spaces-and-models
models_q_learning = {}
models_q_learning["policy"] = EpilonGreedyPolicy(env.observation_space, env.action_space, device, num_envs=env.num_envs, epsilon=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.q_learning.html#configuration-and-hyperparameters
cfg_q_learning = Q_LEARNING_DEFAULT_CONFIG.copy()
cfg_q_learning["random_timesteps"] = 0
# logging to TensorBoard and write checkpoints each 1600 and ignore checkpoints
cfg_q_learning["experiment"]["write_interval"] = 1600
cfg_q_learning["experiment"]["checkpoint_interval"] = 0
agent_q_learning = Q_LEARNING(models=models_q_learning,
memory=None,
cfg=cfg_q_learning,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# load checkpoint
agent_q_learning.load("./runs/22-09-10_17-54-20-381109_Q_LEARNING/checkpoints/best_agent.pt")
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 80000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_q_learning)
# evaluate the agent
trainer.eval()
Gym/Gymnasium vectorized environments
These examples perform the training of one agent in a Gym/Gymnasium vectorized environment (one agent, multiple independent copies of the same environment in parallel)
The following components or practices are exemplified (highlighted):
Load and wrap a Gym vectorized environment: Pendulum (DDPG), CartPole (DQN), Taxi (SARSA), FrozenLake (Q-Learning)
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
# Import the skrl components to build the RL system
from skrl.models.torch import Model, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG
from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the models (deterministic models) for the DDPG agent using mixin
# - Actor (policy): takes as input the environment's observation/state and returns an action
# - Critic: takes the state and action as input and provides a value to guide the policy
class DeterministicActor(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.linear_layer_1 = nn.Linear(self.num_observations, 400)
self.linear_layer_2 = nn.Linear(400, 300)
self.action_layer = nn.Linear(300, self.num_actions)
def compute(self, inputs, role):
x = F.relu(self.linear_layer_1(inputs["states"]))
x = F.relu(self.linear_layer_2(x))
return 2 * torch.tanh(self.action_layer(x)), {} # Pendulum-v1 action_space is -2 to 2
class DeterministicCritic(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.linear_layer_1 = nn.Linear(self.num_observations + self.num_actions, 400)
self.linear_layer_2 = nn.Linear(400, 300)
self.linear_layer_3 = nn.Linear(300, 1)
def compute(self, inputs, role):
x = F.relu(self.linear_layer_1(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1)))
x = F.relu(self.linear_layer_2(x))
return self.linear_layer_3(x), {}
# Load and wrap the Gym environment.
# Note: the environment version may change depending on the gym version
try:
env = gym.vector.make("Pendulum-v1", num_envs=10, asynchronous=False)
except gym.error.DeprecatedEnv as e:
env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("Pendulum-v")][0]
print("Pendulum-v1 not found. Trying {}".format(env_id))
env = gym.vector.make(env_id, num_envs=10, asynchronous=False)
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory (without replacement) as experience replay memory
memory = RandomMemory(memory_size=100000, num_envs=env.num_envs, device=device, replacement=False)
# Instantiate the agent's models (function approximators).
# DDPG requires 4 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models
models_ddpg = {}
models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device)
models_ddpg["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device)
models_ddpg["critic"] = DeterministicCritic(env.observation_space, env.action_space, device)
models_ddpg["target_critic"] = DeterministicCritic(env.observation_space, env.action_space, device)
# Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models_ddpg.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters
cfg_ddpg = DDPG_DEFAULT_CONFIG.copy()
cfg_ddpg["exploration"]["noise"] = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.1, base_scale=1.0, device=device)
cfg_ddpg["batch_size"] = 100
cfg_ddpg["random_timesteps"] = 100
cfg_ddpg["learning_starts"] = 100
# logging to TensorBoard and write checkpoints each 1000 and 1000 timesteps respectively
cfg_ddpg["experiment"]["write_interval"] = 1000
cfg_ddpg["experiment"]["checkpoint_interval"] = 1000
agent_ddpg = DDPG(models=models_ddpg,
memory=memory,
cfg=cfg_ddpg,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 15000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg)
# start training
trainer.train()
import gym
# Import the skrl components to build the RL system
from skrl.utils.model_instantiators import deterministic_model, Shape
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.dqn import DQN, DQN_DEFAULT_CONFIG
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Load and wrap the Gym environment.
# Note: the environment version may change depending on the gym version
try:
env = gym.vector.make("CartPole-v0", num_envs=5, asynchronous=False)
except gym.error.DeprecatedEnv as e:
env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("CartPole-v")][0]
print("CartPole-v0 not found. Trying {}".format(env_id))
env = gym.vector.make(env_id, num_envs=5, asynchronous=False)
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory (without replacement) as experience replay memory
memory = RandomMemory(memory_size=200000, num_envs=env.num_envs, device=device, replacement=False)
# Instantiate the agent's models (function approximators) using the model instantiator utility
# DQN requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#spaces-and-models
models_dqn = {}
models_dqn["q_network"] = deterministic_model(observation_space=env.observation_space,
action_space=env.action_space,
device=device,
clip_actions=False,
input_shape=Shape.OBSERVATIONS,
hiddens=[64, 64],
hidden_activation=["relu", "relu"],
output_shape=Shape.ACTIONS,
output_activation=None,
output_scale=1.0)
models_dqn["target_q_network"] = deterministic_model(observation_space=env.observation_space,
action_space=env.action_space,
device=device,
clip_actions=False,
input_shape=Shape.OBSERVATIONS,
hiddens=[64, 64],
hidden_activation=["relu", "relu"],
output_shape=Shape.ACTIONS,
output_activation=None,
output_scale=1.0)
# Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models_dqn.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#configuration-and-hyperparameters
cfg_dqn = DQN_DEFAULT_CONFIG.copy()
cfg_dqn["learning_starts"] = 100
cfg_dqn["exploration"]["final_epsilon"] = 0.04
cfg_dqn["exploration"]["timesteps"] = 1500
# logging to TensorBoard and write checkpoints each 1000 and 5000 timesteps respectively
cfg_dqn["experiment"]["write_interval"] = 1000
cfg_dqn["experiment"]["checkpoint_interval"] = 5000
agent_dqn = DQN(models=models_dqn,
memory=memory,
cfg=cfg_dqn,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 50000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_dqn)
# start training
trainer.train()
import gym
import torch
# Import the skrl components to build the RL system
from skrl.models.torch import Model, TabularMixin
from skrl.agents.torch.sarsa import SARSA, SARSA_DEFAULT_CONFIG
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the model (tabular model) for the SARSA agent using mixin
class EpilonGreedyPolicy(TabularMixin, Model):
def __init__(self, observation_space, action_space, device, num_envs=1, epsilon=0.1):
Model.__init__(self, observation_space, action_space, device)
TabularMixin.__init__(self, num_envs)
self.epsilon = epsilon
self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions),
dtype=torch.float32, device=self.device)
def compute(self, inputs, role):
actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), inputs["states"]],
dim=-1, keepdim=True).view(-1,1)
# choose random actions for exploration according to epsilon
indexes = (torch.rand(inputs["states"].shape[0], device=self.device) < self.epsilon).nonzero().view(-1)
if indexes.numel():
actions[indexes] = torch.randint(self.num_actions, (indexes.numel(), 1), device=self.device)
return actions, {}
# Load and wrap the Gym environment.
# Note: the environment version may change depending on the gym version
try:
env = gym.vector.make("Taxi-v3", num_envs=10, asynchronous=False)
except gym.error.DeprecatedEnv as e:
env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("Taxi-v")][0]
print("Taxi-v3 not found. Trying {}".format(env_id))
env = gym.vector.make(env_id, num_envs=10, asynchronous=False)
env = wrap_env(env)
device = env.device
# Instantiate the agent's models (table)
# SARSA requires 1 model, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sarsa.html#spaces-and-models
models_sarsa = {}
models_sarsa["policy"] = EpilonGreedyPolicy(env.observation_space, env.action_space, device, num_envs=env.num_envs, epsilon=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sarsa.html#configuration-and-hyperparameters
cfg_sarsa = SARSA_DEFAULT_CONFIG.copy()
cfg_sarsa["discount_factor"] = 0.999
cfg_sarsa["alpha"] = 0.4
# logging to TensorBoard and write checkpoints each 1600 and 8000 timesteps respectively
cfg_sarsa["experiment"]["write_interval"] = 1600
cfg_sarsa["experiment"]["checkpoint_interval"] = 8000
agent_sarsa = SARSA(models=models_sarsa,
memory=None,
cfg=cfg_sarsa,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 80000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_sarsa)
# start training
trainer.train()
import gym
import torch
# Import the skrl components to build the RL system
from skrl.models.torch import Model, TabularMixin
from skrl.agents.torch.q_learning import Q_LEARNING, Q_LEARNING_DEFAULT_CONFIG
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the model (tabular model) for the SARSA agent using mixin
class EpilonGreedyPolicy(TabularMixin, Model):
def __init__(self, observation_space, action_space, device, num_envs=1, epsilon=0.1):
Model.__init__(self, observation_space, action_space, device)
TabularMixin.__init__(self, num_envs)
self.epsilon = epsilon
self.q_table = torch.ones((num_envs, self.num_observations, self.num_actions),
dtype=torch.float32, device=self.device)
def compute(self, inputs, role):
actions = torch.argmax(self.q_table[torch.arange(self.num_envs).view(-1, 1), inputs["states"]],
dim=-1, keepdim=True).view(-1,1)
# choose random actions for exploration according to epsilon
indexes = (torch.rand(inputs["states"].shape[0], device=self.device) < self.epsilon).nonzero().view(-1)
if indexes.numel():
actions[indexes] = torch.randint(self.num_actions, (indexes.numel(), 1), device=self.device)
return actions, {}
# Load and wrap the Gym environment.
# Note: the environment version may change depending on the gym version
try:
env = gym.vector.make("FrozenLake-v0", num_envs=10, asynchronous=False)
except gym.error.DeprecatedEnv as e:
env_id = [spec.id for spec in gym.envs.registry.all() if spec.id.startswith("FrozenLake-v")][0]
print("FrozenLake-v0 not found. Trying {}".format(env_id))
env = gym.vector.make(env_id, num_envs=10, asynchronous=False)
env = wrap_env(env)
device = env.device
# Instantiate the agent's models (table)
# Q-learning requires 1 model, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.q_learning.html#spaces-and-models
models_q_learning = {}
models_q_learning["policy"] = EpilonGreedyPolicy(env.observation_space, env.action_space, device, num_envs=env.num_envs, epsilon=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.q_learning.html#configuration-and-hyperparameters
cfg_q_learning = Q_LEARNING_DEFAULT_CONFIG.copy()
cfg_q_learning["discount_factor"] = 0.999
cfg_q_learning["alpha"] = 0.4
# logging to TensorBoard and write checkpoints each 1600 and 8000 timesteps respectively
cfg_q_learning["experiment"]["write_interval"] = 1600
cfg_q_learning["experiment"]["checkpoint_interval"] = 8000
agent_q_learning = Q_LEARNING(models=models_q_learning,
memory=None,
cfg=cfg_q_learning,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 80000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_q_learning)
# start training
trainer.train()
Farama Shimmy (converted environments)
The following examples show the training in several popular environments (Atari, DeepMind Control and OpenAI Gym) that have been converted to the Gymnasium API using the Shimmy (API conversion tool) package
Note
From skrl, no extra implementation is necessary, since it fully supports Gymnasium API
Note
Because the Gymnasium API requires that the rendering mode be specified during the initialization of the environment, it is not enough to set the headless
option in the trainer configuration to render the environment. In this case, it is necessary to call the gymnasium.make
function using render_mode="human"
or any other supported option
import gymnasium as gym
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.dqn import DQN, DQN_DEFAULT_CONFIG
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the model (deterministic models) for the DQN agent using mixin
class QNetwork(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 64),
nn.ReLU(),
nn.Linear(64, 64),
nn.ReLU(),
nn.Linear(64, self.num_actions))
def compute(self, inputs, role):
return self.net(inputs["states"]), {}
# Load and wrap the environment
env = gym.make("ALE/Pong-v5")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory (without replacement) as experience replay memory
memory = RandomMemory(memory_size=15000, num_envs=env.num_envs, device=device, replacement=False)
# Instantiate the agent's models (function approximators).
# DQN requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#spaces-and-models
models = {}
models["q_network"] = QNetwork(env.observation_space, env.action_space, device)
models["target_q_network"] = QNetwork(env.observation_space, env.action_space, device)
# # Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.dqn.html#configuration-and-hyperparameters
cfg_agent = DQN_DEFAULT_CONFIG.copy()
cfg_agent["learning_starts"] = 100
cfg_agent["exploration"]["initial_epsilon"] = 1.0
cfg_agent["exploration"]["final_epsilon"] = 0.04
cfg_agent["exploration"]["timesteps"] = 1500
# logging to TensorBoard and write checkpoints each 1000 and 5000 timesteps respectively
cfg_agent["experiment"]["write_interval"] = 1000
cfg_agent["experiment"]["checkpoint_interval"] = 5000
agent_dqn = DQN(models=models,
memory=memory,
cfg=cfg_agent,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 50000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_dqn)
# start training
trainer.train()
import gymnasium as gym
import torch
import torch.nn as nn
import torch.nn.functional as F
# Import the skrl components to build the RL system
from skrl.models.torch import Model, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG
from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the models (deterministic models) for the DDPG agent using mixin
# - Actor (policy): takes as input the environment's observation/state and returns an action
# - Critic: takes the state and action as input and provides a value to guide the policy
class Actor(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.linear_layer_1 = nn.Linear(self.num_observations, 400)
self.linear_layer_2 = nn.Linear(400, 300)
self.action_layer = nn.Linear(300, self.num_actions)
def compute(self, inputs, role):
x = F.relu(self.linear_layer_1(inputs["states"]))
x = F.relu(self.linear_layer_2(x))
# Pendulum-v1 action_space is -2 to 2
return 2 * torch.tanh(self.action_layer(x)), {}
class Critic(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.linear_layer_1 = nn.Linear(self.num_observations + self.num_actions, 400)
self.linear_layer_2 = nn.Linear(400, 300)
self.linear_layer_3 = nn.Linear(300, 1)
def compute(self, inputs, role):
x = F.relu(self.linear_layer_1(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1)))
x = F.relu(self.linear_layer_2(x))
return self.linear_layer_3(x), {}
# Load and wrap the environment
env = gym.make("dm_control/acrobot-swingup_sparse-v0")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory (without replacement) as experience replay memory
memory = RandomMemory(memory_size=20000, num_envs=env.num_envs, device=device, replacement=False)
# Instantiate the agent's models (function approximators).
# DDPG requires 4 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models
models_ddpg = {}
models_ddpg["policy"] = Actor(env.observation_space, env.action_space, device)
models_ddpg["target_policy"] = Actor(env.observation_space, env.action_space, device)
models_ddpg["critic"] = Critic(env.observation_space, env.action_space, device)
models_ddpg["target_critic"] = Critic(env.observation_space, env.action_space, device)
# Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models_ddpg.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters
cfg_ddpg = DDPG_DEFAULT_CONFIG.copy()
cfg_ddpg["exploration"]["noise"] = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.1, base_scale=1.0, device=device)
cfg_ddpg["discount_factor"] = 0.98
cfg_ddpg["batch_size"] = 100
cfg_ddpg["random_timesteps"] = 1000
cfg_ddpg["learning_starts"] = 1000
# logging to TensorBoard and write checkpoints each 75 and 750 timesteps respectively
cfg_ddpg["experiment"]["write_interval"] = 75
cfg_ddpg["experiment"]["checkpoint_interval"] = 750
agent_ddpg = DDPG(models=models_ddpg,
memory=memory,
cfg=cfg_ddpg,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 15000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg)
# start training
trainer.train()
import gymnasium as gym
import torch
import torch.nn as nn
import torch.nn.functional as F
# Import the skrl components to build the RL system
from skrl.models.torch import Model, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG
from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the models (deterministic models) for the DDPG agent using mixin
# - Actor (policy): takes as input the environment's observation/state and returns an action
# - Critic: takes the state and action as input and provides a value to guide the policy
class DeterministicActor(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.linear_layer_1 = nn.Linear(self.num_observations, 400)
self.linear_layer_2 = nn.Linear(400, 300)
self.action_layer = nn.Linear(300, self.num_actions)
def compute(self, inputs, role):
x = F.relu(self.linear_layer_1(inputs["states"]))
x = F.relu(self.linear_layer_2(x))
return 2 * torch.tanh(self.action_layer(x)), {} # Pendulum-v1 action_space is -2 to 2
class DeterministicCritic(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.linear_layer_1 = nn.Linear(self.num_observations + self.num_actions, 400)
self.linear_layer_2 = nn.Linear(400, 300)
self.linear_layer_3 = nn.Linear(300, 1)
def compute(self, inputs, role):
x = F.relu(self.linear_layer_1(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1)))
x = F.relu(self.linear_layer_2(x))
return self.linear_layer_3(x), {}
# Load and wrap the Gymnasium environment.
env = gym.make("GymV26Environment-v0", env_id="Pendulum-v1")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory (without replacement) as experience replay memory
memory = RandomMemory(memory_size=15000, num_envs=env.num_envs, device=device, replacement=False)
# Instantiate the agent's models (function approximators).
# DDPG requires 4 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models
models_ddpg = {}
models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device)
models_ddpg["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device)
models_ddpg["critic"] = DeterministicCritic(env.observation_space, env.action_space, device)
models_ddpg["target_critic"] = DeterministicCritic(env.observation_space, env.action_space, device)
# Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models_ddpg.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters
cfg_ddpg = DDPG_DEFAULT_CONFIG.copy()
cfg_ddpg["exploration"]["noise"] = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.1, base_scale=1.0, device=device)
cfg_ddpg["batch_size"] = 100
cfg_ddpg["random_timesteps"] = 100
cfg_ddpg["learning_starts"] = 100
# logging to TensorBoard and write checkpoints each 300 and 1500 timesteps respectively
cfg_ddpg["experiment"]["write_interval"] = 300
cfg_ddpg["experiment"]["checkpoint_interval"] = 1500
agent_ddpg = DDPG(models=models_ddpg,
memory=memory,
cfg=cfg_ddpg,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 15000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg)
# start training
trainer.train()
Other supported APIs
DeepMind environments
These examples perform the training of one agent in a DeepMind environment (one agent, one environment)
The following components or practices are exemplified (highlighted):
Load and wrap a DeepMind environment: cartpole (DDPG)
Map the observation/state space (flat tensor) to the original environment space to be used by the model: reach_site_vision (SAC)
dm_suite_cartpole_swingup_ddpg.py
from dm_control import suite
import torch
import torch.nn as nn
import torch.nn.functional as F
# Import the skrl components to build the RL system
from skrl.models.torch import Model, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG
from skrl.resources.noises.torch import OrnsteinUhlenbeckNoise
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the models (deterministic models) for the DDPG agent using mixins
# and programming with two approaches (torch functional and torch.nn.Sequential class).
# - Actor (policy): takes as input the environment's observation/state and returns an action
# - Critic: takes the state and action as input and provides a value to guide the policy
class DeterministicActor(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.linear_layer_1 = nn.Linear(self.num_observations, 400)
self.linear_layer_2 = nn.Linear(400, 300)
self.action_layer = nn.Linear(300, self.num_actions)
def compute(self, inputs, role):
x = F.relu(self.linear_layer_1(inputs["states"]))
x = F.relu(self.linear_layer_2(x))
return torch.tanh(self.action_layer(x)), {}
class DeterministicCritic(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations + self.num_actions, 400),
nn.ReLU(),
nn.Linear(400, 300),
nn.ReLU(),
nn.Linear(300, 1))
def compute(self, inputs, role):
return self.net(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1)), {}
# Load and wrap the DeepMind environment
env = suite.load(domain_name="cartpole", task_name="swingup")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory (without replacement) as experience replay memory
memory = RandomMemory(memory_size=25000, num_envs=env.num_envs, device=device, replacement=False)
# Instantiate the agent's models (function approximators).
# DDPG requires 4 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models
models_ddpg = {}
models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)
models_ddpg["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)
models_ddpg["critic"] = DeterministicCritic(env.observation_space, env.action_space, device)
models_ddpg["target_critic"] = DeterministicCritic(env.observation_space, env.action_space, device)
# Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models_ddpg.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters
cfg_ddpg = DDPG_DEFAULT_CONFIG.copy()
cfg_ddpg["exploration"]["noise"] = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.1, base_scale=1.0, device=device)
cfg_ddpg["batch_size"] = 100
cfg_ddpg["random_timesteps"] = 100
cfg_ddpg["learning_starts"] = 100
# logging to TensorBoard and write checkpoints each 1000 and 5000 timesteps respectively
cfg_ddpg["experiment"]["write_interval"] = 1000
cfg_ddpg["experiment"]["checkpoint_interval"] = 5000
agent_ddpg = DDPG(models=models_ddpg,
memory=memory,
cfg=cfg_ddpg,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 50000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_ddpg)
# start training
trainer.train()
from dm_control import manipulation
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.sac import SAC, SAC_DEFAULT_CONFIG
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the models (stochastic and deterministic models) for the SAC agent using the mixins.
# - StochasticActor (policy): takes as input the environment's observation/state and returns an action
# - Critic: takes the state and action as input and provides a value to guide the policy
class StochasticActor(GaussianMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std)
self.features_extractor = nn.Sequential(nn.Conv2d(3, 32, kernel_size=8, stride=3),
nn.ReLU(),
nn.Conv2d(32, 64, kernel_size=4, stride=2),
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size=2, stride=1),
nn.ReLU(),
nn.Flatten(),
nn.Linear(7744, 512),
nn.ReLU(),
nn.Linear(512, 8),
nn.Tanh())
self.net = nn.Sequential(nn.Linear(26, 32),
nn.ReLU(),
nn.Linear(32, 32),
nn.ReLU(),
nn.Linear(32, self.num_actions))
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
def compute(self, inputs, role):
states = inputs["states"]
# The dm_control.manipulation tasks have as observation/state spec a `collections.OrderedDict` object as follows:
# OrderedDict([('front_close', BoundedArray(shape=(1, 84, 84, 3), dtype=dtype('uint8'), name='front_close', minimum=0, maximum=255)),
# ('jaco_arm/joints_pos', Array(shape=(1, 6, 2), dtype=dtype('float64'), name='jaco_arm/joints_pos')),
# ('jaco_arm/joints_torque', Array(shape=(1, 6), dtype=dtype('float64'), name='jaco_arm/joints_torque')),
# ('jaco_arm/joints_vel', Array(shape=(1, 6), dtype=dtype('float64'), name='jaco_arm/joints_vel')),
# ('jaco_arm/jaco_hand/joints_pos', Array(shape=(1, 3), dtype=dtype('float64'), name='jaco_arm/jaco_hand/joints_pos')),
# ('jaco_arm/jaco_hand/joints_vel', Array(shape=(1, 3), dtype=dtype('float64'), name='jaco_arm/jaco_hand/joints_vel')),
# ('jaco_arm/jaco_hand/pinch_site_pos', Array(shape=(1, 3), dtype=dtype('float64'), name='jaco_arm/jaco_hand/pinch_site_pos')),
# ('jaco_arm/jaco_hand/pinch_site_rmat', Array(shape=(1, 9), dtype=dtype('float64'), name='jaco_arm/jaco_hand/pinch_site_rmat'))])
# This spec is converted to a `gym.spaces.Dict` space by the `wrap_env` function as follows:
# Dict(front_close: Box(0, 255, (1, 84, 84, 3), uint8),
# jaco_arm/jaco_hand/joints_pos: Box(-inf, inf, (1, 3), float64),
# jaco_arm/jaco_hand/joints_vel: Box(-inf, inf, (1, 3), float64),
# jaco_arm/jaco_hand/pinch_site_pos: Box(-inf, inf, (1, 3), float64),
# jaco_arm/jaco_hand/pinch_site_rmat: Box(-inf, inf, (1, 9), float64),
# jaco_arm/joints_pos: Box(-inf, inf, (1, 6, 2), float64),
# jaco_arm/joints_torque: Box(-inf, inf, (1, 6), float64),
# jaco_arm/joints_vel: Box(-inf, inf, (1, 6), float64))
# The `spaces` parameter is a flat tensor of the flattened observation/state space with shape (batch_size, size_of_flat_space).
# Using the model's method `tensor_to_space` we can convert the flattened tensor to the original space.
# https://skrl.readthedocs.io/en/latest/modules/skrl.models.base_class.html#skrl.models.torch.base.Model.tensor_to_space
space = self.tensor_to_space(states, self.observation_space)
# For this case, the `space` variable is a Python dictionary with the following structure and shapes:
# {'front_close': torch.Tensor(shape=[batch_size, 1, 84, 84, 3], dtype=torch.float32),
# 'jaco_arm/jaco_hand/joints_pos': torch.Tensor(shape=[batch_size, 1, 3], dtype=torch.float32)
# 'jaco_arm/jaco_hand/joints_vel': torch.Tensor(shape=[batch_size, 1, 3], dtype=torch.float32)
# 'jaco_arm/jaco_hand/pinch_site_pos': torch.Tensor(shape=[batch_size, 1, 3], dtype=torch.float32)
# 'jaco_arm/jaco_hand/pinch_site_rmat': torch.Tensor(shape=[batch_size, 1, 9], dtype=torch.float32)
# 'jaco_arm/joints_pos': torch.Tensor(shape=[batch_size, 1, 6, 2], dtype=torch.float32)
# 'jaco_arm/joints_torque': torch.Tensor(shape=[batch_size, 1, 6], dtype=torch.float32)
# 'jaco_arm/joints_vel': torch.Tensor(shape=[batch_size, 1, 6], dtype=torch.float32)}
# permute and normalize the images (samples, width, height, channels) -> (samples, channels, width, height)
features = self.features_extractor(space['front_close'][:,0].permute(0, 3, 1, 2) / 255.0)
mean_actions = torch.tanh(self.net(torch.cat([features,
space["jaco_arm/joints_pos"].view(states.shape[0], -1),
space["jaco_arm/joints_vel"].view(states.shape[0], -1)], dim=-1)))
return mean_actions, self.log_std_parameter, {}
class Critic(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.features_extractor = nn.Sequential(nn.Conv2d(3, 32, kernel_size=8, stride=3),
nn.ReLU(),
nn.Conv2d(32, 64, kernel_size=4, stride=2),
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size=2, stride=1),
nn.ReLU(),
nn.Flatten(),
nn.Linear(7744, 512),
nn.ReLU(),
nn.Linear(512, 8),
nn.Tanh())
self.net = nn.Sequential(nn.Linear(26 + self.num_actions, 32),
nn.ReLU(),
nn.Linear(32, 32),
nn.ReLU(),
nn.Linear(32, 1))
def compute(self, inputs, role):
states = inputs["states"]
# map the observations/states to the original space.
# See the explanation above (StochasticActor.compute)
space = self.tensor_to_space(states, self.observation_space)
# permute and normalize the images (samples, width, height, channels) -> (samples, channels, width, height)
features = self.features_extractor(space['front_close'][:,0].permute(0, 3, 1, 2) / 255.0)
return self.net(torch.cat([features,
space["jaco_arm/joints_pos"].view(states.shape[0], -1),
space["jaco_arm/joints_vel"].view(states.shape[0], -1),
inputs["taken_actions"]], dim=-1)), {}
# Load and wrap the DeepMind environment
env = manipulation.load("reach_site_vision")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory (without replacement) as experience replay memory
memory = RandomMemory(memory_size=50000, num_envs=env.num_envs, device=device, replacement=False)
# Instantiate the agent's models (function approximators).
# SAC requires 5 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#spaces-and-models
models_sac = {}
models_sac["policy"] = StochasticActor(env.observation_space, env.action_space, device, clip_actions=True)
models_sac["critic_1"] = Critic(env.observation_space, env.action_space, device)
models_sac["critic_2"] = Critic(env.observation_space, env.action_space, device)
models_sac["target_critic_1"] = Critic(env.observation_space, env.action_space, device)
models_sac["target_critic_2"] = Critic(env.observation_space, env.action_space, device)
# Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models_sac.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#configuration-and-hyperparameters
cfg_sac = SAC_DEFAULT_CONFIG.copy()
cfg_sac["gradient_steps"] = 1
cfg_sac["batch_size"] = 256
cfg_sac["random_timesteps"] = 0
cfg_sac["learning_starts"] = 10000
cfg_sac["learn_entropy"] = True
# logging to TensorBoard and write checkpoints each 1000 and 5000 timesteps respectively
cfg_sac["experiment"]["write_interval"] = 1000
cfg_sac["experiment"]["checkpoint_interval"] = 5000
agent_sac = SAC(models=models_sac,
memory=memory,
cfg=cfg_sac,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 100000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent_sac)
# start training
trainer.train()
Robosuite environments
These examples perform the training of one agent in a robosuite environment (one agent, one environment)
The following components or practices are exemplified (highlighted):
Load and wrap a robosuite environment: TwoArmLift (TD3)
td3_robosuite_two_arm_lift.py
(not tuned)
import robosuite
from robosuite.controllers import load_controller_config
import torch
import torch.nn as nn
import torch.nn.functional as F
# Import the skrl components to build the RL system
from skrl.models.torch import Model, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.td3 import TD3, TD3_DEFAULT_CONFIG
from skrl.resources.noises.torch import GaussianNoise
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the models (deterministic models) for the TD3 agent using mixins
# and programming with two approaches (torch functional and torch.nn.Sequential class).
# - Actor (policy): takes as input the environment's observation/state and returns an action
# - Critic: takes the state and action as input and provides a value to guide the policy
class DeterministicActor(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.linear_layer_1 = nn.Linear(self.num_observations, 400)
self.linear_layer_2 = nn.Linear(400, 300)
self.action_layer = nn.Linear(300, self.num_actions)
def compute(self, inputs, role):
x = F.relu(self.linear_layer_1(inputs["states"]))
x = F.relu(self.linear_layer_2(x))
return torch.tanh(self.action_layer(x)), {}
class DeterministicCritic(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations + self.num_actions, 400),
nn.ReLU(),
nn.Linear(400, 300),
nn.ReLU(),
nn.Linear(300, 1))
def compute(self, inputs, role):
return self.net(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1)), {}
# Load and wrap the DeepMind robosuite environment
controller_config = load_controller_config(default_controller="OSC_POSE")
env = robosuite.make("TwoArmLift",
robots=["Sawyer", "Panda"], # load a Sawyer robot and a Panda robot
gripper_types="default", # use default grippers per robot arm
controller_configs=controller_config, # each arm is controlled using OSC
env_configuration="single-arm-opposed", # (two-arm envs only) arms face each other
has_renderer=True, # on-screen rendering
render_camera="frontview", # visualize the "frontview" camera
has_offscreen_renderer=False, # no off-screen rendering
control_freq=20, # 20 hz control for applied actions
horizon=200, # each episode terminates after 200 steps
use_object_obs=True, # provide object observations to agent
use_camera_obs=False, # don't provide image observations to agent
reward_shaping=True) # use a dense reward signal for learning
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory (without replacement) as experience replay memory
memory = RandomMemory(memory_size=25000, num_envs=env.num_envs, device=device, replacement=False)
# Instantiate the agent's models (function approximators).
# TD3 requires 6 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#spaces-and-models
models = {}
models["policy"] = DeterministicActor(env.observation_space, env.action_space, device)
models["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device)
models["critic_1"] = DeterministicCritic(env.observation_space, env.action_space, device)
models["critic_2"] = DeterministicCritic(env.observation_space, env.action_space, device)
models["target_critic_1"] = DeterministicCritic(env.observation_space, env.action_space, device)
models["target_critic_2"] = DeterministicCritic(env.observation_space, env.action_space, device)
# Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#configuration-and-hyperparameters
cfg_agent = TD3_DEFAULT_CONFIG.copy()
cfg_agent["exploration"]["noise"] = GaussianNoise(0, 0.1, device=device)
cfg_agent["smooth_regularization_noise"] = GaussianNoise(0, 0.2, device=device)
cfg_agent["smooth_regularization_clip"] = 0.5
cfg_agent["batch_size"] = 100
cfg_agent["random_timesteps"] = 100
cfg_agent["learning_starts"] = 100
# logging to TensorBoard and write checkpoints each 1000 and 5000 timesteps respectively
cfg_agent["experiment"]["write_interval"] = 1000
cfg_agent["experiment"]["checkpoint_interval"] = 5000
agent = TD3(models=models,
memory=memory,
cfg=cfg_agent,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 50000, "headless": False}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
Isaac Gym preview
Isaac Gym environments
These examples perform the training of an agent in the Isaac Gym environments (one agent, multiple environments)
The following components or practices are exemplified (highlighted):
Load an Isaac Gym environment (easy-to-use API from NVIDIA): AllegroHand, Ingenuity
Load and wrap an Isaac Gym environment: Ant, Anymal
Set an input preprocessor: AnymalTerrain, BallBalance
Set a random seed for reproducibility: Cartpole
Set a learning rate scheduler: FrankaCabinet, Humanoid
Define a reward shaping function: Quadcopter, ShadowHand, Trifinger
Access to environment-specific properties and methods: Humanoid (AMP)
Load a checkpoint during evaluation: Cartpole
The PPO agent configuration is mapped, as far as possible, from the rl_games’ A2C-PPO configuration for Isaac Gym preview environments. Shared models or separated models are used depending on the value of the network.separate
variable. The following list shows the mapping between the two configurations:
# memory
memory_size = horizon_length
# agent
rollouts = horizon_length
learning_epochs = mini_epochs
mini_batches = horizon_length * num_actors / minibatch_size
discount_factor = gamma
lambda = tau
learning_rate = learning_rate
learning_rate_scheduler = skrl.resources.schedulers.torch.KLAdaptiveRL
learning_rate_scheduler_kwargs = {"kl_threshold": kl_threshold}
random_timesteps = 0
learning_starts = 0
grad_norm_clip = grad_norm
ratio_clip = e_clip
value_clip = e_clip
clip_predicted_values = clip_value
entropy_loss_scale = entropy_coef
value_loss_scale = 0.5 * critic_coef
kl_threshold = 0
rewards_shaper = lambda rewards, timestep, timesteps: rewards * scale_value
# trainer
timesteps = horizon_length * max_epochs
Benchmark results are listed in Benchmark results #32 (NVIDIA Isaac Gym)
Note
Isaac Gym environments implement a functionality to get their configuration from the command line. Because of this feature, setting the headless
option from the trainer configuration will not work. In this case, it is necessary to invoke the scripts as follows: python script.py headless=True
for Isaac Gym environments (preview 3 and preview 4) or python script.py --headless
for Isaac Gym environments (preview 2)
import isaacgym
import isaacgymenvs
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.utils import set_seed
# set the seed for reproducibility
seed = set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 512),
nn.ELU(),
nn.Linear(512, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU())
self.mean_layer = nn.Linear(128, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(128, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Isaac Gym environment using the easy-to-use API from NVIDIA
env = isaacgymenvs.make(seed=seed,
task="AllegroHand",
num_envs=16384,
sim_device="cuda:0",
rl_device="cuda:0",
graphics_device_id=0,
headless=False)
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=8, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 8 # memory_size
cfg_ppo["learning_epochs"] = 5
cfg_ppo["mini_batches"] = 4 # 8 * 16384 / 32768
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 5e-3
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.02}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 2.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 200 and 2000 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 200
cfg_ppo["experiment"]["checkpoint_interval"] = 2000
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 40000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import isaacgym
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaacgym_env_preview4
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU(),
nn.Linear(128, 64),
nn.ELU())
self.mean_layer = nn.Linear(64, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(64, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Isaac Gym environment
env = load_isaacgym_env_preview4(task_name="Ant") # preview 3 and 4 use the same loader
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 16 # memory_size
cfg_ppo["learning_epochs"] = 4
cfg_ppo["mini_batches"] = 2 # 16 * 4096 / 32768
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 3e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 1.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 40 and 400 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 40
cfg_ppo["experiment"]["checkpoint_interval"] = 400
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 8000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import isaacgym
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaacgym_env_preview4
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU(),
nn.Linear(128, 64),
nn.ELU())
self.mean_layer = nn.Linear(64, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(64, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Isaac Gym environment
env = load_isaacgym_env_preview4(task_name="Anymal") # preview 3 and 4 use the same loader
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=24, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 24 # memory_size
cfg_ppo["learning_epochs"] = 5
cfg_ppo["mini_batches"] = 3 # 24 * 4096 / 32768
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 3e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 1.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = None
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 120 and 1200 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 120
cfg_ppo["experiment"]["checkpoint_interval"] = 1200
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 24000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import isaacgym
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaacgym_env_preview4
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the models (stochastic and deterministic models) for the agent using mixins.
# - Policy: takes as input the environment's observation/state and returns an action
# - Value: takes the state as input and provides a value to guide the policy
class Policy(GaussianMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
self.net = nn.Sequential(nn.Linear(self.num_observations, 512),
nn.ELU(),
nn.Linear(512, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU(),
nn.Linear(128, self.num_actions))
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
def compute(self, inputs, role):
return self.net(inputs["states"]), self.log_std_parameter, {}
class Value(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 512),
nn.ELU(),
nn.Linear(512, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU(),
nn.Linear(128, 1))
def compute(self, inputs, role):
return self.net(inputs["states"]), {}
# Load and wrap the Isaac Gym environment
env = load_isaacgym_env_preview4(task_name="AnymalTerrain") # preview 3 and 4 use the same loader
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=24, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Policy(env.observation_space, env.action_space, device)
models_ppo["value"] = Value(env.observation_space, env.action_space, device)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 24 # memory_size
cfg_ppo["learning_epochs"] = 5
cfg_ppo["mini_batches"] = 6 # 24 * 4096 / 16384
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 3e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.001
cfg_ppo["value_loss_scale"] = 1.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = None
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 180 and 1800 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 180
cfg_ppo["experiment"]["checkpoint_interval"] = 1800
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 36000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import isaacgym
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaacgym_env_preview4
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 128),
nn.ELU(),
nn.Linear(128, 64),
nn.ELU(),
nn.Linear(64, 32),
nn.ELU())
self.mean_layer = nn.Linear(32, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(32, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Isaac Gym environment
env = load_isaacgym_env_preview4(task_name="BallBalance") # preview 3 and 4 use the same loader
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 16 # memory_size
cfg_ppo["learning_epochs"] = 8
cfg_ppo["mini_batches"] = 8 # 16 * 4096 / 8192
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 3e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 2.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.1
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 20 and 200 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 20
cfg_ppo["experiment"]["checkpoint_interval"] = 200
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 4000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import isaacgym
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaacgym_env_preview4
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(32)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU())
self.mean_layer = nn.Linear(32, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(32, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Isaac Gym environment
env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 16 # memory_size
cfg_ppo["learning_epochs"] = 8
cfg_ppo["mini_batches"] = 1 # 16 * 512 / 8192
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 3e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 2.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.1
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 16 and 80 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 16
cfg_ppo["experiment"]["checkpoint_interval"] = 80
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 1600, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import isaacgym
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.trpo import TRPO, TRPO_DEFAULT_CONFIG
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaacgym_env_preview4
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the models (stochastic and deterministic models) for the agent using mixins.
# - Policy: takes as input the environment's observation/state and returns an action
# - Value: takes the state as input and provides a value to guide the policy
class Policy(GaussianMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std)
self.net = nn.Sequential(nn.Linear(self.num_observations, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU(),
nn.Linear(32, self.num_actions))
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
def compute(self, inputs, role):
return self.net(inputs["states"]), self.log_std_parameter, {}
class Value(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU(),
nn.Linear(32, 1))
def compute(self, inputs, role):
return self.net(inputs["states"]), {}
# Load and wrap the Isaac Gym environment
env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# TRPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.trpo.html#spaces-and-models
models_trpo = {}
models_trpo["policy"] = Policy(env.observation_space, env.action_space, device)
models_trpo["value"] = Value(env.observation_space, env.action_space, device)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.trpo.html#configuration-and-hyperparameters
cfg_trpo = TRPO_DEFAULT_CONFIG.copy()
cfg_trpo["rollouts"] = 16 # memory_size
cfg_trpo["learning_epochs"] = 8
cfg_trpo["mini_batches"] = 1
cfg_trpo["discount_factor"] = 0.99
cfg_trpo["lambda"] = 0.95
cfg_trpo["learning_rate"] = 3e-4
cfg_trpo["grad_norm_clip"] = 1.0
cfg_trpo["value_loss_scale"] = 2.0
cfg_trpo["state_preprocessor"] = RunningStandardScaler
cfg_trpo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_trpo["value_preprocessor"] = RunningStandardScaler
cfg_trpo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 16 and 80 timesteps respectively
cfg_trpo["experiment"]["write_interval"] = 16
cfg_trpo["experiment"]["checkpoint_interval"] = 80
agent = TRPO(models=models_trpo,
memory=memory,
cfg=cfg_trpo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 1600, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import isaacgym
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaacgym_env_preview4
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU(),
nn.Linear(128, 64),
nn.ELU())
self.mean_layer = nn.Linear(64, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(64, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Isaac Gym environment
env = load_isaacgym_env_preview4(task_name="FrankaCabinet") # preview 3 and 4 use the same loader
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 16 # memory_size
cfg_ppo["learning_epochs"] = 8
cfg_ppo["mini_batches"] = 8 # 16 * 4096 / 8192
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 5e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 2.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 120 and 1200 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 120
cfg_ppo["experiment"]["checkpoint_interval"] = 1200
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 24000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import isaacgym
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaacgym_env_preview4
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 400),
nn.ELU(),
nn.Linear(400, 200),
nn.ELU(),
nn.Linear(200, 100),
nn.ELU())
self.mean_layer = nn.Linear(100, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(100, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Isaac Gym environment
env = load_isaacgym_env_preview4(task_name="Humanoid") # preview 3 and 4 use the same loader
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=32, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 32 # memory_size
cfg_ppo["learning_epochs"] = 5
cfg_ppo["mini_batches"] = 4 # 32 * 4096 / 32768
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 5e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 2.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 160 and 1600 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 160
cfg_ppo["experiment"]["checkpoint_interval"] = 1600
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 32000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import isaacgym
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.amp import AMP, AMP_DEFAULT_CONFIG
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaacgym_env_preview4
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the models (stochastic and deterministic models) for the agent using mixins.
# - Policy: takes as input the environment's observation/state and returns an action
# - Value: takes the state as input and provides a value to guide the policy
# - Discriminator: differentiate between police-generated behaviors and behaviors from the motion dataset
class Policy(GaussianMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std)
self.net = nn.Sequential(nn.Linear(self.num_observations, 1024),
nn.ReLU(),
nn.Linear(1024, 512),
nn.ReLU(),
nn.Linear(512, self.num_actions))
# set a fixed log standard deviation for the policy
self.log_std_parameter = nn.Parameter(torch.full((self.num_actions,), fill_value=-2.9), requires_grad=False)
def compute(self, inputs, role):
return torch.tanh(self.net(inputs["states"])), self.log_std_parameter, {}
class Value(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 1024),
nn.ReLU(),
nn.Linear(1024, 512),
nn.ReLU(),
nn.Linear(512, 1))
def compute(self, inputs, role):
return self.net(inputs["states"]), {}
class Discriminator(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 1024),
nn.ReLU(),
nn.Linear(1024, 512),
nn.ReLU(),
nn.Linear(512, 1))
def compute(self, inputs, role):
return self.net(inputs["states"]), {}
# Load and wrap the Isaac Gym environment
env = load_isaacgym_env_preview4(task_name="HumanoidAMP") # preview 3 and 4 use the same loader
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# AMP requires 3 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.amp.html#spaces-and-models
models_amp = {}
models_amp["policy"] = Policy(env.observation_space, env.action_space, device)
models_amp["value"] = Value(env.observation_space, env.action_space, device)
models_amp["discriminator"] = Discriminator(env.amp_observation_space, env.action_space, device)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.amp.html#configuration-and-hyperparameters
cfg_amp = AMP_DEFAULT_CONFIG.copy()
cfg_amp["rollouts"] = 16 # memory_size
cfg_amp["learning_epochs"] = 6
cfg_amp["mini_batches"] = 2 # 16 * 4096 / 32768
cfg_amp["discount_factor"] = 0.99
cfg_amp["lambda"] = 0.95
cfg_amp["learning_rate"] = 5e-5
cfg_amp["random_timesteps"] = 0
cfg_amp["learning_starts"] = 0
cfg_amp["grad_norm_clip"] = 0.0
cfg_amp["ratio_clip"] = 0.2
cfg_amp["value_clip"] = 0.2
cfg_amp["clip_predicted_values"] = False
cfg_amp["entropy_loss_scale"] = 0.0
cfg_amp["value_loss_scale"] = 2.5
cfg_amp["discriminator_loss_scale"] = 5.0
cfg_amp["amp_batch_size"] = 512
cfg_amp["task_reward_weight"] = 0.0
cfg_amp["style_reward_weight"] = 1.0
cfg_amp["discriminator_batch_size"] = 4096
cfg_amp["discriminator_reward_scale"] = 2
cfg_amp["discriminator_logit_regularization_scale"] = 0.05
cfg_amp["discriminator_gradient_penalty_scale"] = 5
cfg_amp["discriminator_weight_decay_scale"] = 0.0001
cfg_amp["state_preprocessor"] = RunningStandardScaler
cfg_amp["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_amp["value_preprocessor"] = RunningStandardScaler
cfg_amp["value_preprocessor_kwargs"] = {"size": 1, "device": device}
cfg_amp["amp_state_preprocessor"] = RunningStandardScaler
cfg_amp["amp_state_preprocessor_kwargs"] = {"size": env.amp_observation_space, "device": device}
# logging to TensorBoard and write checkpoints each 16 and 4000 timesteps respectively
cfg_amp["experiment"]["write_interval"] = 160
cfg_amp["experiment"]["checkpoint_interval"] = 4000
agent = AMP(models=models_amp,
memory=memory,
cfg=cfg_amp,
observation_space=env.observation_space,
action_space=env.action_space,
device=device,
amp_observation_space=env.amp_observation_space,
motion_dataset=RandomMemory(memory_size=200000, device=device),
reply_buffer=RandomMemory(memory_size=1000000, device=device),
collect_reference_motions=lambda num_samples: env.fetch_amp_obs_demo(num_samples),
collect_observation=lambda: env.reset_done()[0]["obs"])
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 80000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import isaacgym
import isaacgymenvs
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.utils import set_seed
# set the seed for reproducibility
seed = set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 256),
nn.ELU(),
nn.Linear(256, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU())
self.mean_layer = nn.Linear(128, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(128, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Isaac Gym environment using the easy-to-use API from NVIDIA
env = isaacgymenvs.make(seed=seed,
task="Ingenuity",
num_envs=4096,
sim_device="cuda:0",
rl_device="cuda:0",
graphics_device_id=0,
headless=True)
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 16 # memory_size
cfg_ppo["learning_epochs"] = 8
cfg_ppo["mini_batches"] = 4 # 16 * 4096 / 16384
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 1e-3
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.016}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 1.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 40 and 400 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 40
cfg_ppo["experiment"]["checkpoint_interval"] = 400
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 8000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import isaacgym
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaacgym_env_preview4
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 256),
nn.ELU(),
nn.Linear(256, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU())
self.mean_layer = nn.Linear(128, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(128, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Isaac Gym environment
env = load_isaacgym_env_preview4(task_name="Quadcopter") # preview 3 and 4 use the same loader
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=8, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 8 # memory_size
cfg_ppo["learning_epochs"] = 8
cfg_ppo["mini_batches"] = 4 # 8 * 8192 / 16384
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 1e-3
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.016}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 1.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.1
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 20 and 200 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 20
cfg_ppo["experiment"]["checkpoint_interval"] = 200
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 4000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import isaacgym
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaacgym_env_preview4
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 512),
nn.ELU(),
nn.Linear(512, 512),
nn.ELU(),
nn.Linear(512, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU())
self.mean_layer = nn.Linear(128, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(128, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Isaac Gym environment
env = load_isaacgym_env_preview4(task_name="ShadowHand") # preview 3 and 4 use the same loader
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=8, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 8 # memory_size
cfg_ppo["learning_epochs"] = 5
cfg_ppo["mini_batches"] = 4 # 8 * 16384 / 32768
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 5e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.016}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 2.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 200 and 2000 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 200
cfg_ppo["experiment"]["checkpoint_interval"] = 2000
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 40000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import isaacgym
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaacgym_env_preview4
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 256),
nn.ELU(),
nn.Linear(256, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU(),
nn.Linear(128, 128),
nn.ELU())
self.mean_layer = nn.Linear(128, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(128, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Isaac Gym environment
env = load_isaacgym_env_preview4(task_name="Trifinger") # preview 3 and 4 use the same loader
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=8, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 8 # memory_size
cfg_ppo["learning_epochs"] = 4
cfg_ppo["mini_batches"] = 8 # 8 * 16384 / 16384
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 3e-4
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 2.0
cfg_ppo["kl_threshold"] = 0.016
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 800 and 8000 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 800
cfg_ppo["experiment"]["checkpoint_interval"] = 8000
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 160000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
Note: It is necessary to adjust the checkpoint path according to the directories generated by the new experiments
Note: Warnings such as [skrl:WARNING] Cannot load the <module> module. The agent doesn't have such an instance
can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined
import isaacgym
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaacgym_env_preview4
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU())
self.mean_layer = nn.Linear(32, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(32, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Isaac Gym environment
env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader
env = wrap_env(env)
device = env.device
# Instantiate the agent's policy.
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
# Configure and instantiate the agent.
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["random_timesteps"] = 0
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
# logging to TensorBoard each 16 timesteps and ignore checkpoints
cfg_ppo["experiment"]["write_interval"] = 16
cfg_ppo["experiment"]["checkpoint_interval"] = 0
agent = PPO(models=models_ppo,
memory=None,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# load checkpoint (agent)
agent.load("./runs/22-09-12_18-56-10-110956_PPO/checkpoints/agent_1600.pt")
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 1600, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# evaluate the agent
trainer.eval()
Isaac Gym environments (learning by scopes)
These examples perform the training of 3 agents by scopes in Isaac Gym’s Cartpole environment in the same run (multiple agents and environments)
Two versions are presented:
Simultaneous (sequential) training of agents sharing the same memory and whose scopes are automatically selected as equally as possible
Simultaneous (sequential and parallel) training and evaluation of agents with local memory (no memory sharing) and whose scopes are manually specified and differ from each other
The following components or practices are exemplified (highlighted):
Create a shared memory: Shared memory
Learning by scopes (automatically defined): Shared memory
Create non-shared memories: No shared memory
Learning by scopes (manually defined): No shared memory
Load a checkpoint during evaluation: Shared memory, No shared memory
Note
Isaac Gym environments implement a functionality to get their configuration from the command line. Because of this feature, setting the headless
option from the trainer configuration will not work. In this case, it is necessary to invoke the scripts as follows: python script.py headless=True
for Isaac Gym environments (preview 3 and preview 4) or python script.py --headless
for Isaac Gym environments (preview 2)
isaacgym_sequential_shared_memory.py
import isaacgym
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG
from skrl.agents.torch.td3 import TD3, TD3_DEFAULT_CONFIG
from skrl.agents.torch.sac import SAC, SAC_DEFAULT_CONFIG
from skrl.resources.noises.torch import GaussianNoise, OrnsteinUhlenbeckNoise
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaacgym_env_preview4
# Define the models (stochastic and deterministic models) for the agents using mixins.
# - StochasticActor: takes as input the environment's observation/state and returns an action
# - DeterministicActor: takes as input the environment's observation/state and returns an action
# - Critic: takes the state and action as input and provides a value to guide the policy
class StochasticActor(GaussianMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std)
self.net = nn.Sequential(nn.Linear(self.num_observations, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU(),
nn.Linear(32, self.num_actions))
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
def compute(self, inputs, role):
return self.net(inputs["states"]), self.log_std_parameter, {}
class DeterministicActor(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU(),
nn.Linear(32, self.num_actions))
def compute(self, inputs, role):
return self.net(inputs["states"]), {}
class Critic(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations + self.num_actions, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU(),
nn.Linear(32, 1))
def compute(self, inputs, role):
return self.net(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1)), {}
# Load and wrap the Isaac Gym environment
env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory (without replacement) as shared experience replay memory
memory = RandomMemory(memory_size=8000, num_envs=env.num_envs, device=device, replacement=True)
# Instantiate the agent's models (function approximators).
# DDPG requires 4 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models
models_ddpg = {}
models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)
models_ddpg["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)
models_ddpg["critic"] = Critic(env.observation_space, env.action_space, device)
models_ddpg["target_critic"] = Critic(env.observation_space, env.action_space, device)
# TD3 requires 6 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#spaces-and-models
models_td3 = {}
models_td3["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)
models_td3["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)
models_td3["critic_1"] = Critic(env.observation_space, env.action_space, device)
models_td3["critic_2"] = Critic(env.observation_space, env.action_space, device)
models_td3["target_critic_1"] = Critic(env.observation_space, env.action_space, device)
models_td3["target_critic_2"] = Critic(env.observation_space, env.action_space, device)
# SAC requires 5 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#spaces-and-models
models_sac = {}
models_sac["policy"] = StochasticActor(env.observation_space, env.action_space, device, clip_actions=True)
models_sac["critic_1"] = Critic(env.observation_space, env.action_space, device)
models_sac["critic_2"] = Critic(env.observation_space, env.action_space, device)
models_sac["target_critic_1"] = Critic(env.observation_space, env.action_space, device)
models_sac["target_critic_2"] = Critic(env.observation_space, env.action_space, device)
# Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models_ddpg.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
for model in models_td3.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
for model in models_sac.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters
cfg_ddpg = DDPG_DEFAULT_CONFIG.copy()
cfg_ddpg["exploration"]["noise"] = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.1, base_scale=0.5, device=device)
cfg_ddpg["gradient_steps"] = 1
cfg_ddpg["batch_size"] = 512
cfg_ddpg["random_timesteps"] = 0
cfg_ddpg["learning_starts"] = 0
# logging to TensorBoard and write checkpoints each 25 and 1000 timesteps respectively
cfg_ddpg["experiment"]["write_interval"] = 25
cfg_ddpg["experiment"]["checkpoint_interval"] = 1000
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#configuration-and-hyperparameters
cfg_td3 = TD3_DEFAULT_CONFIG.copy()
cfg_td3["exploration"]["noise"] = GaussianNoise(0, 0.2, device=device)
cfg_td3["smooth_regularization_noise"] = GaussianNoise(0, 0.1, device=device)
cfg_td3["smooth_regularization_clip"] = 0.1
cfg_td3["gradient_steps"] = 1
cfg_td3["batch_size"] = 512
cfg_td3["random_timesteps"] = 0
cfg_td3["learning_starts"] = 0
# logging to TensorBoard and write checkpoints each 25 and 1000 timesteps respectively
cfg_td3["experiment"]["write_interval"] = 25
cfg_td3["experiment"]["checkpoint_interval"] = 1000
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#configuration-and-hyperparameters
cfg_sac = SAC_DEFAULT_CONFIG.copy()
cfg_sac["gradient_steps"] = 1
cfg_sac["batch_size"] = 512
cfg_sac["random_timesteps"] = 0
cfg_sac["learning_starts"] = 0
cfg_sac["learn_entropy"] = True
# logging to TensorBoard and write checkpoints each 25 and 1000 timesteps respectively
cfg_sac["experiment"]["write_interval"] = 25
cfg_sac["experiment"]["checkpoint_interval"] = 1000
agent_ddpg = DDPG(models=models_ddpg,
memory=memory,
cfg=cfg_ddpg,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
agent_td3 = TD3(models=models_td3,
memory=memory,
cfg=cfg_td3,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
agent_sac = SAC(models=models_sac,
memory=memory,
cfg=cfg_sac,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg = {"timesteps": 8000, "headless": True}
trainer = SequentialTrainer(cfg=cfg,
env=env,
agents=[agent_ddpg, agent_td3, agent_sac],
agents_scope=[])
# start training
trainer.train()
isaacgym_sequential_shared_memory_eval.py
Note: It is necessary to adjust the checkpoint path according to the directories generated by the new experiments
Note: Warnings such as [skrl:WARNING] Cannot load the <module> module. The agent doesn't have such an instance
can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined
import isaacgym
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG
from skrl.agents.torch.td3 import TD3, TD3_DEFAULT_CONFIG
from skrl.agents.torch.sac import SAC, SAC_DEFAULT_CONFIG
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaacgym_env_preview4
# Define only the policies for evaluation
class StochasticActor(GaussianMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std)
self.net = nn.Sequential(nn.Linear(self.num_observations, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU(),
nn.Linear(32, self.num_actions))
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
def compute(self, inputs, role):
return self.net(inputs["states"]), self.log_std_parameter, {}
class DeterministicActor(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU(),
nn.Linear(32, self.num_actions))
def compute(self, inputs, role):
return self.net(inputs["states"]), {}
# Load and wrap the Isaac Gym environment
env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader
env = wrap_env(env)
device = env.device
# Instantiate the agent's policies.
# DDPG requires 4 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models
models_ddpg = {}
models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)
# TD3 requires 6 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#spaces-and-models
models_td3 = {}
models_td3["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)
# SAC requires 5 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#spaces-and-models
models_sac = {}
models_sac["policy"] = StochasticActor(env.observation_space, env.action_space, device, clip_actions=True)
# Configure and instantiate the agents.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters
cfg_ddpg = DDPG_DEFAULT_CONFIG.copy()
cfg_ddpg["random_timesteps"] = 0
# logging to TensorBoard each 25 timesteps and ignore checkpoints
cfg_ddpg["experiment"]["write_interval"] = 25
cfg_ddpg["experiment"]["checkpoint_interval"] = 0
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#configuration-and-hyperparameters
cfg_td3 = TD3_DEFAULT_CONFIG.copy()
cfg_td3["random_timesteps"] = 0
# logging to TensorBoard each 25 timesteps and ignore checkpoints
cfg_td3["experiment"]["write_interval"] = 25
cfg_td3["experiment"]["checkpoint_interval"] = 0
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#configuration-and-hyperparameters
cfg_sac = SAC_DEFAULT_CONFIG.copy()
cfg_sac["random_timesteps"] = 0
# logging to TensorBoard each 25 timesteps and ignore checkpoints
cfg_sac["experiment"]["write_interval"] = 25
cfg_sac["experiment"]["checkpoint_interval"] = 0
agent_ddpg = DDPG(models=models_ddpg,
memory=None,
cfg=cfg_ddpg,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
agent_td3 = TD3(models=models_td3,
memory=None,
cfg=cfg_td3,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
agent_sac = SAC(models=models_sac,
memory=None,
cfg=cfg_sac,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# load checkpoint (agent)
agent_ddpg.load("./runs/22-09-12_22-30-58-982355_DDPG/checkpoints/agent_8000.pt")
agent_td3.load("./runs/22-09-12_22-30-58-986295_TD3/checkpoints/agent_8000.pt")
agent_sac.load("./runs/22-09-12_22-30-58-987142_SAC/checkpoints/agent_8000.pt")
# Configure and instantiate the RL trainer
cfg = {"timesteps": 8000, "headless": True}
trainer = SequentialTrainer(cfg=cfg,
env=env,
agents=[agent_ddpg, agent_td3, agent_sac],
agents_scope=[])
# evaluate the agents
trainer.eval()
isaacgym_sequential_no_shared_memory.py
import isaacgym
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG
from skrl.agents.torch.td3 import TD3, TD3_DEFAULT_CONFIG
from skrl.agents.torch.sac import SAC, SAC_DEFAULT_CONFIG
from skrl.resources.noises.torch import GaussianNoise, OrnsteinUhlenbeckNoise
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaacgym_env_preview4
# Define the models (stochastic and deterministic models) for the agents using mixins.
# - StochasticActor: takes as input the environment's observation/state and returns an action
# - DeterministicActor: takes as input the environment's observation/state and returns an action
# - Critic: takes the state and action as input and provides a value to guide the policy
class StochasticActor(GaussianMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std)
self.net = nn.Sequential(nn.Linear(self.num_observations, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU(),
nn.Linear(32, self.num_actions))
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
def compute(self, inputs, role):
return self.net(inputs["states"]), self.log_std_parameter, {}
class DeterministicActor(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU(),
nn.Linear(32, self.num_actions))
def compute(self, inputs, role):
return self.net(inputs["states"]), {}
class Critic(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations + self.num_actions, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU(),
nn.Linear(32, 1))
def compute(self, inputs, role):
return self.net(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1)), {}
# Load and wrap the Isaac Gym environment
env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader
env = wrap_env(env)
device = env.device
# Instantiate the RandomMemory (without replacement) as experience replay memories
memory_ddpg = RandomMemory(memory_size=8000, num_envs=100, device=device, replacement=True)
memory_td3 = RandomMemory(memory_size=8000, num_envs=200, device=device, replacement=True)
memory_sac = RandomMemory(memory_size=8000, num_envs=212, device=device, replacement=True)
# Instantiate the agent's models (function approximators).
# DDPG requires 4 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models
models_ddpg = {}
models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)
models_ddpg["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)
models_ddpg["critic"] = Critic(env.observation_space, env.action_space, device)
models_ddpg["target_critic"] = Critic(env.observation_space, env.action_space, device)
# TD3 requires 6 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#spaces-and-models
models_td3 = {}
models_td3["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)
models_td3["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)
models_td3["critic_1"] = Critic(env.observation_space, env.action_space, device)
models_td3["critic_2"] = Critic(env.observation_space, env.action_space, device)
models_td3["target_critic_1"] = Critic(env.observation_space, env.action_space, device)
models_td3["target_critic_2"] = Critic(env.observation_space, env.action_space, device)
# SAC requires 5 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#spaces-and-models
models_sac = {}
models_sac["policy"] = StochasticActor(env.observation_space, env.action_space, device, clip_actions=True)
models_sac["critic_1"] = Critic(env.observation_space, env.action_space, device)
models_sac["critic_2"] = Critic(env.observation_space, env.action_space, device)
models_sac["target_critic_1"] = Critic(env.observation_space, env.action_space, device)
models_sac["target_critic_2"] = Critic(env.observation_space, env.action_space, device)
# Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models_ddpg.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
for model in models_td3.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
for model in models_sac.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters
cfg_ddpg = DDPG_DEFAULT_CONFIG.copy()
cfg_ddpg["exploration"]["noise"] = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.1, base_scale=0.5, device=device)
cfg_ddpg["gradient_steps"] = 1
cfg_ddpg["batch_size"] = 512
cfg_ddpg["random_timesteps"] = 0
cfg_ddpg["learning_starts"] = 0
# logging to TensorBoard and write checkpoints each 25 and 1000 timesteps respectively
cfg_ddpg["experiment"]["write_interval"] = 25
cfg_ddpg["experiment"]["checkpoint_interval"] = 1000
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#configuration-and-hyperparameters
cfg_td3 = TD3_DEFAULT_CONFIG.copy()
cfg_td3["exploration"]["noise"] = GaussianNoise(0, 0.2, device=device)
cfg_td3["smooth_regularization_noise"] = GaussianNoise(0, 0.1, device=device)
cfg_td3["smooth_regularization_clip"] = 0.1
cfg_td3["gradient_steps"] = 1
cfg_td3["batch_size"] = 512
cfg_td3["random_timesteps"] = 0
cfg_td3["learning_starts"] = 0
# logging to TensorBoard and write checkpoints each 25 and 1000 timesteps respectively
cfg_td3["experiment"]["write_interval"] = 25
cfg_td3["experiment"]["checkpoint_interval"] = 1000
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#configuration-and-hyperparameters
cfg_sac = SAC_DEFAULT_CONFIG.copy()
cfg_sac["gradient_steps"] = 1
cfg_sac["batch_size"] = 512
cfg_sac["random_timesteps"] = 0
cfg_sac["learning_starts"] = 0
cfg_sac["learn_entropy"] = True
# logging to TensorBoard and write checkpoints each 25 and 1000 timesteps respectively
cfg_sac["experiment"]["write_interval"] = 25
cfg_sac["experiment"]["checkpoint_interval"] = 1000
agent_ddpg = DDPG(models=models_ddpg,
memory=memory_ddpg,
cfg=cfg_ddpg,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
agent_td3 = TD3(models=models_td3,
memory=memory_td3,
cfg=cfg_td3,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
agent_sac = SAC(models=models_sac,
memory=memory_sac,
cfg=cfg_sac,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer and define the agent scopes
cfg = {"timesteps": 8000, "headless": True}
trainer = SequentialTrainer(cfg=cfg,
env=env,
agents=[agent_ddpg, agent_td3, agent_sac],
agents_scope=[100, 200, 212]) # agent scopes
# start training
trainer.train()
isaacgym_parallel_no_shared_memory.py
import isaacgym
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG
from skrl.agents.torch.td3 import TD3, TD3_DEFAULT_CONFIG
from skrl.agents.torch.sac import SAC, SAC_DEFAULT_CONFIG
from skrl.resources.noises.torch import GaussianNoise, OrnsteinUhlenbeckNoise
from skrl.trainers.torch import ParallelTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaacgym_env_preview4
# Define the models (stochastic and deterministic models) for the agents using mixins.
# - StochasticActor: takes as input the environment's observation/state and returns an action
# - DeterministicActor: takes as input the environment's observation/state and returns an action
# - Critic: takes the state and action as input and provides a value to guide the policy
class StochasticActor(GaussianMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std)
self.net = nn.Sequential(nn.Linear(self.num_observations, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU(),
nn.Linear(32, self.num_actions))
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
def compute(self, inputs, role):
return self.net(inputs["states"]), self.log_std_parameter, {}
class DeterministicActor(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU(),
nn.Linear(32, self.num_actions))
def compute(self, inputs, role):
return self.net(inputs["states"]), {}
class Critic(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations + self.num_actions, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU(),
nn.Linear(32, 1))
def compute(self, inputs, role):
return self.net(torch.cat([inputs["states"], inputs["taken_actions"]], dim=1)), {}
if __name__ == '__main__':
# Load and wrap the Isaac Gym environment
env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader
env = wrap_env(env)
device = env.device
# Instantiate the RandomMemory (without replacement) as experience replay memories
memory_ddpg = RandomMemory(memory_size=8000, num_envs=100, device=device, replacement=True)
memory_td3 = RandomMemory(memory_size=8000, num_envs=200, device=device, replacement=True)
memory_sac = RandomMemory(memory_size=8000, num_envs=212, device=device, replacement=True)
# Instantiate the agent's models (function approximators).
# DDPG requires 4 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models
models_ddpg = {}
models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)
models_ddpg["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)
models_ddpg["critic"] = Critic(env.observation_space, env.action_space, device)
models_ddpg["target_critic"] = Critic(env.observation_space, env.action_space, device)
# TD3 requires 6 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#spaces-and-models
models_td3 = {}
models_td3["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)
models_td3["target_policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)
models_td3["critic_1"] = Critic(env.observation_space, env.action_space, device)
models_td3["critic_2"] = Critic(env.observation_space, env.action_space, device)
models_td3["target_critic_1"] = Critic(env.observation_space, env.action_space, device)
models_td3["target_critic_2"] = Critic(env.observation_space, env.action_space, device)
# SAC requires 5 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#spaces-and-models
models_sac = {}
models_sac["policy"] = StochasticActor(env.observation_space, env.action_space, device, clip_actions=True)
models_sac["critic_1"] = Critic(env.observation_space, env.action_space, device)
models_sac["critic_2"] = Critic(env.observation_space, env.action_space, device)
models_sac["target_critic_1"] = Critic(env.observation_space, env.action_space, device)
models_sac["target_critic_2"] = Critic(env.observation_space, env.action_space, device)
# Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models_ddpg.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
for model in models_td3.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
for model in models_sac.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters
cfg_ddpg = DDPG_DEFAULT_CONFIG.copy()
cfg_ddpg["exploration"]["noise"] = OrnsteinUhlenbeckNoise(theta=0.15, sigma=0.1, base_scale=0.5, device=device)
cfg_ddpg["gradient_steps"] = 1
cfg_ddpg["batch_size"] = 512
cfg_ddpg["random_timesteps"] = 0
cfg_ddpg["learning_starts"] = 0
# logging to TensorBoard and write checkpoints each 25 and 1000 timesteps respectively
cfg_ddpg["experiment"]["write_interval"] = 25
cfg_ddpg["experiment"]["checkpoint_interval"] = 1000
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#configuration-and-hyperparameters
cfg_td3 = TD3_DEFAULT_CONFIG.copy()
cfg_td3["exploration"]["noise"] = GaussianNoise(0, 0.2, device=device)
cfg_td3["smooth_regularization_noise"] = GaussianNoise(0, 0.1, device=device)
cfg_td3["smooth_regularization_clip"] = 0.1
cfg_td3["gradient_steps"] = 1
cfg_td3["batch_size"] = 512
cfg_td3["random_timesteps"] = 0
cfg_td3["learning_starts"] = 0
# logging to TensorBoard and write checkpoints each 25 and 1000 timesteps respectively
cfg_td3["experiment"]["write_interval"] = 25
cfg_td3["experiment"]["checkpoint_interval"] = 1000
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#configuration-and-hyperparameters
cfg_sac = SAC_DEFAULT_CONFIG.copy()
cfg_sac["gradient_steps"] = 1
cfg_sac["batch_size"] = 512
cfg_sac["random_timesteps"] = 0
cfg_sac["learning_starts"] = 0
cfg_sac["learn_entropy"] = True
# logging to TensorBoard and write checkpoints each 25 and 1000 timesteps respectively
cfg_sac["experiment"]["write_interval"] = 25
cfg_sac["experiment"]["checkpoint_interval"] = 1000
agent_ddpg = DDPG(models=models_ddpg,
memory=memory_ddpg,
cfg=cfg_ddpg,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
agent_td3 = TD3(models=models_td3,
memory=memory_td3,
cfg=cfg_td3,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
agent_sac = SAC(models=models_sac,
memory=memory_sac,
cfg=cfg_sac,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer and define the agent scopes
cfg = {"timesteps": 8000, "headless": True}
trainer = ParallelTrainer(cfg=cfg,
env=env,
agents=[agent_ddpg, agent_td3, agent_sac],
agents_scope=[100, 200, 212]) # agent scopes
# start training
trainer.train()
isaacgym_sequential_no_shared_memory_eval.py
Note: It is necessary to adjust the checkpoint path according to the directories generated by the new experiments
Note: Warnings such as [skrl:WARNING] Cannot load the <module> module. The agent doesn't have such an instance
can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined
import isaacgym
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG
from skrl.agents.torch.td3 import TD3, TD3_DEFAULT_CONFIG
from skrl.agents.torch.sac import SAC, SAC_DEFAULT_CONFIG
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaacgym_env_preview4
# Define only the policies for evaluation
class StochasticActor(GaussianMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std)
self.net = nn.Sequential(nn.Linear(self.num_observations, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU(),
nn.Linear(32, self.num_actions))
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
def compute(self, inputs, role):
return self.net(inputs["states"]), self.log_std_parameter, {}
class DeterministicActor(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU(),
nn.Linear(32, self.num_actions))
def compute(self, inputs, role):
return self.net(inputs["states"]), {}
# Load and wrap the Isaac Gym environment
env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader
env = wrap_env(env)
device = env.device
# Instantiate the agent's policies.
# DDPG requires 4 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models
models_ddpg = {}
models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)
# TD3 requires 6 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#spaces-and-models
models_td3 = {}
models_td3["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)
# SAC requires 5 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#spaces-and-models
models_sac = {}
models_sac["policy"] = StochasticActor(env.observation_space, env.action_space, device, clip_actions=True)
# Configure and instantiate the agents.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters
cfg_ddpg = DDPG_DEFAULT_CONFIG.copy()
cfg_ddpg["random_timesteps"] = 0
# logging to TensorBoard each 25 timesteps and ignore checkpoints
cfg_ddpg["experiment"]["write_interval"] = 25
cfg_ddpg["experiment"]["checkpoint_interval"] = 0
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#configuration-and-hyperparameters
cfg_td3 = TD3_DEFAULT_CONFIG.copy()
cfg_td3["random_timesteps"] = 0
# logging to TensorBoard each 25 timesteps and ignore checkpoints
cfg_td3["experiment"]["write_interval"] = 25
cfg_td3["experiment"]["checkpoint_interval"] = 0
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#configuration-and-hyperparameters
cfg_sac = SAC_DEFAULT_CONFIG.copy()
cfg_sac["random_timesteps"] = 0
# logging to TensorBoard each 25 timesteps and ignore checkpoints
cfg_sac["experiment"]["write_interval"] = 25
cfg_sac["experiment"]["checkpoint_interval"] = 0
agent_ddpg = DDPG(models=models_ddpg,
memory=None,
cfg=cfg_ddpg,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
agent_td3 = TD3(models=models_td3,
memory=None,
cfg=cfg_td3,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
agent_sac = SAC(models=models_sac,
memory=None,
cfg=cfg_sac,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# load checkpoint (agent)
agent_ddpg.load("./runs/22-09-12_22-30-58-982355_DDPG/checkpoints/agent_8000.pt")
agent_td3.load("./runs/22-09-12_22-30-58-986295_TD3/checkpoints/agent_8000.pt")
agent_sac.load("./runs/22-09-12_22-30-58-987142_SAC/checkpoints/agent_8000.pt")
# Configure and instantiate the RL trainer
cfg = {"timesteps": 8000, "headless": True}
trainer = SequentialTrainer(cfg=cfg,
env=env,
agents=[agent_ddpg, agent_td3, agent_sac],
agents_scope=[100, 200, 212])
# evaluate the agents
trainer.eval()
isaacgym_parallel_no_shared_memory_eval.py
Note: It is necessary to adjust the checkpoint path according to the directories generated by the new experiments
Note: Warnings such as [skrl:WARNING] Cannot load the <module> module. The agent doesn't have such an instance
can be ignored without problems. The reason for this is that during the evaluation, not all components such as optimizers or other models apart from the policy are defined
import isaacgym
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.agents.torch.ddpg import DDPG, DDPG_DEFAULT_CONFIG
from skrl.agents.torch.td3 import TD3, TD3_DEFAULT_CONFIG
from skrl.agents.torch.sac import SAC, SAC_DEFAULT_CONFIG
from skrl.trainers.torch import ParallelTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaacgym_env_preview4
# Define only the policies for evaluation
class StochasticActor(GaussianMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std)
self.net = nn.Sequential(nn.Linear(self.num_observations, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU(),
nn.Linear(32, self.num_actions))
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
def compute(self, inputs, role):
return self.net(inputs["states"]), self.log_std_parameter, {}
class DeterministicActor(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU(),
nn.Linear(32, self.num_actions))
def compute(self, inputs, role):
return self.net(inputs["states"]), {}
if __name__ == '__main__':
# Load and wrap the Isaac Gym environment
env = load_isaacgym_env_preview4(task_name="Cartpole") # preview 3 and 4 use the same loader
env = wrap_env(env)
device = env.device
# Instantiate the agent's models (function approximators).
# DDPG requires 4 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#spaces-and-models
models_ddpg = {}
models_ddpg["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)
# TD3 requires 6 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#spaces-and-models
models_td3 = {}
models_td3["policy"] = DeterministicActor(env.observation_space, env.action_space, device, clip_actions=True)
# SAC requires 5 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#spaces-and-models
models_sac = {}
models_sac["policy"] = StochasticActor(env.observation_space, env.action_space, device, clip_actions=True)
# Configure and instantiate the agents.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ddpg.html#configuration-and-hyperparameters
cfg_ddpg = DDPG_DEFAULT_CONFIG.copy()
cfg_ddpg["random_timesteps"] = 0
# logging to TensorBoard each 25 timesteps and ignore checkpoints
cfg_ddpg["experiment"]["write_interval"] = 25
cfg_ddpg["experiment"]["checkpoint_interval"] = 0
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.td3.html#configuration-and-hyperparameters
cfg_td3 = TD3_DEFAULT_CONFIG.copy()
cfg_td3["random_timesteps"] = 0
# logging to TensorBoard each 25 timesteps and ignore checkpoints
cfg_td3["experiment"]["write_interval"] = 25
cfg_td3["experiment"]["checkpoint_interval"] = 0
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.sac.html#configuration-and-hyperparameters
cfg_sac = SAC_DEFAULT_CONFIG.copy()
cfg_sac["random_timesteps"] = 0
# logging to TensorBoard each 25 timesteps and ignore checkpoints
cfg_sac["experiment"]["write_interval"] = 25
cfg_sac["experiment"]["checkpoint_interval"] = 0
agent_ddpg = DDPG(models=models_ddpg,
memory=None,
cfg=cfg_ddpg,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
agent_td3 = TD3(models=models_td3,
memory=None,
cfg=cfg_td3,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
agent_sac = SAC(models=models_sac,
memory=None,
cfg=cfg_sac,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# load checkpoint (agent)
agent_ddpg.load("./runs/22-09-12_22-30-58-982355_DDPG/checkpoints/agent_8000.pt")
agent_td3.load("./runs/22-09-12_22-30-58-986295_TD3/checkpoints/agent_8000.pt")
agent_sac.load("./runs/22-09-12_22-30-58-987142_SAC/checkpoints/agent_8000.pt")
# Configure and instantiate the RL trainer and define the agent scopes
cfg = {"timesteps": 8000, "headless": True}
trainer = ParallelTrainer(cfg=cfg,
env=env,
agents=[agent_ddpg, agent_td3, agent_sac],
agents_scope=[100, 200, 212]) # agent scopes
# evaluate the agents
trainer.eval()
Isaac Orbit
Isaac Orbit environments
These examples perform the training of an agent in the Isaac Orbit environments (one agent, multiple environments)
The following components or practices are exemplified (highlighted):
Load and wrap an Isaac Orbit environment: Ant
The PPO agent configuration is mapped, as far as possible, from the rl_games’ A2C-PPO configuration for Isaac Orbit environments. Shared models or separated models are used depending on the value of the network.separate
variable. The following list shows the mapping between the two configurations:configurations
# memory
memory_size = horizon_length
# agent
rollouts = horizon_length
learning_epochs = mini_epochs
mini_batches = horizon_length * num_envs / minibatch_size
discount_factor = gamma
lambda = tau
learning_rate = learning_rate
learning_rate_scheduler = skrl.resources.schedulers.torch.KLAdaptiveRL
learning_rate_scheduler_kwargs = {"kl_threshold": kl_threshold}
random_timesteps = 0
learning_starts = 0
grad_norm_clip = grad_norm
ratio_clip = e_clip
value_clip = e_clip
clip_predicted_values = clip_value
entropy_loss_scale = entropy_coef
value_loss_scale = 0.5 * critic_coef
kl_threshold = 0
rewards_shaper = lambda rewards, timestep, timesteps: rewards * scale_value
# trainer
timesteps = horizon_length * max_epochs
Benchmark results are listed in Benchmark results #32 (NVIDIA Isaac Orbit)
Note
Isaac Orbit environments implement a functionality to get their configuration from the command line. Because of this feature, setting the headless
option from the trainer configuration will not work. In this case, it is necessary to invoke the scripts as follows: orbit -p script.py --headless
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaac_orbit_env
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions=False)
self.net = nn.Sequential(nn.Linear(self.num_observations, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU(),
nn.Linear(128, 64),
nn.ELU())
self.mean_layer = nn.Linear(64, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(64, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return torch.tanh(self.mean_layer(self.net(inputs["states"]))), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Omniverse Isaac Gym environment
env = load_isaac_orbit_env(task_name="Isaac-Ant-v0")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device, clip_actions=True)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 16 # memory_size
cfg_ppo["learning_epochs"] = 8
cfg_ppo["mini_batches"] = 4 # 16 * 1024 / 4096
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 3e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 1.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 40 and 400 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 40
cfg_ppo["experiment"]["checkpoint_interval"] = 400
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 8000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaac_orbit_env
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions=False)
self.net = nn.Sequential(nn.Linear(self.num_observations, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU())
self.mean_layer = nn.Linear(32, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(32, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return torch.tanh(self.mean_layer(self.net(inputs["states"]))), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Isaac Orbit environment
env = load_isaac_orbit_env(task_name="Isaac-Cartpole-v0")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device, clip_actions=True)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 16 # memory_size
cfg_ppo["learning_epochs"] = 8
cfg_ppo["mini_batches"] = 1 # 16 * 512 / 8192
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 3e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 2.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = None
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 16 and 80 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 16
cfg_ppo["experiment"]["checkpoint_interval"] = 80
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 1600, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaac_orbit_env
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions=False)
self.net = nn.Sequential(nn.Linear(self.num_observations, 400),
nn.ELU(),
nn.Linear(400, 200),
nn.ELU(),
nn.Linear(200, 100),
nn.ELU())
self.mean_layer = nn.Linear(100, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(100, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return torch.tanh(self.mean_layer(self.net(inputs["states"]))), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Omniverse Isaac Gym environment
env = load_isaac_orbit_env(task_name="Isaac-Humanoid-v0")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=32, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device, clip_actions=True)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 32 # memory_size
cfg_ppo["learning_epochs"] = 8
cfg_ppo["mini_batches"] = 8 # 32 * 1024 / 4096
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 3e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 1.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 80 and 800 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 80
cfg_ppo["experiment"]["checkpoint_interval"] = 800
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 16000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaac_orbit_env
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions=False)
self.net = nn.Sequential(nn.Linear(self.num_observations, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU(),
nn.Linear(128, 64),
nn.ELU())
self.mean_layer = nn.Linear(64, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(64, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Omniverse Isaac Gym environment
env = load_isaac_orbit_env(task_name="Isaac-Lift-Franka-v0")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 16 # memory_size
cfg_ppo["learning_epochs"] = 8
cfg_ppo["mini_batches"] = 8 # 16 * 1024 / 2048
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 3e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 2.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 120 and 1200 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 120
cfg_ppo["experiment"]["checkpoint_interval"] = 1200
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 24000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaac_orbit_env
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions=False)
self.net = nn.Sequential(nn.Linear(self.num_observations, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU(),
nn.Linear(128, 64),
nn.ELU())
self.mean_layer = nn.Linear(64, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(64, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return torch.tanh(self.mean_layer(self.net(inputs["states"]))), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Omniverse Isaac Gym environment
env = load_isaac_orbit_env(task_name="Isaac-Reach-Franka-v0")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 16 # memory_size
cfg_ppo["learning_epochs"] = 8
cfg_ppo["mini_batches"] = 8 # 16 * 2048 / 4096
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 3e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.01}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 2.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = None
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 80 and 800 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 80
cfg_ppo["experiment"]["checkpoint_interval"] = 800
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 16000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_isaac_orbit_env
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions=False)
self.net = nn.Sequential(nn.Linear(self.num_observations, 128),
nn.ELU(),
nn.Linear(128, 128),
nn.ELU(),
nn.Linear(128, 128),
nn.ELU())
self.mean_layer = nn.Linear(128, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(128, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return torch.tanh(self.mean_layer(self.net(inputs["states"]))), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Omniverse Isaac Gym environment
env = load_isaac_orbit_env(task_name="Isaac-Velocity-Anymal-C-v0")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=24, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 24 # memory_size
cfg_ppo["learning_epochs"] = 5
cfg_ppo["mini_batches"] = 4 # 24 * 4096 / 24576
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 1e-3
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 1.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = None
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 60 and 600 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 60
cfg_ppo["experiment"]["checkpoint_interval"] = 600
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 12000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
Omniverse Isaac Gym
Omniverse Isaac Gym environments
These examples perform the training of an agent in the Omniverse Isaac Gym environments (one agent, multiple environments)
The following components or practices are exemplified (highlighted):
Load and wrap an Omniverse Isaac Gym environment: AllegroHand, Ant, Anymal
Load and wrap an Omniverse Isaac Gym multi-threaded environment: Ant (multi-threaded), Cartpole (multi-threaded)
Set an input preprocessor: AnymalTerrain, BallBalance
Set a random seed for reproducibility: Cartpole, Crazyflie
Set a learning rate scheduler: FrankaCabinet, Humanoid
Define a reward shaping function: Ingenuity, Quadcopter, ShadowHand
The PPO agent configuration is mapped, as far as possible, from the rl_games’ A2C-PPO configuration for Omniverse Isaac Gym environments. Shared models or separated models are used depending on the value of the network.separate
variable. The following list shows the mapping between the two configurations:configurations
# memory
memory_size = horizon_length
# agent
rollouts = horizon_length
learning_epochs = mini_epochs
mini_batches = horizon_length * num_actors / minibatch_size
discount_factor = gamma
lambda = tau
learning_rate = learning_rate
learning_rate_scheduler = skrl.resources.schedulers.torch.KLAdaptiveRL
learning_rate_scheduler_kwargs = {"kl_threshold": kl_threshold}
random_timesteps = 0
learning_starts = 0
grad_norm_clip = grad_norm
ratio_clip = e_clip
value_clip = e_clip
clip_predicted_values = clip_value
entropy_loss_scale = entropy_coef
value_loss_scale = 0.5 * critic_coef
kl_threshold = 0
rewards_shaper = lambda rewards, timestep, timesteps: rewards * scale_value
# trainer
timesteps = horizon_length * max_epochs
Benchmark results are listed in Benchmark results #32 (NVIDIA Omniverse Isaac Gym)
Note
Omniverse Isaac Gym environments implement a functionality to get their configuration from the command line. Because of this feature, setting the headless
option from the trainer configuration will not work. In this case, it is necessary to invoke the scripts as follows: python script.py headless=True
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_omniverse_isaacgym_env
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 512),
nn.ELU(),
nn.Linear(512, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU())
self.mean_layer = nn.Linear(128, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(128, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Omniverse Isaac Gym environment
env = load_omniverse_isaacgym_env(task_name="AllegroHand")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 16 # memory_size
cfg_ppo["learning_epochs"] = 5
cfg_ppo["mini_batches"] = 4 # 16 * 8192 / 32768
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 5e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.02}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 2.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 800 and 8000 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 800
cfg_ppo["experiment"]["checkpoint_interval"] = 8000
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 160000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_omniverse_isaacgym_env
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU(),
nn.Linear(128, 64),
nn.ELU())
self.mean_layer = nn.Linear(64, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(64, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Omniverse Isaac Gym environment
env = load_omniverse_isaacgym_env(task_name="Ant")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 16 # memory_size
cfg_ppo["learning_epochs"] = 4
cfg_ppo["mini_batches"] = 2 # 16 * 4096 / 32768
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 3e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 1.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 40 and 400 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 40
cfg_ppo["experiment"]["checkpoint_interval"] = 400
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 8000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import threading
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_omniverse_isaacgym_env
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU(),
nn.Linear(128, 64),
nn.ELU())
self.mean_layer = nn.Linear(64, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(64, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Omniverse Isaac Gym environment
env = load_omniverse_isaacgym_env(task_name="Ant", multi_threaded=True, timeout=30)
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 16 # memory_size
cfg_ppo["learning_epochs"] = 4
cfg_ppo["mini_batches"] = 2 # 16 * 4096 / 32768
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 3e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 1.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 40 and 400 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 40
cfg_ppo["experiment"]["checkpoint_interval"] = 400
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 8000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training in a separate thread
threading.Thread(target=trainer.train).start()
# run the simulation in the main thread
env.run()
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_omniverse_isaacgym_env
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU(),
nn.Linear(128, 64),
nn.ELU())
self.mean_layer = nn.Linear(64, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(64, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Omniverse Isaac Gym environment
env = load_omniverse_isaacgym_env(task_name="Anymal")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=24, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 24 # memory_size
cfg_ppo["learning_epochs"] = 5
cfg_ppo["mini_batches"] = 3 # 24 * 4096 / 32768
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 3e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 1.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = None
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 120 and 1200 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 120
cfg_ppo["experiment"]["checkpoint_interval"] = 1200
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 24000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_omniverse_isaacgym_env
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the models (stochastic and deterministic models) for the agent using mixins.
# - Policy: takes as input the environment's observation/state and returns an action
# - Value: takes the state as input and provides a value to guide the policy
class Policy(GaussianMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
self.net = nn.Sequential(nn.Linear(self.num_observations, 512),
nn.ELU(),
nn.Linear(512, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU(),
nn.Linear(128, self.num_actions))
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
def compute(self, inputs, role):
return self.net(inputs["states"]), self.log_std_parameter, {}
class Value(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 512),
nn.ELU(),
nn.Linear(512, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU(),
nn.Linear(128, 1))
def compute(self, inputs, role):
return self.net(inputs["states"]), {}
# Load and wrap the Omniverse Isaac Gym environment
env = load_omniverse_isaacgym_env(task_name="AnymalTerrain")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=48, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Policy(env.observation_space, env.action_space, device)
models_ppo["value"] = Value(env.observation_space, env.action_space, device)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 48 # memory_size
cfg_ppo["learning_epochs"] = 5
cfg_ppo["mini_batches"] = 6 # 48 * 2048 / 16384
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 3e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.001
cfg_ppo["value_loss_scale"] = 1.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = None
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 480 and 4800 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 480
cfg_ppo["experiment"]["checkpoint_interval"] = 4800
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 96000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_omniverse_isaacgym_env
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 128),
nn.ELU(),
nn.Linear(128, 64),
nn.ELU(),
nn.Linear(64, 32),
nn.ELU())
self.mean_layer = nn.Linear(32, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(32, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Omniverse Isaac Gym environment
env = load_omniverse_isaacgym_env(task_name="BallBalance")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 16 # memory_size
cfg_ppo["learning_epochs"] = 8
cfg_ppo["mini_batches"] = 8 # 16 * 4096 / 8192
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 3e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 2.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.1
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 20 and 200 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 20
cfg_ppo["experiment"]["checkpoint_interval"] = 200
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 4000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_omniverse_isaacgym_env
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(40)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU())
self.mean_layer = nn.Linear(32, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(32, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Omniverse Isaac Gym environment
env = load_omniverse_isaacgym_env(task_name="Cartpole")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 16 # memory_size
cfg_ppo["learning_epochs"] = 8
cfg_ppo["mini_batches"] = 1 # 16 * 512 / 8192
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 3e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 2.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.1
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 16 and 80 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 16
cfg_ppo["experiment"]["checkpoint_interval"] = 80
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 1600, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import threading
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_omniverse_isaacgym_env
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 32),
nn.ELU(),
nn.Linear(32, 32),
nn.ELU())
self.mean_layer = nn.Linear(32, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(32, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the multi-threaded Omniverse Isaac Gym environment
env = load_omniverse_isaacgym_env(task_name="Cartpole", multi_threaded=True, timeout=30)
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 16 # memory_size
cfg_ppo["learning_epochs"] = 8
cfg_ppo["mini_batches"] = 1 # 16 * 512 / 8192
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 3e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 2.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.1
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 16 and 80 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 16
cfg_ppo["experiment"]["checkpoint_interval"] = 80
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 1600, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training in a separate thread
threading.Thread(target=trainer.train).start()
# run the simulation in the main thread
env.run()
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_omniverse_isaacgym_env
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 256),
nn.Tanh(),
nn.Linear(256, 256),
nn.Tanh(),
nn.Linear(256, 128),
nn.Tanh())
self.mean_layer = nn.Linear(128, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(128, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Omniverse Isaac Gym environment
env = load_omniverse_isaacgym_env(task_name="Crazyflie")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 16 # memory_size
cfg_ppo["learning_epochs"] = 8
cfg_ppo["mini_batches"] = 4 # 16 * 4096 / 16384
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 1e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.016}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 1.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 80 and 800 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 80
cfg_ppo["experiment"]["checkpoint_interval"] = 800
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 16000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_omniverse_isaacgym_env
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU(),
nn.Linear(128, 64),
nn.ELU())
self.mean_layer = nn.Linear(64, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(64, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Omniverse Isaac Gym environment
env = load_omniverse_isaacgym_env(task_name="FrankaCabinet")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 16 # memory_size
cfg_ppo["learning_epochs"] = 8
cfg_ppo["mini_batches"] = 8 # 16 * 4096 / 8192
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 5e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 2.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 120 and 1200 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 120
cfg_ppo["experiment"]["checkpoint_interval"] = 1200
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 24000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_omniverse_isaacgym_env
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 400),
nn.ELU(),
nn.Linear(400, 200),
nn.ELU(),
nn.Linear(200, 100),
nn.ELU())
self.mean_layer = nn.Linear(100, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(100, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Omniverse Isaac Gym environment
env = load_omniverse_isaacgym_env(task_name="Humanoid")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=32, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 32 # memory_size
cfg_ppo["learning_epochs"] = 5
cfg_ppo["mini_batches"] = 4 # 32 * 4096 / 32768
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 5e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 2.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 160 and 1600 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 160
cfg_ppo["experiment"]["checkpoint_interval"] = 1600
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 32000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_omniverse_isaacgym_env
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 256),
nn.ELU(),
nn.Linear(256, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU())
self.mean_layer = nn.Linear(128, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(128, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Omniverse Isaac Gym environment
env = load_omniverse_isaacgym_env(task_name="Ingenuity")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 16 # memory_size
cfg_ppo["learning_epochs"] = 8
cfg_ppo["mini_batches"] = 4 # 16 * 4096 / 16384
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 1e-3
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.016}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 1.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 32 and 320 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 32
cfg_ppo["experiment"]["checkpoint_interval"] = 320
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 6400, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_omniverse_isaacgym_env
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 256),
nn.ELU(),
nn.Linear(256, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU())
self.mean_layer = nn.Linear(128, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(128, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Omniverse Isaac Gym environment
env = load_omniverse_isaacgym_env(task_name="Quadcopter")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 16 # memory_size
cfg_ppo["learning_epochs"] = 8
cfg_ppo["mini_batches"] = 4 # 16 * 4096 / 16384
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 1e-3
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.016}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 1.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.1
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 80 and 800 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 80
cfg_ppo["experiment"]["checkpoint_interval"] = 800
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 16000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
from skrl.envs.torch import load_omniverse_isaacgym_env
from skrl.utils import set_seed
# set the seed for reproducibility
set_seed(42)
# Define the shared model (stochastic and deterministic models) for the agent using mixins.
class Shared(GaussianMixin, DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 512),
nn.ELU(),
nn.Linear(512, 512),
nn.ELU(),
nn.Linear(512, 256),
nn.ELU(),
nn.Linear(256, 128),
nn.ELU())
self.mean_layer = nn.Linear(128, self.num_actions)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
self.value_layer = nn.Linear(128, 1)
def act(self, inputs, role):
if role == "policy":
return GaussianMixin.act(self, inputs, role)
elif role == "value":
return DeterministicMixin.act(self, inputs, role)
def compute(self, inputs, role):
if role == "policy":
return self.mean_layer(self.net(inputs["states"])), self.log_std_parameter, {}
elif role == "value":
return self.value_layer(self.net(inputs["states"])), {}
# Load and wrap the Omniverse Isaac Gym environment
env = load_omniverse_isaacgym_env(task_name="ShadowHand")
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=16, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Shared(env.observation_space, env.action_space, device)
models_ppo["value"] = models_ppo["policy"] # same instance: shared model
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 16 # memory_size
cfg_ppo["learning_epochs"] = 5
cfg_ppo["mini_batches"] = 4 # 16 * 8192 / 32768
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 5e-4
cfg_ppo["learning_rate_scheduler"] = KLAdaptiveRL
cfg_ppo["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.016}
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = True
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 2.0
cfg_ppo["kl_threshold"] = 0
cfg_ppo["rewards_shaper"] = lambda rewards, timestep, timesteps: rewards * 0.01
cfg_ppo["state_preprocessor"] = RunningStandardScaler
cfg_ppo["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg_ppo["value_preprocessor"] = RunningStandardScaler
cfg_ppo["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints each 800 and 8000 timesteps respectively
cfg_ppo["experiment"]["write_interval"] = 800
cfg_ppo["experiment"]["checkpoint_interval"] = 8000
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 160000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
Omniverse Isaac Sim (single environment)
These examples show how to train an agent in an Omniverse Isaac Sim environment that is implemented using the Gym interface (one agent, one environment)
This example performs the training of an agent in the Isaac Sim’s Cartpole environment described in the Creating New RL Environment tutorial
Use the steps described below to setup and launch the experiment after follow the tutorial
# download the sample code from GitHub in the directory containing the cartpole_task.py script
wget https://raw.githubusercontent.com/Toni-SM/skrl/main/docs/source/examples/isaacsim/cartpole_example_skrl.py
# run the experiment
PYTHON_PATH cartpole_example_skrl.py
# Omniverse Isaac Sim tutorial: Creating New RL Environment
# https://docs.omniverse.nvidia.com/app_isaacsim/app_isaacsim/tutorial_gym_new_rl_example.html
# Instance of VecEnvBase and create the task
from omni.isaac.gym.vec_env import VecEnvBase
env = VecEnvBase(headless=True)
from cartpole_task import CartpoleTask
task = CartpoleTask(name="Cartpole")
env.set_task(task, backend="torch")
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the models (stochastic and deterministic models) for the agent using mixins.
# - Policy: takes as input the environment's observation/state and returns an action
# - Value: takes the state as input and provides a value to guide the policy
class Policy(GaussianMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std)
self.net = nn.Sequential(nn.Linear(self.num_observations, 64),
nn.Tanh(),
nn.Linear(64, 64),
nn.Tanh(),
nn.Linear(64, self.num_actions))
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
def compute(self, inputs, role):
return torch.tanh(self.net(inputs["states"])), self.log_std_parameter, {}
class Value(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 64),
nn.Tanh(),
nn.Linear(64, 64),
nn.Tanh(),
nn.Linear(64, 1))
def compute(self, inputs, role):
return self.net(inputs["states"]), {}
# Load and wrap the environment
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=1000, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Policy(env.observation_space, env.action_space, device)
models_ppo["value"] = Value(env.observation_space, env.action_space, device)
# Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models_ppo.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 1000
cfg_ppo["learning_epochs"] = 20
cfg_ppo["mini_batches"] = 1
cfg_ppo["discount_factor"] = 0.99
cfg_ppo["lambda"] = 0.95
cfg_ppo["learning_rate"] = 0.001
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 1.0
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = False
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 0.5
cfg_ppo["kl_threshold"] = 0
cfg_ppo["experiment"]["write_interval"] = 1000
cfg_ppo["experiment"]["checkpoint_interval"] = 10000
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 100000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
This example performs the training of an agent in the Isaac Sim’s JetBot environment. The following components or practices are exemplified (highlighted):
Define and instantiate Convolutional Neural Networks (CNN) to learn from 128 X 128 RGB images
Use the steps described below (for a local workstation or a remote container) to setup and launch the experiment
# create a working directory and change to it
mkdir ~/.local/share/ov/pkg/isaac_sim-2021.2.1/standalone_examples/api/omni.isaac.jetbot/skrl_example
cd ~/.local/share/ov/pkg/isaac_sim-2021.2.1/standalone_examples/api/omni.isaac.jetbot/skrl_example
# install the skrl library in editable mode from the working directory
~/.local/share/ov/pkg/isaac_sim-2021.2.1/python.sh -m pip install -e git+https://github.com/Toni-SM/skrl.git#egg=skrl
# download the sample code from GitHub
wget https://raw.githubusercontent.com/Toni-SM/skrl/main/docs/source/examples/isaacsim/isaacsim_jetbot_ppo.py
# copy the Isaac Sim sample environment (JetBotEnv) to the working directory
cp ../stable_baselines_example/env.py .
# run the experiment
~/.local/share/ov/pkg/isaac_sim-2021.2.1/python.sh isaacsim_jetbot_ppo.py
# create a working directory and change to it
mkdir /isaac-sim/standalone_examples/api/omni.isaac.jetbot/skrl_example
cd /isaac-sim/standalone_examples/api/omni.isaac.jetbot/skrl_example
# install the skrl library in editable mode from the working directory
/isaac-sim/kit/python/bin/python3 -m pip install -e git+https://github.com/Toni-SM/skrl.git#egg=skrl
# download the sample code from GitHub
wget https://raw.githubusercontent.com/Toni-SM/skrl/main/docs/source/examples/isaacsim/isaacsim_jetbot_ppo.py
# copy the Isaac Sim sample environment (JetBotEnv) to the working directory
cp ../stable_baselines_example/env.py .
# run the experiment
/isaac-sim/python.sh isaacsim_jetbot_ppo.py
# import JetBot environment
from env import JetBotEnv
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import wrap_env
# Define the models (stochastic and deterministic models) for the agent using mixins.
# - Policy: takes as input the environment's observation/state and returns an action
# - Value: takes the state as input and provides a value to guide the policy
class Policy(GaussianMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std)
self.net = nn.Sequential(nn.Conv2d(3, 32, kernel_size=8, stride=4),
nn.ReLU(),
nn.Conv2d(32, 64, kernel_size=4, stride=2),
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size=3, stride=1),
nn.ReLU(),
nn.Flatten(),
nn.Linear(9216, 512),
nn.ReLU(),
nn.Linear(512, 16),
nn.Tanh(),
nn.Linear(16, 64),
nn.Tanh(),
nn.Linear(64, 32),
nn.Tanh(),
nn.Linear(32, self.num_actions))
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
def compute(self, inputs, role):
# view (samples, width * height * channels) -> (samples, width, height, channels)
# permute (samples, width, height, channels) -> (samples, channels, width, height)
x = self.net(inputs["states"].view(-1, *self.observation_space.shape).permute(0, 3, 1, 2))
return 10 * torch.tanh(x), self.log_std_parameter, {} # JetBotEnv action_space is -10 to 10
class Value(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Conv2d(3, 32, kernel_size=8, stride=4),
nn.ReLU(),
nn.Conv2d(32, 64, kernel_size=4, stride=2),
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size=3, stride=1),
nn.ReLU(),
nn.Flatten(),
nn.Linear(9216, 512),
nn.ReLU(),
nn.Linear(512, 16),
nn.Tanh(),
nn.Linear(16, 64),
nn.Tanh(),
nn.Linear(64, 32),
nn.Tanh(),
nn.Linear(32, 1))
def compute(self, inputs, role):
# view (samples, width * height * channels) -> (samples, width, height, channels)
# permute (samples, width, height, channels) -> (samples, channels, width, height)
return self.net(inputs["states"].view(-1, *self.observation_space.shape).permute(0, 3, 1, 2)), {}
# Load and wrap the JetBot environment (a subclass of Gym)
env = JetBotEnv(headless=True)
env = wrap_env(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=10000, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#spaces-and-models
models_ppo = {}
models_ppo["policy"] = Policy(env.observation_space, env.action_space, device)
models_ppo["value"] = Value(env.observation_space, env.action_space, device)
# Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models_ppo.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
# Configure and instantiate the agent.
# Only modify some of the default configuration, visit its documentation to see all the options
# https://skrl.readthedocs.io/en/latest/modules/skrl.agents.ppo.html#configuration-and-hyperparameters
cfg_ppo = PPO_DEFAULT_CONFIG.copy()
cfg_ppo["rollouts"] = 10000
cfg_ppo["learning_epochs"] = 10
cfg_ppo["mini_batches"] = 10
cfg_ppo["discount_factor"] = 0.9995
cfg_ppo["lambda"] = 0.95
cfg_ppo["policy_learning_rate"] = 0.00025
cfg_ppo["value_learning_rate"] = 0.00025
cfg_ppo["random_timesteps"] = 0
cfg_ppo["learning_starts"] = 0
cfg_ppo["grad_norm_clip"] = 10
cfg_ppo["ratio_clip"] = 0.2
cfg_ppo["value_clip"] = 0.2
cfg_ppo["clip_predicted_values"] = False
cfg_ppo["entropy_loss_scale"] = 0.0
cfg_ppo["value_loss_scale"] = 0.5
cfg_ppo["kl_threshold"] = 0
# logging to TensorBoard and write checkpoints each 10000 timesteps
cfg_ppo["experiment"]["write_interval"] = 10000
cfg_ppo["experiment"]["checkpoint_interval"] = 10000
agent = PPO(models=models_ppo,
memory=memory,
cfg=cfg_ppo,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instanciate the RL trainer
cfg_trainer = {"timesteps": 500000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train()
# close the environment
env.close()
Real-world examples
These examples show basic real-world use cases to guide and support advanced RL implementations
3D reaching task (Franka’s gripper must reach a certain target point in space). The training was done in Omniverse Isaac Gym. The real robot control is performed through the Python API of a modified version of frankx (see frankx’s pull request #44), a high-level motion library around libfranka. Training and evaluation is performed for both Cartesian and joint control space
Implementation (see details in the table below):
The observation space is composed of the episode’s normalized progress, the robot joints’ normalized positions (\(q\)) in the interval -1 to 1, the robot joints’ velocities (\(\dot{q}\)) affected by a random uniform scale for generalization, and the target’s position in space (\(target_{_{XYZ}}\)) with respect to the robot’s base
The action space, bounded in the range -1 to 1, consists of the following. For the joint control it’s robot joints’ position scaled change. For the Cartesian control it’s the end-effector’s position (\(ee_{_{XYZ}}\)) scaled change. The end-effector position frame corresponds to the point where the left finger connects to the gripper base in simulation, whereas in the real world it corresponds to the end of the fingers. The gripper fingers remain closed all the time in both cases
The instantaneous reward is the negative value of the Euclidean distance (\(\text{d}\)) between the robot end-effector and the target point position. The episode terminates when this distance is less than 0.035 meters in simulation (0.075 meters in real-world) or when the defined maximum timestep is reached
The target position lies within a rectangular cuboid of dimensions 0.5 x 0.5 x 0.2 meters centered at (0.5, 0.0, 0.2) meters with respect to the robot’s base. The robot joints’ positions are drawn from an initial configuration [0º, -45º, 0º, -135º, 0º, 90º, 45º] modified with uniform random values between -7º and 7º approximately
Variable |
Formula / value |
Size |
---|---|---|
Observation space |
\(\dfrac{t}{t_{max}},\; 2 \dfrac{q - q_{min}}{q_{max} - q_{min}} - 1,\; 0.1\,\dot{q}\,U(0.5,1.5),\; target_{_{XYZ}}\) |
18 |
Action space (joint) |
\(\dfrac{2.5}{120} \, \Delta q\) |
7 |
Action space (Cartesian) |
\(\dfrac{1}{100} \, \Delta ee_{_{XYZ}}\) |
3 |
Reward |
\(-\text{d}(ee_{_{XYZ}},\; target_{_{XYZ}})\) |
|
Episode termination |
\(\text{d}(ee_{_{XYZ}},\; target_{_{XYZ}}) \le 0.035 \quad\) or \(\quad t \ge t_{max} - 1\) |
|
Maximum timesteps (\(t_{max}\)) |
100 |
Workflows
Warning
Make sure you have the e-stop on hand in case something goes wrong in the run. Control via RL can be dangerous and unsafe for both the operator and the robot
Target position in X and Y obtained with a USB-camera (position in Z fixed at 0.2 m)
Prerequisites:
A physical Franka Emika Panda robot with Franka Control Interface (FCI) is required. Additionally, the frankx library must be available in the python environment (see frankx’s pull request #44 for the RL-compatible version installation)
Files
Environment:
reaching_franka_real_env.py
Evaluation script:
reaching_franka_real_skrl_eval.py
Checkpoints (
agent_joint.pt
,agent_cartesian.pt
):trained_checkpoints.zip
Evaluation:
python3 reaching_franka_real_skrl_eval.py
Main environment configuration:
Note
In the joint control space the final control of the robot is performed through the Cartesian pose (forward kinematics from specified values for the joints)
The control space (Cartesian or joint), the robot motion type (waypoint or impedance) and the target position acquisition (command prompt / automatically generated or USB-camera) can be specified in the environment class constructor (from reaching_franka_real_skrl_eval.py
) as follow:
control_space = "joint" # joint or cartesian
motion_type = "waypoint" # waypoint or impedance
camera_tracking = False # True for USB-camera tracking
Prerequisites:
All installation steps described in Omniverse Isaac Gym’s Overview & Getting Started section must be fulfilled (especially the subsection 1.3. Installing Examples Repository)
Files (the implementation is self-contained so no specific location is required):
Environment:
reaching_franka_omniverse_isaacgym_env.py
Training script:
reaching_franka_omniverse_isaacgym_skrl_train.py
Evaluation script:
reaching_franka_omniverse_isaacgym_skrl_eval.py
Checkpoints (
agent_joint.pt
,agent_cartesian.pt
):trained_checkpoints.zip
Training and evaluation:
# training (local workstation)
~/.local/share/ov/pkg/isaac_sim-*/python.sh reaching_franka_omniverse_isaacgym_skrl_train.py
# training (docker container)
/isaac-sim/python.sh reaching_franka_omniverse_isaacgym_skrl_train.py
# evaluation (local workstation)
~/.local/share/ov/pkg/isaac_sim-*/python.sh reaching_franka_omniverse_isaacgym_skrl_eval.py
# evaluation (docker container)
/isaac-sim/python.sh reaching_franka_omniverse_isaacgym_skrl_eval.py
Main environment configuration:
The control space (Cartesian or joint) can be specified in the task configuration dictionary (from reaching_franka_omniverse_isaacgym_skrl_train.py
) as follow:
TASK_CFG["task"]["env"]["controlSpace"] = "joint" # "joint" or "cartesian"
Prerequisites:
All installation steps described in Isaac Gym’s Installation section must be fulfilled
Files (the implementation is self-contained so no specific location is required):
Environment:
reaching_franka_isaacgym_env.py
Training script:
reaching_franka_isaacgym_skrl_train.py
Evaluation script:
reaching_franka_isaacgym_skrl_eval.py
Training and evaluation:
Note
The checkpoints obtained in Isaac Gym were not evaluated with the real robot. However, they were evaluated in Omniverse Isaac Gym showing successful performance
# training (with the Python virtual environment active)
python reaching_franka_isaacgym_skrl_train.py
# evaluation (with the Python virtual environment active)
python reaching_franka_isaacgym_skrl_eval.py
Main environment configuration:
The control space (Cartesian or joint) can be specified in the task configuration dictionary (from reaching_franka_isaacgym_skrl_train.py
) as follow:
TASK_CFG["env"]["controlSpace"] = "joint" # "joint" or "cartesian"
3D reaching task (iiwa’s end-effector must reach a certain target point in space). The training was done in Omniverse Isaac Gym. The real robot control is performed through the Python, ROS and ROS2 APIs of libiiwa, a scalable multi-control framework for the KUKA LBR Iiwa robots. Training and evaluation is performed for both Cartesian and joint control space
Implementation (see details in the table below):
The observation space is composed of the episode’s normalized progress, the robot joints’ normalized positions (\(q\)) in the interval -1 to 1, the robot joints’ velocities (\(\dot{q}\)) affected by a random uniform scale for generalization, and the target’s position in space (\(target_{_{XYZ}}\)) with respect to the robot’s base
The action space, bounded in the range -1 to 1, consists of the following. For the joint control it’s robot joints’ position scaled change. For the Cartesian control it’s the end-effector’s position (\(ee_{_{XYZ}}\)) scaled change
The instantaneous reward is the negative value of the Euclidean distance (\(\text{d}\)) between the robot end-effector and the target point position. The episode terminates when this distance is less than 0.035 meters in simulation (0.075 meters in real-world) or when the defined maximum timestep is reached
The target position lies within a rectangular cuboid of dimensions 0.2 x 0.4 x 0.4 meters centered at (0.6, 0.0, 0.4) meters with respect to the robot’s base. The robot joints’ positions are drawn from an initial configuration [0º, 0º, 0º, -90º, 0º, 90º, 0º] modified with uniform random values between -7º and 7º approximately
Variable |
Formula / value |
Size |
---|---|---|
Observation space |
\(\dfrac{t}{t_{max}},\; 2 \dfrac{q - q_{min}}{q_{max} - q_{min}} - 1,\; 0.1\,\dot{q}\,U(0.5,1.5),\; target_{_{XYZ}}\) |
18 |
Action space (joint) |
\(\dfrac{2.5}{120} \, \Delta q\) |
7 |
Action space (Cartesian) |
\(\dfrac{1}{100} \, \Delta ee_{_{XYZ}}\) |
3 |
Reward |
\(-\text{d}(ee_{_{XYZ}},\; target_{_{XYZ}})\) |
|
Episode termination |
\(\text{d}(ee_{_{XYZ}},\; target_{_{XYZ}}) \le 0.035 \quad\) or \(\quad t \ge t_{max} - 1\) |
|
Maximum timesteps (\(t_{max}\)) |
100 |
Workflows
Warning
Make sure you have the smartHMI on hand in case something goes wrong in the run. Control via RL can be dangerous and unsafe for both the operator and the robot
Prerequisites:
A physical Kuka LBR iiwa robot is required. Additionally, the libiiwa library must be installed (visit the libiiwa documentation for installation details)
Files
Environment:
reaching_iiwa_real_env.py
Evaluation script:
reaching_iiwa_real_skrl_eval.py
Checkpoints (
agent_joint.pt
,agent_cartesian.pt
):trained_checkpoints.zip
Evaluation:
python3 reaching_iiwa_real_skrl_eval.py
Main environment configuration:
The control space (Cartesian or joint) can be specified in the environment class constructor (from reaching_iiwa_real_skrl_eval.py
) as follow:
control_space = "joint" # joint or cartesian
Warning
Make sure you have the smartHMI on hand in case something goes wrong in the run. Control via RL can be dangerous and unsafe for both the operator and the robot
Prerequisites:
A physical Kuka LBR iiwa robot is required. Additionally, the libiiwa library must be installed (visit the libiiwa documentation for installation details) and a Robot Operating System (ROS or ROS2) distribution must be available
Files
Environment (ROS):
reaching_iiwa_real_ros_env.py
Environment (ROS2):
reaching_iiwa_real_ros2_env.py
Evaluation script:
reaching_iiwa_real_ros_ros2_skrl_eval.py
Checkpoints (
agent_joint.pt
,agent_cartesian.pt
):trained_checkpoints.zip
Note
Source the ROS/ROS2 distribution and the ROS/ROS workspace containing the libiiwa packages before executing the scripts
Evaluation:
Note
The environment (reaching_iiwa_real_ros_env.py
or reaching_iiwa_real_ros2_env.py
) to be loaded will be automatically selected based on the sourced ROS distribution (ROS or ROS2) at script execution
python3 reaching_iiwa_real_ros_ros2_skrl_eval.py
Main environment configuration:
The control space (Cartesian or joint) can be specified in the environment class constructor (from reaching_iiwa_real_ros_ros2_skrl_eval.py
) as follow:
control_space = "joint" # joint or cartesian
Prerequisites:
All installation steps described in Omniverse Isaac Gym’s Overview & Getting Started section must be fulfilled (especially the subsection 1.3. Installing Examples Repository)
Files (the implementation is self-contained so no specific location is required):
Environment:
reaching_iiwa_omniverse_isaacgym_env.py
Training script:
reaching_iiwa_omniverse_isaacgym_skrl_train.py
Evaluation script:
reaching_iiwa_omniverse_isaacgym_skrl_eval.py
Checkpoints (
agent_joint.pt
,agent_cartesian.pt
):trained_checkpoints.zip
Simulation files: (.usd assets and robot class):
simulation_files.zip
Simulation files must be structured as follows:
<some_folder>
├── agent_cartesian.pt
├── agent_joint.pt
├── assets
│ ├── iiwa14_instanceable_meshes.usd
│ └── iiwa14.usd
├── reaching_iiwa_omniverse_isaacgym_env.py
├── reaching_iiwa_omniverse_isaacgym_skrl_eval.py
├── reaching_iiwa_omniverse_isaacgym_skrl_train.py
├── robots
│ ├── iiwa14.py
│ └── __init__.py
Training and evaluation:
# training (local workstation)
~/.local/share/ov/pkg/isaac_sim-*/python.sh reaching_iiwa_omniverse_isaacgym_skrl_train.py
# training (docker container)
/isaac-sim/python.sh reaching_iiwa_omniverse_isaacgym_skrl_train.py
# evaluation (local workstation)
~/.local/share/ov/pkg/isaac_sim-*/python.sh reaching_iiwa_omniverse_isaacgym_skrl_eval.py
# evaluation (docker container)
/isaac-sim/python.sh reaching_iiwa_omniverse_isaacgym_skrl_eval.py
Main environment configuration:
The control space (Cartesian or joint) can be specified in the task configuration dictionary (from reaching_iiwa_omniverse_isaacgym_skrl_train.py
) as follow:
TASK_CFG["task"]["env"]["controlSpace"] = "joint" # "joint" or "cartesian"
Library utilities (skrl.utils module)
This example shows how to use the library utilities to carry out the post-processing of files and data generated by the experiments
Example of a figure, generated by the code, showing the total reward (left) and the mean and standard deviation (right) of all experiments located in the runs folder
Note: The code will load all the Tensorboard files of the experiments located in the runs
folder. It is necessary to adjust the iterator’s parameters for other paths
import numpy as np
import matplotlib.pyplot as plt
from skrl.utils import postprocessing
labels = []
rewards = []
# load the Tensorboard files and iterate over them (tag: "Reward / Total reward (mean)")
tensorboard_iterator = postprocessing.TensorboardFileIterator("runs/*/events.out.tfevents.*",
tags=["Reward / Total reward (mean)"])
for dirname, data in tensorboard_iterator:
rewards.append(data["Reward / Total reward (mean)"])
labels.append(dirname)
# convert to numpy arrays and compute mean and std
rewards = np.array(rewards)
mean = np.mean(rewards[:,:,1], axis=0)
std = np.std(rewards[:,:,1], axis=0)
# creae two subplots (one for each reward and one for the mean)
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
# plot the rewards for each experiment
for reward, label in zip(rewards, labels):
ax[0].plot(reward[:,0], reward[:,1], label=label)
ax[0].set_title("Total reward (for each experiment)")
ax[0].set_xlabel("Timesteps")
ax[0].set_ylabel("Reward")
ax[0].grid(True)
ax[0].legend()
# plot the mean and std (across experiments)
ax[1].fill_between(rewards[0,:,0], mean - std, mean + std, alpha=0.5, label="std")
ax[1].plot(rewards[0,:,0], mean, label="mean")
ax[1].set_title("Total reward (mean and std of all experiments)")
ax[1].set_xlabel("Timesteps")
ax[1].set_ylabel("Reward")
ax[1].grid(True)
ax[1].legend()
# show and save the figure
plt.show()
plt.savefig("total_reward.png")