-
Notifications
You must be signed in to change notification settings - Fork 42
/
base_experiment.yaml
112 lines (100 loc) · 5.11 KB
/
base_experiment.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
defaults:
- experiment_config
- _self_
# The device for collection (e.g. cuda)
sampling_device: "cpu"
# The device for training (e.g. cuda)
train_device: "cpu"
# The device for the replay buffer of off-policy algorithms (e.g. cuda)
buffer_device: "cpu"
# Whether to share the parameters of the policy within agent groups
share_policy_params: True
# If an algorithm and an env support both continuous and discrete actions, what should be preferred
prefer_continuous_actions: True
# If False collection is done using a collector (under no grad). If True, collection is done with gradients.
collect_with_grad: False
# Discount factor
gamma: 0.9
# Learning rate
lr: 0.00005
# The epsilon parameter of the adam optimizer
adam_eps: 0.000001
# Clips grad norm if true and clips grad value if false
clip_grad_norm: True
# The value for the clipping, if null no clipping
clip_grad_val: 5
# Whether to use soft or hard target updates
soft_target_update: True
# If soft_target_update is True, this is its polyak_tau
polyak_tau: 0.005
# If soft_target_update is False, this is the frequency of the hard trarget updates in terms of n_optimizer_steps
hard_target_update_frequency: 5
# When an exploration wrapper is used. This is its initial epsilon for annealing
exploration_eps_init: 0.8
# When an exploration wrapper is used. This is its final epsilon after annealing
exploration_eps_end: 0.01
# Number of frames for annealing of exploration strategy in deterministic policy algorithms
# If null it will default to max_n_frames / 3
exploration_anneal_frames: null
# The maximum number of experiment iterations before the experiment terminates, exclusive with max_n_frames
max_n_iters: null
# Number of collected frames before ending, exclusive with max_n_iters
max_n_frames: 3_000_000
# Number of frames collected and each experiment iteration
on_policy_collected_frames_per_batch: 6000
# Number of environments used for collection
# If the environment is vectorized, this will be the number of batched environments.
# Otherwise batching will be simulated and each env will be run sequentially.
on_policy_n_envs_per_worker: 10
# This is the number of times collected_frames_per_batch will be split into minibatches and trained
on_policy_n_minibatch_iters: 45
# In on-policy algorithms the train_batch_size will be equal to the on_policy_collected_frames_per_batch
# and it will be split into minibatches with this number of frames for training
on_policy_minibatch_size: 400
# Number of frames collected and each experiment iteration
off_policy_collected_frames_per_batch: 6000
# Number of environments used for collection
# If the environment is vectorized, this will be the number of batched environments.
# Otherwise batching will be simulated and each env will be run sequentially.
off_policy_n_envs_per_worker: 10
# This is the number of times off_policy_train_batch_size will be sampled from the buffer and trained over.
off_policy_n_optimizer_steps: 1000
# Number of frames used for each off_policy_n_optimizer_steps when training off-policy algorithms
off_policy_train_batch_size: 128
# Maximum number of frames to keep in replay buffer memory for off-policy algorithms
off_policy_memory_size: 1_000_000
# Number of random action frames to prefill the replay buffer with
off_policy_init_random_frames: 0
evaluation: True
# Whether to render the evaluation (if rendering is available)
render: True
# Frequency of evaluation in terms of collected frames (this should be a multiple of on/off_policy_collected_frames_per_batch)
evaluation_interval: 120_000
# Number of episodes that evaluation is run on
evaluation_episodes: 10
# If True, when stochastic policies are evaluated, their deterministic value is taken, otherwise, if False, they are sampled
evaluation_deterministic_actions: True
# List of loggers to use, options are: wandb, csv, tensorboard, mflow
loggers: [csv]
# Wandb project name
project_name: "benchmarl"
# Create a json folder as part of the output in the format of marl-eval
create_json: True
# Absolute path to the folder where the experiment will log.
# If null, this will default to the hydra output dir (if using hydra) or to the current folder when the script is run (if not).
# If you are reloading an experiment with "restore_file", this will default to the reloaded experiment folder.
save_folder: null
# Absolute path to a checkpoint file where the experiment was saved. If null the experiment is started fresh.
restore_file: null
# Map location given to `torch.load()` when reloading.
# If you are reloading in a cpu-only machine a gpu experiment, you can use `restore_map_location: {"cuda:0":"cpu"}`
# to map gpu tensors to the cpu
restore_map_location: null
# Interval for experiment saving in terms of collected frames (this should be a multiple of on/off_policy_collected_frames_per_batch).
# Set it to 0 to disable checkpointing
checkpoint_interval: 0
# Wether to checkpoint when the experiment is done
checkpoint_at_end: False
# How many checkpoints to keep. As new checkpoints are taken, temporally older checkpoints are deleted to keep this number of
# checkpoints. The checkpoint at the end is included in this number. Set to `null` to keep all checkpoints.
keep_checkpoints_num: 3