# ============================================================
# Core Configuration (required)
# ============================================================
model = "Qwen/Qwen3-30B-A3B-Instruct-2507" # HuggingFace model ID
max_steps = 100 # Total training steps
batch_size = 256 # Rollouts per training batch
rollouts_per_example = 8 # Rollouts generated per dataset example
# ============================================================
# Training Hyperparameters (optional)
# ============================================================
# learning_rate = 1e-4 # Learning rate for LoRA
# lora_alpha = 16 # LoRA alpha scaling factor
# oversampling_factor = 2.0 # Oversample factor for rollout generation
# max_async_level = 2 # Maximum async generation level
# trajectory_strategy = "interleaved" # "interleaved" or "branching"
# ============================================================
# Secrets (optional)
# ============================================================
# env_file = ["secrets.env"] # File(s) containing environment secrets
# ============================================================
# Sampling Configuration (required)
# ============================================================
[sampling]
max_tokens = 512 # Max tokens per model response
# ============================================================
# Environment(s) (at least one required)
# ============================================================
[[env]]
id = "primeintellect/alphabet-sort" # Environments Hub ID (owner/name)
# args = { min_turns = 3, max_turns = 5 } # Arguments passed to load_environment()
# Add multiple [[env]] sections for multi-environment training:
# [[env]]
# id = "primeintellect/another-env"
# args = { split = "train", max_examples = 1000 }
# ============================================================
# Weights & Biases Logging (optional)
# ============================================================
# [wandb]
# project = "my-project" # W&B project name
# name = "my-run-name" # W&B run name
# entity = "my-team" # W&B team/entity
# ============================================================
# Online Evaluation (optional)
# ============================================================
# [eval]
# interval = 100 # Run eval every N training steps
# num_examples = -1 # Number of eval examples (-1 = all)
# rollouts_per_example = 1 # Rollouts per eval example
# eval_base_model = true # Also evaluate the base (untrained) model
#
# [[eval.env]] # Environment-specific eval overrides
# id = "primeintellect/eval-env"
# args = { split = "test" }
# num_examples = 30
# rollouts_per_example = 4
# ============================================================
# Validation During Training (optional)
# ============================================================
# [val]
# num_examples = 64 # Validation examples per check
# rollouts_per_example = 1 # Rollouts per validation example
# interval = 5 # Validate every N steps
# ============================================================
# Buffer / Difficulty Filtering (optional)
# ============================================================
# [buffer]
# online_difficulty_filtering = false # Enable difficulty-based sampling
# easy_threshold = 0.8 # Reward above this = "easy"
# hard_threshold = 0.2 # Reward below this = "hard"
# easy_fraction = 0.0 # Fraction of easy examples to include
# hard_fraction = 0.0 # Fraction of hard examples to include
# env_ratios = [0.5, 0.5] # Ratio between envs (multi-env only)
# seed = 42 # Random seed