explainer-env / constants.py
kgdrathan's picture
Upload folder using huggingface_hub
5869d56 verified
"""Shared limits and scoring helpers for explainer episodes."""
MAX_EXPLORE_STEPS = 6
MAX_REPAIR_STEPS = 3
AVAILABLE_TOOLS = (
"search_wikipedia",
"search_hf_papers",
"search_arxiv",
"search_scholar",
"fetch_docs",
"search_hf_hub",
)
MAX_EXPLORE_REWARD = 1.0
MAX_GENERATE_REWARD = 1.0
MAX_REPAIR_REWARD = 1.0
SUCCESS_SCORE_THRESHOLD = 0.3
def clamp_action_reward(value: float) -> float:
"""Clamp any single action reward to the required [0, 1] range."""
return min(max(value, 0.0), 1.0)
def normalized_episode_score(total_reward: float) -> float:
"""Normalize an episode's accumulated reward to the required [0, 1] range.
Repair is intentionally not added to the denominator: repair rewards are
discounted so a failed generate + successful repair should not beat a clean
first-pass generation.
"""
max_possible = MAX_EXPLORE_STEPS * MAX_EXPLORE_REWARD + MAX_GENERATE_REWARD
score = total_reward / max_possible if max_possible > 0 else 0.0
return min(max(score, 0.0), 1.0)