Quickstart Guide
This guide will get you up and running with POMDPPlanners in just a few minutes.
Your First POMDP Solution
Let’s solve the classic Tiger POMDP problem using POMCP:
from POMDPPlanners.environments.tiger_pomdp import TigerPOMDP
from POMDPPlanners.planners.mcts_planners.pomcp import POMCP
from POMDPPlanners.core.belief import get_initial_belief
# Create the environment and initial belief
env = TigerPOMDP(discount_factor=0.95)
belief = get_initial_belief(env, n_particles=500)
# Create the planner
planner = POMCP(
environment=env,
discount_factor=0.95,
depth=10,
exploration_constant=50.0,
name="tiger_planner",
n_simulations=1000,
)
# Plan: returns a list of actions (length=1 for closed-loop planning)
actions, run_data = planner.action(belief)
action = actions[0]
print(f"Recommended action: {action}")
# Execute: sample the next state, observation, and reward
state = belief.sample()
next_state, observation, reward = env.sample_next_step(state=state, action=action)
print(f"Observation: {observation}, Reward: {reward}")
Running a Complete Episode
Use run_episode to run a full episode with automatic belief updates:
from POMDPPlanners.simulations.episodes import run_episode
from POMDPPlanners.utils.logger import get_logger
logger = get_logger("quickstart")
history = run_episode(
environment=env,
policy=planner,
initial_belief=belief,
num_steps=20,
logger=logger,
)
total_reward = sum(step.reward for step in history.history if step.reward is not None)
print(f"Steps: {len(history.history)}, Total reward: {total_reward:.2f}")
# Each step exposes: action, observation, reward, state
for i, step in enumerate(history.history[:5]):
print(f"Step {i}: action={step.action}, obs={step.observation}, reward={step.reward}")
Core Concepts
Environments
Environments can be created directly or via EnvironmentConfigsAPI:
# Direct construction
from POMDPPlanners.environments.tiger_pomdp import TigerPOMDP
env = TigerPOMDP(discount_factor=0.95)
# Via config API (also returns a ready-made initial belief)
from POMDPPlanners.configs.environment_configs import EnvironmentConfigsAPI
config_api = EnvironmentConfigsAPI(discount_factor=0.95)
env, belief = config_api.tiger_pomdp_config(n_particles=500)
# Discrete environments expose their state/action/observation spaces
print(env.states) # ['tiger_left', 'tiger_right']
print(env.actions) # ['listen', 'open_left', 'open_right']
print(env.observations) # ['hear_left', 'hear_right', 'hear_nothing']
# Core interaction method
next_state, observation, reward = env.sample_next_step(state=state, action=action)
done = env.is_terminal(next_state)
Belief States
from POMDPPlanners.core.belief import get_initial_belief
belief = get_initial_belief(env, n_particles=500)
# Sample a single state from the belief
state = belief.sample()
# Inspect the weighted distribution
distribution = belief.to_unique_support_distribution()
Planners
All planners share the same interface: planner.action(belief) returns
(List[action], PolicyRunData). A single-element list means closed-loop
(replans each step); a multi-element list means open-loop (executes the
sequence before replanning).
from POMDPPlanners.planners.mcts_planners.pomcp import POMCP
planner = POMCP(
environment=env,
discount_factor=0.95,
depth=10,
exploration_constant=50.0,
name="my_planner",
n_simulations=1000,
)
actions, run_data = planner.action(belief)
action = actions[0] # closed-loop: take the single planned action
Continuous Action Spaces
For environments with continuous actions, pair PFT_DPW with an action sampler:
import numpy as np
from POMDPPlanners.environments.light_dark_pomdp.continuous_light_dark_pomdp import (
ContinuousLightDarkPOMDP, RewardModelType,
)
from POMDPPlanners.planners.mcts_planners.pft_dpw import PFT_DPW
from POMDPPlanners.planners.planners_utils.dpw import ActionSampler
from POMDPPlanners.core.belief import get_initial_belief
env = ContinuousLightDarkPOMDP(
discount_factor=0.95,
goal_state=np.array([10, 5]),
start_state=np.array([0, 5]),
reward_model_type=RewardModelType.STANDARD,
)
class VelocityActionSampler(ActionSampler):
def sample(self, belief_node=None):
angle = np.random.uniform(0, 2 * np.pi)
speed = np.random.uniform(0, 1.0)
return np.array([speed * np.cos(angle), speed * np.sin(angle)])
planner = PFT_DPW(
environment=env,
discount_factor=0.95,
depth=10,
name="navigation_planner",
action_sampler=VelocityActionSampler(),
n_simulations=500,
)
belief = get_initial_belief(env, n_particles=500)
actions, _ = planner.action(belief)
print(f"Navigation action: {actions[0]}")
Comparing Planners
Use LocalSimulationsAPI to run a statistically rigorous multi-planner,
multi-environment comparison study:
from pathlib import Path
from POMDPPlanners.configs.environment_configs import EnvironmentConfigsAPI
from POMDPPlanners.planners.mcts_planners.pomcpow import POMCPOW
from POMDPPlanners.planners.mcts_planners.pft_dpw import PFT_DPW
from POMDPPlanners.simulations.simulation_apis.local_simulations_api import LocalSimulationsAPI
from POMDPPlanners.core.simulation import EnvironmentRunParams
from POMDPPlanners.utils.action_samplers import DiscreteActionSampler
config_api = EnvironmentConfigsAPI(discount_factor=0.95)
env, belief = config_api.continuous_observations_discrete_actions_light_dark_pomdp_config(
n_particles=500
)
action_sampler = DiscreteActionSampler(actions=env.get_actions())
planners = [
POMCPOW(
environment=env, discount_factor=0.95, depth=10,
exploration_constant=100.0, k_o=10, k_a=4,
alpha_o=0.01, alpha_a=0.01,
action_sampler=action_sampler, n_simulations=1500,
name="POMCPOW",
),
PFT_DPW(
environment=env, discount_factor=0.95, depth=10,
k_a=4, alpha_a=0.01, k_o=10, alpha_o=0.01,
exploration_constant=100.0, action_sampler=action_sampler,
n_simulations=1500, name="PFT_DPW",
),
]
run_params = [
EnvironmentRunParams(
environment=env, belief=belief, policies=planners,
num_episodes=100, num_steps=30,
)
]
api = LocalSimulationsAPI(cache_dir_path=Path("./results"))
results, stats_df = api.run_multiple_environments_and_policies_with_initial_debug_run(
environment_run_params=run_params,
alpha=0.05,
confidence_interval_level=0.95,
experiment_name="planner_comparison",
n_jobs=-1,
)
Hyperparameter Tuning
Automatically find the best hyperparameters using Optuna, then evaluate the optimised policy:
from pathlib import Path
from POMDPPlanners.environments.tiger_pomdp import TigerPOMDP
from POMDPPlanners.planners.mcts_planners.pomcp import POMCP
from POMDPPlanners.core.belief import get_initial_belief
from POMDPPlanners.core.simulation import NumericalHyperParameter
from POMDPPlanners.core.simulation.hyperparameter_tuning import (
HyperParamPlannerConfig, HyperParameterRunParams,
HyperParameterOptimizationDirection,
)
from POMDPPlanners.simulations.simulation_apis.local_simulations_api import LocalSimulationsAPI
env = TigerPOMDP(discount_factor=0.95)
belief = get_initial_belief(env, n_particles=200)
optimization_config = HyperParameterRunParams(
environment=env,
belief=belief,
hyper_param_planner_config=HyperParamPlannerConfig(
policy_cls=POMCP,
hyper_parameters=[
NumericalHyperParameter(0.1, 100.0, "exploration_constant"),
NumericalHyperParameter(3, 10, "depth"),
],
constant_parameters={
"discount_factor": 0.95,
"n_simulations": 500,
"name": "OptimizedPOMCP",
},
),
num_episodes=20,
num_steps=30,
n_trials=50,
parameters_to_optimize=[
("average_return", HyperParameterOptimizationDirection.MAXIMIZE)
],
)
api = LocalSimulationsAPI(cache_dir_path=Path("./tuning_results"), debug=True)
# Optimise then evaluate in one call
results, stats_df = api.run_optimize_and_evaluate(
configs=[optimization_config],
evaluation_episodes=100,
evaluation_steps=30,
evaluation_n_jobs=-1,
optimization_n_jobs=-1,
confidence_interval_level=0.95,
alpha=0.05,
experiment_name="tiger_pomcp_tuning",
)
print(stats_df[["environment_name", "policy_name", "mean_total_return", "ci_lower", "ci_upper"]])
Use predefined search spaces from PlannersHyperparamConfigs to skip
writing parameter ranges by hand:
from POMDPPlanners.configs.planners_hyperparam_configs import PlannersHyperparamConfigs
from POMDPPlanners.utils.action_samplers import DiscreteActionSampler
action_sampler = DiscreteActionSampler(actions=env.get_actions())
planner_configs = PlannersHyperparamConfigs(discount_factor=0.95)
predefined = planner_configs.pomcpow_config(
env=env, action_sampler=action_sampler, name="POMCPOW_Tuned"
)
Viewing Results
All simulation runs and optimization trials are tracked in MLflow. After any run, launch the UI from the cache directory:
cd ./results # or whichever cache_dir_path you used
mlflow ui
Then open http://localhost:5000 to browse metrics, compare runs, and inspect confidence intervals.
Available Environments
from POMDPPlanners.configs.environment_configs import EnvironmentConfigsAPI
config_api = EnvironmentConfigsAPI(discount_factor=0.95)
# Classic
env, belief = config_api.tiger_pomdp_config(n_particles=500)
# Navigation (discrete actions, continuous observations)
env, belief = config_api.continuous_observations_discrete_actions_light_dark_pomdp_config(n_particles=500)
# Navigation (fully continuous)
env, belief = config_api.continuous_observations_continuous_actions_light_dark_pomdp_config(n_particles=500)
# Manipulation
env, belief = config_api.push_pomdp_config(n_particles=500)
# Classic control
env, belief = config_api.cartpole_pomdp_config(n_particles=500)
env, belief = config_api.mountain_car_pomdp_config(n_particles=500)
Available Planners
# POMCP — discrete actions and observations
from POMDPPlanners.planners.mcts_planners.pomcp import POMCP
planner = POMCP(environment=env, discount_factor=0.95, depth=10,
exploration_constant=50.0, name="pomcp", n_simulations=1000)
# POMCPOW — continuous actions/observations via double progressive widening
from POMDPPlanners.planners.mcts_planners.pomcpow import POMCPOW
planner = POMCPOW(environment=env, discount_factor=0.95, depth=10,
exploration_constant=100.0,
k_o=10, k_a=4, alpha_o=0.01, alpha_a=0.01,
action_sampler=action_sampler, n_simulations=1500, name="pomcpow")
# PFT-DPW — particle filter trees with double progressive widening
from POMDPPlanners.planners.mcts_planners.pft_dpw import PFT_DPW
planner = PFT_DPW(environment=env, discount_factor=0.95, depth=10,
k_a=4, alpha_a=0.01, k_o=10, alpha_o=0.01,
exploration_constant=100.0, action_sampler=action_sampler,
n_simulations=1500, name="pft_dpw")
# Sparse Sampling — simple model-based baseline (depth=2, branching_factor=10)
from POMDPPlanners.planners.sparse_sampling_planners.sparse_sampling import SparseSamplingDiscreteActionsPlanner
planner = SparseSamplingDiscreteActionsPlanner(env, branching_factor=10, depth=2)
Next Steps
Run the example notebooks
jupyter notebook docs/examples/basic_usage.ipynb
jupyter notebook docs/examples/planners_comparison.ipynb
jupyter notebook docs/examples/hyperparameter_tuning.ipynb
jupyter notebook docs/examples/advanced_optimization.ipynb
API Reference
Browse the complete API documentation: POMDPPlanners package