diff --git a/include/cuda_mppi_controller/mppi_gpu.hpp b/include/cuda_mppi_controller/mppi_gpu.hpp
index 4f635148..3cfc571d 100644
--- a/include/cuda_mppi_controller/mppi_gpu.hpp
+++ b/include/cuda_mppi_controller/mppi_gpu.hpp
@@ -83,6 +83,10 @@ struct MppiResult
   float vy = 0.0f;
   float w = 0.0f;
   float best_cost = 0.0f;   // min sampled trajectory cost (collision diagnosis)
+  float mean_cost = 0.0f;   // mean sampled trajectory cost from the final iteration
+  int sampled_rollouts = 0;
+  int valid_rollouts = 0;    // sampled trajectories with no collision-cost hit
+  float valid_rollout_ratio = 0.0f;
   bool all_colliding = false;
   bool retreating = false;   // true when command is a recovery back-out action
 };
diff --git a/python/core/include/cuda_mppi_controller/mppi_gpu.hpp b/python/core/include/cuda_mppi_controller/mppi_gpu.hpp
index 4f635148..3cfc571d 100644
--- a/python/core/include/cuda_mppi_controller/mppi_gpu.hpp
+++ b/python/core/include/cuda_mppi_controller/mppi_gpu.hpp
@@ -83,6 +83,10 @@ struct MppiResult
   float vy = 0.0f;
   float w = 0.0f;
   float best_cost = 0.0f;   // min sampled trajectory cost (collision diagnosis)
+  float mean_cost = 0.0f;   // mean sampled trajectory cost from the final iteration
+  int sampled_rollouts = 0;
+  int valid_rollouts = 0;    // sampled trajectories with no collision-cost hit
+  float valid_rollout_ratio = 0.0f;
   bool all_colliding = false;
   bool retreating = false;   // true when command is a recovery back-out action
 };
diff --git a/python/core/src/mppi_gpu.cu b/python/core/src/mppi_gpu.cu
index 046ee5dc..5439122b 100644
--- a/python/core/src/mppi_gpu.cu
+++ b/python/core/src/mppi_gpu.cu
@@ -707,7 +707,19 @@ MppiResult MppiGpu::computeInternal(
   }
 
   MppiResult res;
+  res.sampled_rollouts = K;
+  double cost_sum = 0.0;
+  int valid_rollouts = 0;
+  for (const float cost : im.h_costs) {
+    cost_sum += cost;
+    if (cost < mp.collision_cost) {
+      ++valid_rollouts;
+    }
+  }
   res.best_cost = min_cost;
+  res.mean_cost = static_cast<float>(cost_sum / static_cast<double>(K));
+  res.valid_rollouts = valid_rollouts;
+  res.valid_rollout_ratio = static_cast<float>(valid_rollouts) / static_cast<float>(K);
   res.all_colliding = min_cost >= mp.collision_cost;
 
   if (res.all_colliding) {
diff --git a/python/src/cudarobotics/bindings.cpp b/python/src/cudarobotics/bindings.cpp
index c68dd78e..e65719b2 100644
--- a/python/src/cudarobotics/bindings.cpp
+++ b/python/src/cudarobotics/bindings.cpp
@@ -422,6 +422,10 @@ class PyMppiPlanner
 
     nb::dict info;
     info["best_cost"] = result.best_cost;
+    info["mean_cost"] = result.mean_cost;
+    info["sampled_rollouts"] = result.sampled_rollouts;
+    info["valid_rollouts"] = result.valid_rollouts;
+    info["valid_rollout_ratio"] = result.valid_rollout_ratio;
     info["all_colliding"] = result.all_colliding;
     info["retreating"] = result.retreating;
     return nb::make_tuple(result.v, result.vy, result.w, info);
@@ -737,6 +741,10 @@ NB_MODULE(_cudarobotics, m)
     .def_rw("vy", &cr::MppiResult::vy)
     .def_rw("w", &cr::MppiResult::w)
     .def_rw("best_cost", &cr::MppiResult::best_cost)
+    .def_rw("mean_cost", &cr::MppiResult::mean_cost)
+    .def_rw("sampled_rollouts", &cr::MppiResult::sampled_rollouts)
+    .def_rw("valid_rollouts", &cr::MppiResult::valid_rollouts)
+    .def_rw("valid_rollout_ratio", &cr::MppiResult::valid_rollout_ratio)
     .def_rw("all_colliding", &cr::MppiResult::all_colliding)
     .def_rw("retreating", &cr::MppiResult::retreating);
 
diff --git a/python/tests/test_import.py b/python/tests/test_import.py
index 18077796..43d0f8c9 100644
--- a/python/tests/test_import.py
+++ b/python/tests/test_import.py
@@ -37,6 +37,15 @@ def test_mppi_planner_smoke():
     assert isinstance(vy, float)
     assert isinstance(w, float)
     assert isinstance(info, dict)
+    assert {
+        "best_cost",
+        "mean_cost",
+        "sampled_rollouts",
+        "valid_rollouts",
+        "valid_rollout_ratio",
+        "all_colliding",
+        "retreating",
+    }.issubset(info)
 
 
 def test_mppi_planner_cuda_dlpack_costmap_smoke():
@@ -67,6 +76,7 @@ def test_mppi_planner_cuda_dlpack_costmap_smoke():
     assert isinstance(vy, float)
     assert isinstance(w, float)
     assert isinstance(info, dict)
+    assert "valid_rollout_ratio" in info
 
 
 @pytest.mark.parametrize(
diff --git a/ros2_ws/src/cuda_mppi_controller/README.md b/ros2_ws/src/cuda_mppi_controller/README.md
index e9483132..d50f0d2f 100644
--- a/ros2_ws/src/cuda_mppi_controller/README.md
+++ b/ros2_ws/src/cuda_mppi_controller/README.md
@@ -174,12 +174,20 @@ controller_server:
 | `yaw_goal_activation_dist` | 0.5 | [m] range to enable the yaw goal cost |
 | `lookahead_dist` | 3.0 | [m] global plan window fed to the GPU |
 | `transform_tolerance` | 0.1 | [s] TF lookup tolerance |
+| `diagnostics_log_period` | 0.0 | [s] periodic one-line solve/valid-rollout logging; 0 disables |
+| `diagnostics_csv_path` | `""` | optional per-cycle diagnostics CSV path |
 
 Parameters above are validated at configure time and during live ROS parameter
 updates. Invalid values, such as zero horizon length, non-positive model step,
 unknown motion models, or negative cost weights, are rejected before the GPU
 optimizer is rebuilt.
 
+Set `diagnostics_log_period` to a positive value for throttled controller logs,
+or set `diagnostics_csv_path` to capture one row per control cycle. The CSV
+includes solve time, best/mean rollout cost, valid rollout count and ratio,
+all-colliding/retreat flags, path window size, costmap size, and the selected
+command.
+
 ## Benchmark scenarios
 
 `controller_benchmark` runs closed-loop CPU vs GPU comparisons on synthetic maps:
@@ -188,12 +196,21 @@ optimizer is rebuilt.
 ros2 run cuda_mppi_controller controller_benchmark /tmp/bench wall_gap
 ros2 run cuda_mppi_controller controller_benchmark /tmp/bench narrow_corridor
 ros2 run cuda_mppi_controller controller_benchmark /tmp/bench u_turn
+ros2 run cuda_mppi_controller controller_benchmark /tmp/bench double_gap
+ros2 run cuda_mppi_controller controller_benchmark /tmp/bench moving_crossing quick
 ros2 run cuda_mppi_controller controller_benchmark /tmp/bench all
+ros2 run cuda_mppi_controller controller_benchmark /tmp/bench double_gap quick
+ros2 run cuda_mppi_controller controller_benchmark /tmp/bench double_gap cpu_gpu
 ros2 run cuda_mppi_controller controller_benchmark /tmp/bench esdf
 ros2 run cuda_mppi_controller controller_benchmark /tmp/bench path_angle
 ros2 run cuda_mppi_controller controller_benchmark /tmp/bench curvature_speed
 ```
 
+The optional preset is `full` by default. Use `quick` for GPU K=2,048/8,192
+smoke runs, or `cpu_gpu` for CPU K=2,000 vs GPU K=8,192. The `esdf`,
+`path_angle`, and `curvature_speed` benchmark families keep their fixed
+comparison sets.
+
 `all` also runs Ackermann/Omni GPU configs (`gpu_ackermann_K8192`, `gpu_omni_K8192`).
 `esdf` runs a GPU-only comparison of the default costmap critic against the
 optional distance-field clearance critic. Results:
diff --git a/ros2_ws/src/cuda_mppi_controller/config/cuda_mppi_params.example.yaml b/ros2_ws/src/cuda_mppi_controller/config/cuda_mppi_params.example.yaml
index 71574ef8..490c3fcc 100644
--- a/ros2_ws/src/cuda_mppi_controller/config/cuda_mppi_params.example.yaml
+++ b/ros2_ws/src/cuda_mppi_controller/config/cuda_mppi_params.example.yaml
@@ -42,3 +42,5 @@ controller_server:
       yaw_goal_activation_dist: 0.5
       lookahead_dist: 3.0       # [m] global plan window fed to the GPU
       transform_tolerance: 0.1
+      diagnostics_log_period: 0.0 # [s] 0 disables periodic solve/valid-rollout logs
+      diagnostics_csv_path: ""    # optional CSV trace path for per-cycle diagnostics
diff --git a/ros2_ws/src/cuda_mppi_controller/include/cuda_mppi_controller/cuda_mppi_controller.hpp b/ros2_ws/src/cuda_mppi_controller/include/cuda_mppi_controller/cuda_mppi_controller.hpp
index 6b9c5a10..21d9c532 100644
--- a/ros2_ws/src/cuda_mppi_controller/include/cuda_mppi_controller/cuda_mppi_controller.hpp
+++ b/ros2_ws/src/cuda_mppi_controller/include/cuda_mppi_controller/cuda_mppi_controller.hpp
@@ -2,6 +2,7 @@
 #define CUDA_MPPI_CONTROLLER__CUDA_MPPI_CONTROLLER_HPP_
 
 #include <memory>
+#include <fstream>
 #include <string>
 #include <vector>
 
@@ -48,6 +49,12 @@ class CudaMppiController : public nav2_core::Controller
   void reset();
 
 private:
+  struct DiagnosticsCsv
+  {
+    std::ofstream file;
+    bool enabled = false;
+  };
+
   // Extract the local window of the global plan around the robot, transformed
   // into the costmap global frame. Returns flattened [x0,y0,x1,y1,...] points;
   // sets goal pose (window end) and whether it is the true final goal.
@@ -55,6 +62,11 @@ class CudaMppiController : public nav2_core::Controller
     const geometry_msgs::msg::PoseStamped & robot_pose,
     float & goal_x, float & goal_y, float & goal_yaw, bool & goal_is_final);
 
+  DiagnosticsCsv openDiagnosticsCsv(const std::string & path) const;
+  void emitDiagnostics(
+    const MppiResult & result, double solve_ms, int path_points,
+    int costmap_size_x, int costmap_size_y);
+
   rclcpp_lifecycle::LifecycleNode::WeakPtr node_;
   std::string name_;
   std::shared_ptr<tf2_ros::Buffer> tf_;
@@ -67,6 +79,11 @@ class CudaMppiController : public nav2_core::Controller
 
   double lookahead_dist_ = 3.0;
   double transform_tolerance_ = 0.1;
+  double diagnostics_log_period_ = 0.0;
+  std::string diagnostics_csv_path_;
+  DiagnosticsCsv diagnostics_csv_;
+  rclcpp::Time last_diagnostics_log_time_{0, 0, RCL_ROS_TIME};
+  bool has_diagnostics_log_time_ = false;
   rclcpp::node_interfaces::OnSetParametersCallbackHandle::SharedPtr param_callback_;
   bool updateParamsFromNode(const rclcpp_lifecycle::LifecycleNode::SharedPtr & node);
 };
diff --git a/ros2_ws/src/cuda_mppi_controller/src/cuda_mppi_controller.cpp b/ros2_ws/src/cuda_mppi_controller/src/cuda_mppi_controller.cpp
index 3798ac72..f63a318c 100644
--- a/ros2_ws/src/cuda_mppi_controller/src/cuda_mppi_controller.cpp
+++ b/ros2_ws/src/cuda_mppi_controller/src/cuda_mppi_controller.cpp
@@ -2,6 +2,7 @@
 #include "cuda_mppi_controller/nav2_compat.hpp"
 
 #include <algorithm>
+#include <chrono>
 #include <cmath>
 #include <limits>
 #include <mutex>
@@ -73,7 +74,7 @@ void requireNonNegative(const std::string & name, double value)
 }
 
 void validateControllerParams(const MppiParams & params, double lookahead_dist,
-                              double transform_tolerance)
+                              double transform_tolerance, double diagnostics_log_period)
 {
   requireParam(params.batch_size > 0, "batch_size", "must be greater than 0");
   requireParam(params.time_steps > 0, "time_steps", "must be greater than 0");
@@ -114,11 +115,15 @@ void validateControllerParams(const MppiParams & params, double lookahead_dist,
 
   requirePositive("lookahead_dist", lookahead_dist);
   requireNonNegative("transform_tolerance", transform_tolerance);
+  requireNonNegative("diagnostics_log_period", diagnostics_log_period);
 }
 
 bool applyControllerParameter(const std::string & key, const rclcpp::Parameter & parameter,
                               MppiParams & params, double & lookahead_dist,
-                              double & transform_tolerance)
+                              double & transform_tolerance,
+                              double & diagnostics_log_period,
+                              std::string & diagnostics_csv_path,
+                              bool & optimizer_params_changed)
 {
   if (key == "batch_size") {
     params.batch_size = static_cast<int>(parameter.as_int());
@@ -190,9 +195,16 @@ bool applyControllerParameter(const std::string & key, const rclcpp::Parameter &
     lookahead_dist = parameter.as_double();
   } else if (key == "transform_tolerance") {
     transform_tolerance = parameter.as_double();
+  } else if (key == "diagnostics_log_period") {
+    diagnostics_log_period = parameter.as_double();
+    return true;
+  } else if (key == "diagnostics_csv_path") {
+    diagnostics_csv_path = parameter.as_string();
+    return true;
   } else {
     return false;
   }
+  optimizer_params_changed = true;
   return true;
 }
 
@@ -208,6 +220,8 @@ bool CudaMppiController::updateParamsFromNode(
   MppiParams next = params_;
   double next_lookahead_dist = lookahead_dist_;
   double next_transform_tolerance = transform_tolerance_;
+  double next_diagnostics_log_period = diagnostics_log_period_;
+  std::string next_diagnostics_csv_path = diagnostics_csv_path_;
 
   int batch_size = next.batch_size;
   int time_steps = next.time_steps;
@@ -273,6 +287,8 @@ bool CudaMppiController::updateParamsFromNode(
   node->get_parameter(name_ + ".retreat_scale", retreat_scale);
   node->get_parameter(name_ + ".lookahead_dist", next_lookahead_dist);
   node->get_parameter(name_ + ".transform_tolerance", next_transform_tolerance);
+  node->get_parameter(name_ + ".diagnostics_log_period", next_diagnostics_log_period);
+  node->get_parameter(name_ + ".diagnostics_csv_path", next_diagnostics_csv_path);
 
   next.batch_size = batch_size;
   next.time_steps = time_steps;
@@ -308,11 +324,14 @@ bool CudaMppiController::updateParamsFromNode(
   next.enable_retreat = enable_retreat;
   next.retreat_scale = static_cast<float>(retreat_scale);
 
-  validateControllerParams(next, next_lookahead_dist, next_transform_tolerance);
+  validateControllerParams(
+    next, next_lookahead_dist, next_transform_tolerance, next_diagnostics_log_period);
 
   params_ = next;
   lookahead_dist_ = next_lookahead_dist;
   transform_tolerance_ = next_transform_tolerance;
+  diagnostics_log_period_ = next_diagnostics_log_period;
+  diagnostics_csv_path_ = next_diagnostics_csv_path;
   return true;
 }
 
@@ -373,8 +392,11 @@ void CudaMppiController::configure(
   declare_param("retreat_scale", static_cast<double>(params_.retreat_scale));
   declare_param("lookahead_dist", lookahead_dist_);
   declare_param("transform_tolerance", transform_tolerance_);
+  declare_param("diagnostics_log_period", diagnostics_log_period_);
+  declare_param("diagnostics_csv_path", diagnostics_csv_path_);
 
   updateParamsFromNode(node);
+  diagnostics_csv_ = openDiagnosticsCsv(diagnostics_csv_path_);
 
   optimizer_ = std::make_unique<MppiGpu>(params_);
 
@@ -386,30 +408,49 @@ void CudaMppiController::configure(
       MppiParams next_params = params_;
       double next_lookahead_dist = lookahead_dist_;
       double next_transform_tolerance = transform_tolerance_;
+      double next_diagnostics_log_period = diagnostics_log_period_;
+      std::string next_diagnostics_csv_path = diagnostics_csv_path_;
       bool changed = false;
+      bool optimizer_params_changed = false;
       for (const auto & parameter : parameters) {
         const std::string & full_name = parameter.get_name();
         if (full_name.rfind(prefix, 0) != 0) {
           continue;
         }
         const std::string key = full_name.substr(prefix.size());
-        changed = applyControllerParameter(key, parameter, next_params, next_lookahead_dist,
-                                           next_transform_tolerance) ||
+        changed = applyControllerParameter(
+          key, parameter, next_params, next_lookahead_dist, next_transform_tolerance,
+          next_diagnostics_log_period, next_diagnostics_csv_path, optimizer_params_changed) ||
                   changed;
       }
       if (!changed) {
         return result;
       }
       try {
-        validateControllerParams(next_params, next_lookahead_dist, next_transform_tolerance);
+        validateControllerParams(
+          next_params, next_lookahead_dist, next_transform_tolerance,
+          next_diagnostics_log_period);
         std::unique_ptr<MppiGpu> next_optimizer;
-        if (optimizer_) {
+        if (optimizer_ && optimizer_params_changed) {
           next_optimizer = std::make_unique<MppiGpu>(next_params);
         }
+        DiagnosticsCsv next_diagnostics_csv;
+        const bool diagnostics_csv_changed =
+          next_diagnostics_csv_path != diagnostics_csv_path_;
+        if (diagnostics_csv_changed) {
+          next_diagnostics_csv = openDiagnosticsCsv(next_diagnostics_csv_path);
+        }
         params_ = next_params;
         lookahead_dist_ = next_lookahead_dist;
         transform_tolerance_ = next_transform_tolerance;
-        optimizer_ = std::move(next_optimizer);
+        diagnostics_log_period_ = next_diagnostics_log_period;
+        diagnostics_csv_path_ = next_diagnostics_csv_path;
+        if (optimizer_params_changed) {
+          optimizer_ = std::move(next_optimizer);
+        }
+        if (diagnostics_csv_changed) {
+          diagnostics_csv_ = std::move(next_diagnostics_csv);
+        }
       } catch (const std::exception & ex) {
         result.successful = false;
         result.reason = ex.what();
@@ -427,6 +468,7 @@ void CudaMppiController::cleanup()
 {
   param_callback_.reset();
   optimizer_.reset();
+  diagnostics_csv_ = DiagnosticsCsv{};
 }
 
 void CudaMppiController::activate()
@@ -452,6 +494,71 @@ void CudaMppiController::reset()
   }
 }
 
+CudaMppiController::DiagnosticsCsv CudaMppiController::openDiagnosticsCsv(
+  const std::string & path) const
+{
+  DiagnosticsCsv output;
+  if (path.empty()) {
+    return output;
+  }
+
+  output.file.open(path, std::ios::out | std::ios::app);
+  if (!output.file.is_open()) {
+    throw std::runtime_error(
+            "CudaMppiController: failed to open diagnostics_csv_path '" + path + "'");
+  }
+  output.enabled = true;
+  output.file
+    << "stamp_sec,solve_ms,best_cost,mean_cost,sampled_rollouts,valid_rollouts,"
+    << "valid_rollout_ratio,all_colliding,retreating,path_points,costmap_size_x,"
+    << "costmap_size_y,cmd_v,cmd_vy,cmd_w\n";
+  return output;
+}
+
+void CudaMppiController::emitDiagnostics(
+  const MppiResult & result, double solve_ms, int path_points,
+  int costmap_size_x, int costmap_size_y)
+{
+  const auto node = node_.lock();
+  const rclcpp::Time now = node ? node->now() : rclcpp::Clock(RCL_ROS_TIME).now();
+
+  if (diagnostics_log_period_ > 0.0 && node) {
+    const bool due =
+      !has_diagnostics_log_time_ ||
+      (now - last_diagnostics_log_time_).seconds() >= diagnostics_log_period_;
+    if (due) {
+      last_diagnostics_log_time_ = now;
+      has_diagnostics_log_time_ = true;
+      RCLCPP_INFO(
+        logger_,
+        "CUDA MPPI diagnostics: solve=%.2f ms valid=%d/%d (%.1f%%) best=%.3f mean=%.3f "
+        "retreat=%s cmd=(%.3f, %.3f, %.3f)",
+        solve_ms, result.valid_rollouts, result.sampled_rollouts,
+        100.0 * result.valid_rollout_ratio, result.best_cost, result.mean_cost,
+        result.retreating ? "true" : "false", result.v, result.vy, result.w);
+    }
+  }
+
+  if (diagnostics_csv_.enabled && diagnostics_csv_.file.is_open()) {
+    diagnostics_csv_.file
+      << now.seconds() << ','
+      << solve_ms << ','
+      << result.best_cost << ','
+      << result.mean_cost << ','
+      << result.sampled_rollouts << ','
+      << result.valid_rollouts << ','
+      << result.valid_rollout_ratio << ','
+      << (result.all_colliding ? 1 : 0) << ','
+      << (result.retreating ? 1 : 0) << ','
+      << path_points << ','
+      << costmap_size_x << ','
+      << costmap_size_y << ','
+      << result.v << ','
+      << result.vy << ','
+      << result.w << '\n';
+  }
+}
+
 std::vector<float> CudaMppiController::extractLocalPath(
   const geometry_msgs::msg::PoseStamped & robot_pose,
   float & goal_x, float & goal_y, float & goal_yaw, bool & goal_is_final)
@@ -541,22 +648,32 @@ geometry_msgs::msg::TwistStamped CudaMppiController::computeVelocityCommands(
 
   nav2_costmap_2d::Costmap2D * costmap = costmap_ros_->getCostmap();
   MppiResult result;
+  int costmap_size_x = 0;
+  int costmap_size_y = 0;
+  double solve_ms = 0.0;
   {
     std::unique_lock<nav2_costmap_2d::Costmap2D::mutex_t> lock(*costmap->getMutex());
+    costmap_size_x = static_cast<int>(costmap->getSizeInCellsX());
+    costmap_size_y = static_cast<int>(costmap->getSizeInCellsY());
+    const auto solve_start = std::chrono::steady_clock::now();
     result = optimizer_->compute(
       static_cast<float>(pose.pose.position.x),
       static_cast<float>(pose.pose.position.y),
       static_cast<float>(tf2::getYaw(pose.pose.orientation)),
       costmap->getCharMap(),
-      static_cast<int>(costmap->getSizeInCellsX()),
-      static_cast<int>(costmap->getSizeInCellsY()),
+      costmap_size_x,
+      costmap_size_y,
       static_cast<float>(costmap->getOriginX()),
       static_cast<float>(costmap->getOriginY()),
       static_cast<float>(costmap->getResolution()),
       path_xy.data(), static_cast<int>(path_xy.size() / 2),
       goal_x, goal_y, goal_yaw, goal_is_final,
       footprint_xy.data(), static_cast<int>(footprint_xy.size() / 2));
+    const auto solve_end = std::chrono::steady_clock::now();
+    solve_ms = std::chrono::duration<double, std::milli>(solve_end - solve_start).count();
   }
+  emitDiagnostics(
+    result, solve_ms, static_cast<int>(path_xy.size() / 2), costmap_size_x, costmap_size_y);
 
   if (result.all_colliding && !result.retreating) {
     throw NoValidControl(
diff --git a/ros2_ws/src/cuda_mppi_controller/test/controller_benchmark.cpp b/ros2_ws/src/cuda_mppi_controller/test/controller_benchmark.cpp
index e6c77763..e5b6c046 100644
--- a/ros2_ws/src/cuda_mppi_controller/test/controller_benchmark.cpp
+++ b/ros2_ws/src/cuda_mppi_controller/test/controller_benchmark.cpp
@@ -3,10 +3,12 @@
 // nav2's controller_server loads them, driving the same unicycle plant
 // through synthetic costmaps.
 //
-// Usage: controller_benchmark <out_dir> [scenario]
-//   scenario: wall_gap | narrow_corridor | u_turn | all | esdf | path_angle | curvature_speed
+// Usage: controller_benchmark <out_dir> [scenario] [preset]
+//   scenario: wall_gap | narrow_corridor | u_turn | double_gap | moving_crossing
+//             | all | esdf | path_angle | curvature_speed
 //             (default: wall_gap)
-//   writes <out_dir>/summary.csv and <out_dir>/traj_<label>.csv
+//   preset  : full | quick | cpu_gpu (standard scenarios only; default: full)
+//   writes <out_dir>/<scenario>/summary.csv and <out_dir>/<scenario>/traj_<label>.csv
 #include <algorithm>
 #include <array>
 #include <chrono>
@@ -47,7 +49,12 @@ struct Scenario
   double goal_x;
   double goal_y;
   std::function<bool(double, double)> lethal;
+  std::function<bool(double, double, double)> dynamic_lethal;
+  std::function<void(nav2_costmap_2d::Costmap2D &, double)> paint_costmap;
   std::function<nav_msgs::msg::Path(const rclcpp::Time &)> make_plan;
+  bool dynamic_costmap = false;
+  int dynamic_repaint_period = 1;
+  bool inflate_costmap = true;
 };
 
 bool inRect(double x, double y, double x0, double x1, double y0, double y1)
@@ -55,6 +62,62 @@ bool inRect(double x, double y, double x0, double x1, double y0, double y1)
   return x >= x0 && x < x1 && y >= y0 && y < y1;
 }
 
+bool scenarioLethal(const Scenario & scenario, double x, double y, double sim_time)
+{
+  return (scenario.lethal && scenario.lethal(x, y)) ||
+         (scenario.dynamic_lethal && scenario.dynamic_lethal(x, y, sim_time));
+}
+
+void clearCostmap(nav2_costmap_2d::Costmap2D & costmap)
+{
+  costmap.resetMapToValue(
+    0, 0, costmap.getSizeInCellsX(), costmap.getSizeInCellsY(), nav2_costmap_2d::FREE_SPACE);
+}
+
+void paintRect(
+  nav2_costmap_2d::Costmap2D & costmap,
+  double x0, double x1, double y0, double y1, unsigned char cost)
+{
+  const double origin_x = costmap.getOriginX();
+  const double origin_y = costmap.getOriginY();
+  const double resolution = costmap.getResolution();
+  const int nx = static_cast<int>(costmap.getSizeInCellsX());
+  const int ny = static_cast<int>(costmap.getSizeInCellsY());
+  const int mx0 = std::max(0, static_cast<int>(std::floor((x0 - origin_x) / resolution)));
+  const int mx1 = std::min(nx - 1, static_cast<int>(std::ceil((x1 - origin_x) / resolution)));
+  const int my0 = std::max(0, static_cast<int>(std::floor((y0 - origin_y) / resolution)));
+  const int my1 = std::min(ny - 1, static_cast<int>(std::ceil((y1 - origin_y) / resolution)));
+  if (mx0 > mx1 || my0 > my1) {
+    return;
+  }
+  for (int my = my0; my <= my1; ++my) {
+    for (int mx = mx0; mx <= mx1; ++mx) {
+      costmap.setCost(static_cast<unsigned int>(mx), static_cast<unsigned int>(my), cost);
+    }
+  }
+}
+
+void appendPathSegment(
+  nav_msgs::msg::Path & path,
+  double x0, double y0, double x1, double y1,
+  double step = 0.05)
+{
+  const double len = std::hypot(x1 - x0, y1 - y0);
+  const int steps = std::max(1, static_cast<int>(len / step));
+  for (int i = 0; i <= steps; ++i) {
+    if (!path.poses.empty() && i == 0) {
+      continue;
+    }
+    const double t = static_cast<double>(i) / steps;
+    geometry_msgs::msg::PoseStamped p;
+    p.header = path.header;
+    p.pose.position.x = x0 + t * (x1 - x0);
+    p.pose.position.y = y0 + t * (y1 - y0);
+    p.pose.orientation.w = 1.0;
+    path.poses.push_back(p);
+  }
+}
+
 Scenario makeWallGap()
 {
   Scenario s;
@@ -132,45 +195,109 @@ Scenario makeUTurn()
       const std::array<std::array<double, 2>, 4> pts = {{
         {1.5, 1.5}, {8.5, 1.5}, {8.5, 8.5}, {1.5, 8.5}}};
       for (size_t seg = 0; seg + 1 < pts.size(); ++seg) {
-        const double x0 = pts[seg][0], y0 = pts[seg][1];
-        const double x1 = pts[seg + 1][0], y1 = pts[seg + 1][1];
-        const double len = std::hypot(x1 - x0, y1 - y0);
-        const int steps = std::max(1, static_cast<int>(len / 0.05));
-        for (int i = 0; i <= steps; ++i) {
-          const double t = static_cast<double>(i) / steps;
-          geometry_msgs::msg::PoseStamped p;
-          p.header = path.header;
-          p.pose.position.x = x0 + t * (x1 - x0);
-          p.pose.position.y = y0 + t * (y1 - y0);
-          p.pose.orientation.w = 1.0;
-          path.poses.push_back(p);
-        }
+        appendPathSegment(path, pts[seg][0], pts[seg][1], pts[seg + 1][0], pts[seg + 1][1]);
       }
       return path;
     };
   return s;
 }
 
+Scenario makeDoubleGap()
+{
+  Scenario s;
+  s.name = "double_gap";
+  s.start_x = 1.0;
+  s.start_y = 5.0;
+  s.goal_x = 9.0;
+  s.goal_y = 5.0;
+  s.lethal = [](double x, double y) {
+      const bool first_wall =
+        inRect(x, y, 3.8, 4.0, 0.0, 2.2) || inRect(x, y, 3.8, 4.0, 3.8, 10.0);
+      const bool second_wall =
+        inRect(x, y, 6.0, 6.2, 0.0, 6.2) || inRect(x, y, 6.0, 6.2, 7.8, 10.0);
+      return first_wall || second_wall;
+    };
+  s.make_plan = [](const rclcpp::Time & stamp) {
+      nav_msgs::msg::Path path;
+      path.header.frame_id = "odom";
+      path.header.stamp = stamp;
+      const std::array<std::array<double, 2>, 6> pts = {{
+        {1.0, 5.0}, {3.4, 3.0}, {4.4, 3.0}, {5.6, 7.0}, {6.6, 7.0}, {9.0, 5.0}}};
+      for (size_t seg = 0; seg + 1 < pts.size(); ++seg) {
+        appendPathSegment(path, pts[seg][0], pts[seg][1], pts[seg + 1][0], pts[seg + 1][1]);
+      }
+      return path;
+    };
+  return s;
+}
+
+Scenario makeMovingCrossing()
+{
+  Scenario s;
+  s.name = "moving_crossing";
+  s.start_x = 1.0;
+  s.start_y = 5.0;
+  s.goal_x = 9.0;
+  s.goal_y = 5.0;
+  s.lethal = [](double, double) {
+      return false;
+    };
+  s.dynamic_lethal = [](double x, double y, double sim_time) {
+      const double center_y = 1.0 + 0.5 * sim_time;
+      return inRect(x, y, 4.65, 5.35, center_y - 0.45, center_y + 0.45);
+    };
+  s.dynamic_costmap = true;
+  s.dynamic_repaint_period = 4;  // 5 Hz obstacle-map updates inside a 20 Hz control loop.
+  s.inflate_costmap = false;  // keep per-step repaint cheap for this moving-obstacle smoke.
+  s.paint_costmap = [](nav2_costmap_2d::Costmap2D & costmap, double sim_time) {
+      const double center_y = 1.0 + 0.5 * sim_time;
+      paintRect(
+        costmap, 4.65, 5.35, center_y - 0.45, center_y + 0.45,
+        nav2_costmap_2d::LETHAL_OBSTACLE);
+    };
+  s.make_plan = [](const rclcpp::Time & stamp) {
+      nav_msgs::msg::Path path;
+      path.header.frame_id = "odom";
+      path.header.stamp = stamp;
+      appendPathSegment(path, 1.0, 5.0, 9.0, 5.0);
+      return path;
+    };
+  return s;
+}
+
 std::vector<Scenario> allScenarios()
 {
-  return {makeWallGap(), makeNarrowCorridor(), makeUTurn()};
+  return {
+    makeWallGap(), makeNarrowCorridor(), makeUTurn(), makeDoubleGap(), makeMovingCrossing()};
 }
 
-void paintCostmap(nav2_costmap_2d::Costmap2D & costmap, const Scenario & scenario)
+void paintCostmap(
+  nav2_costmap_2d::Costmap2D & costmap, const Scenario & scenario, double sim_time)
 {
   const unsigned int nx = costmap.getSizeInCellsX();
   const unsigned int ny = costmap.getSizeInCellsY();
+  clearCostmap(costmap);
+  if (scenario.paint_costmap) {
+    scenario.paint_costmap(costmap, sim_time);
+    return;
+  }
+
   std::vector<std::pair<double, double>> lethal_centers;
   for (unsigned int my = 0; my < ny; ++my) {
     for (unsigned int mx = 0; mx < nx; ++mx) {
       double wx, wy;
       costmap.mapToWorld(mx, my, wx, wy);
-      if (scenario.lethal(wx, wy)) {
+      if (scenarioLethal(scenario, wx, wy, sim_time)) {
         costmap.setCost(mx, my, nav2_costmap_2d::LETHAL_OBSTACLE);
         lethal_centers.emplace_back(wx, wy);
       }
     }
   }
+
+  if (!scenario.inflate_costmap || lethal_centers.empty()) {
+    return;
+  }
+
   for (unsigned int my = 0; my < ny; ++my) {
     for (unsigned int mx = 0; mx < nx; ++mx) {
       if (costmap.getCost(mx, my) == nav2_costmap_2d::LETHAL_OBSTACLE) {
@@ -220,6 +347,7 @@ struct RunResult
 RunResult runClosedLoop(
   nav2_core::Controller & controller,
   const rclcpp_lifecycle::LifecycleNode::SharedPtr & node,
+  nav2_costmap_2d::Costmap2D & costmap,
   const Scenario & scenario)
 {
   RunResult res;
@@ -235,6 +363,11 @@ RunResult runClosedLoop(
   int command_samples = 0;
 
   for (res.steps = 0; res.steps < kMaxSteps; ++res.steps) {
+    const double sim_time = res.steps * kControlDt;
+    if (scenario.dynamic_costmap && res.steps % scenario.dynamic_repaint_period == 0) {
+      paintCostmap(costmap, scenario, sim_time);
+    }
+
     res.traj.push_back({x, y, yaw});
     geometry_msgs::msg::PoseStamped pose;
     pose.header.frame_id = "odom";
@@ -274,7 +407,7 @@ RunResult runClosedLoop(
       std::sin(yaw + kControlDt * cmd.angular.z),
       std::cos(yaw + kControlDt * cmd.angular.z));
 
-    if (scenario.lethal(x, y)) {
+    if (scenarioLethal(scenario, x, y, (res.steps + 1) * kControlDt)) {
       res.collided = true;
       break;
     }
@@ -338,6 +471,22 @@ std::vector<Config> benchmarkConfigs(bool include_motion_models)
   return configs;
 }
 
+std::vector<Config> quickBenchmarkConfigs()
+{
+  return {
+    {"gpu_mppi_K2048", "cuda_mppi_controller::CudaMppiController", 2048},
+    {"gpu_mppi_K8192", "cuda_mppi_controller::CudaMppiController", 8192},
+  };
+}
+
+std::vector<Config> cpuGpuBenchmarkConfigs()
+{
+  return {
+    {"cpu_mppi_K2000", "nav2_mppi_controller::MPPIController", 2000},
+    {"gpu_mppi_K8192", "cuda_mppi_controller::CudaMppiController", 8192},
+  };
+}
+
 std::vector<Config> esdfBenchmarkConfigs()
 {
   return {
@@ -389,7 +538,7 @@ void runScenario(
   });
   auto local_costmap = std::make_shared<nav2_costmap_2d::Costmap2DROS>(costmap_options);
   local_costmap->configure();
-  paintCostmap(*local_costmap->getCostmap(), scenario);
+  paintCostmap(*local_costmap->getCostmap(), scenario, 0.0);
 
   const std::string scenario_dir = out_dir + "/" + scenario.name;
   std::filesystem::create_directories(scenario_dir);
@@ -439,7 +588,7 @@ void runScenario(
     controller->activate();
 
     std::printf("=== %s / %s (%s) ===\n", scenario.name.c_str(), cfg.label.c_str(), cfg.plugin.c_str());
-    const RunResult r = runClosedLoop(*controller, node, scenario);
+    const RunResult r = runClosedLoop(*controller, node, *local_costmap->getCostmap(), scenario);
     std::printf(
       "  %s steps=%d sim=%.1fs solve mean=%.2fms p95=%.2fms max=%.2fms "
       "dist=%.2fm mean_v=%.2fm/s max_w=%.2frad/s exc=%d\n",
@@ -478,6 +627,7 @@ int main(int argc, char ** argv)
   rclcpp::init(argc, argv);
   const std::string out_dir = argc > 1 ? argv[1] : ".";
   const std::string scenario_arg = argc > 2 ? argv[2] : "wall_gap";
+  const std::string preset_arg = argc > 3 ? argv[3] : "full";
   std::filesystem::create_directories(out_dir);
 
   rclcpp::NodeOptions costmap_options;
@@ -517,6 +667,7 @@ int main(int argc, char ** argv)
   auto costmap_ros = std::make_shared<nav2_costmap_2d::Costmap2DROS>(costmap_options);
   costmap_ros->configure();
   auto tf = std::make_shared<tf2_ros::Buffer>(costmap_ros->get_clock());
+  tf->setUsingDedicatedThread(true);
 
   pluginlib::ClassLoader<nav2_core::Controller> loader(
     "nav2_core", "nav2_core::Controller");
@@ -524,13 +675,33 @@ int main(int argc, char ** argv)
   const bool motion_checks = scenario_arg == "all" || scenario_arg == "wall_gap";
   std::vector<Config> configs;
   if (path_angle_benchmark) {
+    if (preset_arg != "full") {
+      std::fprintf(stderr, "Preset '%s' is not supported for path_angle\n", preset_arg.c_str());
+      return 1;
+    }
     configs = pathAngleBenchmarkConfigs();
   } else if (curvature_speed_benchmark) {
+    if (preset_arg != "full") {
+      std::fprintf(
+        stderr, "Preset '%s' is not supported for curvature_speed\n", preset_arg.c_str());
+      return 1;
+    }
     configs = curvatureSpeedBenchmarkConfigs();
   } else if (esdf_benchmark) {
+    if (preset_arg != "full") {
+      std::fprintf(stderr, "Preset '%s' is not supported for esdf\n", preset_arg.c_str());
+      return 1;
+    }
     configs = esdfBenchmarkConfigs();
-  } else {
+  } else if (preset_arg == "full") {
     configs = benchmarkConfigs(motion_checks);
+  } else if (preset_arg == "quick") {
+    configs = quickBenchmarkConfigs();
+  } else if (preset_arg == "cpu_gpu") {
+    configs = cpuGpuBenchmarkConfigs();
+  } else {
+    std::fprintf(stderr, "Unknown preset '%s' (full | quick | cpu_gpu)\n", preset_arg.c_str());
+    return 1;
   }
 
   for (const auto & scenario : scenarios) {
diff --git a/ros2_ws/src/cuda_mppi_controller/test/mppi_gpu_standalone.cpp b/ros2_ws/src/cuda_mppi_controller/test/mppi_gpu_standalone.cpp
index dbf679aa..ed83657a 100644
--- a/ros2_ws/src/cuda_mppi_controller/test/mppi_gpu_standalone.cpp
+++ b/ros2_ws/src/cuda_mppi_controller/test/mppi_gpu_standalone.cpp
@@ -117,8 +117,10 @@ int main(int argc, char ** argv)
 
   float x = 1.0f, y = 5.0f, yaw = 0.0f;
   double total_ms = 0.0, max_ms = 0.0;
+  double min_valid_ratio = 1.0;
   double dist = 0.0;
   int wall_cross_step = -1, near_goal_step = -1;
+  int retreat_count = 0;
   int steps = 0;
   const int max_steps = 1200;
 
@@ -161,6 +163,10 @@ int main(int argc, char ** argv)
     const double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
     total_ms += ms;
     max_ms = std::max(max_ms, ms);
+    min_valid_ratio = std::min(min_valid_ratio, static_cast<double>(res.valid_rollout_ratio));
+    if (res.retreating) {
+      ++retreat_count;
+    }
 
     if (res.all_colliding) {
       std::printf("FAIL: all sampled trajectories colliding at step %d\n", steps);
@@ -168,8 +174,11 @@ int main(int argc, char ** argv)
     }
 
     if (std::getenv("MPPI_TRACE") && steps % 20 == 0) {
-      std::fprintf(stderr, "t=%5.2f x=%.2f y=%.2f yaw=%6.2f v=%5.2f w=%6.2f\n",
-        steps * params.model_dt, x, y, yaw, res.v, res.w);
+      std::fprintf(
+        stderr,
+        "t=%5.2f x=%.2f y=%.2f yaw=%6.2f v=%5.2f w=%6.2f valid=%d/%d best=%.3f\n",
+        steps * params.model_dt, x, y, yaw, res.v, res.w,
+        res.valid_rollouts, res.sampled_rollouts, res.best_cost);
     }
     // apply first control to the plant (same model as the rollouts)
     const float px = x, py = y;
@@ -212,5 +221,8 @@ int main(int argc, char ** argv)
   std::printf(
     "solve time: mean %.2f ms, max %.2f ms (K=%d, T=%d, incl. costmap upload)\n",
     total_ms / (steps + 1), max_ms, params.batch_size, params.time_steps);
+  std::printf(
+    "diagnostics: min valid rollout ratio %.1f%% | retreat cycles %d\n",
+    100.0 * min_valid_ratio, retreat_count);
   return 0;
 }
diff --git a/ros2_ws/src/cuda_mppi_controller/test/parameter_validation_test.cpp b/ros2_ws/src/cuda_mppi_controller/test/parameter_validation_test.cpp
index d7164794..9779fee7 100644
--- a/ros2_ws/src/cuda_mppi_controller/test/parameter_validation_test.cpp
+++ b/ros2_ws/src/cuda_mppi_controller/test/parameter_validation_test.cpp
@@ -64,6 +64,8 @@ int main(int argc, char ** argv)
       rclcpp::Parameter("FollowPath.distance_field_cutoff", -0.1)},
     {"negative_lookahead", rclcpp::Parameter("FollowPath.lookahead_dist", -1.0)},
     {"negative_transform_tolerance", rclcpp::Parameter("FollowPath.transform_tolerance", -0.1)},
+    {"negative_diagnostics_log_period",
+      rclcpp::Parameter("FollowPath.diagnostics_log_period", -0.1)},
   };
 
   bool ok = true;
diff --git a/src/mppi_gpu.cu b/src/mppi_gpu.cu
index 046ee5dc..5439122b 100644
--- a/src/mppi_gpu.cu
+++ b/src/mppi_gpu.cu
@@ -707,7 +707,19 @@ MppiResult MppiGpu::computeInternal(
   }
 
   MppiResult res;
+  res.sampled_rollouts = K;
+  double cost_sum = 0.0;
+  int valid_rollouts = 0;
+  for (const float cost : im.h_costs) {
+    cost_sum += cost;
+    if (cost < mp.collision_cost) {
+      ++valid_rollouts;
+    }
+  }
   res.best_cost = min_cost;
+  res.mean_cost = static_cast<float>(cost_sum / static_cast<double>(K));
+  res.valid_rollouts = valid_rollouts;
+  res.valid_rollout_ratio = static_cast<float>(valid_rollouts) / static_cast<float>(K);
   res.all_colliding = min_cost >= mp.collision_cost;
 
   if (res.all_colliding) {