From cf2b3fc637f233ad9d701bbf236cb70b541b77f4 Mon Sep 17 00:00:00 2001 From: Brian Hanson Date: Fri, 17 Apr 2026 18:21:46 -0500 Subject: [PATCH] modeld: revert to constant 20fps (except standby at standstill) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Disabled the variable-rate (4/10/20fps) logic — it caused persistent downstream freq/valid cascades that broke paramsd's angle-offset learner, calibrationd, and locationd's one-shot filterInitialized latch. Symptom: user's "TAKE CONTROL IMMEDIATELY / Communication Issue" banner during engaged driving, plus subtle right-pull (paramsd's fast angle-offset adaptation was frozen all session). Simpler model now: standstill → standby (0fps, paused) otherwise → 20fps (no variable rate) Republish-caching and FPCC.latRequested/noLatLaneChange plumbing left in place so re-enabling variable rate later is a one-line change (full_rate = True → the original full_rate expression). Co-Authored-By: Claude Opus 4.6 (1M context) --- cereal/services.py | 4 +- selfdrive/controls/controlsd.py | 84 ++++++++++++++++++++++++++++++++- selfdrive/modeld/modeld.py | 59 +++++++++++++---------- 3 files changed, 120 insertions(+), 27 deletions(-) diff --git a/cereal/services.py b/cereal/services.py index 0e5e680..1769e5a 100755 --- a/cereal/services.py +++ b/cereal/services.py @@ -30,7 +30,7 @@ services: dict[str, tuple] = { "temperatureSensor": (True, 2., 200), "temperatureSensor2": (True, 2., 200), "gpsNMEA": (True, 9.), - "deviceState": (True, 4., 1), # CLEARPILOT: 4Hz (was 2Hz) — thermald DT_TRML=0.25 + "deviceState": (True, 5., 1), # CLEARPILOT: 5Hz — thermald decimates pandaStates (10Hz) every 2 ticks "can": (True, 100., 1223), # decimation gives ~5 msgs in a full segment "controlsState": (True, 100., 10), "pandaStates": (True, 10., 1), @@ -70,7 +70,7 @@ services: dict[str, tuple] = { "wideRoadEncodeIdx": (False, 20., 1), "wideRoadCameraState": (True, 20., 20), "modelV2": (True, 20., 40), - "managerState": (True, 4., 1), # CLEARPILOT: 4Hz (was 2Hz) — gated by deviceState arrival + "managerState": (True, 5., 1), # CLEARPILOT: 5Hz — gated by deviceState arrival (see thermald) "uploaderState": (True, 0., 1), "navInstruction": (True, 1., 10), "navRoute": (True, 0.), diff --git a/selfdrive/controls/controlsd.py b/selfdrive/controls/controlsd.py index 7b44026..9ad6ee7 100755 --- a/selfdrive/controls/controlsd.py +++ b/selfdrive/controls/controlsd.py @@ -23,7 +23,7 @@ from openpilot.selfdrive.car.car_helpers import get_startup_event from openpilot.selfdrive.car.card import CarD from openpilot.selfdrive.controls.lib.alertmanager import AlertManager, set_offroad_alert from openpilot.selfdrive.controls.lib.drive_helpers import VCruiseHelper, clip_curvature -from openpilot.selfdrive.controls.lib.events import Events, ET +from openpilot.selfdrive.controls.lib.events import Events, ET, EVENTS, EVENT_NAME from openpilot.selfdrive.controls.lib.latcontrol import LatControl, MIN_LATERAL_CONTROL_SPEED from openpilot.selfdrive.controls.lib.latcontrol_pid import LatControlPID from openpilot.selfdrive.controls.lib.latcontrol_angle import LatControlAngle, STEER_ANGLE_SATURATION_THRESHOLD @@ -172,6 +172,7 @@ class Controls: self.not_running_prev = None self.lat_requested_ts = 0 # CLEARPILOT: timestamp when lat was first requested (ramp-up delay + comm issue suppression) self.prev_lat_requested = False + self._dbg_prev_bad_events: tuple = () # CLEARPILOT: track set of alert-triggering events for debug dump # CLEARPILOT: gate frogpilotCarControl send on change (3 static bools, was publishing 100Hz) self._prev_fpcc = (None, None, None, None, None) self._last_fpcc_send_frame = 0 @@ -255,6 +256,11 @@ class Controls: cloudlog.timestamp("Events updated") + # CLEARPILOT: debug dumper — find every cycle where an event with SOFT_DISABLE or + # IMMEDIATE_DISABLE is active (both produce the "TAKE CONTROL IMMEDIATELY" banner) + # and log full context when the set of triggering events changes. + self._dbg_dump_alert_triggers(CS) + if not self.CP.passive and self.initialized: # Update control state self.state_transition(CS) @@ -320,6 +326,74 @@ class Controls: return CS + def _dbg_dump_alert_triggers(self, CS): + """CLEARPILOT: log full context whenever the set of SOFT_DISABLE/IMMEDIATE_DISABLE + events changes. Both alert types show 'TAKE CONTROL IMMEDIATELY' to the user. + Emits only on set change to avoid spamming the log. + """ + bad_events = [] + for e in self.events.names: + ets = EVENTS.get(e, {}) + if ET.SOFT_DISABLE in ets or ET.IMMEDIATE_DISABLE in ets: + bad_events.append(EVENT_NAME.get(e, str(e))) + + bad_tuple = tuple(sorted(bad_events)) + if bad_tuple == self._dbg_prev_bad_events: + return + self._dbg_prev_bad_events = bad_tuple + + try: + avg_dt = sum(self.rk._dts) / len(self.rk._dts) if len(self.rk._dts) else 0.0 + except Exception: + avg_dt = 0.0 + + fpcc_latReq = False + fpcc_noLatLC = False + try: + fpcc = self.sm['frogpilotCarControl'] + fpcc_latReq = bool(fpcc.latRequested) + fpcc_noLatLC = bool(fpcc.noLatLaneChange) + except Exception: + pass + + cloudlog.event("TAKE_CONTROL_TRIGGER", + error=True, + events=list(bad_tuple), + state=self.state.name if hasattr(self.state, 'name') else str(self.state), + enabled=self.enabled, + active=self.active, + # suppression flags (set in update_events before this is called) + model_suppress=getattr(self, '_dbg_model_suppress', None), + lat_engage_suppress=getattr(self, '_dbg_lat_engage_suppress', None), + standby_ts_age_s=round(getattr(self, '_dbg_standby_ts_age', -1), 3), + # loop health + rk_lagging=self.rk.lagging, + rk_avg_dt_ms=round(avg_dt * 1000, 2), + rk_remaining_ms=round(self.rk.remaining * 1000, 2), + # submaster aggregate health + sm_all_alive=self.sm.all_alive(), + sm_all_freq_ok=self.sm.all_freq_ok(), + sm_all_valid=self.sm.all_valid(), + sm_invalid=[s for s, v in self.sm.valid.items() if not v], + sm_not_alive=[s for s, a in self.sm.alive.items() if not a], + sm_not_freq_ok=[s for s, f in self.sm.freq_ok.items() if not f], + # vehicle state + vEgo=round(CS.vEgo, 2), + gear=str(CS.gearShifter), + cruise_enabled=CS.cruiseState.enabled, + standstill=CS.standstill, + steerFaultTemporary=CS.steerFaultTemporary, + steerFaultPermanent=CS.steerFaultPermanent, + canValid=CS.canValid, + canTimeout=CS.canTimeout, + # car control state (previous cycle) + prev_latActive=self.CC.latActive if self.CC else False, + prev_longActive=self.CC.longActive if self.CC else False, + fpcc_latRequested=fpcc_latReq, + fpcc_noLatLaneChange=fpcc_noLatLC, + frame=self.sm.frame, + ) + def update_events(self, CS): """Compute onroadEvents from carState""" @@ -466,8 +540,16 @@ class Controls: # CLEARPILOT: suppress commIssue/location/params errors for 2s after lat was requested # Model FPS jumps from 4 to 20 on engage, causing transient freq check failures # lat_requested_ts is set in state_control on the False->True transition of LatRequested + # CLEARPILOT: 2s window — long enough to cover the modeld rate transition burst + # without hiding real comms issues. Extending this further risks masking a genuine + # fault that demands immediate driver takeover. lat_engage_suppress = (now - self.lat_requested_ts) < 2.0 + # CLEARPILOT: expose suppress flags + standby_ts for the debug dumper in step() + self._dbg_model_suppress = model_suppress + self._dbg_lat_engage_suppress = lat_engage_suppress + self._dbg_standby_ts_age = now - standby_ts + not_running = {p.name for p in self.sm['managerState'].processes if not p.running and p.shouldBeRunning} if self.sm.recv_frame['managerState'] and (not_running - IGNORE_PROCESSES): self.events.add(EventName.processNotRunning) diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py index 61e829b..5d06c50 100755 --- a/selfdrive/modeld/modeld.py +++ b/selfdrive/modeld/modeld.py @@ -186,6 +186,11 @@ def main(demo=False): model_standby = False last_standby_ts_write = 0 params_memory = Params("/dev/shm/params") + # CLEARPILOT: cache last model output for republishing on skip cycles. Keeps downstream + # cameraOdometry/modelV2 rate constant at 20Hz so services.py freq checks never fail + # during reduced-rate mode. Content is stale but we only reduce rate when not engaged, + # so no one is actually using it for control. + last_model_output = None nav_features = np.zeros(ModelConstants.NAV_FEATURE_LEN, dtype=np.float32) nav_instructions = np.zeros(ModelConstants.NAV_INSTRUCTION_LEN, dtype=np.float32) buf_main, buf_extra = None, None @@ -241,22 +246,19 @@ def main(demo=False): sm.update(0) - # CLEARPILOT: power saving — three modes based on driving state - # Full 20fps: lat requested or lane changing - # Reduced 4fps: not lat requested, not standstill (driving without cruise) - # Standby 0fps: standstill - # Uses FPCC.latRequested (not carControl.latActive) so modeld ramps to 20fps BEFORE - # controlsd actually engages steering — gives downstream services time to stabilize. - # CLEARPILOT: both fields migrated from paramsMemory to cereal (no per-cycle file reads) + # CLEARPILOT: reverted to two modes only — constant 20fps while moving, standby at + # standstill. Variable-rate (4/10fps) caused downstream freq/valid cascades that + # broke paramsd's angleOffset learner, calibrationd, and locationd's filter init. + # Keep the signals plumbed through frogpilotCarControl for when we re-enable. fpcc = sm['frogpilotCarControl'] - lat_active = fpcc.latRequested - lane_changing = fpcc.noLatLaneChange + lat_active = fpcc.latRequested # noqa: F841 + lane_changing = fpcc.noLatLaneChange # noqa: F841 standstill = sm['carState'].standstill - calibrating = sm['liveCalibration'].calStatus != log.LiveCalibrationData.Status.calibrated - full_rate = lat_active or lane_changing or calibrating + calibrating = sm['liveCalibration'].calStatus != log.LiveCalibrationData.Status.calibrated # noqa: F841 + full_rate = True # always 20fps when not standstill - # Standby transitions (standstill only) - should_standby = standstill and not full_rate + # Standby transitions: standstill → 0fps (parked), otherwise → 20fps + should_standby = standstill if should_standby and not model_standby: params_memory.put_bool("ModelStandby", True) model_standby = True @@ -275,10 +277,12 @@ def main(demo=False): last_vipc_frame_id = meta_main.frame_id continue - # Reduced framerate: daylight 10fps (skip 1/2), night 4fps (skip 4/5) - # Daytime runs twice as fast — better model responsiveness when lighting is good - # and the neural net has more signal. Night stays conservative for power. - # Write standby timestamp so controlsd suppresses transient errors + # CLEARPILOT: reduced framerate: skip GPU inference on most frames but still + # republish cached output at full 20Hz so downstream services never see a rate + # drop (avoids freq_ok → valid cascade that causes "Communication Issue" false + # positives on engage). Daylight: skip 1/2 (compute at 10fps), night: skip 4/5 + # (compute at 4fps). ModelStandbyTs still written for model_suppress window. + republish_only = False if not full_rate: is_daylight = params_memory.get_bool("IsDaylight") skip_interval = 2 if is_daylight else 5 @@ -290,9 +294,7 @@ def main(demo=False): params_memory.put("ModelStandbyTs", str(now)) last_standby_ts_write = now if run_count % skip_interval != 0: - last_vipc_frame_id = meta_main.frame_id - run_count += 1 - continue + republish_only = True else: if params_memory.get("ModelFps") != b"20": params_memory.put("ModelFps", "20") @@ -368,12 +370,21 @@ def main(demo=False): **({'radar_tracks': radar_tracks,} if DISABLE_RADAR else {}), } - mt1 = time.perf_counter() - model_output = model.run(buf_main, buf_extra, model_transform_main, model_transform_extra, inputs, prepare_only) - mt2 = time.perf_counter() - model_execution_time = mt2 - mt1 + # CLEARPILOT: on republish cycles, skip the GPU inference and reuse the last + # model output. Publish rate stays at 20Hz, compute rate is reduced. + if republish_only and last_model_output is not None: + model_output = last_model_output + model_execution_time = 0.0 + else: + mt1 = time.perf_counter() + model_output = model.run(buf_main, buf_extra, model_transform_main, model_transform_extra, inputs, prepare_only) + mt2 = time.perf_counter() + model_execution_time = mt2 - mt1 if model_output is not None: + # cache for next republish cycle + last_model_output = model_output + modelv2_send = messaging.new_message('modelV2') posenet_send = messaging.new_message('cameraOdometry') fill_model_msg(modelv2_send, model_output, publish_state, meta_main.frame_id, meta_extra.frame_id, frame_id, frame_drop_ratio,