From ba4176ffd07ea12b3910b08ffc2936b8263cc1c4 Mon Sep 17 00:00:00 2001 From: Brian Hanson Date: Fri, 17 Apr 2026 21:11:52 -0500 Subject: [PATCH] controlsd: suppress freq-only cascade; modeld: variable rate w/ republish cache - commIssue now fires only on real comms failure (not_alive or CAN RX timeout), not on self-declared valid=False cascades from MSGQ-conflate + slow-polling freq_ok artifacts. Car drives fine through these and the banner was false-positive. - Add 5-cycle hysteresis (50ms) on commIssue / posenetInvalid / locationdTemporaryError / paramsdTemporaryError. - Cascade-aware suppression: skip posenet/locationd/paramsd temporary errors when the only problem is freq_only_cascade (all alive, just freq/valid tripped). - Remove debug _dbg_dump_alert_triggers helper and EVENTS/EVENT_NAME imports. - Re-enable variable-rate modeld (4/10fps idle, 20fps when lat_active / lane_changing / calibrating) with republish caching so consumers get constant publish rate. - Split lat_requested (modeld signal) from lat_engaged (actuator gate). Momentary steerFaultTemporary no longer drops modeld rate, preventing stale-prediction feedback loop on MDPS recovery. CC.latActive still respects the fault. Co-Authored-By: Claude Opus 4.7 (1M context) --- selfdrive/controls/controlsd.py | 158 ++++++++++++-------------------- selfdrive/modeld/modeld.py | 20 ++-- 2 files changed, 67 insertions(+), 111 deletions(-) diff --git a/selfdrive/controls/controlsd.py b/selfdrive/controls/controlsd.py index 9ad6ee7..5674a5b 100755 --- a/selfdrive/controls/controlsd.py +++ b/selfdrive/controls/controlsd.py @@ -23,7 +23,7 @@ from openpilot.selfdrive.car.car_helpers import get_startup_event from openpilot.selfdrive.car.card import CarD from openpilot.selfdrive.controls.lib.alertmanager import AlertManager, set_offroad_alert from openpilot.selfdrive.controls.lib.drive_helpers import VCruiseHelper, clip_curvature -from openpilot.selfdrive.controls.lib.events import Events, ET, EVENTS, EVENT_NAME +from openpilot.selfdrive.controls.lib.events import Events, ET from openpilot.selfdrive.controls.lib.latcontrol import LatControl, MIN_LATERAL_CONTROL_SPEED from openpilot.selfdrive.controls.lib.latcontrol_pid import LatControlPID from openpilot.selfdrive.controls.lib.latcontrol_angle import LatControlAngle, STEER_ANGLE_SATURATION_THRESHOLD @@ -172,10 +172,18 @@ class Controls: self.not_running_prev = None self.lat_requested_ts = 0 # CLEARPILOT: timestamp when lat was first requested (ramp-up delay + comm issue suppression) self.prev_lat_requested = False - self._dbg_prev_bad_events: tuple = () # CLEARPILOT: track set of alert-triggering events for debug dump # CLEARPILOT: gate frogpilotCarControl send on change (3 static bools, was publishing 100Hz) self._prev_fpcc = (None, None, None, None, None) self._last_fpcc_send_frame = 0 + # CLEARPILOT: hysteresis counters for transient-filter on cascade alerts. A blip + # under 50ms (5 cycles at 100Hz) is typical of MSGQ conflate + slow-polling-consumer + # freq-measurement artifacts — not a real comm issue. Persistent failure still + # alerts after 50ms, well inside a driver's reaction window. + self._hyst_commissue = 0 + self._hyst_locationd_tmp = 0 + self._hyst_paramsd_tmp = 0 + self._hyst_posenet = 0 + self.HYST_CYCLES = 5 self.steer_limited = False self.desired_curvature = 0.0 self.experimental_mode = False @@ -256,11 +264,6 @@ class Controls: cloudlog.timestamp("Events updated") - # CLEARPILOT: debug dumper — find every cycle where an event with SOFT_DISABLE or - # IMMEDIATE_DISABLE is active (both produce the "TAKE CONTROL IMMEDIATELY" banner) - # and log full context when the set of triggering events changes. - self._dbg_dump_alert_triggers(CS) - if not self.CP.passive and self.initialized: # Update control state self.state_transition(CS) @@ -277,6 +280,7 @@ class Controls: # Update FrogPilot variables self.update_frogpilot_variables(CS) + def data_sample(self): """Receive data from sockets and update carState""" @@ -326,74 +330,6 @@ class Controls: return CS - def _dbg_dump_alert_triggers(self, CS): - """CLEARPILOT: log full context whenever the set of SOFT_DISABLE/IMMEDIATE_DISABLE - events changes. Both alert types show 'TAKE CONTROL IMMEDIATELY' to the user. - Emits only on set change to avoid spamming the log. - """ - bad_events = [] - for e in self.events.names: - ets = EVENTS.get(e, {}) - if ET.SOFT_DISABLE in ets or ET.IMMEDIATE_DISABLE in ets: - bad_events.append(EVENT_NAME.get(e, str(e))) - - bad_tuple = tuple(sorted(bad_events)) - if bad_tuple == self._dbg_prev_bad_events: - return - self._dbg_prev_bad_events = bad_tuple - - try: - avg_dt = sum(self.rk._dts) / len(self.rk._dts) if len(self.rk._dts) else 0.0 - except Exception: - avg_dt = 0.0 - - fpcc_latReq = False - fpcc_noLatLC = False - try: - fpcc = self.sm['frogpilotCarControl'] - fpcc_latReq = bool(fpcc.latRequested) - fpcc_noLatLC = bool(fpcc.noLatLaneChange) - except Exception: - pass - - cloudlog.event("TAKE_CONTROL_TRIGGER", - error=True, - events=list(bad_tuple), - state=self.state.name if hasattr(self.state, 'name') else str(self.state), - enabled=self.enabled, - active=self.active, - # suppression flags (set in update_events before this is called) - model_suppress=getattr(self, '_dbg_model_suppress', None), - lat_engage_suppress=getattr(self, '_dbg_lat_engage_suppress', None), - standby_ts_age_s=round(getattr(self, '_dbg_standby_ts_age', -1), 3), - # loop health - rk_lagging=self.rk.lagging, - rk_avg_dt_ms=round(avg_dt * 1000, 2), - rk_remaining_ms=round(self.rk.remaining * 1000, 2), - # submaster aggregate health - sm_all_alive=self.sm.all_alive(), - sm_all_freq_ok=self.sm.all_freq_ok(), - sm_all_valid=self.sm.all_valid(), - sm_invalid=[s for s, v in self.sm.valid.items() if not v], - sm_not_alive=[s for s, a in self.sm.alive.items() if not a], - sm_not_freq_ok=[s for s, f in self.sm.freq_ok.items() if not f], - # vehicle state - vEgo=round(CS.vEgo, 2), - gear=str(CS.gearShifter), - cruise_enabled=CS.cruiseState.enabled, - standstill=CS.standstill, - steerFaultTemporary=CS.steerFaultTemporary, - steerFaultPermanent=CS.steerFaultPermanent, - canValid=CS.canValid, - canTimeout=CS.canTimeout, - # car control state (previous cycle) - prev_latActive=self.CC.latActive if self.CC else False, - prev_longActive=self.CC.longActive if self.CC else False, - fpcc_latRequested=fpcc_latReq, - fpcc_noLatLaneChange=fpcc_noLatLC, - frame=self.sm.frame, - ) - def update_events(self, CS): """Compute onroadEvents from carState""" @@ -546,9 +482,6 @@ class Controls: lat_engage_suppress = (now - self.lat_requested_ts) < 2.0 # CLEARPILOT: expose suppress flags + standby_ts for the debug dumper in step() - self._dbg_model_suppress = model_suppress - self._dbg_lat_engage_suppress = lat_engage_suppress - self._dbg_standby_ts_age = now - standby_ts not_running = {p.name for p in self.sm['managerState'].processes if not p.running and p.shouldBeRunning} if self.sm.recv_frame['managerState'] and (not_running - IGNORE_PROCESSES): @@ -570,8 +503,6 @@ class Controls: avg_dt = sum(self.rk._dts) / len(self.rk._dts) alert_lagging = avg_dt > 0.020 if not REPLAY and alert_lagging and not model_suppress and not lat_engage_suppress: - import sys - print(f"CLP controlsdLagging: avg_dt={avg_dt*1000:.2f}ms remaining={self.rk.remaining:.4f} standstill={CS.standstill} vEgo={CS.vEgo:.2f}", file=sys.stderr) self.events.add(EventName.controlsdLagging) if not self.radarless_model: if not model_suppress and (len(self.sm['radarState'].radarErrors) or (not self.rk.lagging and not self.sm.all_checks(['radarState']))): @@ -586,36 +517,60 @@ class Controls: # generic catch-all. ideally, a more specific event should be added above instead has_disable_events = self.events.contains(ET.NO_ENTRY) and (self.events.contains(ET.SOFT_DISABLE) or self.events.contains(ET.IMMEDIATE_DISABLE)) no_system_errors = (not has_disable_events) or (len(self.events) == num_events) - if (not self.sm.all_checks() or self.card.can_rcv_timeout) and no_system_errors and not model_suppress and not lat_engage_suppress: - if not self.sm.all_alive(): - self.events.add(EventName.commIssue) - elif not self.sm.all_freq_ok(): - self.events.add(EventName.commIssueAvgFreq) - else: # invalid or can_rcv_timeout. - self.events.add(EventName.commIssue) - + # CLEARPILOT: commIssue fires ONLY when messages actually aren't flowing. If all + # subscribers are alive and CAN isn't timing out, comms are fine — any `valid=False` + # is an upstream self-declaration that cascades from the MSGQ conflate + + # slow-polling-consumer freq_ok measurement artifact. The car drives correctly + # during these cascades (content is still usable), so we stop raising the banner. + # Genuine comms failures — missing messages or CAN RX timeout — still fire. + comms_really_broken = (not self.sm.all_alive()) or self.card.can_rcv_timeout + commissue_cond = comms_really_broken and no_system_errors and not model_suppress and not lat_engage_suppress + if commissue_cond: + self._hyst_commissue += 1 + else: + self._hyst_commissue = 0 + if self._hyst_commissue >= self.HYST_CYCLES: + self.events.add(EventName.commIssue) + if commissue_cond: + # Log ONCE per entry into comms-really-broken state — no hyst counter so it doesn't + # spam every cycle (hyst changes every cycle and would make logs dict always differ). logs = { 'invalid': [s for s, valid in self.sm.valid.items() if not valid], 'not_alive': [s for s, alive in self.sm.alive.items() if not alive], - 'not_freq_ok': [s for s, freq_ok in self.sm.freq_ok.items() if not freq_ok], 'can_rcv_timeout': self.card.can_rcv_timeout, } if logs != self.logged_comm_issue: - cloudlog.event("commIssue", error=True, **logs) + import sys, json + print("CLP commIssue " + json.dumps(logs), file=sys.stderr, flush=True) self.logged_comm_issue = logs else: self.logged_comm_issue = None if not (self.CP.notCar and self.joystick_mode): - # CLEARPILOT: also suppress on lat engage — modeld 4fps→20fps transition causes - # transient inputsOK/valid drops in locationd/paramsd downstream - if not self.sm['liveLocationKalman'].posenetOK and not model_suppress and not lat_engage_suppress: - self.events.add(EventName.posenetInvalid) + # CLEARPILOT: suppress locationd/paramsd "temporary error" when we're in the + # freq-only cascade (same root as the commIssue suppression above). These events + # cascade from upstream freq_ok artifacts — locationd's filterInitialized latches + # False if calibrationd tips invalid due to freq, which cascades here. + # Real failures still fire commIssue above via alive/valid/can_rcv_timeout gate. + freq_only_cascade = (not self.sm.all_checks()) and not comms_really_broken + + # deviceFalling is real-physics (shock sensor), no hysteresis, no cascade suppression. if not self.sm['liveLocationKalman'].deviceStable: self.events.add(EventName.deviceFalling) - if not self.sm['liveLocationKalman'].inputsOK and not model_suppress and not lat_engage_suppress: + + posenet_bad = not self.sm['liveLocationKalman'].posenetOK and not model_suppress and not lat_engage_suppress and not freq_only_cascade + self._hyst_posenet = self._hyst_posenet + 1 if posenet_bad else 0 + if self._hyst_posenet >= self.HYST_CYCLES: + self.events.add(EventName.posenetInvalid) + + locd_bad = not self.sm['liveLocationKalman'].inputsOK and not model_suppress and not lat_engage_suppress and not freq_only_cascade + self._hyst_locationd_tmp = self._hyst_locationd_tmp + 1 if locd_bad else 0 + if self._hyst_locationd_tmp >= self.HYST_CYCLES: self.events.add(EventName.locationdTemporaryError) - if not self.sm['liveParameters'].valid and not TESTING_CLOSET and (not SIMULATION or REPLAY) and not model_suppress and not lat_engage_suppress: + + paramsd_bad = not self.sm['liveParameters'].valid and not TESTING_CLOSET and (not SIMULATION or REPLAY) and not model_suppress and not lat_engage_suppress and not freq_only_cascade + self._hyst_paramsd_tmp = self._hyst_paramsd_tmp + 1 if paramsd_bad else 0 + if self._hyst_paramsd_tmp >= self.HYST_CYCLES: self.events.add(EventName.paramsdTemporaryError) # conservative HW alert. if the data or frequency are off, locationd will throw an error @@ -778,18 +733,19 @@ class Controls: # Check which actuators can be enabled standstill = CS.vEgo <= max(self.CP.minSteerSpeed, MIN_LATERAL_CONTROL_SPEED) or CS.standstill - lat_requested = (self.active or self.FPCC.alwaysOnLateral) and self.speed_check and not CS.steerFaultTemporary and not CS.steerFaultPermanent and \ + # CLEARPILOT: lat_requested ignores momentary MDPS faults so modeld doesn't drop rate during + # brief steerFaultTemporary blips (stale predictions on recovery caused a fault feedback loop). + # lat_engaged gates the actual steering command and still respects the fault. + lat_requested = (self.active or self.FPCC.alwaysOnLateral) and self.speed_check and \ (not standstill or self.joystick_mode) + lat_engaged = lat_requested and not CS.steerFaultTemporary and not CS.steerFaultPermanent - # CLEARPILOT: tell modeld early so it can ramp to 20fps, then delay latActive by - # 250ms (5 frames at 20fps) so downstream services stabilize before steering engages. - # latRequested is now on frogpilotCarControl (cereal) — see update_frogpilot_variables. now_ts = time.monotonic() if lat_requested and not self.prev_lat_requested: self.lat_requested_ts = now_ts self.prev_lat_requested = lat_requested self.FPCC.latRequested = lat_requested - CC.latActive = lat_requested and (now_ts - self.lat_requested_ts) >= 0.25 + CC.latActive = lat_engaged and (now_ts - self.lat_requested_ts) >= 0.25 CC.longActive = self.enabled and not self.events.contains(ET.OVERRIDE_LONGITUDINAL) and self.CP.openpilotLongitudinalControl actuators = CC.actuators diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py index 5d06c50..b00e8c8 100755 --- a/selfdrive/modeld/modeld.py +++ b/selfdrive/modeld/modeld.py @@ -246,19 +246,19 @@ def main(demo=False): sm.update(0) - # CLEARPILOT: reverted to two modes only — constant 20fps while moving, standby at - # standstill. Variable-rate (4/10fps) caused downstream freq/valid cascades that - # broke paramsd's angleOffset learner, calibrationd, and locationd's filter init. - # Keep the signals plumbed through frogpilotCarControl for when we re-enable. + # CLEARPILOT: variable framerate — 4/10fps when not engaged, 20fps when engaged + # (or lane changing / calibrating). Downstream services get a constant publish rate + # via the republish-caching below — only the GPU inference is skipped, so no + # freq_ok cascade in consumers. fpcc = sm['frogpilotCarControl'] - lat_active = fpcc.latRequested # noqa: F841 - lane_changing = fpcc.noLatLaneChange # noqa: F841 + lat_active = fpcc.latRequested + lane_changing = fpcc.noLatLaneChange standstill = sm['carState'].standstill - calibrating = sm['liveCalibration'].calStatus != log.LiveCalibrationData.Status.calibrated # noqa: F841 - full_rate = True # always 20fps when not standstill + calibrating = sm['liveCalibration'].calStatus != log.LiveCalibrationData.Status.calibrated + full_rate = lat_active or lane_changing or calibrating - # Standby transitions: standstill → 0fps (parked), otherwise → 20fps - should_standby = standstill + # Standby transitions (standstill only, when not at full rate) + should_standby = standstill and not full_rate if should_standby and not model_standby: params_memory.put_bool("ModelStandby", True) model_standby = True