controlsd: suppress freq-only cascade; modeld: variable rate w/ republish cache
- commIssue now fires only on real comms failure (not_alive or CAN RX timeout), not on self-declared valid=False cascades from MSGQ-conflate + slow-polling freq_ok artifacts. Car drives fine through these and the banner was false-positive. - Add 5-cycle hysteresis (50ms) on commIssue / posenetInvalid / locationdTemporaryError / paramsdTemporaryError. - Cascade-aware suppression: skip posenet/locationd/paramsd temporary errors when the only problem is freq_only_cascade (all alive, just freq/valid tripped). - Remove debug _dbg_dump_alert_triggers helper and EVENTS/EVENT_NAME imports. - Re-enable variable-rate modeld (4/10fps idle, 20fps when lat_active / lane_changing / calibrating) with republish caching so consumers get constant publish rate. - Split lat_requested (modeld signal) from lat_engaged (actuator gate). Momentary steerFaultTemporary no longer drops modeld rate, preventing stale-prediction feedback loop on MDPS recovery. CC.latActive still respects the fault. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -23,7 +23,7 @@ from openpilot.selfdrive.car.car_helpers import get_startup_event
|
||||
from openpilot.selfdrive.car.card import CarD
|
||||
from openpilot.selfdrive.controls.lib.alertmanager import AlertManager, set_offroad_alert
|
||||
from openpilot.selfdrive.controls.lib.drive_helpers import VCruiseHelper, clip_curvature
|
||||
from openpilot.selfdrive.controls.lib.events import Events, ET, EVENTS, EVENT_NAME
|
||||
from openpilot.selfdrive.controls.lib.events import Events, ET
|
||||
from openpilot.selfdrive.controls.lib.latcontrol import LatControl, MIN_LATERAL_CONTROL_SPEED
|
||||
from openpilot.selfdrive.controls.lib.latcontrol_pid import LatControlPID
|
||||
from openpilot.selfdrive.controls.lib.latcontrol_angle import LatControlAngle, STEER_ANGLE_SATURATION_THRESHOLD
|
||||
@@ -172,10 +172,18 @@ class Controls:
|
||||
self.not_running_prev = None
|
||||
self.lat_requested_ts = 0 # CLEARPILOT: timestamp when lat was first requested (ramp-up delay + comm issue suppression)
|
||||
self.prev_lat_requested = False
|
||||
self._dbg_prev_bad_events: tuple = () # CLEARPILOT: track set of alert-triggering events for debug dump
|
||||
# CLEARPILOT: gate frogpilotCarControl send on change (3 static bools, was publishing 100Hz)
|
||||
self._prev_fpcc = (None, None, None, None, None)
|
||||
self._last_fpcc_send_frame = 0
|
||||
# CLEARPILOT: hysteresis counters for transient-filter on cascade alerts. A blip
|
||||
# under 50ms (5 cycles at 100Hz) is typical of MSGQ conflate + slow-polling-consumer
|
||||
# freq-measurement artifacts — not a real comm issue. Persistent failure still
|
||||
# alerts after 50ms, well inside a driver's reaction window.
|
||||
self._hyst_commissue = 0
|
||||
self._hyst_locationd_tmp = 0
|
||||
self._hyst_paramsd_tmp = 0
|
||||
self._hyst_posenet = 0
|
||||
self.HYST_CYCLES = 5
|
||||
self.steer_limited = False
|
||||
self.desired_curvature = 0.0
|
||||
self.experimental_mode = False
|
||||
@@ -256,11 +264,6 @@ class Controls:
|
||||
|
||||
cloudlog.timestamp("Events updated")
|
||||
|
||||
# CLEARPILOT: debug dumper — find every cycle where an event with SOFT_DISABLE or
|
||||
# IMMEDIATE_DISABLE is active (both produce the "TAKE CONTROL IMMEDIATELY" banner)
|
||||
# and log full context when the set of triggering events changes.
|
||||
self._dbg_dump_alert_triggers(CS)
|
||||
|
||||
if not self.CP.passive and self.initialized:
|
||||
# Update control state
|
||||
self.state_transition(CS)
|
||||
@@ -277,6 +280,7 @@ class Controls:
|
||||
# Update FrogPilot variables
|
||||
self.update_frogpilot_variables(CS)
|
||||
|
||||
|
||||
def data_sample(self):
|
||||
"""Receive data from sockets and update carState"""
|
||||
|
||||
@@ -326,74 +330,6 @@ class Controls:
|
||||
|
||||
return CS
|
||||
|
||||
def _dbg_dump_alert_triggers(self, CS):
|
||||
"""CLEARPILOT: log full context whenever the set of SOFT_DISABLE/IMMEDIATE_DISABLE
|
||||
events changes. Both alert types show 'TAKE CONTROL IMMEDIATELY' to the user.
|
||||
Emits only on set change to avoid spamming the log.
|
||||
"""
|
||||
bad_events = []
|
||||
for e in self.events.names:
|
||||
ets = EVENTS.get(e, {})
|
||||
if ET.SOFT_DISABLE in ets or ET.IMMEDIATE_DISABLE in ets:
|
||||
bad_events.append(EVENT_NAME.get(e, str(e)))
|
||||
|
||||
bad_tuple = tuple(sorted(bad_events))
|
||||
if bad_tuple == self._dbg_prev_bad_events:
|
||||
return
|
||||
self._dbg_prev_bad_events = bad_tuple
|
||||
|
||||
try:
|
||||
avg_dt = sum(self.rk._dts) / len(self.rk._dts) if len(self.rk._dts) else 0.0
|
||||
except Exception:
|
||||
avg_dt = 0.0
|
||||
|
||||
fpcc_latReq = False
|
||||
fpcc_noLatLC = False
|
||||
try:
|
||||
fpcc = self.sm['frogpilotCarControl']
|
||||
fpcc_latReq = bool(fpcc.latRequested)
|
||||
fpcc_noLatLC = bool(fpcc.noLatLaneChange)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
cloudlog.event("TAKE_CONTROL_TRIGGER",
|
||||
error=True,
|
||||
events=list(bad_tuple),
|
||||
state=self.state.name if hasattr(self.state, 'name') else str(self.state),
|
||||
enabled=self.enabled,
|
||||
active=self.active,
|
||||
# suppression flags (set in update_events before this is called)
|
||||
model_suppress=getattr(self, '_dbg_model_suppress', None),
|
||||
lat_engage_suppress=getattr(self, '_dbg_lat_engage_suppress', None),
|
||||
standby_ts_age_s=round(getattr(self, '_dbg_standby_ts_age', -1), 3),
|
||||
# loop health
|
||||
rk_lagging=self.rk.lagging,
|
||||
rk_avg_dt_ms=round(avg_dt * 1000, 2),
|
||||
rk_remaining_ms=round(self.rk.remaining * 1000, 2),
|
||||
# submaster aggregate health
|
||||
sm_all_alive=self.sm.all_alive(),
|
||||
sm_all_freq_ok=self.sm.all_freq_ok(),
|
||||
sm_all_valid=self.sm.all_valid(),
|
||||
sm_invalid=[s for s, v in self.sm.valid.items() if not v],
|
||||
sm_not_alive=[s for s, a in self.sm.alive.items() if not a],
|
||||
sm_not_freq_ok=[s for s, f in self.sm.freq_ok.items() if not f],
|
||||
# vehicle state
|
||||
vEgo=round(CS.vEgo, 2),
|
||||
gear=str(CS.gearShifter),
|
||||
cruise_enabled=CS.cruiseState.enabled,
|
||||
standstill=CS.standstill,
|
||||
steerFaultTemporary=CS.steerFaultTemporary,
|
||||
steerFaultPermanent=CS.steerFaultPermanent,
|
||||
canValid=CS.canValid,
|
||||
canTimeout=CS.canTimeout,
|
||||
# car control state (previous cycle)
|
||||
prev_latActive=self.CC.latActive if self.CC else False,
|
||||
prev_longActive=self.CC.longActive if self.CC else False,
|
||||
fpcc_latRequested=fpcc_latReq,
|
||||
fpcc_noLatLaneChange=fpcc_noLatLC,
|
||||
frame=self.sm.frame,
|
||||
)
|
||||
|
||||
def update_events(self, CS):
|
||||
"""Compute onroadEvents from carState"""
|
||||
|
||||
@@ -546,9 +482,6 @@ class Controls:
|
||||
lat_engage_suppress = (now - self.lat_requested_ts) < 2.0
|
||||
|
||||
# CLEARPILOT: expose suppress flags + standby_ts for the debug dumper in step()
|
||||
self._dbg_model_suppress = model_suppress
|
||||
self._dbg_lat_engage_suppress = lat_engage_suppress
|
||||
self._dbg_standby_ts_age = now - standby_ts
|
||||
|
||||
not_running = {p.name for p in self.sm['managerState'].processes if not p.running and p.shouldBeRunning}
|
||||
if self.sm.recv_frame['managerState'] and (not_running - IGNORE_PROCESSES):
|
||||
@@ -570,8 +503,6 @@ class Controls:
|
||||
avg_dt = sum(self.rk._dts) / len(self.rk._dts)
|
||||
alert_lagging = avg_dt > 0.020
|
||||
if not REPLAY and alert_lagging and not model_suppress and not lat_engage_suppress:
|
||||
import sys
|
||||
print(f"CLP controlsdLagging: avg_dt={avg_dt*1000:.2f}ms remaining={self.rk.remaining:.4f} standstill={CS.standstill} vEgo={CS.vEgo:.2f}", file=sys.stderr)
|
||||
self.events.add(EventName.controlsdLagging)
|
||||
if not self.radarless_model:
|
||||
if not model_suppress and (len(self.sm['radarState'].radarErrors) or (not self.rk.lagging and not self.sm.all_checks(['radarState']))):
|
||||
@@ -586,36 +517,60 @@ class Controls:
|
||||
# generic catch-all. ideally, a more specific event should be added above instead
|
||||
has_disable_events = self.events.contains(ET.NO_ENTRY) and (self.events.contains(ET.SOFT_DISABLE) or self.events.contains(ET.IMMEDIATE_DISABLE))
|
||||
no_system_errors = (not has_disable_events) or (len(self.events) == num_events)
|
||||
if (not self.sm.all_checks() or self.card.can_rcv_timeout) and no_system_errors and not model_suppress and not lat_engage_suppress:
|
||||
if not self.sm.all_alive():
|
||||
# CLEARPILOT: commIssue fires ONLY when messages actually aren't flowing. If all
|
||||
# subscribers are alive and CAN isn't timing out, comms are fine — any `valid=False`
|
||||
# is an upstream self-declaration that cascades from the MSGQ conflate +
|
||||
# slow-polling-consumer freq_ok measurement artifact. The car drives correctly
|
||||
# during these cascades (content is still usable), so we stop raising the banner.
|
||||
# Genuine comms failures — missing messages or CAN RX timeout — still fire.
|
||||
comms_really_broken = (not self.sm.all_alive()) or self.card.can_rcv_timeout
|
||||
commissue_cond = comms_really_broken and no_system_errors and not model_suppress and not lat_engage_suppress
|
||||
if commissue_cond:
|
||||
self._hyst_commissue += 1
|
||||
else:
|
||||
self._hyst_commissue = 0
|
||||
if self._hyst_commissue >= self.HYST_CYCLES:
|
||||
self.events.add(EventName.commIssue)
|
||||
elif not self.sm.all_freq_ok():
|
||||
self.events.add(EventName.commIssueAvgFreq)
|
||||
else: # invalid or can_rcv_timeout.
|
||||
self.events.add(EventName.commIssue)
|
||||
|
||||
if commissue_cond:
|
||||
# Log ONCE per entry into comms-really-broken state — no hyst counter so it doesn't
|
||||
# spam every cycle (hyst changes every cycle and would make logs dict always differ).
|
||||
logs = {
|
||||
'invalid': [s for s, valid in self.sm.valid.items() if not valid],
|
||||
'not_alive': [s for s, alive in self.sm.alive.items() if not alive],
|
||||
'not_freq_ok': [s for s, freq_ok in self.sm.freq_ok.items() if not freq_ok],
|
||||
'can_rcv_timeout': self.card.can_rcv_timeout,
|
||||
}
|
||||
if logs != self.logged_comm_issue:
|
||||
cloudlog.event("commIssue", error=True, **logs)
|
||||
import sys, json
|
||||
print("CLP commIssue " + json.dumps(logs), file=sys.stderr, flush=True)
|
||||
self.logged_comm_issue = logs
|
||||
else:
|
||||
self.logged_comm_issue = None
|
||||
|
||||
if not (self.CP.notCar and self.joystick_mode):
|
||||
# CLEARPILOT: also suppress on lat engage — modeld 4fps→20fps transition causes
|
||||
# transient inputsOK/valid drops in locationd/paramsd downstream
|
||||
if not self.sm['liveLocationKalman'].posenetOK and not model_suppress and not lat_engage_suppress:
|
||||
self.events.add(EventName.posenetInvalid)
|
||||
# CLEARPILOT: suppress locationd/paramsd "temporary error" when we're in the
|
||||
# freq-only cascade (same root as the commIssue suppression above). These events
|
||||
# cascade from upstream freq_ok artifacts — locationd's filterInitialized latches
|
||||
# False if calibrationd tips invalid due to freq, which cascades here.
|
||||
# Real failures still fire commIssue above via alive/valid/can_rcv_timeout gate.
|
||||
freq_only_cascade = (not self.sm.all_checks()) and not comms_really_broken
|
||||
|
||||
# deviceFalling is real-physics (shock sensor), no hysteresis, no cascade suppression.
|
||||
if not self.sm['liveLocationKalman'].deviceStable:
|
||||
self.events.add(EventName.deviceFalling)
|
||||
if not self.sm['liveLocationKalman'].inputsOK and not model_suppress and not lat_engage_suppress:
|
||||
|
||||
posenet_bad = not self.sm['liveLocationKalman'].posenetOK and not model_suppress and not lat_engage_suppress and not freq_only_cascade
|
||||
self._hyst_posenet = self._hyst_posenet + 1 if posenet_bad else 0
|
||||
if self._hyst_posenet >= self.HYST_CYCLES:
|
||||
self.events.add(EventName.posenetInvalid)
|
||||
|
||||
locd_bad = not self.sm['liveLocationKalman'].inputsOK and not model_suppress and not lat_engage_suppress and not freq_only_cascade
|
||||
self._hyst_locationd_tmp = self._hyst_locationd_tmp + 1 if locd_bad else 0
|
||||
if self._hyst_locationd_tmp >= self.HYST_CYCLES:
|
||||
self.events.add(EventName.locationdTemporaryError)
|
||||
if not self.sm['liveParameters'].valid and not TESTING_CLOSET and (not SIMULATION or REPLAY) and not model_suppress and not lat_engage_suppress:
|
||||
|
||||
paramsd_bad = not self.sm['liveParameters'].valid and not TESTING_CLOSET and (not SIMULATION or REPLAY) and not model_suppress and not lat_engage_suppress and not freq_only_cascade
|
||||
self._hyst_paramsd_tmp = self._hyst_paramsd_tmp + 1 if paramsd_bad else 0
|
||||
if self._hyst_paramsd_tmp >= self.HYST_CYCLES:
|
||||
self.events.add(EventName.paramsdTemporaryError)
|
||||
|
||||
# conservative HW alert. if the data or frequency are off, locationd will throw an error
|
||||
@@ -778,18 +733,19 @@ class Controls:
|
||||
|
||||
# Check which actuators can be enabled
|
||||
standstill = CS.vEgo <= max(self.CP.minSteerSpeed, MIN_LATERAL_CONTROL_SPEED) or CS.standstill
|
||||
lat_requested = (self.active or self.FPCC.alwaysOnLateral) and self.speed_check and not CS.steerFaultTemporary and not CS.steerFaultPermanent and \
|
||||
# CLEARPILOT: lat_requested ignores momentary MDPS faults so modeld doesn't drop rate during
|
||||
# brief steerFaultTemporary blips (stale predictions on recovery caused a fault feedback loop).
|
||||
# lat_engaged gates the actual steering command and still respects the fault.
|
||||
lat_requested = (self.active or self.FPCC.alwaysOnLateral) and self.speed_check and \
|
||||
(not standstill or self.joystick_mode)
|
||||
lat_engaged = lat_requested and not CS.steerFaultTemporary and not CS.steerFaultPermanent
|
||||
|
||||
# CLEARPILOT: tell modeld early so it can ramp to 20fps, then delay latActive by
|
||||
# 250ms (5 frames at 20fps) so downstream services stabilize before steering engages.
|
||||
# latRequested is now on frogpilotCarControl (cereal) — see update_frogpilot_variables.
|
||||
now_ts = time.monotonic()
|
||||
if lat_requested and not self.prev_lat_requested:
|
||||
self.lat_requested_ts = now_ts
|
||||
self.prev_lat_requested = lat_requested
|
||||
self.FPCC.latRequested = lat_requested
|
||||
CC.latActive = lat_requested and (now_ts - self.lat_requested_ts) >= 0.25
|
||||
CC.latActive = lat_engaged and (now_ts - self.lat_requested_ts) >= 0.25
|
||||
CC.longActive = self.enabled and not self.events.contains(ET.OVERRIDE_LONGITUDINAL) and self.CP.openpilotLongitudinalControl
|
||||
|
||||
actuators = CC.actuators
|
||||
|
||||
@@ -246,19 +246,19 @@ def main(demo=False):
|
||||
|
||||
sm.update(0)
|
||||
|
||||
# CLEARPILOT: reverted to two modes only — constant 20fps while moving, standby at
|
||||
# standstill. Variable-rate (4/10fps) caused downstream freq/valid cascades that
|
||||
# broke paramsd's angleOffset learner, calibrationd, and locationd's filter init.
|
||||
# Keep the signals plumbed through frogpilotCarControl for when we re-enable.
|
||||
# CLEARPILOT: variable framerate — 4/10fps when not engaged, 20fps when engaged
|
||||
# (or lane changing / calibrating). Downstream services get a constant publish rate
|
||||
# via the republish-caching below — only the GPU inference is skipped, so no
|
||||
# freq_ok cascade in consumers.
|
||||
fpcc = sm['frogpilotCarControl']
|
||||
lat_active = fpcc.latRequested # noqa: F841
|
||||
lane_changing = fpcc.noLatLaneChange # noqa: F841
|
||||
lat_active = fpcc.latRequested
|
||||
lane_changing = fpcc.noLatLaneChange
|
||||
standstill = sm['carState'].standstill
|
||||
calibrating = sm['liveCalibration'].calStatus != log.LiveCalibrationData.Status.calibrated # noqa: F841
|
||||
full_rate = True # always 20fps when not standstill
|
||||
calibrating = sm['liveCalibration'].calStatus != log.LiveCalibrationData.Status.calibrated
|
||||
full_rate = lat_active or lane_changing or calibrating
|
||||
|
||||
# Standby transitions: standstill → 0fps (parked), otherwise → 20fps
|
||||
should_standby = standstill
|
||||
# Standby transitions (standstill only, when not at full rate)
|
||||
should_standby = standstill and not full_rate
|
||||
if should_standby and not model_standby:
|
||||
params_memory.put_bool("ModelStandby", True)
|
||||
model_standby = True
|
||||
|
||||
Reference in New Issue
Block a user