From dc7e0a2db75b9b94058367626590f381d58d58f8 Mon Sep 17 00:00:00 2001 From: Brian Hanson Date: Thu, 23 Apr 2026 11:00:45 -0500 Subject: [PATCH] controlsd+calibrationd: suppress commIssue from valid=False cascade MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes that reinstate the pre-revert defenses against the "TAKE CONTROL IMMEDIATELY / Communication Issue" banner that fires when self-driving on the baseline modelrevert stack: calibrationd: publish valid based on calStatus == calibrated, not sm.all_checks(). Original gate cascaded upstream freq glitches into liveCalibration.valid=False, which kept locationd.filterInitialized False, which fed garbage into paramsd, which corrupted steerRatio (erratic steering). "valid" here is a question about convergence, not input freshness. controlsd: narrow the commIssue trigger to genuine comm failures — not_alive OR can_rcv_timeout. The `not sm.all_checks()` branch also picked up valid=False, but paramsd / torqued / plannerd / frogpilot_planner / dmonitoringd all propagate their sm.all_checks() into msg.valid via a polling-pattern artifact (freq_ok inside poll='...' subscribers tracks gaps between drain bursts rather than the publish rate), so the whole stack flaps valid and trips the banner during normal driving. Content and rate are fine; just the flag. Co-Authored-By: Claude Opus 4.7 (1M context) --- selfdrive/controls/controlsd.py | 13 ++++++++----- selfdrive/locationd/calibrationd.py | 9 ++++++++- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/selfdrive/controls/controlsd.py b/selfdrive/controls/controlsd.py index 6400478..66ef835 100755 --- a/selfdrive/controls/controlsd.py +++ b/selfdrive/controls/controlsd.py @@ -485,13 +485,16 @@ class Controls: # generic catch-all. ideally, a more specific event should be added above instead has_disable_events = self.events.contains(ET.NO_ENTRY) and (self.events.contains(ET.SOFT_DISABLE) or self.events.contains(ET.IMMEDIATE_DISABLE)) no_system_errors = (not has_disable_events) or (len(self.events) == num_events) - if (not self.sm.all_checks() or self.card.can_rcv_timeout) and no_system_errors and not model_suppress: + # CLEARPILOT: fire commIssue ONLY when messages actually aren't flowing (not_alive) + # or CAN RX is timing out. Don't fire on self-declared valid=False — that's the + # polling-pattern / all_checks cascade that paramsd/torqued/plannerd/frogpilot + # propagate even while their publish rate and content are fine. + comms_really_broken = (not self.sm.all_alive()) or self.card.can_rcv_timeout + if comms_really_broken and no_system_errors and not model_suppress: if not self.sm.all_alive(): self.events.add(EventName.commIssue) - elif not self.sm.all_freq_ok(): - self.events.add(EventName.commIssueAvgFreq) - else: # invalid or can_rcv_timeout. - self.events.add(EventName.commIssue) + else: + self.events.add(EventName.commIssue) # can_rcv_timeout path logs = { 'invalid': [s for s, valid in self.sm.valid.items() if not valid], diff --git a/selfdrive/locationd/calibrationd.py b/selfdrive/locationd/calibrationd.py index 6e154bf..1a23162 100755 --- a/selfdrive/locationd/calibrationd.py +++ b/selfdrive/locationd/calibrationd.py @@ -284,7 +284,14 @@ def main() -> NoReturn: # 4Hz driven by cameraOdometry if sm.frame % 5 == 0: - calibrator.send_data(pm, sm.all_checks()) + # CLEARPILOT: publish valid based on calibration status, not upstream sm.all_checks(). + # The original gate cascaded upstream freq glitches into liveCalibration.valid=False, + # which kept locationd.filterInitialized False, which fed garbage into paramsd, which + # corrupted steerRatio and caused erratic steering (and controlsd commIssue banners). + # "valid" here semantically means "the calibration data is trustworthy" — a question + # about convergence, not input freshness. + cal_valid = calibrator.cal_status == log.LiveCalibrationData.Status.calibrated + calibrator.send_data(pm, cal_valid) if __name__ == "__main__":