diff --git a/cereal/services.py b/cereal/services.py index 1769e5a..3677501 100755 --- a/cereal/services.py +++ b/cereal/services.py @@ -30,7 +30,7 @@ services: dict[str, tuple] = { "temperatureSensor": (True, 2., 200), "temperatureSensor2": (True, 2., 200), "gpsNMEA": (True, 9.), - "deviceState": (True, 5., 1), # CLEARPILOT: 5Hz — thermald decimates pandaStates (10Hz) every 2 ticks + "deviceState": (True, 2., 1), # CLEARPILOT: matches DT_TRML=0.5 (thermald at 2Hz) "can": (True, 100., 1223), # decimation gives ~5 msgs in a full segment "controlsState": (True, 100., 10), "pandaStates": (True, 10., 1), @@ -70,7 +70,7 @@ services: dict[str, tuple] = { "wideRoadEncodeIdx": (False, 20., 1), "wideRoadCameraState": (True, 20., 20), "modelV2": (True, 20., 40), - "managerState": (True, 5., 1), # CLEARPILOT: 5Hz — gated by deviceState arrival (see thermald) + "managerState": (True, 2., 1), # CLEARPILOT: gated by deviceState arrival, so matches thermald rate "uploaderState": (True, 0., 1), "navInstruction": (True, 1., 10), "navRoute": (True, 0.), diff --git a/selfdrive/boardd/boardd.cc b/selfdrive/boardd/boardd.cc index c4db1ea..471e920 100755 --- a/selfdrive/boardd/boardd.cc +++ b/selfdrive/boardd/boardd.cc @@ -415,6 +415,7 @@ void panda_state_thread(std::vector pandas, bool spoofing_started) { Panda *peripheral_panda = pandas[0]; bool is_onroad = false; bool is_onroad_last = false; + bool ignition_last = false; std::future safety_future; std::vector connected_serials; @@ -472,8 +473,14 @@ void panda_state_thread(std::vector pandas, bool spoofing_started) { is_onroad = params.getBool("IsOnroad"); - // set new safety on onroad transition, after params are cleared - if (is_onroad && !is_onroad_last) { + // CLEARPILOT: trigger on ignition rising edge instead of IsOnroad rising edge. + // ClearPilot's parked-mode split breaks the stock assumption that IsOnroad + // rises with ignition: IsOnroad now requires `started`, which requires + // thermald to see carState != park, which requires controlsd_parked to + // finish CarD init, which requires this thread to ack OBD multiplexing. + // Firing on ignition restores the original "set safety as soon as the bus + // is alive" timing for both controlsd variants. + if (ignition && !ignition_last) { if (!safety_future.valid() || safety_future.wait_for(0ms) == std::future_status::ready) { safety_future = std::async(std::launch::async, safety_setter_thread, pandas); } else { @@ -482,6 +489,7 @@ void panda_state_thread(std::vector pandas, bool spoofing_started) { } is_onroad_last = is_onroad; + ignition_last = ignition; sm.update(0); const bool engaged = sm.allAliveAndValid({"controlsState"}) && sm["controlsState"].getControlsState().getEnabled(); diff --git a/selfdrive/controls/controlsd.py b/selfdrive/controls/controlsd.py index 3c43a02..b0d6137 100755 --- a/selfdrive/controls/controlsd.py +++ b/selfdrive/controls/controlsd.py @@ -686,12 +686,11 @@ class Controls: if model_v2.meta.laneChangeState == LaneChangeState.laneChangeStarting and clearpilot_disable_lat_on_lane_change: CC.latActive = False - self.params_memory.put_bool("no_lat_lane_change", True) - # CLEARPILOT: hyundai carcontroller reads this off frogpilot_variables (not Params) — keep both in sync self.frogpilot_variables.no_lat_lane_change = True + self.FPCC.noLatLaneChange = True else: - self.params_memory.put_bool("no_lat_lane_change", False) self.frogpilot_variables.no_lat_lane_change = False + self.FPCC.noLatLaneChange = False if CS.leftBlinker or CS.rightBlinker: self.last_blinker_frame = self.sm.frame diff --git a/selfdrive/locationd/locationd.cc b/selfdrive/locationd/locationd.cc index 0274b6e..2535c2c 100755 --- a/selfdrive/locationd/locationd.cc +++ b/selfdrive/locationd/locationd.cc @@ -308,12 +308,18 @@ void Localizer::input_fake_gps_observations(double current_time) { } void Localizer::handle_gps(double current_time, const cereal::GpsLocationData::Reader& log, const double sensor_time_offset) { + // CLEARPILOT: GPS disabled as a Kalman input. gpsd still publishes real GPS for + // UI / dashcam / clock / night-mode; only locationd ignores it. Falls through to + // determine_gps_mode() which is openpilot's existing no-GPS path (fake observations + // to bound position uncertainty). To re-enable, set this to false. + const bool clearpilot_disable_gps = true; + bool gps_unreasonable = (Vector2d(log.getHorizontalAccuracy(), log.getVerticalAccuracy()).norm() >= SANE_GPS_UNCERTAINTY); bool gps_accuracy_insane = ((log.getVerticalAccuracy() <= 0) || (log.getSpeedAccuracy() <= 0) || (log.getBearingAccuracyDeg() <= 0)); bool gps_lat_lng_alt_insane = ((std::abs(log.getLatitude()) > 90) || (std::abs(log.getLongitude()) > 180) || (std::abs(log.getAltitude()) > ALTITUDE_SANITY_CHECK)); bool gps_vel_insane = (floatlist2vector(log.getVNED()).norm() > TRANS_SANITY_CHECK); - if (!log.getHasFix() || gps_unreasonable || gps_accuracy_insane || gps_lat_lng_alt_insane || gps_vel_insane) { + if (clearpilot_disable_gps || !log.getHasFix() || gps_unreasonable || gps_accuracy_insane || gps_lat_lng_alt_insane || gps_vel_insane) { //this->gps_valid = false; this->determine_gps_mode(current_time); return; diff --git a/sessions/2026-04-26-1143-gps-disable-and-calibrationd-stale/README.md b/sessions/2026-04-26-1143-gps-disable-and-calibrationd-stale/README.md new file mode 100644 index 0000000..b8d606e --- /dev/null +++ b/sessions/2026-04-26-1143-gps-disable-and-calibrationd-stale/README.md @@ -0,0 +1,358 @@ +# Session: 2026-04-26 — GPS disabled in locationd; calibrationd-still-stale notes + +## Context + +Followup session to `2026-04-26-0914-baseline-revert-and-parked-mode`. +After the baseline restore, the manager wouldn't start cleanly and the car +exhibited a "drifting right on straight roads, model rescues us mid-curve" +symptom. This session unblocked the startup chain end-to-end so the car can +boot and run, disabled GPS as an input to locationd (the actual fix that +made the drift go away), and pinned down — but did **not** solve — why +`liveCalibration.valid` is still stuck at `False` and what that latently +breaks downstream. + +Pre-session tip: `27cad05`. Single combined commit covering four code +changes plus this README: + +- `selfdrive/boardd/boardd.cc` — `safety_setter_thread` on ignition edge +- `selfdrive/controls/controlsd.py` — drop unregistered + `no_lat_lane_change` Params write, wire `FPCC.noLatLaneChange` for UI +- `cereal/services.py` — deviceState/managerState back to 2Hz to match + restored `DT_TRML` +- `selfdrive/locationd/locationd.cc` — ignore GPS as Kalman input + (reversible flag) + +--- + +## Commit 1: boardd — safety_setter_thread on ignition edge + +### Symptom + +Manager started fine but `controlsd_parked` blocked indefinitely at: + +``` +set_obd_multiplexing (selfdrive/car/fw_versions.py:236) + fingerprint (selfdrive/car/car_helpers.py:149) + get_car (selfdrive/car/car_helpers.py:210) + __init__ (selfdrive/car/card.py:60) + main (selfdrive/controls/controlsd_parked.py:41) +``` + +`set_obd_multiplexing` does `params.get_bool("ObdMultiplexingChanged", block=True)` — +it waits for **boardd's `safety_setter_thread`** to ack. That thread spawns +only on the **rising edge of `IsOnroad`** (`boardd.cc:476`). `IsOnroad` is set +by manager from the `started` flag (`helpers.py:49`). With the parked-mode split +from the prior session, `started` requires `not_parked`, which requires thermald +to see `carState.gearShifter != park`, which requires `controlsd_parked` to +publish carState — which it can't do until OBD multiplexing is acked. + +Classic deadlock: ignition rising no longer implies IsOnroad rising. + +### Fix + +`selfdrive/boardd/boardd.cc:476` — change the trigger from IsOnroad rising +edge to **ignition rising edge**. Adds `bool ignition_last = false;` next to +`is_onroad_last`, swaps the gate variable, sets `ignition_last = ignition;` +after. Restores stock openpilot's intent: set safety as soon as the bus is +alive. Both controlsd variants need it; the thread's phase 2 (waiting on +`ControlsReady`) is harmless in parked mode — it just sits. + +`is_onroad`/`is_onroad_last` left in place; they're now unused beyond this +gate but the read of `params.getBool("IsOnroad")` still happens, in case any +future logic wants it. + +--- + +## Commit 2: controlsd — drop unregistered Params write + +### Symptom + +After the boardd fix, controlsd_parked progressed but full controlsd +crashed at first `state_control` cycle: + +``` +File "selfdrive/controls/controlsd.py", line 693, in state_control + self.params_memory.put_bool("no_lat_lane_change", False) +common.params_pyx.UnknownKeyName: b'no_lat_lane_change' +``` + +The baseline-restored controlsd unconditionally writes `no_lat_lane_change` +to memory params on every state_control cycle. That key was never +registered in `common/params.cc`. Pre-revert (`62a403d`) controlsd didn't +write the param at all — it set `self.FPCC.noLatLaneChange` (capnp field). +The baseline brought back a code path the fork's params.cc never matched. + +### Fix + +In `selfdrive/controls/controlsd.py:687-694`, remove the two +`params_memory.put_bool("no_lat_lane_change", ...)` calls and replace with +`self.FPCC.noLatLaneChange = True/False` (matches what the kept UI code in +`onroad.cc:897` and `ui.cc:122` is reading from cereal). The +`frogpilot_variables.no_lat_lane_change` writes were already there — those +are what the kept Hyundai carcontroller actually reads at 100Hz. + +No actuator change. Pure plumbing. + +--- + +## Commit 3: services.py — restore deviceState/managerState rates + +### Symptom + +After fixing the controlsd crash, controlsd booted but immediately fired +**continuous `commIssue`** with: + +``` +"not_freq_ok": ["deviceState", "managerState"] +``` + +### Diagnosis + +`common/realtime.py` was reverted to `DT_TRML = 0.5` (thermald → 2Hz) in +the baseline restore. But `cereal/services.py` still declared `deviceState` +and `managerState` at 5Hz from earlier 4Hz-fan-control work. The freq window +is `[0.8 × min, 1.2 × max]` — declared 5Hz means [4.0, 6.0]Hz. Thermald +publishing at 2Hz fell well below 4.0 → freq_ok=False every cycle → +commIssue every cycle. + +Already documented as a known footgun in `CLAUDE.md` ("Changing a Service's +Publish Rate"). + +### Fix + +`cereal/services.py:33,73` — set both back to 2Hz with a comment pointing +at `DT_TRML`. After this, `not_freq_ok=[]` and the continuous commIssue +stopped — only one transient commIssue at startup remained (warmup). + +--- + +## Commit 4: locationd — ignore GPS as Kalman input (reversible) + +### Why + +The car was drifting right on straight roads. Pre-revert, the user +reported this had been working for years; the only practically-new thing +is that `system/clearpilot/gpsd.py` (AT-command-based GPS) had recently +**started actually getting fixes**, where for a long time it wasn't. + +Two concrete data problems with the GPS feed for selfdrive purposes: + +- `gpsd.py:221` hard-codes `gps.vNED = [0.0, 0.0, 0.0]` while the user is + moving 28 m/s. locationd's `handle_gps` derives `OBSERVATION_ECEF_VEL` + from this; the Kalman gets "GPS says you're stopped" while accelerometer + says otherwise. +- `gpsd.py:216,222` populate horizontalAccuracy/verticalAccuracy from + `hdop * 5` (a rough conversion); `bearingAccuracyDeg = 10.0` and + `speedAccuracy = 1.0` are constants. None of these match what a real + GNSS chip reports. + +These flow into the latcontrol_torque pipeline indirectly through +`liveLocationKalman.angularVelocityCalibrated` (used as +`actual_curvature_llk = ... / CS.vEgo` blended with steering-angle-derived +curvature) and through `liveLocationKalman.calibratedOrientationNED` +(pitch). When the Kalman has wrong velocity observations, those derived +fields go wrong — and on a straight crowned road, the controller's +"actual_curvature" picture is off-center, biasing torque output. + +### What the user does NOT want disabled + +gpsd publishes are still consumed by: +- UI speed indicator and the ClearPilot status overlay +- `dashcamd` for `.srt` subtitle sidecars (lat/lon per segment) +- `timed.py` for system-clock setting via `unixTimestampMillis` +- `gpsd.py` itself for sunset/sunrise → `IsDaylight` (auto night mode) +- `telemetryd.py` for the `gps` group in the CSV + +So we cannot just stop publishing — only locationd should ignore. + +### Fix + +`selfdrive/locationd/locationd.cc:310-323` — add a `clearpilot_disable_gps` +const at the top of `Localizer::handle_gps` and OR it into the existing +reject condition. With it true, every gpsLocation message falls through to +`determine_gps_mode(current_time)` (openpilot's stock no-GPS path: +`input_fake_gps_observations` once position uncertainty grows past +`SANE_GPS_UNCERTAINTY`, otherwise nothing). `last_gps_msg` never updates, +`is_gps_ok()` returns False, `liveLocationKalman.gpsOK = false`. + +Effect verified by user: drift improved noticeably. The torque controller +is no longer fed contradictory GPS-vs-IMU velocity observations through +the Kalman. + +To re-enable GPS as a Kalman input: flip the `clearpilot_disable_gps` +constant to `false` and rebuild. Self-contained edit. + +--- + +## Calibrationd: still stale, root-cause partially understood + +### What is happening + +`calibrationd` publishes `liveCalibration.valid = sm.all_checks()` on every +cycle, where `sm` polls `cameraOdometry` and non-polls `carState` and +`carParams`. We measured: **120 publishes in 30 seconds, every single one +`valid=False`** with `calStatus=calibrated, calPerc=100, validBlocks=50`. +The body of the calibration is converged — the validity flag is stuck off. + +### Why this matters + +`liveCalibration.valid=False` cascades: + +1. `locationd.cc:715` — `filterInitialized = sm.allAliveAndValid()`. + liveCalibration is in the sub list and not in `ignore_alive` / + `ignore_valid`. So filterInitialized stays False forever. +2. With Kalman uninitialized, `liveLocationKalman` still publishes but the + body fields are empty/default. `liveLocationKalman.status = uninitialized`. +3. `paramsd.py` subscribes to `liveLocationKalman` with `poll='liveLocationKalman'` + and gates its update logic on `sm.all_checks()`. When liveLocationKalman + itself isn't in a sane state, paramsd's `roll`, `angleOffsetDeg`, + `steerRatio`, `stiffnessFactor` either never converge or converge to bad + values. +4. `latcontrol_torque.py:135-136` uses `params.angleOffsetDeg` to compute + `actual_curvature_vm` and `params.roll` for `roll_compensation`. With + `roll=0`, no compensation for a crowned/banked road. With wrong + `angleOffsetDeg`, the closed-loop "actual curvature" measurement is + biased. + +So the latent risk is: even with our GPS fix, the controller is running +**without learned roll compensation and without a learned steering-angle +offset**. Symptom-free on straight, level pavement; biased on banked roads. + +### Why `valid=False` + +Inside calibrationd's SubMaster, `sm.all_checks() = all_alive AND all_freq_ok +AND all_valid`. We measured each: + +- `cameraOdometry`: alive=True, valid=True, freq_ok=True ✓ +- `carState`: alive flickers True/False, valid=True, **freq_ok=False (every cycle)** +- `carParams`: alive=True after first arrival, valid=True, freq_ok=False + but excluded from `all_freq_ok` because `_check_avg_freq` skips services + with `frequency < 0.99` Hz (carParams is 0.02 Hz declared) — so it doesn't + fail the gate. + +The smoking gun is **`carState.freq_ok = False` from inside calibrationd's +`poll='cameraOdometry'` SubMaster**. + +Direct measurement (`/tmp/test_subs.py`): + +| poll arg | carState observed rate | freq_ok | +|---|---:|:-:| +| `None` (all polled) | 97.40 Hz | True | +| `'cameraOdometry'` | **2.24 Hz** | **False** | +| `'carState'` | 98.58 Hz | True | + +carState is published at 100 Hz to a 10MB shared-memory MSGQ queue. From +inside `poll='cameraOdometry'`, our non-blocking `recv_one_or_none(carState)` +returns None ~88% of the time. `/tmp/diag_recv.py`: + +``` +cameraOdometry msgs received: 121 +carState recv calls: 121, hits: 14 (11.6% hit rate) +avg cycle duration: 50.0ms +``` + +So we're calling `recv` 20× per second on carState's queue, and finding +the queue empty 9 out of 10 times — even though carState is being published +at 100 Hz to that same queue with a `conflate=True` socket option. + +### The MSGQ NUM_READERS = 12 hypothesis + +`cereal/messaging/msgq.h:9` defines `#define NUM_READERS 12`. When a 13th +subscriber tries to subscribe to a queue, `msgq.cc:182-197` **invalidates +ALL readers simultaneously** (`*q->read_valids[i] = false` for all i) +to "reset and re-register". On the next read, an invalidated reader's +`msgq_msg_recv` jumps to `msgq_reset_reader` (line 347) and ends up with +`read_pointer == write_pointer` (caught up to current), returning size 0. + +Subscribers to `carState` in our running system include: controlsd, +plannerd, locationd, calibrationd, paramsd, dmonitoringd, frogpilot_process, +telemetryd, statsd. That's already nine. The introspection scripts +(`/tmp/check_*.py`, `/tmp/measure_freq.py`, `/tmp/diag_recv.py`, +`/tmp/cal_view.py`) and any UI-side subscribers add more, can easily +push past 12 and trigger global eviction. Once evicted, a long-running +subscriber stays in the "find empty queue / reset / try again" loop, which +is what we measured. + +This is **partially confirmed** but not proven definitively. The +investigation was paused before instrumenting the queue header to count +slot churn. The smoking-gun would be: print `q->num_readers` from inside +calibrationd at boot vs. during steady state and watch it tick up to 12+. + +### Things to consider for the actual fix + +1. **`7ee923b` already solved this exact problem.** It changed + calibrationd's publish to: + ```python + # was: calibrator.send_data(pm, sm.all_checks()) + # to: calibrator.send_data(pm, calibrator.cal_status == log.LiveCalibrationData.Status.calibrated) + ``` + The commit message documented the exact failure mode (cascade through + locationd uninitialized → paramsd steerRatio≈0 / stiffnessFactor≈0 → + nonsense curvature commands). It was reverted by `47321e3` as part of + "restore driving logic to pre-variable-fps baseline" — but that revert + was about isolating the drift cause, and the calibrationd change here + is *not* in the variable-FPS family. **Re-applying `7ee923b` is + probably the right next move**, narrowly scoped to calibrationd.py. +2. Less attractive alternatives: bump `NUM_READERS`; switch + carState to `poll=None` in calibrationd (more cycles per update, + higher CPU); add `ignore_average_freq=['carState']` to calibrationd's + SubMaster (treats freq glitches as benign, but keeps the cascade for + alive/valid). + +--- + +## Steer-fault alert investigation (separate symptom) + +User saw "Steering Temporarily Unavailable" alerts during a test drive +even though we hadn't touched lateral-control code. Captured in +`realdata/00000081--528e2aa03a--0/rlog`: + +- Faults occur in **brief 50–100 ms pulses** clustered while moving slowly + (`vEgo` 2.7–5.2 m/s ≈ 6–12 mph). +- Each pulse correlates with **large driver wheel torque** (-250, +273, + -187, +119 Nm) — i.e. the user actively turning the wheel during a + parking-lot maneuver. +- `cruise.enabled = False` throughout — openpilot was not engaged. +- The car's MDPS sets `LKA_FAULT` at low speeds when torque is high; that + bit maps directly to `cs_out.steerFaultTemporary` (`carstate.py:259`), + which fires `steerTempUnavailableSilent` regardless of engagement + (`ET.WARNING` displays unconditionally). + +User reports they've "never had this issue before" — implying earlier +ClearPilot revisions either gated the fault on speed or used a different +no-lateral path. **Not confirmed which.** Open follow-up. + +--- + +## Open follow-ups (ordered by likely return) + +1. **Re-apply `7ee923b`** — gate `liveCalibration.valid` on `calStatus`, + not `sm.all_checks()`. Unblocks locationd init → paramsd convergence → + real `params.roll` and `params.angleOffsetDeg` for the torque + controller. Latent benefit beyond what GPS-disable alone gave us. +2. **Investigate the persistent low-speed `steerTempUnavailable` alert.** + Either (a) gate `steerFaultTemporary` on `vEgo > ~8 m/s` in + `carstate.py:259`, or (b) find what the previous fork did — possibly + stopped sending tester CAN messages on park, possibly suppressed the + alert specifically during a park transition window. +3. **Suppress LKAS fault display when shifting drive → park.** The user + reports the car shows an LKAS-fault icon when openpilot keeps publishing + tester-present CAN messages after entering park. Investigation needed + in `selfdrive/car/hyundai/carcontroller.py` and `hyundaicanfd.py` to + gate tester messages on gear ≠ park. +4. **Wheel-torque headroom edit.** User mentioned a community-known edit + that allows slightly higher steering torque on the Hyundai panda safety + model. Research target: panda safety code for HYUNDAI_CANFD safety + model and the `MAX_TORQUE` / per-cycle delta limits. +5. **Single startup `commIssue` event.** Even with all our fixes, controlsd + logs one transient commIssue right after `controlsd.initialized` + (timeout=true after 6s). The `invalid` set at that moment is + downstream services still warming up (liveCalibration, liveLocationKalman, + liveParameters, liveTorqueParameters, frogpilotPlan, longitudinalPlan, + driverMonitoringState). Most should clear once the calibrationd issue + is fixed; remaining ones are normal warmup. +6. **gpsd.py vNED / accuracy fields.** Out of scope for this session + (we disabled GPS in locationd instead), but if GPS is ever re-enabled, + `gpsd.py:216,221-224` need real values: vNED from + `speed × {cos(bearing), sin(bearing), 0}`, and accuracy fields from + actual modem reports rather than hard-coded constants.