perf: gate hot /dev/shm writes on change — controlsd 69%→28% CPU

py-spy showed per-cycle atomic param writes were the dominant cost. Each
put() is mkstemp+fsync+flock+rename+fsync_dir — fine when rare, ruinous at
100Hz. At park with no state changes, these writes were running anyway and
the flock contention was poisoning the whole system.

carstate.py (update + update_canfd): CarSpeedLimit, CarIsMetric,
  CarCruiseDisplayActual were written every CAN update. Now cached and
  written only on change.
controlsd.py: same fix for LatRequested and no_lat_lane_change. Also
  throttle the sentry crash-file stat() from 100Hz to 1Hz.

Also: suppress locationdTemporaryError/paramsdTemporaryError/posenetInvalid
on lat engage (same 2s window as commIssue), and tie suppression to the
LatRequested edge instead of CC.latActive (fires immediately, not after
the 250ms ramp-up delay).

Also: reset Ratekeeper when it falls >1s behind — the ~6s fingerprinting
stall at startup was poisoning the lag metric for the entire session.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-16 17:34:07 -05:00
parent 2d819c784b
commit 7a1e157c9c
2 changed files with 54 additions and 34 deletions

View File

@@ -48,6 +48,11 @@ class CarState(CarStateBase):
self.is_metric = False self.is_metric = False
self.buttons_counter = 0 self.buttons_counter = 0
# CLEARPILOT: cache to avoid per-cycle atomic writes to /dev/shm (eats CPU via fsync/flock)
self._prev_car_speed_limit = None
self._prev_car_is_metric = None
self._prev_car_cruise_display = None
self.cruise_info = {} self.cruise_info = {}
# On some cars, CLU15->CF_Clu_VehicleSpeed can oscillate faster than the dash updates. Sample at 5 Hz # On some cars, CLU15->CF_Clu_VehicleSpeed can oscillate faster than the dash updates. Sample at 5 Hz
@@ -210,10 +215,18 @@ class CarState(CarStateBase):
self.lkas_previously_enabled = self.lkas_enabled self.lkas_previously_enabled = self.lkas_enabled
self.lkas_enabled = cp.vl["BCM_PO_11"]["LFA_Pressed"] self.lkas_enabled = cp.vl["BCM_PO_11"]["LFA_Pressed"]
# self.params_memory.put_int("CarSpeedLimitLiteral", self.calculate_speed_limit(cp, cp_cam)) # CLEARPILOT: gate on change — see same fix in update_canfd
self.params_memory.put_float("CarSpeedLimit", self.calculate_speed_limit(cp, cp_cam) * speed_conv) car_speed_limit = self.calculate_speed_limit(cp, cp_cam) * speed_conv
if car_speed_limit != self._prev_car_speed_limit:
self.params_memory.put_float("CarSpeedLimit", car_speed_limit)
self._prev_car_speed_limit = car_speed_limit
if self.is_metric != self._prev_car_is_metric:
self.params_memory.put("CarIsMetric", "1" if self.is_metric else "0") self.params_memory.put("CarIsMetric", "1" if self.is_metric else "0")
self.params_memory.put_float("CarCruiseDisplayActual", cp_cruise.vl["SCC11"]["VSetDis"]) self._prev_car_is_metric = self.is_metric
car_cruise_display = cp_cruise.vl["SCC11"]["VSetDis"]
if car_cruise_display != self._prev_car_cruise_display:
self.params_memory.put_float("CarCruiseDisplayActual", car_cruise_display)
self._prev_car_cruise_display = car_cruise_display
return ret return ret
@@ -417,11 +430,14 @@ class CarState(CarStateBase):
# nonAdaptive = false, # nonAdaptive = false,
# speedCluster = 0 ) # speedCluster = 0 )
# print("Set limit") # CLEARPILOT: gate on change — these writes run 100Hz, each is an atomic fsync/flock transaction
# print(self.calculate_speed_limit(cp, cp_cam)) car_speed_limit = self.calculate_speed_limit(cp, cp_cam) * speed_factor
# self.params_memory.put_float("CarSpeedLimitLiteral", self.calculate_speed_limit(cp, cp_cam)) if car_speed_limit != self._prev_car_speed_limit:
self.params_memory.put_float("CarSpeedLimit", self.calculate_speed_limit(cp, cp_cam) * speed_factor) self.params_memory.put_float("CarSpeedLimit", car_speed_limit)
self._prev_car_speed_limit = car_speed_limit
if self.is_metric != self._prev_car_is_metric:
self.params_memory.put("CarIsMetric", "1" if self.is_metric else "0") self.params_memory.put("CarIsMetric", "1" if self.is_metric else "0")
self._prev_car_is_metric = self.is_metric
# CLEARPILOT: telemetry logging — disabled, re-enable when needed # CLEARPILOT: telemetry logging — disabled, re-enable when needed
# speed_limit_bus = cp if self.CP.flags & HyundaiFlags.CANFD_HDA2 else cp_cam # speed_limit_bus = cp if self.CP.flags & HyundaiFlags.CANFD_HDA2 else cp_cam

View File

@@ -170,10 +170,9 @@ class Controls:
self.current_alert_types = [ET.PERMANENT] self.current_alert_types = [ET.PERMANENT]
self.logged_comm_issue = None self.logged_comm_issue = None
self.not_running_prev = None self.not_running_prev = None
self.last_lat_engage_ts = 0 # CLEARPILOT: timestamp of most recent lat engage transition self.lat_requested_ts = 0 # CLEARPILOT: timestamp when lat was first requested (ramp-up delay + comm issue suppression)
self.prev_lat_active = False
self.lat_requested_ts = 0 # CLEARPILOT: timestamp when lat was first requested (for ramp-up delay)
self.prev_lat_requested = False self.prev_lat_requested = False
self.prev_no_lat_lane_change = None # CLEARPILOT: gate no_lat_lane_change write on change
self.steer_limited = False self.steer_limited = False
self.desired_curvature = 0.0 self.desired_curvature = 0.0
self.experimental_mode = False self.experimental_mode = False
@@ -462,13 +461,10 @@ class Controls:
now = time.monotonic() now = time.monotonic()
model_suppress = (now - standby_ts) < 2.0 model_suppress = (now - standby_ts) < 2.0
# CLEARPILOT: detect lat engage rising edge and suppress commIssue for 2s after # CLEARPILOT: suppress commIssue/location/params errors for 2s after lat was requested
# Model FPS jumps from 4 to 20 on engage, causing transient freq check failures # Model FPS jumps from 4 to 20 on engage, causing transient freq check failures
lat_active_now = self.CC.latActive # lat_requested_ts is set in state_control on the False->True transition of LatRequested
if lat_active_now and not self.prev_lat_active: lat_engage_suppress = (now - self.lat_requested_ts) < 2.0
self.last_lat_engage_ts = now
self.prev_lat_active = lat_active_now
lat_engage_suppress = (now - self.last_lat_engage_ts) < 2.0
not_running = {p.name for p in self.sm['managerState'].processes if not p.running and p.shouldBeRunning} not_running = {p.name for p in self.sm['managerState'].processes if not p.running and p.shouldBeRunning}
if self.sm.recv_frame['managerState'] and (not_running - IGNORE_PROCESSES): if self.sm.recv_frame['managerState'] and (not_running - IGNORE_PROCESSES):
@@ -520,13 +516,15 @@ class Controls:
self.logged_comm_issue = None self.logged_comm_issue = None
if not (self.CP.notCar and self.joystick_mode): if not (self.CP.notCar and self.joystick_mode):
if not self.sm['liveLocationKalman'].posenetOK and not model_suppress: # CLEARPILOT: also suppress on lat engage — modeld 4fps→20fps transition causes
# transient inputsOK/valid drops in locationd/paramsd downstream
if not self.sm['liveLocationKalman'].posenetOK and not model_suppress and not lat_engage_suppress:
self.events.add(EventName.posenetInvalid) self.events.add(EventName.posenetInvalid)
if not self.sm['liveLocationKalman'].deviceStable: if not self.sm['liveLocationKalman'].deviceStable:
self.events.add(EventName.deviceFalling) self.events.add(EventName.deviceFalling)
if not self.sm['liveLocationKalman'].inputsOK and not model_suppress: if not self.sm['liveLocationKalman'].inputsOK and not model_suppress and not lat_engage_suppress:
self.events.add(EventName.locationdTemporaryError) self.events.add(EventName.locationdTemporaryError)
if not self.sm['liveParameters'].valid and not TESTING_CLOSET and (not SIMULATION or REPLAY) and not model_suppress: if not self.sm['liveParameters'].valid and not TESTING_CLOSET and (not SIMULATION or REPLAY) and not model_suppress and not lat_engage_suppress:
self.events.add(EventName.paramsdTemporaryError) self.events.add(EventName.paramsdTemporaryError)
# conservative HW alert. if the data or frequency are off, locationd will throw an error # conservative HW alert. if the data or frequency are off, locationd will throw an error
@@ -702,9 +700,10 @@ class Controls:
# CLEARPILOT: tell modeld early so it can ramp to 20fps, then delay latActive by # CLEARPILOT: tell modeld early so it can ramp to 20fps, then delay latActive by
# 250ms (5 frames at 20fps) so downstream services stabilize before steering engages. # 250ms (5 frames at 20fps) so downstream services stabilize before steering engages.
self.params_memory.put_bool("LatRequested", lat_requested)
now_ts = time.monotonic() now_ts = time.monotonic()
if lat_requested and not self.prev_lat_requested: if lat_requested != self.prev_lat_requested:
self.params_memory.put_bool("LatRequested", lat_requested)
if lat_requested:
self.lat_requested_ts = now_ts self.lat_requested_ts = now_ts
self.prev_lat_requested = lat_requested self.prev_lat_requested = lat_requested
CC.latActive = lat_requested and (now_ts - self.lat_requested_ts) >= 0.25 CC.latActive = lat_requested and (now_ts - self.lat_requested_ts) >= 0.25
@@ -722,11 +721,13 @@ class Controls:
CC.leftBlinker = model_v2.meta.laneChangeDirection == LaneChangeDirection.left CC.leftBlinker = model_v2.meta.laneChangeDirection == LaneChangeDirection.left
CC.rightBlinker = model_v2.meta.laneChangeDirection == LaneChangeDirection.right CC.rightBlinker = model_v2.meta.laneChangeDirection == LaneChangeDirection.right
if model_v2.meta.laneChangeState == LaneChangeState.laneChangeStarting and clearpilot_disable_lat_on_lane_change: no_lat_lane_change = model_v2.meta.laneChangeState == LaneChangeState.laneChangeStarting and clearpilot_disable_lat_on_lane_change
if no_lat_lane_change:
CC.latActive = False CC.latActive = False
self.params_memory.put_bool("no_lat_lane_change", True) # CLEARPILOT: only write on change — per-cycle atomic writes were eating ~15% CPU
else: if no_lat_lane_change != self.prev_no_lat_lane_change:
self.params_memory.put_bool("no_lat_lane_change", False) self.params_memory.put_bool("no_lat_lane_change", no_lat_lane_change)
self.prev_no_lat_lane_change = no_lat_lane_change
if CS.leftBlinker or CS.rightBlinker: if CS.leftBlinker or CS.rightBlinker:
self.last_blinker_frame = self.sm.frame self.last_blinker_frame = self.sm.frame
@@ -1006,6 +1007,11 @@ class Controls:
while True: while True:
self.step() self.step()
self.rk.monitor_time() self.rk.monitor_time()
# CLEARPILOT: accumulated debt from startup/blocking calls never recovers.
# Reset the rate keeper when catastrophically behind — prevents a one-time
# ~8s fingerprinting stall from poisoning the lag metric for the whole session.
if self.rk.remaining < -1.0:
self.rk._next_frame_time = time.monotonic() + self.rk._interval
except SystemExit: except SystemExit:
e.set() e.set()
t.join() t.join()
@@ -1076,13 +1082,11 @@ class Controls:
elif self.sm['modelV2'].meta.turnDirection == Desire.turnRight: elif self.sm['modelV2'].meta.turnDirection == Desire.turnRight:
self.events.add(EventName.turningRight) self.events.add(EventName.turningRight)
if os.path.isfile(os.path.join(sentry.CRASHES_DIR, 'error.txt')) and self.crashed_timer < 10: # CLEARPILOT: check crash file once per second, not every cycle (stat syscall at 100Hz hurts)
if self.sm.frame % 100 == 0:
self._crashed_file_exists = os.path.isfile(os.path.join(sentry.CRASHES_DIR, 'error.txt'))
if getattr(self, '_crashed_file_exists', False) and self.crashed_timer < 10:
self.events.add(EventName.openpilotCrashed) self.events.add(EventName.openpilotCrashed)
# if self.random_events and not self.openpilot_crashed_triggered:
# self.events.add(EventName.openpilotCrashedRandomEvents)
# self.openpilot_crashed_triggered = True
self.crashed_timer += DT_CTRL self.crashed_timer += DT_CTRL
# if self.speed_limit_alert or self.speed_limit_confirmation: # if self.speed_limit_alert or self.speed_limit_confirmation: