perf: gate hot /dev/shm writes on change — controlsd 69%→28% CPU
py-spy showed per-cycle atomic param writes were the dominant cost. Each put() is mkstemp+fsync+flock+rename+fsync_dir — fine when rare, ruinous at 100Hz. At park with no state changes, these writes were running anyway and the flock contention was poisoning the whole system. carstate.py (update + update_canfd): CarSpeedLimit, CarIsMetric, CarCruiseDisplayActual were written every CAN update. Now cached and written only on change. controlsd.py: same fix for LatRequested and no_lat_lane_change. Also throttle the sentry crash-file stat() from 100Hz to 1Hz. Also: suppress locationdTemporaryError/paramsdTemporaryError/posenetInvalid on lat engage (same 2s window as commIssue), and tie suppression to the LatRequested edge instead of CC.latActive (fires immediately, not after the 250ms ramp-up delay). Also: reset Ratekeeper when it falls >1s behind — the ~6s fingerprinting stall at startup was poisoning the lag metric for the entire session. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -48,6 +48,11 @@ class CarState(CarStateBase):
|
|||||||
self.is_metric = False
|
self.is_metric = False
|
||||||
self.buttons_counter = 0
|
self.buttons_counter = 0
|
||||||
|
|
||||||
|
# CLEARPILOT: cache to avoid per-cycle atomic writes to /dev/shm (eats CPU via fsync/flock)
|
||||||
|
self._prev_car_speed_limit = None
|
||||||
|
self._prev_car_is_metric = None
|
||||||
|
self._prev_car_cruise_display = None
|
||||||
|
|
||||||
self.cruise_info = {}
|
self.cruise_info = {}
|
||||||
|
|
||||||
# On some cars, CLU15->CF_Clu_VehicleSpeed can oscillate faster than the dash updates. Sample at 5 Hz
|
# On some cars, CLU15->CF_Clu_VehicleSpeed can oscillate faster than the dash updates. Sample at 5 Hz
|
||||||
@@ -210,10 +215,18 @@ class CarState(CarStateBase):
|
|||||||
self.lkas_previously_enabled = self.lkas_enabled
|
self.lkas_previously_enabled = self.lkas_enabled
|
||||||
self.lkas_enabled = cp.vl["BCM_PO_11"]["LFA_Pressed"]
|
self.lkas_enabled = cp.vl["BCM_PO_11"]["LFA_Pressed"]
|
||||||
|
|
||||||
# self.params_memory.put_int("CarSpeedLimitLiteral", self.calculate_speed_limit(cp, cp_cam))
|
# CLEARPILOT: gate on change — see same fix in update_canfd
|
||||||
self.params_memory.put_float("CarSpeedLimit", self.calculate_speed_limit(cp, cp_cam) * speed_conv)
|
car_speed_limit = self.calculate_speed_limit(cp, cp_cam) * speed_conv
|
||||||
|
if car_speed_limit != self._prev_car_speed_limit:
|
||||||
|
self.params_memory.put_float("CarSpeedLimit", car_speed_limit)
|
||||||
|
self._prev_car_speed_limit = car_speed_limit
|
||||||
|
if self.is_metric != self._prev_car_is_metric:
|
||||||
self.params_memory.put("CarIsMetric", "1" if self.is_metric else "0")
|
self.params_memory.put("CarIsMetric", "1" if self.is_metric else "0")
|
||||||
self.params_memory.put_float("CarCruiseDisplayActual", cp_cruise.vl["SCC11"]["VSetDis"])
|
self._prev_car_is_metric = self.is_metric
|
||||||
|
car_cruise_display = cp_cruise.vl["SCC11"]["VSetDis"]
|
||||||
|
if car_cruise_display != self._prev_car_cruise_display:
|
||||||
|
self.params_memory.put_float("CarCruiseDisplayActual", car_cruise_display)
|
||||||
|
self._prev_car_cruise_display = car_cruise_display
|
||||||
|
|
||||||
|
|
||||||
return ret
|
return ret
|
||||||
@@ -417,11 +430,14 @@ class CarState(CarStateBase):
|
|||||||
# nonAdaptive = false,
|
# nonAdaptive = false,
|
||||||
# speedCluster = 0 )
|
# speedCluster = 0 )
|
||||||
|
|
||||||
# print("Set limit")
|
# CLEARPILOT: gate on change — these writes run 100Hz, each is an atomic fsync/flock transaction
|
||||||
# print(self.calculate_speed_limit(cp, cp_cam))
|
car_speed_limit = self.calculate_speed_limit(cp, cp_cam) * speed_factor
|
||||||
# self.params_memory.put_float("CarSpeedLimitLiteral", self.calculate_speed_limit(cp, cp_cam))
|
if car_speed_limit != self._prev_car_speed_limit:
|
||||||
self.params_memory.put_float("CarSpeedLimit", self.calculate_speed_limit(cp, cp_cam) * speed_factor)
|
self.params_memory.put_float("CarSpeedLimit", car_speed_limit)
|
||||||
|
self._prev_car_speed_limit = car_speed_limit
|
||||||
|
if self.is_metric != self._prev_car_is_metric:
|
||||||
self.params_memory.put("CarIsMetric", "1" if self.is_metric else "0")
|
self.params_memory.put("CarIsMetric", "1" if self.is_metric else "0")
|
||||||
|
self._prev_car_is_metric = self.is_metric
|
||||||
|
|
||||||
# CLEARPILOT: telemetry logging — disabled, re-enable when needed
|
# CLEARPILOT: telemetry logging — disabled, re-enable when needed
|
||||||
# speed_limit_bus = cp if self.CP.flags & HyundaiFlags.CANFD_HDA2 else cp_cam
|
# speed_limit_bus = cp if self.CP.flags & HyundaiFlags.CANFD_HDA2 else cp_cam
|
||||||
|
|||||||
@@ -170,10 +170,9 @@ class Controls:
|
|||||||
self.current_alert_types = [ET.PERMANENT]
|
self.current_alert_types = [ET.PERMANENT]
|
||||||
self.logged_comm_issue = None
|
self.logged_comm_issue = None
|
||||||
self.not_running_prev = None
|
self.not_running_prev = None
|
||||||
self.last_lat_engage_ts = 0 # CLEARPILOT: timestamp of most recent lat engage transition
|
self.lat_requested_ts = 0 # CLEARPILOT: timestamp when lat was first requested (ramp-up delay + comm issue suppression)
|
||||||
self.prev_lat_active = False
|
|
||||||
self.lat_requested_ts = 0 # CLEARPILOT: timestamp when lat was first requested (for ramp-up delay)
|
|
||||||
self.prev_lat_requested = False
|
self.prev_lat_requested = False
|
||||||
|
self.prev_no_lat_lane_change = None # CLEARPILOT: gate no_lat_lane_change write on change
|
||||||
self.steer_limited = False
|
self.steer_limited = False
|
||||||
self.desired_curvature = 0.0
|
self.desired_curvature = 0.0
|
||||||
self.experimental_mode = False
|
self.experimental_mode = False
|
||||||
@@ -462,13 +461,10 @@ class Controls:
|
|||||||
now = time.monotonic()
|
now = time.monotonic()
|
||||||
model_suppress = (now - standby_ts) < 2.0
|
model_suppress = (now - standby_ts) < 2.0
|
||||||
|
|
||||||
# CLEARPILOT: detect lat engage rising edge and suppress commIssue for 2s after
|
# CLEARPILOT: suppress commIssue/location/params errors for 2s after lat was requested
|
||||||
# Model FPS jumps from 4 to 20 on engage, causing transient freq check failures
|
# Model FPS jumps from 4 to 20 on engage, causing transient freq check failures
|
||||||
lat_active_now = self.CC.latActive
|
# lat_requested_ts is set in state_control on the False->True transition of LatRequested
|
||||||
if lat_active_now and not self.prev_lat_active:
|
lat_engage_suppress = (now - self.lat_requested_ts) < 2.0
|
||||||
self.last_lat_engage_ts = now
|
|
||||||
self.prev_lat_active = lat_active_now
|
|
||||||
lat_engage_suppress = (now - self.last_lat_engage_ts) < 2.0
|
|
||||||
|
|
||||||
not_running = {p.name for p in self.sm['managerState'].processes if not p.running and p.shouldBeRunning}
|
not_running = {p.name for p in self.sm['managerState'].processes if not p.running and p.shouldBeRunning}
|
||||||
if self.sm.recv_frame['managerState'] and (not_running - IGNORE_PROCESSES):
|
if self.sm.recv_frame['managerState'] and (not_running - IGNORE_PROCESSES):
|
||||||
@@ -520,13 +516,15 @@ class Controls:
|
|||||||
self.logged_comm_issue = None
|
self.logged_comm_issue = None
|
||||||
|
|
||||||
if not (self.CP.notCar and self.joystick_mode):
|
if not (self.CP.notCar and self.joystick_mode):
|
||||||
if not self.sm['liveLocationKalman'].posenetOK and not model_suppress:
|
# CLEARPILOT: also suppress on lat engage — modeld 4fps→20fps transition causes
|
||||||
|
# transient inputsOK/valid drops in locationd/paramsd downstream
|
||||||
|
if not self.sm['liveLocationKalman'].posenetOK and not model_suppress and not lat_engage_suppress:
|
||||||
self.events.add(EventName.posenetInvalid)
|
self.events.add(EventName.posenetInvalid)
|
||||||
if not self.sm['liveLocationKalman'].deviceStable:
|
if not self.sm['liveLocationKalman'].deviceStable:
|
||||||
self.events.add(EventName.deviceFalling)
|
self.events.add(EventName.deviceFalling)
|
||||||
if not self.sm['liveLocationKalman'].inputsOK and not model_suppress:
|
if not self.sm['liveLocationKalman'].inputsOK and not model_suppress and not lat_engage_suppress:
|
||||||
self.events.add(EventName.locationdTemporaryError)
|
self.events.add(EventName.locationdTemporaryError)
|
||||||
if not self.sm['liveParameters'].valid and not TESTING_CLOSET and (not SIMULATION or REPLAY) and not model_suppress:
|
if not self.sm['liveParameters'].valid and not TESTING_CLOSET and (not SIMULATION or REPLAY) and not model_suppress and not lat_engage_suppress:
|
||||||
self.events.add(EventName.paramsdTemporaryError)
|
self.events.add(EventName.paramsdTemporaryError)
|
||||||
|
|
||||||
# conservative HW alert. if the data or frequency are off, locationd will throw an error
|
# conservative HW alert. if the data or frequency are off, locationd will throw an error
|
||||||
@@ -702,9 +700,10 @@ class Controls:
|
|||||||
|
|
||||||
# CLEARPILOT: tell modeld early so it can ramp to 20fps, then delay latActive by
|
# CLEARPILOT: tell modeld early so it can ramp to 20fps, then delay latActive by
|
||||||
# 250ms (5 frames at 20fps) so downstream services stabilize before steering engages.
|
# 250ms (5 frames at 20fps) so downstream services stabilize before steering engages.
|
||||||
self.params_memory.put_bool("LatRequested", lat_requested)
|
|
||||||
now_ts = time.monotonic()
|
now_ts = time.monotonic()
|
||||||
if lat_requested and not self.prev_lat_requested:
|
if lat_requested != self.prev_lat_requested:
|
||||||
|
self.params_memory.put_bool("LatRequested", lat_requested)
|
||||||
|
if lat_requested:
|
||||||
self.lat_requested_ts = now_ts
|
self.lat_requested_ts = now_ts
|
||||||
self.prev_lat_requested = lat_requested
|
self.prev_lat_requested = lat_requested
|
||||||
CC.latActive = lat_requested and (now_ts - self.lat_requested_ts) >= 0.25
|
CC.latActive = lat_requested and (now_ts - self.lat_requested_ts) >= 0.25
|
||||||
@@ -722,11 +721,13 @@ class Controls:
|
|||||||
CC.leftBlinker = model_v2.meta.laneChangeDirection == LaneChangeDirection.left
|
CC.leftBlinker = model_v2.meta.laneChangeDirection == LaneChangeDirection.left
|
||||||
CC.rightBlinker = model_v2.meta.laneChangeDirection == LaneChangeDirection.right
|
CC.rightBlinker = model_v2.meta.laneChangeDirection == LaneChangeDirection.right
|
||||||
|
|
||||||
if model_v2.meta.laneChangeState == LaneChangeState.laneChangeStarting and clearpilot_disable_lat_on_lane_change:
|
no_lat_lane_change = model_v2.meta.laneChangeState == LaneChangeState.laneChangeStarting and clearpilot_disable_lat_on_lane_change
|
||||||
|
if no_lat_lane_change:
|
||||||
CC.latActive = False
|
CC.latActive = False
|
||||||
self.params_memory.put_bool("no_lat_lane_change", True)
|
# CLEARPILOT: only write on change — per-cycle atomic writes were eating ~15% CPU
|
||||||
else:
|
if no_lat_lane_change != self.prev_no_lat_lane_change:
|
||||||
self.params_memory.put_bool("no_lat_lane_change", False)
|
self.params_memory.put_bool("no_lat_lane_change", no_lat_lane_change)
|
||||||
|
self.prev_no_lat_lane_change = no_lat_lane_change
|
||||||
|
|
||||||
if CS.leftBlinker or CS.rightBlinker:
|
if CS.leftBlinker or CS.rightBlinker:
|
||||||
self.last_blinker_frame = self.sm.frame
|
self.last_blinker_frame = self.sm.frame
|
||||||
@@ -1006,6 +1007,11 @@ class Controls:
|
|||||||
while True:
|
while True:
|
||||||
self.step()
|
self.step()
|
||||||
self.rk.monitor_time()
|
self.rk.monitor_time()
|
||||||
|
# CLEARPILOT: accumulated debt from startup/blocking calls never recovers.
|
||||||
|
# Reset the rate keeper when catastrophically behind — prevents a one-time
|
||||||
|
# ~8s fingerprinting stall from poisoning the lag metric for the whole session.
|
||||||
|
if self.rk.remaining < -1.0:
|
||||||
|
self.rk._next_frame_time = time.monotonic() + self.rk._interval
|
||||||
except SystemExit:
|
except SystemExit:
|
||||||
e.set()
|
e.set()
|
||||||
t.join()
|
t.join()
|
||||||
@@ -1076,13 +1082,11 @@ class Controls:
|
|||||||
elif self.sm['modelV2'].meta.turnDirection == Desire.turnRight:
|
elif self.sm['modelV2'].meta.turnDirection == Desire.turnRight:
|
||||||
self.events.add(EventName.turningRight)
|
self.events.add(EventName.turningRight)
|
||||||
|
|
||||||
if os.path.isfile(os.path.join(sentry.CRASHES_DIR, 'error.txt')) and self.crashed_timer < 10:
|
# CLEARPILOT: check crash file once per second, not every cycle (stat syscall at 100Hz hurts)
|
||||||
|
if self.sm.frame % 100 == 0:
|
||||||
|
self._crashed_file_exists = os.path.isfile(os.path.join(sentry.CRASHES_DIR, 'error.txt'))
|
||||||
|
if getattr(self, '_crashed_file_exists', False) and self.crashed_timer < 10:
|
||||||
self.events.add(EventName.openpilotCrashed)
|
self.events.add(EventName.openpilotCrashed)
|
||||||
|
|
||||||
# if self.random_events and not self.openpilot_crashed_triggered:
|
|
||||||
# self.events.add(EventName.openpilotCrashedRandomEvents)
|
|
||||||
# self.openpilot_crashed_triggered = True
|
|
||||||
|
|
||||||
self.crashed_timer += DT_CTRL
|
self.crashed_timer += DT_CTRL
|
||||||
|
|
||||||
# if self.speed_limit_alert or self.speed_limit_confirmation:
|
# if self.speed_limit_alert or self.speed_limit_confirmation:
|
||||||
|
|||||||
Reference in New Issue
Block a user