test(meter): add fault-count regression test for meter diagnosis

2026-03-16 16:32:21 +01:00
parent 3e9259735e
commit 99aae76404
1 changed files with 301 additions and 0 deletions
@@ -0,0 +1,301 @@
+/**
+ * @file test_meter_fault_count.cpp
+ * @brief Unit test: verifies that the meter fault counter increments once per
+ *        stale-data event, NOT once per catch-up tick.
+ *
+ * Regression test for the ~200 errors/hour bug where LoRa TX blocking caused
+ * the sampling catch-up loop to fire note_fault() for every missed 1s tick.
+ *
+ * Run on target with: pio test -e lilygo-t3-v1-6-1-test -f test_meter_fault_count
+ */
+
+#include <Arduino.h>
+#include <unity.h>
+
+#include "data_model.h"
+
+// ---------- Minimal stubs replicating the fixed fault-counting logic ----------
+
+static FaultCounters test_faults = {};
+static FaultType test_last_error = FaultType::None;
+static uint32_t test_last_error_utc = 0;
+static uint32_t test_last_error_ms = 0;
+
+static void note_fault_stub(FaultCounters &counters, FaultType &last_type,
+                             uint32_t &last_ts_utc, uint32_t &last_ts_ms, FaultType type) {
+  if (type == FaultType::MeterRead) {
+    counters.meter_read_fail++;
+  } else if (type == FaultType::Decode) {
+    counters.decode_fail++;
+  } else if (type == FaultType::LoraTx) {
+    counters.lora_tx_fail++;
+  }
+  last_type = type;
+  last_ts_utc = millis() / 1000;
+  last_ts_ms = millis();
+}
+
+static void reset_test_faults() {
+  test_faults = {};
+  test_last_error = FaultType::None;
+  test_last_error_utc = 0;
+  test_last_error_ms = 0;
+}
+
+// ---------- Simulate the FIXED sampling loop logic ----------
+
+static constexpr uint32_t SAMPLE_INTERVAL_MS = 1000;
+
+/**
+ * Simulates the fixed sender_loop sampling section.
+ *
+ * @param last_sample_ms  Tracks the last sample tick (in/out).
+ * @param now_ms          Current millis().
+ * @param meter_ok        Whether the meter snapshot is fresh.
+ * @param time_jump_pending  Whether a time-jump event is pending (in/out).
+ * @param faults          Fault counters (in/out).
+ * @return Number of samples generated in the catch-up loop.
+ */
+static uint32_t simulate_fixed_sampling(
+    uint32_t &last_sample_ms, uint32_t now_ms, bool meter_ok,
+    bool &time_jump_pending, FaultCounters &faults) {
+
+  FaultType last_error = FaultType::None;
+  uint32_t last_error_utc = 0;
+  uint32_t last_error_ms = 0;
+  bool meter_fault_noted = false;
+
+  // Time-jump: one fault per event, outside loop.
+  if (time_jump_pending) {
+    time_jump_pending = false;
+    note_fault_stub(faults, last_error, last_error_utc, last_error_ms, FaultType::MeterRead);
+    meter_fault_noted = true;
+  }
+
+  // Stale meter: one fault per contiguous stale period, outside loop.
+  if (!meter_ok && !meter_fault_noted) {
+    note_fault_stub(faults, last_error, last_error_utc, last_error_ms, FaultType::MeterRead);
+  }
+
+  uint32_t samples = 0;
+  while (now_ms - last_sample_ms >= SAMPLE_INTERVAL_MS) {
+    last_sample_ms += SAMPLE_INTERVAL_MS;
+    samples++;
+  }
+  return samples;
+}
+
+/**
+ * Simulates the OLD (buggy) sampling loop for comparison.
+ */
+static uint32_t simulate_buggy_sampling(
+    uint32_t &last_sample_ms, uint32_t now_ms, bool meter_ok,
+    bool &time_jump_pending, FaultCounters &faults) {
+
+  FaultType last_error = FaultType::None;
+  uint32_t last_error_utc = 0;
+  uint32_t last_error_ms = 0;
+
+  uint32_t samples = 0;
+  while (now_ms - last_sample_ms >= SAMPLE_INTERVAL_MS) {
+    last_sample_ms += SAMPLE_INTERVAL_MS;
+    samples++;
+    if (!meter_ok) {
+      note_fault_stub(faults, last_error, last_error_utc, last_error_ms, FaultType::MeterRead);
+    }
+    if (time_jump_pending) {
+      time_jump_pending = false;
+      note_fault_stub(faults, last_error, last_error_utc, last_error_ms, FaultType::MeterRead);
+    }
+  }
+  return samples;
+}
+
+// ---------- Tests ----------
+
+/**
+ * Normal operation: meter is fresh, no blocking. 1 tick per call.
+ * Should produce 0 faults.
+ */
+static void test_no_fault_when_meter_fresh() {
+  FaultCounters faults = {};
+  uint32_t last_sample_ms = 0;
+  bool time_jump = false;
+
+  // Simulate 60 consecutive 1s ticks with fresh meter data.
+  for (int i = 1; i <= 60; i++) {
+    simulate_fixed_sampling(last_sample_ms, i * 1000, true, time_jump, faults);
+  }
+
+  TEST_ASSERT_EQUAL_UINT32(0, faults.meter_read_fail);
+}
+
+/**
+ * LoRa TX blocks for 10 seconds while meter is stale.
+ * OLD code: 10 faults. FIXED code: 1 fault.
+ */
+static void test_single_fault_after_blocking_stale() {
+  FaultCounters faults = {};
+  uint32_t last_sample_ms = 0;
+  bool time_jump = false;
+
+  // 5 normal ticks with fresh data.
+  for (int i = 1; i <= 5; i++) {
+    simulate_fixed_sampling(last_sample_ms, i * 1000, true, time_jump, faults);
+  }
+  TEST_ASSERT_EQUAL_UINT32(0, faults.meter_read_fail);
+
+  // LoRa TX blocks for 10s → meter goes stale.
+  // now_ms = 15000, last_sample_ms = 5000 → 10 catch-up ticks.
+  uint32_t samples = simulate_fixed_sampling(last_sample_ms, 15000, false, time_jump, faults);
+  TEST_ASSERT_EQUAL_UINT32(10, samples);   // 10 ticks caught up.
+  TEST_ASSERT_EQUAL_UINT32(1, faults.meter_read_fail);  // But only 1 fault!
+}
+
+/**
+ * Demonstrate the OLD buggy behavior: same scenario produces 10 faults.
+ */
+static void test_buggy_produces_many_faults() {
+  FaultCounters faults = {};
+  uint32_t last_sample_ms = 0;
+  bool time_jump = false;
+
+  for (int i = 1; i <= 5; i++) {
+    simulate_buggy_sampling(last_sample_ms, i * 1000, true, time_jump, faults);
+  }
+  TEST_ASSERT_EQUAL_UINT32(0, faults.meter_read_fail);
+
+  simulate_buggy_sampling(last_sample_ms, 15000, false, time_jump, faults);
+  TEST_ASSERT_EQUAL_UINT32(10, faults.meter_read_fail);  // Buggy: 10 faults for one event.
+}
+
+/**
+ * Time-jump event should produce exactly 1 additional fault,
+ * regardless of how many ticks are caught up.
+ */
+static void test_time_jump_single_fault() {
+  FaultCounters faults = {};
+  uint32_t last_sample_ms = 0;
+  bool time_jump = true;  // Pending time-jump.
+
+  // 8 catch-up ticks with stale meter AND time jump pending.
+  uint32_t samples = simulate_fixed_sampling(last_sample_ms, 8000, false, time_jump, faults);
+  TEST_ASSERT_EQUAL_UINT32(8, samples);
+  // Time jump counted as 1, stale suppressed because meter_fault_noted == true.
+  TEST_ASSERT_EQUAL_UINT32(1, faults.meter_read_fail);
+  TEST_ASSERT_FALSE(time_jump);
+}
+
+/**
+ * Repeated stale periods should count 1 fault per call to the sampling function,
+ * not 1 per tick. After 3600s at 1 call/s with meter stale every call,
+ * the FIXED code should produce ≤ 3600 faults (1 per call).
+ * The OLD code would produce the same number (since 1 tick per call).
+ * The difference is when blocking causes N>1 ticks per call.
+ */
+static void test_sustained_stale_1hz_no_blocking() {
+  FaultCounters faults = {};
+  uint32_t last_sample_ms = 0;
+  bool time_jump = false;
+
+  // Simulate 1 hour at 1 Hz with meter always stale (no blocking, 1 tick/call).
+  for (uint32_t i = 1; i <= 3600; i++) {
+    simulate_fixed_sampling(last_sample_ms, i * 1000, false, time_jump, faults);
+  }
+  // 1 fault per call = 3600 faults. This correctly reflects 3600 distinct evaluations
+  // where the meter was stale.
+  TEST_ASSERT_EQUAL_UINT32(3600, faults.meter_read_fail);
+}
+
+/**
+ * Worst-case: 1 hour, main loop blocked for 10s every 30s (batch TX + ACK).
+ * Each blocking event catches up 10 ticks with stale meter.
+ *
+ * OLD: 10 faults per blocking event × 120 blocks = 1200 faults,
+ *      + 20 normal stale ticks between blocks × 120 = 2400 → total ~3600.
+ *
+ * FIXED: 1 fault per blocking event + 1 per non-blocked stale call.
+ *        120 blocking events + 2400 normal calls = 2520.
+ *        (Still correctly counts each loop iteration where meter was stale.)
+ */
+static void test_periodic_blocking_reduces_faults() {
+  FaultCounters faults_fixed = {};
+  FaultCounters faults_buggy = {};
+  uint32_t last_fixed = 0;
+  uint32_t last_buggy = 0;
+  bool tj_fixed = false;
+  bool tj_buggy = false;
+
+  uint32_t t = 0;
+  for (int cycle = 0; cycle < 120; cycle++) {
+    // 20s of normal 1Hz polling, meter stale.
+    for (int s = 0; s < 20; s++) {
+      t += 1000;
+      simulate_fixed_sampling(last_fixed, t, false, tj_fixed, faults_fixed);
+      simulate_buggy_sampling(last_buggy, t, false, tj_buggy, faults_buggy);
+    }
+    // 10s blocking (LoRa TX + ACK), meter stale.
+    t += 10000;
+    simulate_fixed_sampling(last_fixed, t, false, tj_fixed, faults_fixed);
+    simulate_buggy_sampling(last_buggy, t, false, tj_buggy, faults_buggy);
+  }
+
+  // Both produce 3600 samples total.
+  // Buggy: 20*120 normal + 10*120 from catch-up = 3600 faults.
+  TEST_ASSERT_EQUAL_UINT32(3600, faults_buggy.meter_read_fail);
+  // Fixed: 20*120 normal + 1*120 from catch-up = 2520 faults.
+  TEST_ASSERT_EQUAL_UINT32(2520, faults_fixed.meter_read_fail);
+  // Significant reduction: fixed < buggy.
+  TEST_ASSERT_TRUE(faults_fixed.meter_read_fail < faults_buggy.meter_read_fail);
+}
+
+/**
+ * Real scenario: meter works fine most of the time; occasional 5-10s stale
+ * during LoRa TX. With fresh meter otherwise, faults should be minimal.
+ *
+ * 1h = 120 batch cycles of 30s.
+ * Each cycle: 20s meter OK → 10s TX blocking (stale) → continue.
+ * FIXED: 120 faults/h (one per TX stale event).
+ * OLD: ~1200 faults/h (10 per TX stale event).
+ */
+static void test_realistic_scenario_mostly_fresh() {
+  FaultCounters faults_fixed = {};
+  FaultCounters faults_buggy = {};
+  uint32_t last_fixed = 0;
+  uint32_t last_buggy = 0;
+  bool tj_fixed = false;
+  bool tj_buggy = false;
+
+  uint32_t t = 0;
+  for (int cycle = 0; cycle < 120; cycle++) {
+    // 20s of fresh meter data.
+    for (int s = 0; s < 20; s++) {
+      t += 1000;
+      simulate_fixed_sampling(last_fixed, t, true, tj_fixed, faults_fixed);
+      simulate_buggy_sampling(last_buggy, t, true, tj_buggy, faults_buggy);
+    }
+    // 10s LoRa blocking, meter goes stale.
+    t += 10000;
+    simulate_fixed_sampling(last_fixed, t, false, tj_fixed, faults_fixed);
+    simulate_buggy_sampling(last_buggy, t, false, tj_buggy, faults_buggy);
+  }
+
+  // Fixed: 0 faults during fresh + 1 per stale event = 120 faults/h.
+  TEST_ASSERT_EQUAL_UINT32(120, faults_fixed.meter_read_fail);
+  // Buggy: 0 faults during fresh + 10 per stale event = 1200 faults/h.
+  TEST_ASSERT_EQUAL_UINT32(1200, faults_buggy.meter_read_fail);
+}
+
+void setup() {
+  UNITY_BEGIN();
+  RUN_TEST(test_no_fault_when_meter_fresh);
+  RUN_TEST(test_single_fault_after_blocking_stale);
+  RUN_TEST(test_buggy_produces_many_faults);
+  RUN_TEST(test_time_jump_single_fault);
+  RUN_TEST(test_sustained_stale_1hz_no_blocking);
+  RUN_TEST(test_periodic_blocking_reduces_faults);
+  RUN_TEST(test_realistic_scenario_mostly_fresh);
+  UNITY_END();
+}
+
+void loop() {}