feat: operational status endpoint + reconciler/peer state tracking 📊

- ReconciliationWorker._last_run stores per-pass stats (da_servers_polled,
  zones_in_da/db, orphans_found/queued, hostnames_backfilled/migrated,
  zones_healed, duration_seconds, dry_run flag)
- ReconciliationWorker.get_status() exposes state for API/UI consumption
- _heal_backends() now returns healed count
- PeerSyncWorker.get_peer_status() serialises _peer_health to JSON-safe dict
  (url, healthy, consecutive_failures, last_seen) with summary totals
- WorkerManager tracks dead-letter count; queue_status() now returns nested
  reconciler/peer_sync dicts replacing flat reconciler_alive/peer_syncer_alive
- New GET /status endpoint (StatusAPI) aggregates queue depths, worker liveness,
  reconciler last-run, peer health, and live zone count; computes ok/degraded/error
- .gitignore: exclude .claude/, .vscode/, .env (always local)
- app.yml: add documented datastore section (SQLite default + MySQL commented)
- 164 tests passing (23 new tests added)
This commit is contained in:
2026-02-25 18:51:56 +13:00
parent 0f417da204
commit db60d808de
11 changed files with 491 additions and 19 deletions

View File

@@ -317,3 +317,83 @@ def test_heal_skipped_when_no_registry(delete_queue, patch_connect):
w._reconcile_all()
assert save_queue.empty()
# ---------------------------------------------------------------------------
# get_status — last-run state
# ---------------------------------------------------------------------------
def test_get_status_before_any_run(worker):
status = worker.get_status()
assert status["enabled"] is True
assert status["alive"] is False
assert status["last_run"] == {}
def test_get_status_after_run(worker, patch_connect):
with _patch_da(set()):
worker._reconcile_all()
s = worker.get_status()
assert s["enabled"] is True
lr = s["last_run"]
assert lr["status"] == "ok"
assert "started_at" in lr
assert "completed_at" in lr
assert "duration_seconds" in lr
assert lr["da_servers_polled"] == 1
assert lr["da_servers_unreachable"] == 0
assert lr["dry_run"] is False
def test_get_status_counts_unreachable_server(worker, patch_connect):
with _patch_da(None):
worker._reconcile_all()
lr = worker.get_status()["last_run"]
assert lr["da_servers_polled"] == 1
assert lr["da_servers_unreachable"] == 1
def test_get_status_counts_orphans(worker, delete_queue, patch_connect):
patch_connect.add(
Domain(domain="orphan.com", hostname="da1.example.com", username="admin")
)
patch_connect.commit()
with _patch_da(set()):
worker._reconcile_all()
lr = worker.get_status()["last_run"]
assert lr["orphans_found"] == 1
assert lr["orphans_queued"] == 1
def test_get_status_dry_run_orphans_not_queued_in_stats(dry_run_worker, patch_connect):
patch_connect.add(
Domain(domain="orphan.com", hostname="da1.example.com", username="admin")
)
patch_connect.commit()
with _patch_da(set()):
dry_run_worker._reconcile_all()
lr = dry_run_worker.get_status()["last_run"]
assert lr["dry_run"] is True
assert lr["orphans_found"] == 1
assert lr["orphans_queued"] == 0
def test_get_status_zones_in_db_counted(worker, patch_connect):
for d in ["a.com", "b.com", "c.com"]:
patch_connect.add(Domain(domain=d, hostname="da1.example.com", username="admin"))
patch_connect.commit()
with _patch_da({"a.com", "b.com", "c.com"}):
worker._reconcile_all()
lr = worker.get_status()["last_run"]
assert lr["zones_in_db"] == 3
assert lr["zones_in_da"] == 3
assert lr["orphans_found"] == 0