feat: persistent known_issues tracking in ASN registry

When the same field fails 2+ consecutive audit runs, a known_issue
entry is written into the ASN's registry profile with:
- field name, description of what's wrong
- first_seen / last_seen dates, occurrence count
- last auth vs PC values
- status: open (stays until PeerCortex data matches)

Report shows KNOWN ISSUES section (all open issues across registry).
Issues auto-resolve when the ASN passes, or partially resolve when
individual fields are fixed. Also stores ASN name in registry.
This commit is contained in:
Rene Fichtmueller 2026-03-28 14:02:33 +13:00
parent 87ce2ed36a
commit 711b89a09e

View File

@ -315,6 +315,82 @@ def _audit_asn(asn):
"passed": len(failures) == 0, "passed": len(failures) == 0,
} }
# ─── Known issue tracking ────────────────────────────────────────────────────
def _issue_description(field, auth_val, pc_val):
"""Generate a human-readable description of the data discrepancy."""
if field == "TIMEOUT":
return "PeerCortex did not respond within timeout — server may be overloaded"
if auth_val is not None and pc_val is not None:
if auth_val > 0 and (pc_val or 0) == 0:
return (
f"PeerCortex returns 0 but authoritative source shows {auth_val}. "
f"Likely cause: PeeringDB lookup failing in server.js for this ASN "
f"(net_id resolution or netixlan/netfac query failing)."
)
if (auth_val or 0) == 0 and pc_val > 0:
return (
f"PeerCortex returns {pc_val} but authoritative source shows 0. "
f"Likely cause: PeerCortex using stale cached data or querying wrong endpoint."
)
delta = abs(auth_val - pc_val)
pct = round(delta / max(auth_val, 1) * 100)
return (
f"Mismatch: PeerCortex={pc_val}, authoritative={auth_val} "
f"(delta={delta}, {pct}% off). Exceeds tolerance."
)
return f"Comparison not possible (auth={auth_val}, pc={pc_val})"
def _update_known_issues(entry, failures, date):
"""
Update the known_issues dict for an ASN registry entry.
A known issue is created when the same field fails in 2+ consecutive runs.
It stays open (status='open') until the field passes in a future run.
When the ASN fully passes, all known_issues are cleared in the caller.
"""
if not failures:
return
# Only promote to known_issue after 2+ consecutive failures
# (consecutive_errors was already incremented by caller)
if entry.get("consecutive_errors", 0) < 2:
return
known = entry.setdefault("known_issues", {})
failing_fields = {f["field"] for f in failures if f.get("field") not in ("TIMEOUT", "EXCEPTION")}
for f in failures:
field = f.get("field")
if not field or field in ("TIMEOUT", "EXCEPTION"):
continue
if field in known:
# Update existing issue
known[field]["last_seen"] = date
known[field]["occurrences"] = known[field].get("occurrences", 1) + 1
known[field]["last_auth"] = f.get("auth")
known[field]["last_pc"] = f.get("pc")
known[field]["description"] = _issue_description(field, f.get("auth"), f.get("pc"))
else:
# Create new known issue
known[field] = {
"field": field,
"first_seen": date,
"last_seen": date,
"occurrences": 1,
"status": "open",
"last_auth": f.get("auth"),
"last_pc": f.get("pc"),
"description": _issue_description(field, f.get("auth"), f.get("pc")),
}
# Clear issues for fields that are now passing (partial fix)
for field in list(known.keys()):
if field not in failing_fields:
known[field]["status"] = "resolved"
known[field]["fixed_on"] = date
# ─── Main ───────────────────────────────────────────────────────────────────── # ─── Main ─────────────────────────────────────────────────────────────────────
def main(): def main():
REPORTS_DIR.mkdir(parents=True, exist_ok=True) REPORTS_DIR.mkdir(parents=True, exist_ok=True)
@ -379,10 +455,14 @@ def main():
entry["pass_count"] = entry.get("pass_count", 0) + 1 entry["pass_count"] = entry.get("pass_count", 0) + 1
entry["consecutive_errors"] = 0 entry["consecutive_errors"] = 0
entry["last_status"] = "pass" entry["last_status"] = "pass"
# Clear all known_issues when ASN fully passes
entry.pop("known_issues", None)
else: else:
entry["error_count"] = entry.get("error_count", 0) + 1 entry["error_count"] = entry.get("error_count", 0) + 1
entry["consecutive_errors"] = entry.get("consecutive_errors", 0) + 1 entry["consecutive_errors"] = entry.get("consecutive_errors", 0) + 1
entry["last_status"] = "fail" entry["last_status"] = "fail"
# ── Known issues: track persistent failures (2+ consecutive runs) ──
_update_known_issues(entry, r["failures"], date)
entry["last_failures"] = r["failures"] entry["last_failures"] = r["failures"]
@ -390,6 +470,8 @@ def main():
auth = r.get("auth") or {} auth = r.get("auth") or {}
if auth.get("pdb_id"): if auth.get("pdb_id"):
entry["peeringdb_id"] = auth["pdb_id"] entry["peeringdb_id"] = auth["pdb_id"]
if r.get("pc_name"):
entry["name"] = r["pc_name"]
total = len(results) total = len(results)
passed = sum(1 for r in results if r["passed"]) passed = sum(1 for r in results if r["passed"])
@ -456,16 +538,51 @@ def main():
f" {', '.join('AS'+str(a) for a in sorted(absent_asns))}" f" {', '.join('AS'+str(a) for a in sorted(absent_asns))}"
) )
# ── Known issues across entire registry (all ASNs, not just this batch) ───
all_entries = reg["asns"]
open_issues = {
k: v for k, v in all_entries.items()
if v.get("known_issues") and
any(i.get("status") == "open" for i in v["known_issues"].values())
}
if open_issues:
summary_lines.append(
f"\n{'!'*60}\n"
f"KNOWN ISSUES ({len(open_issues)} ASNs with persistent failures)\n"
f"These remain until the data is correct in PeerCortex.\n"
f"{'!'*60}"
)
for k in sorted(open_issues, key=lambda x: int(x)):
v = all_entries[k]
name = v.get("name", "")
streak = v.get("consecutive_errors", 0)
issues = {fld: i for fld, i in v["known_issues"].items()
if i.get("status") == "open"}
summary_lines.append(
f"\n AS{k} {name}"
f" [OPEN — {streak} consecutive failures, "
f"first seen: {next(iter(issues.values()))['first_seen']}]"
)
for fld, i in issues.items():
summary_lines.append(f"{fld}:")
summary_lines.append(f" {i['description']}")
summary_lines.append(
f" auth={i['last_auth']} pc={i['last_pc']} "
f"seen {i['occurrences']}x last: {i['last_seen']}"
)
summary_lines.append("")
else:
summary_lines.append("\nNo persistent known issues — all data is consistent.")
# Overall DB health # Overall DB health
all_entries = reg["asns"] ever_failed = sum(1 for v in all_entries.values() if v.get("error_count", 0) > 0)
ever_failed = sum(1 for v in all_entries.values() if v.get("error_count", 0) > 0) clean_streak = sum(1 for v in all_entries.values()
clean_streak = sum(1 for v in all_entries.values() if v.get("consecutive_errors", 0) == 0 and v.get("last_audited"))
if v.get("consecutive_errors", 0) == 0
and v.get("last_audited"))
summary_lines += [ summary_lines += [
f"\nDATABASE HEALTH:", f"\nDATABASE HEALTH:",
f" Total tracked ASNs : {len(all_entries)}", f" Total tracked ASNs : {len(all_entries)}",
f" Clean streak : {clean_streak} ASNs with 0 consecutive errors", f" Clean streak : {clean_streak} ASNs with 0 consecutive errors",
f" Open known issues : {len(open_issues)} ASNs",
f" Ever had errors : {ever_failed} ASNs", f" Ever had errors : {ever_failed} ASNs",
f"\nReport: {REPORTS_DIR}/{date}.json", f"\nReport: {REPORTS_DIR}/{date}.json",
] ]
@ -486,15 +603,23 @@ def main():
"pdb_absent": no_pdb, "pdb_absent": no_pdb,
"accuracy_pct": accuracy, "accuracy_pct": accuracy,
"pdb_key_active": bool(PEERINGDB_KEY), "pdb_key_active": bool(PEERINGDB_KEY),
"results": [ "known_issues_registry": {
k: v["known_issues"]
for k, v in all_entries.items()
if v.get("known_issues") and
any(i.get("status") == "open" for i in v["known_issues"].values())
},
"results": [
{ {
"asn": r["asn"], "asn": r["asn"],
"name": r.get("pc_name", ""), "name": r.get("pc_name", ""),
"pdb_absent": r["pdb_absent"], "pdb_absent": r.get("pdb_absent", False),
"passed": r["passed"], "pdb_unknown": r.get("pdb_unknown", False),
"failures": r["failures"], "passed": r["passed"],
"auth": {k: v for k, v in (r.get("auth") or {}).items() "failures": r["failures"],
if k not in ("pdb_ok", "ripe_ok")}, "known_issues": all_entries.get(str(r["asn"]), {}).get("known_issues"),
"auth": {fk: fv for fk, fv in (r.get("auth") or {}).items()
if fk not in ("pdb_ok", "ripe_ok")},
} }
for r in results for r in results
], ],