project: renacer
version: 0.1.0
start_date: 2025-11-16
methodology: EXTREME TDD + Spec-Driven Development + Toyota Way
objectives:
- title: "Best-in-class Rust system call tracer with source correlation"
description: "Pure Rust strace replacement focusing on Rust binary tracing with DWARF-based source mapping"
metrics:
- "2-5x faster than strace on typical workloads"
- "95%+ DWARF accuracy on opt-level=1"
- "90%+ test coverage"
- "80%+ mutation score"
milestones:
- name: "Sprint 1-2: Minimal Viable Tracer (Weeks 1-4)"
target_date: 2025-12-14
objectives:
- "renacer -- ./hello-world works"
deliverables:
- "CLI accepting -- COMMAND"
- "Ptrace attach to child process (x86_64)"
- "Intercept write syscall only"
- "Print: write(1, \"Hello\\n\", 6) = 6"
- "90%+ test coverage"
acceptance_criteria:
- "Compares favorably with strace -e write"
- "Zero crashes on 100 test programs"
- "All unit tests pass"
- "assert_cmd integration tests pass"
risks:
- "Ptrace API complexity"
- "x86_64 register conventions"
- name: "Sprint 3-4: Full Syscall Coverage (Weeks 5-8)"
target_date: 2026-01-11
objectives:
- "Trace all syscalls, not just write"
deliverables:
- "Syscall number → name resolution (x86_64)"
- "Decode common args (openat, read, close, mmap)"
- "Handle process exit gracefully"
acceptance_criteria:
- "renacer -- ls -la matches strace structurally"
- "<2x slowdown vs strace"
- "90%+ coverage maintained"
risks:
- "Syscall argument decoding complexity"
- "Memory safety in ptrace operations"
- name: "Sprint 5-6: DWARF Source Correlation (Weeks 9-12)"
target_date: 2026-02-08
objectives:
- "Show source file:line for Rust binaries"
deliverables:
- "ELF + DWARF .debug_line parser (gimli)"
- "Instruction pointer → source mapping"
- "Enhanced output with source annotations"
acceptance_criteria:
- "95%+ accuracy on opt-level=1"
- "80%+ accuracy on opt-level=2"
- "<10% DWARF lookup overhead"
risks:
- "DWARF accuracy on optimized code (RISK 1 from spec)"
- "Performance impact of source lookups"
- name: "Sprint 7-8: Multi-Architecture (Weeks 13-16)"
target_date: 2026-03-08
objectives:
- "Add aarch64 support"
deliverables:
- "Architecture-specific syscall tables"
- "Register mapping abstraction"
- "CI matrix testing (QEMU)"
acceptance_criteria:
- "aarch64 tests pass via QEMU"
- "Code coverage >90%"
risks:
- "Cross-platform testing complexity (RISK 5 from spec)"
- name: "Sprint 9-10: Advanced Features & Polish (Weeks 17-20)"
target_date: 2026-04-05
status: "COMPLETED (5/6 features - 83%)"
completion_date: 2025-11-17
objectives:
- "strace feature parity"
deliverables:
- "✅ -p PID attach to running process (COMPLETED)"
- "⚠️ -f follow forks (infrastructure only - deferred to v0.3.0)"
- "✅ -e trace=FILE filtering (COMPLETED)"
- "✅ -c statistics mode (COMPLETED)"
- "✅ -T timing per syscall (COMPLETED)"
- "✅ --format json output (COMPLETED)"
achievements:
- "24 new integration tests across 5 test suites"
- "3 new production modules (filter.rs, stats.rs, json_output.rs)"
- "Hash-based filtering with O(1) lookup"
- "strace-compatible statistics output"
- "JSON schema (renacer-json-v1) documented"
- "TDG score: 92.6/100 (A grade)"
- "Zero regressions maintained"
acceptance_criteria:
- "✅ 90% strace compatibility (achieved for implemented features)"
- "✅ JSON schema documented (in json_output.rs)"
risks:
- "Fork following complexity (MITIGATED: deferred to v0.3.0 with GitHub Issue #2)"
deferred:
- "Fork following (-f) requires trace loop refactoring for multi-process tracking"
- "See GitHub Issue #2: https://github.com/paiml/renacer/issues/2"
- name: "Sprint 11-12: Hardening & 1.0 Release (Weeks 21-24)"
target_date: 2026-05-03
objectives:
- "Production-ready 1.0 release"
deliverables:
- "90%+ test coverage enforced"
- "24hr fuzz runs (zero crashes)"
- "Benchmark suite vs strace"
- "Complete documentation"
- "crates.io publication"
acceptance_criteria:
- "All quality gates pass"
- "2-5x faster than strace measured"
- "Security audit complete"
- "3+ beta testers validated"
quality_gates:
pre_commit:
- "cargo test --all-features"
- "cargo clippy -- -D warnings"
- "pmat analyze tdg (no regressions)"
pre_release:
- "coverage ≥90%"
- "mutation score ≥80%"
- "all tests pass"
- "cargo bench (baseline established)"
- "pmat repo-score ≥80/100"
- "zero fuzzing crashes (24hr run)"
risks:
- risk: "DWARF accuracy in optimized code"
mitigation: "Test matrix across opt-levels, confidence flagging"
status: "monitoring"
reference: "Specification Section 5, Risk 1"
- risk: "eBPF performance claims (future)"
mitigation: "Test before claiming, honest benchmarks"
status: "deferred (post-1.0)"
reference: "Specification Section 5, Risk 2"
- risk: "Async runtime brittleness"
mitigation: "De-prioritized to post-1.0"
status: "deferred"
reference: "Specification Section 5, Risk 3"
- risk: "WASM toolchain heterogeneity"
mitigation: "Post-1.0, start with Rust-to-WASM only"
status: "deferred"
reference: "Specification Section 5, Risk 4"
- risk: "Multi-architecture complexity"
mitigation: "x86_64 + aarch64 only for 1.0"
status: "monitoring"
reference: "Specification Section 5, Risk 5"
toyota_way_principles:
jidoka: "Automated quality gates, continuous fuzzing, pre-commit hooks"
andon_cord: "Quality gates block bad code, 90%+ coverage enforced"
genchi_genbutsu: "Benchmark-driven performance claims, measured overhead"
kaizen: "2-week sprints with validation, iterative improvement"
zero_defects: "100% test pass rate, zero clippy warnings, zero fuzzing crashes"
post_1_0_roadmap:
v1_1_ebpf_backend:
sprints: "13-18 (3 months)"
target: "<5% overhead on production workloads"
risk: "High - requires extensive validation (Risk 2)"
v1_2_wasm_analysis:
sprints: "19-22 (2 months)"
target: "Parse 1000+ public WASM modules"
risk: "Medium - toolchain heterogeneity (Risk 4)"
v1_3_async_support:
sprints: "23-26 (2 months)"
target: "Experimental Tokio task attribution"
risk: "High - brittleness (Risk 3), may be deprecated"
v1_4_ecosystem:
sprints: "27-31 (2.5 months)"
features:
- "Trueno export (structured traces)"
- "Ruchy Ruby VM tracing"
- "OpenTelemetry span export"
master_tickets:
- ticket_id: "RENACER-100"
title: "Ruchy End-to-End Tracing Integration (Sprints 27-31)"
type: "EPIC"
status: "PLANNED"
priority: "HIGH"
specification: "docs/specifications/ruchy-tracing-support.md (v2.0.0)"
start_date: "2025-12-01"
target_date: "2026-01-26"
duration: "8 weeks (5 sprints)"
executive_summary: |
Implement production-ready end-to-end tracing infrastructure for the Ruchy ecosystem
that links transpiler decisions to runtime performance across local, Docker, and Lambda
environments. This is the foundation for data-driven compiler optimization.
objectives:
- "Capture transpiler decisions via memory-mapped file (zero I/O blocking)"
- "Implement <0.3% overhead runtime tracing with hash-based IDs"
- "Enable causal trace correlation across 3 environments (local/Docker/Lambda)"
- "Integrate with OpenTelemetry for industry-standard observability"
- "Publish academic paper (OOPSLA/PLDI 2026)"
deliverables:
- "Transpiler decision capture (memory-mapped file, u64 hashes)"
- "Runtime tracing with randomized sampling + circuit breaker"
- "W3C Trace Context propagation + Lambda Extension API"
- "OTLP exporters + Jaeger/Grafana integration"
- "Flamegraph generator with Ruby source attribution"
- "Decision impact dashboard (HTML)"
- "End-to-end documentation (3 environments)"
- "Academic paper submission"
child_sprints:
- "Sprint 27: Transpiler Decision Tracing (Phase 1)"
- "Sprint 28: Runtime Tracing with Hash-Based IDs (Phase 2)"
- "Sprint 29: Cross-Environment Correlation (Phase 3)"
- "Sprint 30: OpenTelemetry Integration (Phase 4)"
- "Sprint 31: Documentation and Publication (Phase 5)"
acceptance_criteria:
overall:
- "All 5 child sprints completed with quality gates passed"
- "<0.3% overhead validated on 3 benchmarks (fib, primes, array_sum)"
- "End-to-end trace: local transpilation → Docker build → Lambda execution"
- "Flamegraph accuracy: 95%+ Ruby source attribution"
- "Lambda flush safety: 1000 invocations, zero data loss"
- "Academic paper submitted to OOPSLA 2026 or PLDI 2026"
- "17 peer-reviewed citations documented"
phase_gates:
phase_1: "Transpiler decision capture (100+ decisions, zero I/O blocking)"
phase_2: "<0.3% overhead + DoS protection (10M call burst handled)"
phase_3: "Cross-environment correlation (causal ordering validated)"
phase_4: "OTLP integration (Jaeger + Grafana Tempo working)"
phase_5: "Documentation + paper submission"
quality_gates:
per_sprint:
- "85%+ test coverage (enforced via cargo-llvm-cov)"
- "85%+ mutation score (enforced via cargo-mutants)"
- "Zero clippy warnings"
- "pmat analyze tdg ≥85/100"
epic_completion:
- "All 5 sprint quality gates passed"
- "End-to-end integration test passing (local → Docker → Lambda)"
- "Performance benchmarks documented (<0.3% overhead)"
- "Security review complete (no secrets in traces)"
- "Documentation peer-reviewed by 2+ external reviewers"
risks:
critical:
- risk: "Overhead creep above 0.3%"
mitigation: "Continuous benchmarking, hash-based IDs, circuit breaker"
status: "monitoring"
- risk: "Lambda trace data loss"
mitigation: "Extension API with lifecycle hooks"
status: "mitigated"
- risk: "Clock skew correlation errors"
mitigation: "Causal ordering via span hierarchy"
status: "mitigated"
medium:
- risk: "Hash collision in decision IDs"
mitigation: "64-bit FNV-1a (2^64 space)"
probability: "low"
- risk: "Moiré sampling patterns"
mitigation: "Xorshift RNG"
status: "mitigated"
- risk: "DoS via cold function burst"
mitigation: "10K/sec rate limiter"
status: "mitigated"
dependencies:
upstream:
- "Renacer Sprint 26 complete (--trace-transpiler-decisions flag) ✅"
- "RuchyRuchy tracing infrastructure (lock-free buffers) ✅"
downstream:
- "Ruchy transpiler team for integration testing"
- "Ruchy-Docker team for container integration"
- "Ruchy-Lambda team for Extension API implementation"
metrics:
technical:
- "Overhead: <0.3% with 0.1% sampling"
- "Trace throughput: 10,000 events/sec max"
- "Lambda flush rate: 100% (zero data loss)"
- "Flamegraph accuracy: 95%+ source attribution"
- "Test coverage: 85%+ maintained"
- "Mutation score: 85%+ maintained"
business:
- "Academic paper submission: 1"
- "Blog post: 1 (ruchy.dev)"
- "GitHub stars target: 100+ in first month"
- "External documentation reviews: 2+"
toyota_way_principles:
genchi_genbutsu: "Measure actual production workloads, not synthetic benchmarks"
jidoka: "Quality gates block bad code, stop-the-line on regressions"
kaizen: "Iterative improvement across 5 sprints"
muda: "Zero allocation overhead (hash-based IDs, no strings)"
zero_defects: "100% test pass, zero data loss, zero clippy warnings"
academic_foundation:
citations: 17
key_papers:
- "Lozi et al. (EuroSys 2016): Lock-free tracing"
- "Lamport (CACM 1978): Causal ordering"
- "Wang et al. (USENIX ATC 2018): Lambda freeze behavior"
- "Mytkowicz et al. (ASPLOS 2009): Sampling bias"
novelty: |
First system to correlate transpiler decisions with runtime performance across
local, containerized, and serverless environments using causal ordering and
hash-based zero-allocation instrumentation.
success_criteria:
minimum_viable:
- "All 5 sprints completed"
- "<0.3% overhead validated"
- "End-to-end trace correlation working"
target:
- "Above + academic paper accepted"
- "Above + 100+ GitHub stars"
- "Above + production deployment in Ruchy project"
stretch:
- "Above + second paper (performance optimization ML)"
- "Above + integration with 3rd party observability platforms"
sprints:
- name: "Sprint 27: Ruchy Transpiler Decision Tracing (Phase 1)"
sprint_number: 27
target_date: "2025-12-01"
status: "PLANNED"
specification: "docs/specifications/ruchy-tracing-support.md (v2.0.0)"
objectives:
- "Integrate Renacer with Ruchy transpiler for decision capture"
deliverables:
- "Memory-mapped file output (.ruchy/decisions.msgpack) for transpiler decisions"
- "Hash-based decision IDs (u64 via FNV-1a)"
- "Decision manifest generation (JSON sidecar)"
- "Renacer Sprint 26 integration complete (--trace-transpiler-decisions flag)"
- "Test suite: 50+ unit tests for decision parsing"
- "10 decision categories implemented (type_inference, optimization, codegen, stdlib)"
acceptance_criteria:
- "Renacer captures 100+ transpiler decisions from Ruchy compilation"
- "Zero I/O blocking (mmap validated vs stderr baseline)"
- "Decision manifest maps all u64 hashes to human-readable descriptions"
- "85%+ test coverage"
- "85%+ mutation score"
- "All quality gates pass (clippy, rustfmt, tests)"
quality_gates:
- "pmat analyze tdg ≥85/100"
- "cargo test --all-features (100% pass)"
- "cargo mutants (≥85% mutation score)"
- "No performance regression (transpiler compilation time <5% slower)"
risks:
- "Ruchy transpiler integration complexity"
- "Hash collision probability (mitigated: 64-bit hash space = 2^64 combinations)"
toyota_way:
genchi_genbutsu: "Measure actual transpiler I/O blocking vs mmap performance"
jidoka: "Automated quality gates block merge if coverage drops"
kaizen: "Iterative improvement based on transpiler team feedback"
- name: "Sprint 28: Runtime Tracing with Hash-Based IDs (Phase 2)"
sprint_number: 28
target_date: "2025-12-15"
status: "PLANNED"
objectives:
- "Implement zero-allocation runtime tracing in RuchyRuchy"
deliverables:
- "RuchyRuchy TraceEvent extension with DecisionImpact (u64 decision_id, span_id, parent_span_id)"
- "Xorshift RNG for randomized sampling"
- "Global rate limiter (10,000 traces/sec circuit breaker)"
- "Lock-free per-thread trace buffers (existing infrastructure enhancement)"
- "Generated Rust code with #[cfg(feature = \"trace\")] instrumentation"
- "Overhead benchmarks: Fibonacci, Prime Sieve, Array Sum"
acceptance_criteria:
- "<0.3% overhead with 0.1% sampling rate (hot functions)"
- "Zero allocation in trace path (validated via flamegraph)"
- "Circuit breaker prevents DoS (stress test: 10M cold function calls)"
- "Randomized sampling eliminates Moiré patterns (validated on 16-thread HTTP server)"
- "85%+ test coverage"
- "85%+ mutation score"
quality_gates:
- "Overhead benchmark: <0.3% on fib(35) = 9,227,465"
- "DoS stress test: max 10K traces/sec under 10M call burst"
- "Property-based tests: sampling rate invariants, trace ordering"
risks:
- "Overhead creep from instrumentation (mitigated: constant monitoring)"
- "Sampling bias in production (mitigated: Xorshift RNG + circuit breaker)"
toyota_way:
muda: "Eliminate all allocation overhead (u64 IDs, no strings)"
genchi_genbutsu: "Profile production workloads to validate <0.3% claim"
- name: "Sprint 29: Cross-Environment Correlation (Phase 3)"
sprint_number: 29
target_date: "2025-12-29"
status: "PLANNED"
objectives:
- "Enable causal trace correlation across local/Docker/Lambda"
deliverables:
- "W3C Trace Context propagation (TRACEPARENT header/env variable)"
- "OpenTelemetry span hierarchy (parent → child relationships)"
- "Ruchy-Docker integration (Docker build + runtime tracing)"
- "Ruchy-Lambda integration (Lambda Extension API for flush safety)"
- "Trace merging tool (merge_traces.py)"
- "End-to-end correlation validation (3 environments)"
acceptance_criteria:
- "Single trace_id links local transpilation → Docker build → Lambda execution"
- "Causal ordering preserved (span hierarchy, NOT timestamps)"
- "Lambda Extension API prevents trace data loss (100% flush rate)"
- "Trace merging tool produces unified OpenTelemetry JSON"
- "85%+ test coverage"
- "Integration tests: 20+ cross-environment scenarios"
quality_gates:
- "End-to-end test: trace local → Docker → Lambda → merged output"
- "Lambda flush validation: 1000 invocations, zero data loss"
- "Clock skew test: 10-second skew between environments, correct causal ordering"
risks:
- "Lambda environment freeze timing (mitigated: Extension API lifecycle hooks)"
- "Docker layer caching breaks trace correlation (mitigated: TRACEPARENT in ENV)"
toyota_way:
genchi_genbutsu: "Test on real AWS Lambda, not just mocks"
jidoka: "Stop deployment if trace flush validation fails"
- name: "Sprint 30: OpenTelemetry Integration (Phase 4)"
sprint_number: 30
target_date: "2026-01-12"
status: "PLANNED"
objectives:
- "Industry-standard trace export and visualization"
deliverables:
- "OTLP (OpenTelemetry Protocol) exporter for Renacer"
- "OTLP exporter for RuchyRuchy"
- "Jaeger integration (Docker Compose example)"
- "Grafana Tempo integration (Docker Compose example)"
- "Flamegraph generator with Ruby source annotations"
- "Decision impact dashboard (HTML + Chart.js)"
acceptance_criteria:
- "Traces viewable in Jaeger UI with decision annotations"
- "Flamegraphs show Ruby source locations (NOT Rust)"
- "Decision impact dashboard sorts by overhead (ms)"
- "100% OpenTelemetry spec compliance (OTLP v1.0)"
- "Examples: 3 observability platforms (Jaeger, Grafana Tempo, Zipkin)"
quality_gates:
- "OTLP schema validation (opentelemetry-proto)"
- "Jaeger end-to-end test (trace visible in UI)"
- "Flamegraph accuracy: 95%+ Ruby source attribution"
risks:
- "OpenTelemetry schema evolution (mitigated: lock to v1.0 spec)"
- "Visualization performance with 10K+ spans (mitigated: sampling)"
toyota_way:
genchi_genbutsu: "Validate with real production traces (not synthetic)"
- name: "Sprint 31: Documentation and Publication (Phase 5)"
sprint_number: 31
target_date: "2026-01-26"
status: "PLANNED"
objectives:
- "Public release and academic publication"
deliverables:
- "User guide: End-to-end tracing workflows (local, Docker, Lambda)"
- "Performance tuning guide (sampling rates, overhead analysis)"
- "Integration guide for each environment"
- "Academic paper draft (OOPSLA/PLDI submission)"
- "Blog post with case studies"
- "Public GitHub repository with examples"
acceptance_criteria:
- "Documentation peer-reviewed by 2+ external reviewers"
- "Reproducible examples for all 3 environments"
- "Academic paper submitted to conference (OOPSLA 2026 or PLDI 2026)"
- "Blog post published on ruchy.dev"
- "100+ GitHub stars in first month (target)"
quality_gates:
- "Documentation completeness check (all features documented)"
- "Example validation (all examples run successfully)"
- "Academic paper peer review (internal)"
risks:
- "Conference acceptance (mitigated: 17 peer-reviewed citations strengthen paper)"
toyota_way:
genchi_genbutsu: "Get feedback from real users before final release"
kaizen: "Continuous improvement based on community feedback"
ruchy_tracing_quality_gates:
per_sprint:
- "85%+ test coverage (enforced via cargo-llvm-cov)"
- "85%+ mutation score (enforced via cargo-mutants)"
- "Zero clippy warnings"
- "pmat analyze tdg ≥85/100"
- "All integration tests pass"
phase_milestones:
phase_1_complete:
- "Transpiler decision capture validated (100+ decisions)"
- "Zero I/O blocking (mmap performance validated)"
phase_2_complete:
- "<0.3% overhead validated (3 benchmarks)"
- "DoS protection validated (10M call burst)"
phase_3_complete:
- "Cross-environment correlation validated (local → Docker → Lambda)"
- "Lambda flush safety validated (1000 invocations, zero loss)"
phase_4_complete:
- "OTLP export validated (Jaeger, Grafana Tempo)"
- "Flamegraph accuracy ≥95%"
phase_5_complete:
- "Documentation complete and peer-reviewed"
- "Academic paper submitted"
ruchy_tracing_risks:
- risk: "Overhead creep above 0.3% target"
mitigation: "Continuous benchmarking, hash-based IDs, randomized sampling"
status: "monitoring"
owner: "Performance team"
reference: "ruchy-tracing-support.md Section 6"
- risk: "Lambda trace data loss from environment freeze"
mitigation: "Lambda Extension API with lifecycle hooks"
status: "mitigated"
owner: "Lambda integration team"
reference: "ruchy-tracing-support.md Section 4.4"
- risk: "Clock skew causing incorrect correlation"
mitigation: "Causal ordering via span hierarchy (NOT timestamps)"
status: "mitigated"
owner: "Cross-environment team"
reference: "ruchy-tracing-support.md Section 4.1"
- risk: "Hash collision in decision IDs"
mitigation: "64-bit hash space (2^64), FNV-1a algorithm"
status: "low probability"
owner: "Transpiler team"
reference: "ruchy-tracing-support.md Section 2.1"
- risk: "Moiré sampling patterns in multi-threaded workloads"
mitigation: "Xorshift RNG for randomized sampling"
status: "mitigated"
owner: "Runtime team"
reference: "ruchy-tracing-support.md Section 3.2"
- risk: "DoS via cold function burst traffic"
mitigation: "Global rate limiter (10K traces/sec circuit breaker)"
status: "mitigated"
owner: "Runtime team"
reference: "ruchy-tracing-support.md Section 3.2"
ruchy_tracing_citations:
- citation: "Cooper & Torczon (2011): Engineering a Compiler"
relevance: "Compiler optimization theory"
- citation: "Lozi et al. (EuroSys 2016): The Linux Scheduler"
relevance: "Lock-free tracing (<1% overhead)"
- citation: "Mytkowicz et al. (ASPLOS 2009): Producing Wrong Data"
relevance: "Measurement bias, sampling"
- citation: "Ren et al. (IEEE Micro 2010): Google-Wide Profiling"
relevance: "Production profiling"
- citation: "Sigelman et al. (Google 2010): Dapper"
relevance: "Distributed tracing"
- citation: "Marr et al. (DLS 2016): Cross-Language Benchmarking"
relevance: "Benchmarking methodology"
- citation: "Blackburn et al. (CACM 2008): Wake up and Smell the Coffee"
relevance: "Benchmark design"
- citation: "Kalibera & Jones (ISMM 2013): Rigorous Benchmarking"
relevance: "Statistical rigor"
- citation: "Gregg (2019): BPF Performance Tools"
relevance: "eBPF tracing"
- citation: "Ball & Larus (MICRO 1996): Efficient Path Profiling"
relevance: "Program instrumentation"
- citation: "Zhao et al. (SOCC 2017): Log20"
relevance: "Log placement optimization (v2.0)"
- citation: "Lattner & Adve (CGO 2004): LLVM"
relevance: "Compact metadata (v2.0)"
- citation: "Moseley et al. (MEMOCODE 2006): Loop-Centric Profiling"
relevance: "Sampling bias (v2.0)"
- citation: "Dean & Barroso (CACM 2013): The Tail at Scale"
relevance: "Circuit breakers (v2.0)"
- citation: "Lamport (CACM 1978): Time, Clocks, and Ordering"
relevance: "Causal ordering (v2.0)"
- citation: "Mace et al. (SOSP 2015): Pivot Tracing"
relevance: "Happen-before relationships (v2.0)"
- citation: "Wang et al. (USENIX ATC 2018): Serverless Platforms"
relevance: "Lambda freeze behavior (v2.0)"
tickets:
- id: "METRICS-001"
title: "Core Metrics, Alerting, and Visualization"
type: "EPIC"
status: "todo"
priority: "high"
specification: "docs/specifications/core-metrics-alerts-viz.md"
sprints: [56, 57]
duration: "2 weeks"
description: |
Extend Renacer from tracing-only to full observability platform with:
- Metrics collection (Counter, Gauge, Histogram - OTLP compatible)
- Alerting engine (threshold, rate, absence, anomaly-based)
- Enhanced TUI visualization (metrics panels, alert status)
objectives:
- "Counter/Gauge/Histogram with SIMD-accelerated histograms (trueno)"
- "OTLP metrics export (extend existing trace exporter)"
- "Alert rule DSL in renacer.toml"
- "Real-time TUI metrics dashboard"
- "100-point Popper falsification QA checklist"
deliverables:
sprint_56:
- "src/metrics/counter.rs - Atomic counter"
- "src/metrics/gauge.rs - Atomic gauge"
- "src/metrics/histogram.rs - SIMD bucket search"
- "src/metrics/registry.rs - Thread-safe registration"
- "src/otlp_exporter.rs - Extended for metrics"
- "CLI: renacer --metrics, renacer metrics list"
sprint_57:
- "src/alerting/engine.rs - Alert evaluation loop"
- "src/alerting/rule.rs - Alert rule parsing"
- "src/alerting/state.rs - Pending/Firing/Resolved"
- "src/visualize/panels/metrics.rs - TUI sparklines"
- "src/visualize/panels/alerts.rs - Alert status"
- "renacer.toml alerting configuration"
acceptance_criteria:
- "Counter.inc() <50ns (p99)"
- "Histogram.observe() <200ns with SIMD"
- "Alert fires within 100ms of threshold breach"
- "TUI renders 50 metrics at 30fps"
- "100/100 Popper QA checklist items pass"
quality_gates:
- "95%+ test coverage"
- "85%+ mutation score"
- "Zero clippy warnings"
- "pmat rust-project-score ≥85/106"
- "All 100 Popper falsification tests pass"
peer_reviewed_citations:
- "Dapper (Google 2010): Ring buffer pattern"
- "Prometheus (SoundCloud 2012): Counter/Gauge/Histogram types"
- "Linux perf_event (2008): Per-CPU ring buffers"
- "Borgmon (Google 2003): Alert rule evaluation"
- "Isolation Forest (Liu 2008): Anomaly detection"
linux_kernel_patterns:
- "kernel/events/ring_buffer.c: Lock-free per-CPU buffers"
- "kernel/watchdog_perf.c: Threshold-based alerting"
prometheus_patterns:
- "model/histogram: Exponential bucket boundaries"
- "Label cardinality control: Allowlist enforcement"
- "Recording rules: Pre-computed aggregations"
toyota_way:
heijunka: "Lock-free ring buffer prevents I/O blocking"
jidoka: "Automated alert evaluation, quality gates"
genchi_genbutsu: "Real production workload validation"
muda: "SIMD acceleration eliminates CPU waste"
- id: "METRICS-001-SPRINT56"
title: "Core Metrics + OTLP Export"
type: "sprint"
status: "todo"
priority: "high"
parent: "METRICS-001"
sprint: 56
specification: "docs/specifications/core-metrics-alerts-viz.md#sprint-56"
- id: "METRICS-001-SPRINT57"
title: "Alerting + Visualization"
type: "sprint"
status: "blocked"
blocked_by: "METRICS-001-SPRINT56"
priority: "high"
parent: "METRICS-001"
sprint: 57
specification: "docs/specifications/core-metrics-alerts-viz.md#sprint-57"