fallow_v8_coverage/lib.rs
1//! V8 `ScriptCoverage` JSON parser and UTF-16 source-offset mapper.
2//!
3//! This is the open-source layer of fallow's runtime-coverage pipeline. It
4//! provides the two things the `fallow` CLI consumes:
5//!
6//! 1. Serde input types for the V8 coverage dump format emitted by
7//! `node --experimental-test-coverage`, `c8`, the Inspector protocol, or
8//! any V8 isolate ([`V8CoverageDump`] and friends).
9//! 2. [`LineOffsetTable`], which converts V8 source offsets into 1-indexed
10//! line / 0-indexed column [`IstanbulPosition`]s.
11//!
12//! ## Offset semantics (load-bearing)
13//!
14//! V8 reports coverage offsets in **UTF-16 code units**, not UTF-8 bytes (V8
15//! strings are UTF-16). Verified against real Node: a function preceded by a
16//! `😀` (2 UTF-16 units / 4 UTF-8 bytes) on the same line is reported at the
17//! UTF-16 offset, not the byte offset. [`LineOffsetTable`] therefore stores
18//! line starts in UTF-16 units. This is the invariant the `line_table_*` tests
19//! pin, and the one a byte-offset implementation gets wrong.
20//!
21//! ## Relationship to `oxc_coverage_v8`
22//!
23//! `oxc_coverage_v8` (in `oxc-coverage-instrument`) solves the inverse problem:
24//! it takes an AST-built Istanbul `FileCoverage` and fills its statement /
25//! function / branch counts by converting Istanbul positions into **byte**
26//! offsets. The two crates are intentionally not consolidated: opposite
27//! directions, opposite unit spaces, and different producers (real Node V8
28//! dumps here vs. an instrumenter-controlled pipeline there). See
29//! `decisions/010-v8-coverage-vs-oxc-coverage-boundary.md`.
30//!
31//! The closed-source cross-reference, combined scoring, hot-path heuristics and
32//! verdict generation live in `fallow-cov` (private) and consume the CLI's
33//! remapped function output via the `fallow-cov-protocol` envelope.
34
35#![forbid(unsafe_code)]
36
37use serde::{Deserialize, Deserializer, Serialize};
38
39// -- V8 input types ---------------------------------------------------------
40
41/// Top-level shape emitted by Node's `NODE_V8_COVERAGE` directory: one file
42/// per worker / process containing a `result` array of [`ScriptCoverage`].
43#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct V8CoverageDump {
45 /// Per-script coverage entries.
46 pub result: Vec<ScriptCoverage>,
47 /// Optional source-map cache emitted by Node 13+.
48 #[serde(default, rename = "source-map-cache")]
49 pub source_map_cache: Option<serde_json::Value>,
50}
51
52/// V8's per-script coverage record. Field names mirror the V8 inspector
53/// protocol verbatim.
54#[derive(Debug, Clone, Serialize, Deserialize)]
55pub struct ScriptCoverage {
56 /// V8 script identifier.
57 #[serde(rename = "scriptId")]
58 pub script_id: String,
59 /// File URL — typically `file:///abs/path` for Node, `https://…` for
60 /// browsers. Callers normalize to absolute paths before merging.
61 pub url: String,
62 /// One entry per function (including the implicit module-level function).
63 pub functions: Vec<FunctionCoverage>,
64}
65
66/// V8 per-function coverage. Each function carries one or more
67/// [`CoverageRange`]s — block-level for instrumented coverage, function-level
68/// for `--coverage=best-effort`.
69#[derive(Debug, Clone, Serialize, Deserialize)]
70pub struct FunctionCoverage {
71 /// Source-as-written function name. Empty for the module-level wrapper
72 /// and anonymous functions.
73 #[serde(rename = "functionName")]
74 pub function_name: String,
75 /// Coverage ranges, UTF-16 code-unit offsets relative to the script's
76 /// source text (see the crate-level "Offset semantics" note).
77 pub ranges: Vec<CoverageRange>,
78 /// True when V8 emitted block-level data for this function (instrumented
79 /// coverage). False when only the outer function range is reliable
80 /// (best-effort / runtime coverage).
81 #[serde(rename = "isBlockCoverage", default)]
82 pub is_block_coverage: bool,
83}
84
85/// A single coverage range. `count == 0` means the range was never hit.
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct CoverageRange {
88 /// Inclusive UTF-16 code-unit offset into the script's source.
89 #[serde(rename = "startOffset")]
90 pub start_offset: u32,
91 /// Exclusive UTF-16 code-unit offset into the script's source.
92 #[serde(rename = "endOffset")]
93 pub end_offset: u32,
94 /// Number of times the range was executed.
95 pub count: u64,
96}
97
98// -- Istanbul position type -------------------------------------------------
99
100/// 1-indexed line + 0-indexed column.
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct IstanbulPosition {
103 /// 1-indexed line number.
104 pub line: u32,
105 /// 0-indexed column within the line.
106 ///
107 /// Some real Istanbul producers (including Vitest in certain transforms)
108 /// emit `null` for end columns. We normalize those to `0` at parse time
109 /// so downstream CRAP/prod-coverage consumers can still ingest the file.
110 #[serde(deserialize_with = "deserialize_nullable_u32")]
111 pub column: u32,
112}
113
114fn deserialize_nullable_u32<'de, D>(deserializer: D) -> Result<u32, D::Error>
115where
116 D: Deserializer<'de>,
117{
118 Ok(Option::<u32>::deserialize(deserializer)?.unwrap_or(0))
119}
120
121// -- V8 offset to line/column mapper ---------------------------------------
122
123/// Pre-computed line-start table for converting V8 source offsets into
124/// Istanbul line/column positions in O(log n) per lookup.
125///
126/// V8 reports offsets in JavaScript source positions: UTF-16 code units, not
127/// UTF-8 bytes. Istanbul columns use the same 0-indexed source-position model,
128/// so this table stores line starts in UTF-16 units.
129///
130/// The source is consumed once at construction; subsequent lookups are
131/// allocation-free.
132#[derive(Debug)]
133pub struct LineOffsetTable {
134 /// UTF-16 offset of the first character of each line. `line_starts[0]`
135 /// is always `0` (the start of the file).
136 line_starts: Vec<u32>,
137}
138
139impl LineOffsetTable {
140 /// Build a table from the full source text. The source must be UTF-8 with
141 /// LF, CRLF, or CR line endings (mixed endings are tolerated).
142 #[must_use]
143 pub fn from_source(source: &str) -> Self {
144 let mut line_starts = Vec::with_capacity(source.lines().count() + 1);
145 line_starts.push(0);
146 let mut offset = 0u32;
147 let mut chars = source.chars().peekable();
148 while let Some(ch) = chars.next() {
149 match ch {
150 '\n' => {
151 offset = offset.saturating_add(1);
152 line_starts.push(offset);
153 }
154 '\r' => {
155 offset = offset.saturating_add(1);
156 if chars.peek() == Some(&'\n') {
157 chars.next();
158 offset = offset.saturating_add(1);
159 }
160 line_starts.push(offset);
161 }
162 _ => offset = offset.saturating_add(ch.len_utf16() as u32),
163 }
164 }
165 Self { line_starts }
166 }
167
168 /// Build a table from V8's `source-map-cache.lineLengths` data.
169 ///
170 /// `lineLengths` are already measured in JavaScript source positions. The
171 /// cache does not carry line-ending widths, so this preserves the existing
172 /// Node fallback behavior and advances one source position between lines.
173 #[must_use]
174 pub fn from_v8_line_lengths(line_lengths: &[u32]) -> Option<Self> {
175 if line_lengths.is_empty() {
176 return None;
177 }
178
179 let mut line_starts = Vec::with_capacity(line_lengths.len());
180 line_starts.push(0);
181 let mut offset = 0u32;
182 for length in line_lengths
183 .iter()
184 .take(line_lengths.len().saturating_sub(1))
185 {
186 offset = offset.saturating_add(*length).saturating_add(1);
187 line_starts.push(offset);
188 }
189 Some(Self { line_starts })
190 }
191
192 /// Convert a V8 source offset to a 1-indexed line + 0-indexed column.
193 ///
194 /// Offsets at or past the end of the source clamp to the last line +
195 /// remaining column.
196 #[must_use]
197 pub fn position(&self, source_offset: u32) -> IstanbulPosition {
198 // Binary search for the last line_start <= source_offset.
199 let line_zero_indexed = match self.line_starts.binary_search(&source_offset) {
200 Ok(exact) => exact,
201 Err(insertion_point) => insertion_point.saturating_sub(1),
202 };
203 let line_start = self.line_starts[line_zero_indexed];
204 IstanbulPosition {
205 line: (line_zero_indexed as u32) + 1,
206 column: source_offset.saturating_sub(line_start),
207 }
208 }
209}
210
211// Manual Copy impls: the CLI consumer `.copied()`s `CoverageRange` out of a
212// function's `ranges`, and `IstanbulPosition` is a small value type returned by
213// `LineOffsetTable::position`.
214impl Copy for CoverageRange {}
215impl Copy for IstanbulPosition {}
216
217#[cfg(test)]
218mod tests {
219 use super::*;
220
221 #[test]
222 fn line_table_handles_lf() {
223 let table = LineOffsetTable::from_source("a\nbb\nccc");
224 assert_eq!(table.position(0).line, 1);
225 assert_eq!(table.position(0).column, 0);
226 assert_eq!(table.position(2).line, 2);
227 assert_eq!(table.position(2).column, 0);
228 assert_eq!(table.position(5).line, 3);
229 assert_eq!(table.position(5).column, 0);
230 }
231
232 #[test]
233 fn line_table_handles_crlf() {
234 let table = LineOffsetTable::from_source("a\r\nbb\r\nccc");
235 assert_eq!(table.position(3).line, 2);
236 assert_eq!(table.position(3).column, 0);
237 }
238
239 #[test]
240 fn line_table_handles_lone_cr() {
241 let table = LineOffsetTable::from_source("a\rbb");
242 assert_eq!(table.position(2).line, 2);
243 assert_eq!(table.position(2).column, 0);
244 }
245
246 #[test]
247 fn line_table_uses_utf16_offsets_for_non_ascii_source() {
248 let source = "const smile = \"😀\";\nfunction after_emoji() {}\n";
249 let function_byte_offset = source
250 .find("function")
251 .expect("test source should contain function");
252 let function_v8_offset = source[..function_byte_offset].encode_utf16().count() as u32;
253
254 assert_ne!(function_v8_offset, function_byte_offset as u32);
255
256 let table = LineOffsetTable::from_source(source);
257 let pos = table.position(function_v8_offset);
258
259 assert_eq!(pos.line, 2);
260 assert_eq!(pos.column, 0);
261 }
262
263 /// The discriminating case: a multibyte character and the offset live on the
264 /// SAME line, so the UTF-16-vs-byte distinction shows up as the *column*,
265 /// not just line counting. A byte-offset implementation would report a
266 /// strictly larger column here. `😀` is 2 UTF-16 units / 4 UTF-8 bytes, so
267 /// two of them put the byte offset 4 ahead of the V8 (UTF-16) offset. This
268 /// mirrors what real Node emits (`Profiler.takePreciseCoverage` reports the
269 /// UTF-16 offset, e.g. 18 rather than the byte offset 22 for this shape).
270 #[test]
271 fn line_table_maps_columns_in_utf16_units_within_a_line() {
272 let source = "const e = \"😀😀\"; function f(){}\n";
273 let function_byte_offset = source
274 .find("function")
275 .expect("test source should contain function")
276 as u32;
277 let function_v8_offset = source[..function_byte_offset as usize]
278 .encode_utf16()
279 .count() as u32;
280
281 // The fixture must actually exercise the multibyte gap, else a byte
282 // implementation would pass this test by accident.
283 assert!(
284 function_v8_offset < function_byte_offset,
285 "fixture must place a multibyte char before the function",
286 );
287
288 let table = LineOffsetTable::from_source(source);
289 let pos = table.position(function_v8_offset);
290
291 // Line 1 starts at offset 0, so the column equals the V8 (UTF-16)
292 // offset. A byte model would report `function_byte_offset` instead.
293 assert_eq!(pos.line, 1);
294 assert_eq!(pos.column, function_v8_offset);
295 assert!(
296 pos.column < function_byte_offset,
297 "column must be measured in UTF-16 units, not bytes",
298 );
299 }
300
301 #[test]
302 fn line_table_builds_from_v8_line_lengths() {
303 let table = LineOffsetTable::from_v8_line_lengths(&[20, 12])
304 .expect("line lengths should build table");
305
306 assert_eq!(table.position(20).line, 1);
307 assert_eq!(table.position(20).column, 20);
308 assert_eq!(table.position(21).line, 2);
309 assert_eq!(table.position(21).column, 0);
310 }
311
312 #[test]
313 fn line_table_clamps_past_end() {
314 let table = LineOffsetTable::from_source("abc");
315 let pos = table.position(100);
316 assert_eq!(pos.line, 1);
317 assert_eq!(pos.column, 100);
318 }
319
320 #[test]
321 fn parse_node_v8_coverage_dump() {
322 let raw = serde_json::json!({
323 "result": [{
324 "scriptId": "42",
325 "url": "file:///t/x.js",
326 "functions": [{
327 "functionName": "a",
328 "ranges": [{"startOffset": 0, "endOffset": 10, "count": 3}],
329 "isBlockCoverage": false
330 }]
331 }]
332 });
333 let dump: V8CoverageDump = serde_json::from_value(raw).unwrap();
334 assert_eq!(dump.result.len(), 1);
335 assert_eq!(dump.result[0].functions[0].function_name, "a");
336 }
337
338 /// Some real Istanbul producers (e.g. Vitest under certain transforms) emit
339 /// `null` for end columns. [`IstanbulPosition`] tolerates that via
340 /// `deserialize_nullable_u32`, normalizing `null` to `0` so a downstream
341 /// consumer deserializing positions does not choke. Pinned directly on the
342 /// position type since that is where the custom deserializer lives.
343 #[test]
344 fn istanbul_position_normalizes_null_column_to_zero() {
345 let with_null: IstanbulPosition =
346 serde_json::from_value(serde_json::json!({ "line": 76, "column": null })).unwrap();
347 assert_eq!(with_null.line, 76);
348 assert_eq!(with_null.column, 0);
349
350 let with_value: IstanbulPosition =
351 serde_json::from_value(serde_json::json!({ "line": 66, "column": 4 })).unwrap();
352 assert_eq!(with_value.column, 4);
353 }
354
355 /// Property tests for the UTF-16-offset-to-line/column mapper.
356 ///
357 /// The `position` mapper backs every Istanbul range fallow emits for runtime
358 /// coverage, so its invariants are encoded as properties rather than relying
359 /// on hand-picked examples. The line-boundary tests build their input from
360 /// known line bodies and join them with a chosen ending, so the expected
361 /// offsets are computed independently of the char-walking construction loop.
362 mod proptests {
363 use super::*;
364 use proptest::prelude::*;
365
366 /// A line body drawn from an alphabet that exercises the UTF-16 width
367 /// branch: ASCII (1 unit), `€` (1 unit / 3 UTF-8 bytes), and `😀` (a
368 /// surrogate pair, 2 units / 4 UTF-8 bytes). Never contains CR or LF, so
369 /// the only line breaks are the ones the harness inserts deliberately.
370 fn line_body() -> impl Strategy<Value = String> {
371 prop::collection::vec(prop::sample::select(vec!['a', 'b', ' ', '€', '😀']), 0..12)
372 .prop_map(|chars| chars.into_iter().collect())
373 }
374
375 /// UTF-16 length of a `str`, matching the units `LineOffsetTable` stores.
376 fn utf16_len(s: &str) -> u32 {
377 s.encode_utf16().count() as u32
378 }
379
380 proptest! {
381 /// `position` is monotonic: a non-decreasing offset never yields an
382 /// earlier `(line, column)`. Guards the `binary_search` Err-branch
383 /// `saturating_sub(1)` and the saturating column subtraction against
384 /// off-by-one regressions, for any source including past-end offsets.
385 #[test]
386 fn position_is_monotonic_in_offset(
387 source in prop::collection::vec(any::<char>(), 0..200)
388 .prop_map(|chars| chars.into_iter().collect::<String>()),
389 a in any::<u32>(),
390 b in any::<u32>(),
391 ) {
392 let table = LineOffsetTable::from_source(&source);
393 let (lo, hi) = (a.min(b), a.max(b));
394 let p_lo = table.position(lo);
395 let p_hi = table.position(hi);
396 prop_assert!(p_lo.line >= 1, "line numbers are 1-indexed");
397 prop_assert!(
398 (p_lo.line, p_lo.column) <= (p_hi.line, p_hi.column),
399 "position({lo}) = {p_lo:?} should not exceed position({hi}) = {p_hi:?}",
400 );
401 }
402
403 /// Every true line boundary maps back to column 0 on the right line,
404 /// and offsets within a line recover their column. Input is assembled
405 /// from known bodies + ending, so the expectation is independent of
406 /// the mapper's own line-splitting logic.
407 #[test]
408 fn line_starts_and_columns_round_trip(
409 bodies in prop::collection::vec(line_body(), 1..8),
410 ending in prop::sample::select(vec!["\n", "\r\n", "\r"]),
411 ) {
412 let source = bodies.join(ending);
413 let table = LineOffsetTable::from_source(&source);
414 let ending_units = utf16_len(ending);
415
416 let mut line_start = 0u32;
417 for (index, body) in bodies.iter().enumerate() {
418 let body_units = utf16_len(body);
419 // Column 0 of each line lands on the line's first offset.
420 let at_start = table.position(line_start);
421 prop_assert_eq!(at_start.line, index as u32 + 1);
422 prop_assert_eq!(at_start.column, 0);
423 // Offsets inside the line (up to its width) recover the column.
424 for column in 0..=body_units {
425 let pos = table.position(line_start + column);
426 prop_assert_eq!(pos.line, index as u32 + 1);
427 prop_assert_eq!(pos.column, column);
428 }
429 line_start += body_units;
430 if index + 1 < bodies.len() {
431 line_start += ending_units;
432 }
433 }
434 }
435
436 /// `from_v8_line_lengths` advances one source position per line. The
437 /// cumulative line starts are strictly increasing and each maps to
438 /// column 0 on its line; offsets within a non-final line recover the
439 /// column. Lengths are bounded so the cumulative offset never
440 /// saturates, keeping the reference model exact.
441 #[test]
442 fn v8_line_lengths_build_consistent_table(
443 lengths in prop::collection::vec(0u32..1000, 1..20),
444 ) {
445 let table = LineOffsetTable::from_v8_line_lengths(&lengths)
446 .expect("non-empty lengths build a table");
447
448 // Reconstruct the expected line starts: +1 separator per line.
449 let mut starts = vec![0u32];
450 let mut acc = 0u32;
451 for length in &lengths[..lengths.len() - 1] {
452 acc += length + 1;
453 starts.push(acc);
454 }
455
456 let mut previous: Option<u32> = None;
457 for (index, &start) in starts.iter().enumerate() {
458 if let Some(prev) = previous {
459 prop_assert!(start > prev, "line starts must strictly increase");
460 }
461 previous = Some(start);
462
463 let at_start = table.position(start);
464 prop_assert_eq!(at_start.line, index as u32 + 1);
465 prop_assert_eq!(at_start.column, 0);
466
467 // Within a non-final line the recorded length bounds the columns.
468 if index + 1 < lengths.len() {
469 for column in 0..=lengths[index] {
470 let pos = table.position(start + column);
471 prop_assert_eq!(pos.line, index as u32 + 1);
472 prop_assert_eq!(pos.column, column);
473 }
474 }
475 }
476 }
477 }
478 }
479}