fallow_v8_coverage/lib.rs
1//! V8 `ScriptCoverage` JSON parser and UTF-16 source-offset mapper.
2//!
3//! This is the open-source layer of fallow's runtime-coverage pipeline. It
4//! provides the two things the `fallow` CLI consumes:
5//!
6//! 1. Serde input types for the V8 coverage dump format emitted by
7//! `node --experimental-test-coverage`, `c8`, the Inspector protocol, or
8//! any V8 isolate ([`V8CoverageDump`] and friends).
9//! 2. [`LineOffsetTable`], which converts V8 source offsets into 1-indexed
10//! line / 0-indexed column [`IstanbulPosition`]s.
11//!
12//! ## Offset semantics (load-bearing)
13//!
14//! V8 reports coverage offsets in **UTF-16 code units**, not UTF-8 bytes (V8
15//! strings are UTF-16). Verified against real Node: a function preceded by a
16//! `😀` (2 UTF-16 units / 4 UTF-8 bytes) on the same line is reported at the
17//! UTF-16 offset, not the byte offset. [`LineOffsetTable`] therefore stores
18//! line starts in UTF-16 units. This is the invariant the `line_table_*` tests
19//! pin, and the one a byte-offset implementation gets wrong.
20//!
21//! ## Relationship to `oxc_coverage_v8`
22//!
23//! `oxc_coverage_v8` (in `oxc-coverage-instrument`) solves the inverse problem:
24//! it takes an AST-built Istanbul `FileCoverage` and fills its statement /
25//! function / branch counts by converting Istanbul positions into **byte**
26//! offsets. The two crates are intentionally not consolidated: opposite
27//! directions, opposite unit spaces, and different producers (real Node V8
28//! dumps here vs. an instrumenter-controlled pipeline there). See
29//! `decisions/010-v8-coverage-vs-oxc-coverage-boundary.md`.
30//!
31//! The closed-source cross-reference, combined scoring, hot-path heuristics and
32//! verdict generation live in `fallow-cov` (private) and consume the CLI's
33//! remapped function output via the `fallow-cov-protocol` envelope.
34
35#![forbid(unsafe_code)]
36#![cfg_attr(
37 test,
38 allow(
39 clippy::unwrap_used,
40 clippy::expect_used,
41 reason = "tests use unwrap and expect to keep fixture setup concise"
42 )
43)]
44
45use serde::{Deserialize, Deserializer, Serialize};
46
47/// Top-level shape emitted by Node's `NODE_V8_COVERAGE` directory: one file
48/// per worker / process containing a `result` array of [`ScriptCoverage`].
49#[derive(Debug, Clone, Serialize, Deserialize)]
50pub struct V8CoverageDump {
51 /// Per-script coverage entries.
52 pub result: Vec<ScriptCoverage>,
53 /// Optional source-map cache emitted by Node 13+.
54 #[serde(default, rename = "source-map-cache")]
55 pub source_map_cache: Option<serde_json::Value>,
56}
57
58/// V8's per-script coverage record. Field names mirror the V8 inspector
59/// protocol verbatim.
60#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct ScriptCoverage {
62 /// V8 script identifier.
63 #[serde(rename = "scriptId")]
64 pub script_id: String,
65 /// File URL — typically `file:///abs/path` for Node, `https://…` for
66 /// browsers. Callers normalize to absolute paths before merging.
67 pub url: String,
68 /// One entry per function (including the implicit module-level function).
69 pub functions: Vec<FunctionCoverage>,
70}
71
72/// V8 per-function coverage. Each function carries one or more
73/// [`CoverageRange`]s — block-level for instrumented coverage, function-level
74/// for `--coverage=best-effort`.
75#[derive(Debug, Clone, Serialize, Deserialize)]
76pub struct FunctionCoverage {
77 /// Source-as-written function name. Empty for the module-level wrapper
78 /// and anonymous functions.
79 #[serde(rename = "functionName")]
80 pub function_name: String,
81 /// Coverage ranges, UTF-16 code-unit offsets relative to the script's
82 /// source text (see the crate-level "Offset semantics" note).
83 pub ranges: Vec<CoverageRange>,
84 /// True when V8 emitted block-level data for this function (instrumented
85 /// coverage). False when only the outer function range is reliable
86 /// (best-effort / runtime coverage).
87 #[serde(rename = "isBlockCoverage", default)]
88 pub is_block_coverage: bool,
89}
90
91/// A single coverage range. `count == 0` means the range was never hit.
92#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct CoverageRange {
94 /// Inclusive UTF-16 code-unit offset into the script's source.
95 #[serde(rename = "startOffset")]
96 pub start_offset: u32,
97 /// Exclusive UTF-16 code-unit offset into the script's source.
98 #[serde(rename = "endOffset")]
99 pub end_offset: u32,
100 /// Number of times the range was executed.
101 pub count: u64,
102}
103
104/// 1-indexed line + 0-indexed column.
105#[derive(Debug, Clone, Serialize, Deserialize)]
106pub struct IstanbulPosition {
107 /// 1-indexed line number.
108 pub line: u32,
109 /// 0-indexed column within the line.
110 ///
111 /// Some real Istanbul producers (including Vitest in certain transforms)
112 /// emit `null` for end columns. We normalize those to `0` at parse time
113 /// so downstream CRAP/prod-coverage consumers can still ingest the file.
114 #[serde(deserialize_with = "deserialize_nullable_u32")]
115 pub column: u32,
116}
117
118fn deserialize_nullable_u32<'de, D>(deserializer: D) -> Result<u32, D::Error>
119where
120 D: Deserializer<'de>,
121{
122 Ok(Option::<u32>::deserialize(deserializer)?.unwrap_or(0))
123}
124
125/// Pre-computed line-start table for converting V8 source offsets into
126/// Istanbul line/column positions in O(log n) per lookup.
127///
128/// V8 reports offsets in JavaScript source positions: UTF-16 code units, not
129/// UTF-8 bytes. Istanbul columns use the same 0-indexed source-position model,
130/// so this table stores line starts in UTF-16 units.
131///
132/// The source is consumed once at construction; subsequent lookups are
133/// allocation-free.
134#[derive(Debug)]
135pub struct LineOffsetTable {
136 /// UTF-16 offset of the first character of each line. `line_starts[0]`
137 /// is always `0` (the start of the file).
138 line_starts: Vec<u32>,
139}
140
141impl LineOffsetTable {
142 /// Build a table from the full source text. The source must be UTF-8 with
143 /// LF, CRLF, or CR line endings (mixed endings are tolerated).
144 #[must_use]
145 pub fn from_source(source: &str) -> Self {
146 let mut line_starts = Vec::with_capacity(source.lines().count() + 1);
147 line_starts.push(0);
148 let mut offset = 0u32;
149 let mut chars = source.chars().peekable();
150 while let Some(ch) = chars.next() {
151 match ch {
152 '\n' => {
153 offset = offset.saturating_add(1);
154 line_starts.push(offset);
155 }
156 '\r' => {
157 offset = offset.saturating_add(1);
158 if chars.peek() == Some(&'\n') {
159 chars.next();
160 offset = offset.saturating_add(1);
161 }
162 line_starts.push(offset);
163 }
164 _ => offset = offset.saturating_add(ch.len_utf16() as u32),
165 }
166 }
167 Self { line_starts }
168 }
169
170 /// Build a table from V8's `source-map-cache.lineLengths` data.
171 ///
172 /// `lineLengths` are already measured in JavaScript source positions. The
173 /// cache does not carry line-ending widths, so this preserves the existing
174 /// Node fallback behavior and advances one source position between lines.
175 #[must_use]
176 pub fn from_v8_line_lengths(line_lengths: &[u32]) -> Option<Self> {
177 if line_lengths.is_empty() {
178 return None;
179 }
180
181 let mut line_starts = Vec::with_capacity(line_lengths.len());
182 line_starts.push(0);
183 let mut offset = 0u32;
184 for length in line_lengths
185 .iter()
186 .take(line_lengths.len().saturating_sub(1))
187 {
188 offset = offset.saturating_add(*length).saturating_add(1);
189 line_starts.push(offset);
190 }
191 Some(Self { line_starts })
192 }
193
194 /// Convert a V8 source offset to a 1-indexed line + 0-indexed column.
195 ///
196 /// Offsets at or past the end of the source clamp to the last line +
197 /// remaining column.
198 #[must_use]
199 pub fn position(&self, source_offset: u32) -> IstanbulPosition {
200 let line_zero_indexed = match self.line_starts.binary_search(&source_offset) {
201 Ok(exact) => exact,
202 Err(insertion_point) => insertion_point.saturating_sub(1),
203 };
204 let line_start = self.line_starts[line_zero_indexed];
205 IstanbulPosition {
206 line: (line_zero_indexed as u32) + 1,
207 column: source_offset.saturating_sub(line_start),
208 }
209 }
210}
211
212impl Copy for CoverageRange {}
213impl Copy for IstanbulPosition {}
214
215#[cfg(test)]
216mod tests {
217 use super::*;
218
219 #[test]
220 fn line_table_handles_lf() {
221 let table = LineOffsetTable::from_source("a\nbb\nccc");
222 assert_eq!(table.position(0).line, 1);
223 assert_eq!(table.position(0).column, 0);
224 assert_eq!(table.position(2).line, 2);
225 assert_eq!(table.position(2).column, 0);
226 assert_eq!(table.position(5).line, 3);
227 assert_eq!(table.position(5).column, 0);
228 }
229
230 #[test]
231 fn line_table_handles_crlf() {
232 let table = LineOffsetTable::from_source("a\r\nbb\r\nccc");
233 assert_eq!(table.position(3).line, 2);
234 assert_eq!(table.position(3).column, 0);
235 }
236
237 #[test]
238 fn line_table_handles_lone_cr() {
239 let table = LineOffsetTable::from_source("a\rbb");
240 assert_eq!(table.position(2).line, 2);
241 assert_eq!(table.position(2).column, 0);
242 }
243
244 #[test]
245 fn line_table_uses_utf16_offsets_for_non_ascii_source() {
246 let source = "const smile = \"😀\";\nfunction after_emoji() {}\n";
247 let function_byte_offset = source
248 .find("function")
249 .expect("test source should contain function");
250 let function_v8_offset = source[..function_byte_offset].encode_utf16().count() as u32;
251
252 assert_ne!(function_v8_offset, function_byte_offset as u32);
253
254 let table = LineOffsetTable::from_source(source);
255 let pos = table.position(function_v8_offset);
256
257 assert_eq!(pos.line, 2);
258 assert_eq!(pos.column, 0);
259 }
260
261 /// The discriminating case: a multibyte character and the offset live on the
262 /// SAME line, so the UTF-16-vs-byte distinction shows up as the *column*,
263 /// not just line counting. A byte-offset implementation would report a
264 /// strictly larger column here. `😀` is 2 UTF-16 units / 4 UTF-8 bytes, so
265 /// two of them put the byte offset 4 ahead of the V8 (UTF-16) offset. This
266 /// mirrors what real Node emits (`Profiler.takePreciseCoverage` reports the
267 /// UTF-16 offset, e.g. 18 rather than the byte offset 22 for this shape).
268 #[test]
269 fn line_table_maps_columns_in_utf16_units_within_a_line() {
270 let source = "const e = \"😀😀\"; function f(){}\n";
271 let function_byte_offset = source
272 .find("function")
273 .expect("test source should contain function")
274 as u32;
275 let function_v8_offset = source[..function_byte_offset as usize]
276 .encode_utf16()
277 .count() as u32;
278
279 assert!(
280 function_v8_offset < function_byte_offset,
281 "fixture must place a multibyte char before the function",
282 );
283
284 let table = LineOffsetTable::from_source(source);
285 let pos = table.position(function_v8_offset);
286
287 assert_eq!(pos.line, 1);
288 assert_eq!(pos.column, function_v8_offset);
289 assert!(
290 pos.column < function_byte_offset,
291 "column must be measured in UTF-16 units, not bytes",
292 );
293 }
294
295 #[test]
296 fn line_table_builds_from_v8_line_lengths() {
297 let table = LineOffsetTable::from_v8_line_lengths(&[20, 12])
298 .expect("line lengths should build table");
299
300 assert_eq!(table.position(20).line, 1);
301 assert_eq!(table.position(20).column, 20);
302 assert_eq!(table.position(21).line, 2);
303 assert_eq!(table.position(21).column, 0);
304 }
305
306 #[test]
307 fn line_table_clamps_past_end() {
308 let table = LineOffsetTable::from_source("abc");
309 let pos = table.position(100);
310 assert_eq!(pos.line, 1);
311 assert_eq!(pos.column, 100);
312 }
313
314 #[test]
315 fn parse_node_v8_coverage_dump() {
316 let raw = serde_json::json!({
317 "result": [{
318 "scriptId": "42",
319 "url": "file:///t/x.js",
320 "functions": [{
321 "functionName": "a",
322 "ranges": [{"startOffset": 0, "endOffset": 10, "count": 3}],
323 "isBlockCoverage": false
324 }]
325 }]
326 });
327 let dump: V8CoverageDump = serde_json::from_value(raw).unwrap();
328 assert_eq!(dump.result.len(), 1);
329 assert_eq!(dump.result[0].functions[0].function_name, "a");
330 }
331
332 /// Some real Istanbul producers (e.g. Vitest under certain transforms) emit
333 /// `null` for end columns. [`IstanbulPosition`] tolerates that via
334 /// `deserialize_nullable_u32`, normalizing `null` to `0` so a downstream
335 /// consumer deserializing positions does not choke. Pinned directly on the
336 /// position type since that is where the custom deserializer lives.
337 #[test]
338 fn istanbul_position_normalizes_null_column_to_zero() {
339 let with_null: IstanbulPosition =
340 serde_json::from_value(serde_json::json!({ "line": 76, "column": null })).unwrap();
341 assert_eq!(with_null.line, 76);
342 assert_eq!(with_null.column, 0);
343
344 let with_value: IstanbulPosition =
345 serde_json::from_value(serde_json::json!({ "line": 66, "column": 4 })).unwrap();
346 assert_eq!(with_value.column, 4);
347 }
348
349 /// Property tests for the UTF-16-offset-to-line/column mapper.
350 ///
351 /// The `position` mapper backs every Istanbul range fallow emits for runtime
352 /// coverage, so its invariants are encoded as properties rather than relying
353 /// on hand-picked examples. The line-boundary tests build their input from
354 /// known line bodies and join them with a chosen ending, so the expected
355 /// offsets are computed independently of the char-walking construction loop.
356 mod proptests {
357 use super::*;
358 use proptest::prelude::*;
359
360 /// A line body drawn from an alphabet that exercises the UTF-16 width
361 /// branch: ASCII (1 unit), `€` (1 unit / 3 UTF-8 bytes), and `😀` (a
362 /// surrogate pair, 2 units / 4 UTF-8 bytes). Never contains CR or LF, so
363 /// the only line breaks are the ones the harness inserts deliberately.
364 fn line_body() -> impl Strategy<Value = String> {
365 prop::collection::vec(prop::sample::select(vec!['a', 'b', ' ', '€', '😀']), 0..12)
366 .prop_map(|chars| chars.into_iter().collect())
367 }
368
369 /// UTF-16 length of a `str`, matching the units `LineOffsetTable` stores.
370 fn utf16_len(s: &str) -> u32 {
371 s.encode_utf16().count() as u32
372 }
373
374 proptest! {
375 /// `position` is monotonic: a non-decreasing offset never yields an
376 /// earlier `(line, column)`. Guards the `binary_search` Err-branch
377 /// `saturating_sub(1)` and the saturating column subtraction against
378 /// off-by-one regressions, for any source including past-end offsets.
379 #[test]
380 fn position_is_monotonic_in_offset(
381 source in prop::collection::vec(any::<char>(), 0..200)
382 .prop_map(|chars| chars.into_iter().collect::<String>()),
383 a in any::<u32>(),
384 b in any::<u32>(),
385 ) {
386 let table = LineOffsetTable::from_source(&source);
387 let (lo, hi) = (a.min(b), a.max(b));
388 let p_lo = table.position(lo);
389 let p_hi = table.position(hi);
390 prop_assert!(p_lo.line >= 1, "line numbers are 1-indexed");
391 prop_assert!(
392 (p_lo.line, p_lo.column) <= (p_hi.line, p_hi.column),
393 "position({lo}) = {p_lo:?} should not exceed position({hi}) = {p_hi:?}",
394 );
395 }
396
397 /// Every true line boundary maps back to column 0 on the right line,
398 /// and offsets within a line recover their column. Input is assembled
399 /// from known bodies + ending, so the expectation is independent of
400 /// the mapper's own line-splitting logic.
401 #[test]
402 fn line_starts_and_columns_round_trip(
403 bodies in prop::collection::vec(line_body(), 1..8),
404 ending in prop::sample::select(vec!["\n", "\r\n", "\r"]),
405 ) {
406 let source = bodies.join(ending);
407 let table = LineOffsetTable::from_source(&source);
408 let ending_units = utf16_len(ending);
409
410 let mut line_start = 0u32;
411 for (index, body) in bodies.iter().enumerate() {
412 let body_units = utf16_len(body);
413 let at_start = table.position(line_start);
414 prop_assert_eq!(at_start.line, index as u32 + 1);
415 prop_assert_eq!(at_start.column, 0);
416 for column in 0..=body_units {
417 let pos = table.position(line_start + column);
418 prop_assert_eq!(pos.line, index as u32 + 1);
419 prop_assert_eq!(pos.column, column);
420 }
421 line_start += body_units;
422 if index + 1 < bodies.len() {
423 line_start += ending_units;
424 }
425 }
426 }
427
428 /// `from_v8_line_lengths` advances one source position per line. The
429 /// cumulative line starts are strictly increasing and each maps to
430 /// column 0 on its line; offsets within a non-final line recover the
431 /// column. Lengths are bounded so the cumulative offset never
432 /// saturates, keeping the reference model exact.
433 #[test]
434 fn v8_line_lengths_build_consistent_table(
435 lengths in prop::collection::vec(0u32..1000, 1..20),
436 ) {
437 let table = LineOffsetTable::from_v8_line_lengths(&lengths)
438 .expect("non-empty lengths build a table");
439
440 let mut starts = vec![0u32];
441 let mut acc = 0u32;
442 for length in &lengths[..lengths.len() - 1] {
443 acc += length + 1;
444 starts.push(acc);
445 }
446
447 let mut previous: Option<u32> = None;
448 for (index, &start) in starts.iter().enumerate() {
449 if let Some(prev) = previous {
450 prop_assert!(start > prev, "line starts must strictly increase");
451 }
452 previous = Some(start);
453
454 let at_start = table.position(start);
455 prop_assert_eq!(at_start.line, index as u32 + 1);
456 prop_assert_eq!(at_start.column, 0);
457
458 if index + 1 < lengths.len() {
459 for column in 0..=lengths[index] {
460 let pos = table.position(start + column);
461 prop_assert_eq!(pos.line, index as u32 + 1);
462 prop_assert_eq!(pos.column, column);
463 }
464 }
465 }
466 }
467 }
468 }
469}