Skip to main content

fallow_v8_coverage/
lib.rs

1//! V8 `ScriptCoverage` JSON parser and UTF-16 source-offset mapper.
2//!
3//! This is the open-source layer of fallow's runtime-coverage pipeline. It
4//! provides the two things the `fallow` CLI consumes:
5//!
6//! 1. Serde input types for the V8 coverage dump format emitted by
7//!    `node --experimental-test-coverage`, `c8`, the Inspector protocol, or
8//!    any V8 isolate ([`V8CoverageDump`] and friends).
9//! 2. [`LineOffsetTable`], which converts V8 source offsets into 1-indexed
10//!    line / 0-indexed column [`IstanbulPosition`]s.
11//!
12//! ## Offset semantics (load-bearing)
13//!
14//! V8 reports coverage offsets in **UTF-16 code units**, not UTF-8 bytes (V8
15//! strings are UTF-16). Verified against real Node: a function preceded by a
16//! `😀` (2 UTF-16 units / 4 UTF-8 bytes) on the same line is reported at the
17//! UTF-16 offset, not the byte offset. [`LineOffsetTable`] therefore stores
18//! line starts in UTF-16 units. This is the invariant the `line_table_*` tests
19//! pin, and the one a byte-offset implementation gets wrong.
20//!
21//! ## Relationship to `oxc_coverage_v8`
22//!
23//! `oxc_coverage_v8` (in `oxc-coverage-instrument`) solves the inverse problem:
24//! it takes an AST-built Istanbul `FileCoverage` and fills its statement /
25//! function / branch counts by converting Istanbul positions into **byte**
26//! offsets. The two crates are intentionally not consolidated: opposite
27//! directions, opposite unit spaces, and different producers (real Node V8
28//! dumps here vs. an instrumenter-controlled pipeline there). See
29//! `decisions/010-v8-coverage-vs-oxc-coverage-boundary.md`.
30//!
31//! The closed-source cross-reference, combined scoring, hot-path heuristics and
32//! verdict generation live in `fallow-cov` (private) and consume the CLI's
33//! remapped function output via the `fallow-cov-protocol` envelope.
34
35#![forbid(unsafe_code)]
36
37use serde::{Deserialize, Deserializer, Serialize};
38
39// -- V8 input types ---------------------------------------------------------
40
41/// Top-level shape emitted by Node's `NODE_V8_COVERAGE` directory: one file
42/// per worker / process containing a `result` array of [`ScriptCoverage`].
43#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct V8CoverageDump {
45    /// Per-script coverage entries.
46    pub result: Vec<ScriptCoverage>,
47    /// Optional source-map cache emitted by Node 13+.
48    #[serde(default, rename = "source-map-cache")]
49    pub source_map_cache: Option<serde_json::Value>,
50}
51
52/// V8's per-script coverage record. Field names mirror the V8 inspector
53/// protocol verbatim.
54#[derive(Debug, Clone, Serialize, Deserialize)]
55pub struct ScriptCoverage {
56    /// V8 script identifier.
57    #[serde(rename = "scriptId")]
58    pub script_id: String,
59    /// File URL — typically `file:///abs/path` for Node, `https://…` for
60    /// browsers. Callers normalize to absolute paths before merging.
61    pub url: String,
62    /// One entry per function (including the implicit module-level function).
63    pub functions: Vec<FunctionCoverage>,
64}
65
66/// V8 per-function coverage. Each function carries one or more
67/// [`CoverageRange`]s — block-level for instrumented coverage, function-level
68/// for `--coverage=best-effort`.
69#[derive(Debug, Clone, Serialize, Deserialize)]
70pub struct FunctionCoverage {
71    /// Source-as-written function name. Empty for the module-level wrapper
72    /// and anonymous functions.
73    #[serde(rename = "functionName")]
74    pub function_name: String,
75    /// Coverage ranges, UTF-16 code-unit offsets relative to the script's
76    /// source text (see the crate-level "Offset semantics" note).
77    pub ranges: Vec<CoverageRange>,
78    /// True when V8 emitted block-level data for this function (instrumented
79    /// coverage). False when only the outer function range is reliable
80    /// (best-effort / runtime coverage).
81    #[serde(rename = "isBlockCoverage", default)]
82    pub is_block_coverage: bool,
83}
84
85/// A single coverage range. `count == 0` means the range was never hit.
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct CoverageRange {
88    /// Inclusive UTF-16 code-unit offset into the script's source.
89    #[serde(rename = "startOffset")]
90    pub start_offset: u32,
91    /// Exclusive UTF-16 code-unit offset into the script's source.
92    #[serde(rename = "endOffset")]
93    pub end_offset: u32,
94    /// Number of times the range was executed.
95    pub count: u64,
96}
97
98// -- Istanbul position type -------------------------------------------------
99
100/// 1-indexed line + 0-indexed column.
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct IstanbulPosition {
103    /// 1-indexed line number.
104    pub line: u32,
105    /// 0-indexed column within the line.
106    ///
107    /// Some real Istanbul producers (including Vitest in certain transforms)
108    /// emit `null` for end columns. We normalize those to `0` at parse time
109    /// so downstream CRAP/prod-coverage consumers can still ingest the file.
110    #[serde(deserialize_with = "deserialize_nullable_u32")]
111    pub column: u32,
112}
113
114fn deserialize_nullable_u32<'de, D>(deserializer: D) -> Result<u32, D::Error>
115where
116    D: Deserializer<'de>,
117{
118    Ok(Option::<u32>::deserialize(deserializer)?.unwrap_or(0))
119}
120
121// -- V8 offset to line/column mapper ---------------------------------------
122
123/// Pre-computed line-start table for converting V8 source offsets into
124/// Istanbul line/column positions in O(log n) per lookup.
125///
126/// V8 reports offsets in JavaScript source positions: UTF-16 code units, not
127/// UTF-8 bytes. Istanbul columns use the same 0-indexed source-position model,
128/// so this table stores line starts in UTF-16 units.
129///
130/// The source is consumed once at construction; subsequent lookups are
131/// allocation-free.
132#[derive(Debug)]
133pub struct LineOffsetTable {
134    /// UTF-16 offset of the first character of each line. `line_starts[0]`
135    /// is always `0` (the start of the file).
136    line_starts: Vec<u32>,
137}
138
139impl LineOffsetTable {
140    /// Build a table from the full source text. The source must be UTF-8 with
141    /// LF, CRLF, or CR line endings (mixed endings are tolerated).
142    #[must_use]
143    pub fn from_source(source: &str) -> Self {
144        let mut line_starts = Vec::with_capacity(source.lines().count() + 1);
145        line_starts.push(0);
146        let mut offset = 0u32;
147        let mut chars = source.chars().peekable();
148        while let Some(ch) = chars.next() {
149            match ch {
150                '\n' => {
151                    offset = offset.saturating_add(1);
152                    line_starts.push(offset);
153                }
154                '\r' => {
155                    offset = offset.saturating_add(1);
156                    if chars.peek() == Some(&'\n') {
157                        chars.next();
158                        offset = offset.saturating_add(1);
159                    }
160                    line_starts.push(offset);
161                }
162                _ => offset = offset.saturating_add(ch.len_utf16() as u32),
163            }
164        }
165        Self { line_starts }
166    }
167
168    /// Build a table from V8's `source-map-cache.lineLengths` data.
169    ///
170    /// `lineLengths` are already measured in JavaScript source positions. The
171    /// cache does not carry line-ending widths, so this preserves the existing
172    /// Node fallback behavior and advances one source position between lines.
173    #[must_use]
174    pub fn from_v8_line_lengths(line_lengths: &[u32]) -> Option<Self> {
175        if line_lengths.is_empty() {
176            return None;
177        }
178
179        let mut line_starts = Vec::with_capacity(line_lengths.len());
180        line_starts.push(0);
181        let mut offset = 0u32;
182        for length in line_lengths
183            .iter()
184            .take(line_lengths.len().saturating_sub(1))
185        {
186            offset = offset.saturating_add(*length).saturating_add(1);
187            line_starts.push(offset);
188        }
189        Some(Self { line_starts })
190    }
191
192    /// Convert a V8 source offset to a 1-indexed line + 0-indexed column.
193    ///
194    /// Offsets at or past the end of the source clamp to the last line +
195    /// remaining column.
196    #[must_use]
197    pub fn position(&self, source_offset: u32) -> IstanbulPosition {
198        // Binary search for the last line_start <= source_offset.
199        let line_zero_indexed = match self.line_starts.binary_search(&source_offset) {
200            Ok(exact) => exact,
201            Err(insertion_point) => insertion_point.saturating_sub(1),
202        };
203        let line_start = self.line_starts[line_zero_indexed];
204        IstanbulPosition {
205            line: (line_zero_indexed as u32) + 1,
206            column: source_offset.saturating_sub(line_start),
207        }
208    }
209}
210
211// Manual Copy impls: the CLI consumer `.copied()`s `CoverageRange` out of a
212// function's `ranges`, and `IstanbulPosition` is a small value type returned by
213// `LineOffsetTable::position`.
214impl Copy for CoverageRange {}
215impl Copy for IstanbulPosition {}
216
217#[cfg(test)]
218mod tests {
219    use super::*;
220
221    #[test]
222    fn line_table_handles_lf() {
223        let table = LineOffsetTable::from_source("a\nbb\nccc");
224        assert_eq!(table.position(0).line, 1);
225        assert_eq!(table.position(0).column, 0);
226        assert_eq!(table.position(2).line, 2);
227        assert_eq!(table.position(2).column, 0);
228        assert_eq!(table.position(5).line, 3);
229        assert_eq!(table.position(5).column, 0);
230    }
231
232    #[test]
233    fn line_table_handles_crlf() {
234        let table = LineOffsetTable::from_source("a\r\nbb\r\nccc");
235        assert_eq!(table.position(3).line, 2);
236        assert_eq!(table.position(3).column, 0);
237    }
238
239    #[test]
240    fn line_table_handles_lone_cr() {
241        let table = LineOffsetTable::from_source("a\rbb");
242        assert_eq!(table.position(2).line, 2);
243        assert_eq!(table.position(2).column, 0);
244    }
245
246    #[test]
247    fn line_table_uses_utf16_offsets_for_non_ascii_source() {
248        let source = "const smile = \"😀\";\nfunction after_emoji() {}\n";
249        let function_byte_offset = source
250            .find("function")
251            .expect("test source should contain function");
252        let function_v8_offset = source[..function_byte_offset].encode_utf16().count() as u32;
253
254        assert_ne!(function_v8_offset, function_byte_offset as u32);
255
256        let table = LineOffsetTable::from_source(source);
257        let pos = table.position(function_v8_offset);
258
259        assert_eq!(pos.line, 2);
260        assert_eq!(pos.column, 0);
261    }
262
263    /// The discriminating case: a multibyte character and the offset live on the
264    /// SAME line, so the UTF-16-vs-byte distinction shows up as the *column*,
265    /// not just line counting. A byte-offset implementation would report a
266    /// strictly larger column here. `😀` is 2 UTF-16 units / 4 UTF-8 bytes, so
267    /// two of them put the byte offset 4 ahead of the V8 (UTF-16) offset. This
268    /// mirrors what real Node emits (`Profiler.takePreciseCoverage` reports the
269    /// UTF-16 offset, e.g. 18 rather than the byte offset 22 for this shape).
270    #[test]
271    fn line_table_maps_columns_in_utf16_units_within_a_line() {
272        let source = "const e = \"😀😀\"; function f(){}\n";
273        let function_byte_offset = source
274            .find("function")
275            .expect("test source should contain function")
276            as u32;
277        let function_v8_offset = source[..function_byte_offset as usize]
278            .encode_utf16()
279            .count() as u32;
280
281        // The fixture must actually exercise the multibyte gap, else a byte
282        // implementation would pass this test by accident.
283        assert!(
284            function_v8_offset < function_byte_offset,
285            "fixture must place a multibyte char before the function",
286        );
287
288        let table = LineOffsetTable::from_source(source);
289        let pos = table.position(function_v8_offset);
290
291        // Line 1 starts at offset 0, so the column equals the V8 (UTF-16)
292        // offset. A byte model would report `function_byte_offset` instead.
293        assert_eq!(pos.line, 1);
294        assert_eq!(pos.column, function_v8_offset);
295        assert!(
296            pos.column < function_byte_offset,
297            "column must be measured in UTF-16 units, not bytes",
298        );
299    }
300
301    #[test]
302    fn line_table_builds_from_v8_line_lengths() {
303        let table = LineOffsetTable::from_v8_line_lengths(&[20, 12])
304            .expect("line lengths should build table");
305
306        assert_eq!(table.position(20).line, 1);
307        assert_eq!(table.position(20).column, 20);
308        assert_eq!(table.position(21).line, 2);
309        assert_eq!(table.position(21).column, 0);
310    }
311
312    #[test]
313    fn line_table_clamps_past_end() {
314        let table = LineOffsetTable::from_source("abc");
315        let pos = table.position(100);
316        assert_eq!(pos.line, 1);
317        assert_eq!(pos.column, 100);
318    }
319
320    #[test]
321    fn parse_node_v8_coverage_dump() {
322        let raw = serde_json::json!({
323            "result": [{
324                "scriptId": "42",
325                "url": "file:///t/x.js",
326                "functions": [{
327                    "functionName": "a",
328                    "ranges": [{"startOffset": 0, "endOffset": 10, "count": 3}],
329                    "isBlockCoverage": false
330                }]
331            }]
332        });
333        let dump: V8CoverageDump = serde_json::from_value(raw).unwrap();
334        assert_eq!(dump.result.len(), 1);
335        assert_eq!(dump.result[0].functions[0].function_name, "a");
336    }
337
338    /// Some real Istanbul producers (e.g. Vitest under certain transforms) emit
339    /// `null` for end columns. [`IstanbulPosition`] tolerates that via
340    /// `deserialize_nullable_u32`, normalizing `null` to `0` so a downstream
341    /// consumer deserializing positions does not choke. Pinned directly on the
342    /// position type since that is where the custom deserializer lives.
343    #[test]
344    fn istanbul_position_normalizes_null_column_to_zero() {
345        let with_null: IstanbulPosition =
346            serde_json::from_value(serde_json::json!({ "line": 76, "column": null })).unwrap();
347        assert_eq!(with_null.line, 76);
348        assert_eq!(with_null.column, 0);
349
350        let with_value: IstanbulPosition =
351            serde_json::from_value(serde_json::json!({ "line": 66, "column": 4 })).unwrap();
352        assert_eq!(with_value.column, 4);
353    }
354
355    /// Property tests for the UTF-16-offset-to-line/column mapper.
356    ///
357    /// The `position` mapper backs every Istanbul range fallow emits for runtime
358    /// coverage, so its invariants are encoded as properties rather than relying
359    /// on hand-picked examples. The line-boundary tests build their input from
360    /// known line bodies and join them with a chosen ending, so the expected
361    /// offsets are computed independently of the char-walking construction loop.
362    mod proptests {
363        use super::*;
364        use proptest::prelude::*;
365
366        /// A line body drawn from an alphabet that exercises the UTF-16 width
367        /// branch: ASCII (1 unit), `€` (1 unit / 3 UTF-8 bytes), and `😀` (a
368        /// surrogate pair, 2 units / 4 UTF-8 bytes). Never contains CR or LF, so
369        /// the only line breaks are the ones the harness inserts deliberately.
370        fn line_body() -> impl Strategy<Value = String> {
371            prop::collection::vec(prop::sample::select(vec!['a', 'b', ' ', '€', '😀']), 0..12)
372                .prop_map(|chars| chars.into_iter().collect())
373        }
374
375        /// UTF-16 length of a `str`, matching the units `LineOffsetTable` stores.
376        fn utf16_len(s: &str) -> u32 {
377            s.encode_utf16().count() as u32
378        }
379
380        proptest! {
381            /// `position` is monotonic: a non-decreasing offset never yields an
382            /// earlier `(line, column)`. Guards the `binary_search` Err-branch
383            /// `saturating_sub(1)` and the saturating column subtraction against
384            /// off-by-one regressions, for any source including past-end offsets.
385            #[test]
386            fn position_is_monotonic_in_offset(
387                source in prop::collection::vec(any::<char>(), 0..200)
388                    .prop_map(|chars| chars.into_iter().collect::<String>()),
389                a in any::<u32>(),
390                b in any::<u32>(),
391            ) {
392                let table = LineOffsetTable::from_source(&source);
393                let (lo, hi) = (a.min(b), a.max(b));
394                let p_lo = table.position(lo);
395                let p_hi = table.position(hi);
396                prop_assert!(p_lo.line >= 1, "line numbers are 1-indexed");
397                prop_assert!(
398                    (p_lo.line, p_lo.column) <= (p_hi.line, p_hi.column),
399                    "position({lo}) = {p_lo:?} should not exceed position({hi}) = {p_hi:?}",
400                );
401            }
402
403            /// Every true line boundary maps back to column 0 on the right line,
404            /// and offsets within a line recover their column. Input is assembled
405            /// from known bodies + ending, so the expectation is independent of
406            /// the mapper's own line-splitting logic.
407            #[test]
408            fn line_starts_and_columns_round_trip(
409                bodies in prop::collection::vec(line_body(), 1..8),
410                ending in prop::sample::select(vec!["\n", "\r\n", "\r"]),
411            ) {
412                let source = bodies.join(ending);
413                let table = LineOffsetTable::from_source(&source);
414                let ending_units = utf16_len(ending);
415
416                let mut line_start = 0u32;
417                for (index, body) in bodies.iter().enumerate() {
418                    let body_units = utf16_len(body);
419                    // Column 0 of each line lands on the line's first offset.
420                    let at_start = table.position(line_start);
421                    prop_assert_eq!(at_start.line, index as u32 + 1);
422                    prop_assert_eq!(at_start.column, 0);
423                    // Offsets inside the line (up to its width) recover the column.
424                    for column in 0..=body_units {
425                        let pos = table.position(line_start + column);
426                        prop_assert_eq!(pos.line, index as u32 + 1);
427                        prop_assert_eq!(pos.column, column);
428                    }
429                    line_start += body_units;
430                    if index + 1 < bodies.len() {
431                        line_start += ending_units;
432                    }
433                }
434            }
435
436            /// `from_v8_line_lengths` advances one source position per line. The
437            /// cumulative line starts are strictly increasing and each maps to
438            /// column 0 on its line; offsets within a non-final line recover the
439            /// column. Lengths are bounded so the cumulative offset never
440            /// saturates, keeping the reference model exact.
441            #[test]
442            fn v8_line_lengths_build_consistent_table(
443                lengths in prop::collection::vec(0u32..1000, 1..20),
444            ) {
445                let table = LineOffsetTable::from_v8_line_lengths(&lengths)
446                    .expect("non-empty lengths build a table");
447
448                // Reconstruct the expected line starts: +1 separator per line.
449                let mut starts = vec![0u32];
450                let mut acc = 0u32;
451                for length in &lengths[..lengths.len() - 1] {
452                    acc += length + 1;
453                    starts.push(acc);
454                }
455
456                let mut previous: Option<u32> = None;
457                for (index, &start) in starts.iter().enumerate() {
458                    if let Some(prev) = previous {
459                        prop_assert!(start > prev, "line starts must strictly increase");
460                    }
461                    previous = Some(start);
462
463                    let at_start = table.position(start);
464                    prop_assert_eq!(at_start.line, index as u32 + 1);
465                    prop_assert_eq!(at_start.column, 0);
466
467                    // Within a non-final line the recorded length bounds the columns.
468                    if index + 1 < lengths.len() {
469                        for column in 0..=lengths[index] {
470                            let pos = table.position(start + column);
471                            prop_assert_eq!(pos.line, index as u32 + 1);
472                            prop_assert_eq!(pos.column, column);
473                        }
474                    }
475                }
476            }
477        }
478    }
479}