Skip to main content

crw_diff/
lib.rs

1//! Stateless change-tracking diff engine for CRW monitors.
2//!
3//! Pure, synchronous, no I/O, no LLM. Given the current scrape (markdown +
4//! optionally extracted JSON) and a caller-supplied `previous` snapshot, it
5//! classifies the page (`same` / `changed`), computes the requested diff
6//! surfaces, and returns the current snapshot to persist as the next baseline.
7//!
8//! ## Caller-supplied JSON invariant
9//! `current_json` is the *already-extracted* structured JSON supplied by the
10//! orchestration layer. This crate NEVER extracts JSON itself and does not
11//! depend on `crw-extract` — the LLM/judge live upstream.
12//!
13//! ## Mode-aware hashing
14//! `content_hash` is the normalized-markdown hash in gitDiff/mixed mode, and
15//! the canonicalized tracked-JSON hash in json-only mode. The SaaS store-skip
16//! short-circuit keys off this hash.
17
18pub mod git_diff;
19pub mod json_diff;
20pub mod snapshot;
21
22use crw_core::types::{
23    ChangeDiff, ChangeStatus, ChangeTrackingMode, ChangeTrackingOptions, ChangeTrackingResult,
24    ChangeTrackingSnapshot,
25};
26use serde_json::Value;
27
28/// Default cap on AST change-lines before the diff AST is truncated.
29pub const DEFAULT_MAX_DIFF_CHANGES: usize = 5000;
30
31/// Tunable limits for diff computation.
32#[derive(Debug, Clone, Copy)]
33pub struct DiffLimits {
34    pub max_diff_changes: usize,
35}
36
37impl Default for DiffLimits {
38    fn default() -> Self {
39        Self {
40            max_diff_changes: DEFAULT_MAX_DIFF_CHANGES,
41        }
42    }
43}
44
45/// Compute change tracking with default limits. See module docs for the
46/// caller-supplied-JSON invariant.
47pub fn compute_change_tracking(
48    opts: &ChangeTrackingOptions,
49    current_markdown: &str,
50    current_json: Option<&Value>,
51    content_type: Option<&str>,
52) -> ChangeTrackingResult {
53    compute_change_tracking_with_limits(
54        opts,
55        current_markdown,
56        current_json,
57        content_type,
58        &DiffLimits::default(),
59    )
60}
61
62/// Compute change tracking with explicit limits.
63pub fn compute_change_tracking_with_limits(
64    opts: &ChangeTrackingOptions,
65    current_markdown: &str,
66    current_json: Option<&Value>,
67    content_type: Option<&str>,
68    limits: &DiffLimits,
69) -> ChangeTrackingResult {
70    let has_git = opts.modes.is_empty() || opts.modes.contains(&ChangeTrackingMode::GitDiff);
71    let has_json = opts.modes.contains(&ChangeTrackingMode::Json);
72    let json_only = has_json && !has_git;
73
74    // ---- Binary / non-text content: hash only, never diff or judge ----
75    if !is_text(content_type) {
76        return binary_result(opts, current_markdown);
77    }
78
79    // ---- Mode-aware current content hash ----
80    let content_hash = if json_only {
81        match current_json {
82            Some(j) => snapshot::hash_json(j),
83            None => snapshot::hash_str(""),
84        }
85    } else {
86        snapshot::hash_markdown(current_markdown)
87    };
88
89    // ---- Build the current snapshot to persist as next baseline ----
90    let current_snapshot = ChangeTrackingSnapshot {
91        markdown: if has_git {
92            Some(current_markdown.to_string())
93        } else {
94            None
95        },
96        json: if has_json {
97            current_json.cloned()
98        } else {
99            None
100        },
101        content_hash: content_hash.clone(),
102        captured_at: None,
103    };
104
105    // ---- First observation: no baseline to diff against ----
106    let Some(previous) = &opts.previous else {
107        return ChangeTrackingResult {
108            status: ChangeStatus::Changed,
109            first_observation: true,
110            content_hash,
111            snapshot: Some(current_snapshot),
112            diff: None,
113            judgment: None,
114            tag: opts.tag.clone(),
115            truncated: false,
116        };
117    };
118
119    // ---- Determine per-surface change ----
120    let prev_md_norm = previous
121        .markdown
122        .as_deref()
123        .map(snapshot::normalize_markdown);
124    let cur_md_norm = snapshot::normalize_markdown(current_markdown);
125    let markdown_changed = has_git
126        && prev_md_norm
127            .as_deref()
128            .map(|p| p != cur_md_norm)
129            .unwrap_or(true);
130
131    let empty_json = Value::Null;
132    let prev_json = previous.json.as_ref().unwrap_or(&empty_json);
133    let cur_json_val = current_json.unwrap_or(&empty_json);
134    let json_changed = has_json && json_diff::changed(prev_json, cur_json_val);
135
136    let changed = (has_git && markdown_changed) || (has_json && json_changed);
137
138    if !changed {
139        return ChangeTrackingResult {
140            status: ChangeStatus::Same,
141            first_observation: false,
142            content_hash,
143            snapshot: Some(current_snapshot),
144            diff: None,
145            judgment: None,
146            tag: opts.tag.clone(),
147            truncated: false,
148        };
149    }
150
151    // ---- Build the diff envelope ----
152    let mut text: Option<String> = None;
153    let mut ast_value: Option<Value> = None;
154    let mut truncated = false;
155
156    if has_git {
157        let g = git_diff::compute(
158            prev_md_norm.as_deref().unwrap_or(""),
159            &cur_md_norm,
160            limits.max_diff_changes,
161        );
162        truncated = g.ast.truncated;
163        text = Some(g.text);
164        // The AST occupies diff.json ONLY in gitDiff-only mode. In mixed mode
165        // the per-field json diff takes diff.json instead (Firecrawl parity).
166        if !has_json {
167            ast_value = Some(serde_json::to_value(&g.ast).unwrap_or(Value::Null));
168        }
169    }
170
171    let json_value: Option<Value> = if has_json {
172        Some(json_diff::compute(prev_json, cur_json_val))
173    } else {
174        None
175    };
176
177    // diff.json: per-field map (json/mixed) wins; else the AST (gitDiff-only).
178    let diff_json = json_value.or(ast_value);
179    let diff = ChangeDiff {
180        text,
181        json: diff_json,
182    };
183
184    ChangeTrackingResult {
185        status: ChangeStatus::Changed,
186        first_observation: false,
187        content_hash,
188        snapshot: Some(current_snapshot),
189        diff: Some(diff),
190        judgment: None,
191        tag: opts.tag.clone(),
192        truncated,
193    }
194}
195
196/// Binary / non-text content path: hash the extracted text for same/changed,
197/// emit no diff. The orchestration layer never judges these pages.
198fn binary_result(opts: &ChangeTrackingOptions, current_text: &str) -> ChangeTrackingResult {
199    let content_hash = snapshot::hash_str(current_text);
200    let snapshot = ChangeTrackingSnapshot {
201        markdown: None,
202        json: None,
203        content_hash: content_hash.clone(),
204        captured_at: None,
205    };
206    match &opts.previous {
207        None => ChangeTrackingResult {
208            status: ChangeStatus::Changed,
209            first_observation: true,
210            content_hash,
211            snapshot: Some(snapshot),
212            diff: None,
213            judgment: None,
214            tag: opts.tag.clone(),
215            truncated: false,
216        },
217        Some(prev) => {
218            let status = if prev.content_hash == content_hash {
219                ChangeStatus::Same
220            } else {
221                ChangeStatus::Changed
222            };
223            ChangeTrackingResult {
224                status,
225                first_observation: false,
226                content_hash,
227                snapshot: Some(snapshot),
228                diff: None,
229                judgment: None,
230                tag: opts.tag.clone(),
231                truncated: false,
232            }
233        }
234    }
235}
236
237/// Whether a content type should be treated as diffable text. `None` => assume
238/// text (the common HTML→markdown case). Binary types (PDF, images, octet
239/// stream) are hashed by extracted text only.
240fn is_text(content_type: Option<&str>) -> bool {
241    let Some(ct) = content_type else {
242        return true;
243    };
244    let ct = ct.to_ascii_lowercase();
245    ct.starts_with("text/")
246        || ct.contains("json")
247        || ct.contains("xml")
248        || ct.contains("html")
249        || ct.contains("markdown")
250        || ct.contains("javascript")
251        || ct.contains("csv")
252        || ct.contains("yaml")
253}
254
255#[cfg(test)]
256mod tests {
257    use super::*;
258    use crw_core::types::ChangeTrackingMode;
259    use serde_json::json;
260
261    fn opts(
262        modes: Vec<ChangeTrackingMode>,
263        previous: Option<ChangeTrackingSnapshot>,
264    ) -> ChangeTrackingOptions {
265        ChangeTrackingOptions {
266            modes,
267            schema: None,
268            prompt: None,
269            previous,
270            tag: None,
271            content_type: None,
272        }
273    }
274
275    fn snap_md(md: &str) -> ChangeTrackingSnapshot {
276        ChangeTrackingSnapshot {
277            markdown: Some(md.to_string()),
278            json: None,
279            content_hash: snapshot::hash_markdown(md),
280            captured_at: None,
281        }
282    }
283
284    #[test]
285    fn first_observation_no_previous() {
286        let r = compute_change_tracking(
287            &opts(vec![ChangeTrackingMode::GitDiff], None),
288            "# Hi",
289            None,
290            None,
291        );
292        assert!(r.first_observation);
293        assert_eq!(r.status, ChangeStatus::Changed);
294        assert!(r.diff.is_none());
295        assert!(r.snapshot.is_some());
296    }
297
298    #[test]
299    fn identical_markdown_is_same() {
300        let o = opts(
301            vec![ChangeTrackingMode::GitDiff],
302            Some(snap_md("# Hi\n\nbody")),
303        );
304        let r = compute_change_tracking(&o, "# Hi\n\nbody", None, None);
305        assert_eq!(r.status, ChangeStatus::Same);
306        assert!(r.diff.is_none());
307    }
308
309    #[test]
310    fn whitespace_only_change_is_same() {
311        let o = opts(
312            vec![ChangeTrackingMode::GitDiff],
313            Some(snap_md("# Hi\n\nbody")),
314        );
315        let r = compute_change_tracking(&o, "# Hi   \n\n\n\nbody  \n", None, None);
316        assert_eq!(r.status, ChangeStatus::Same);
317    }
318
319    #[test]
320    fn markdown_change_emits_text_and_ast_in_git_mode() {
321        let o = opts(
322            vec![ChangeTrackingMode::GitDiff],
323            Some(snap_md("Starter $19")),
324        );
325        let r = compute_change_tracking(&o, "Starter $24", None, None);
326        assert_eq!(r.status, ChangeStatus::Changed);
327        let diff = r.diff.unwrap();
328        assert!(diff.text.unwrap().contains("+Starter $24"));
329        // gitDiff-only => diff.json holds the AST (has a `files` array)
330        assert!(diff.json.unwrap().get("files").is_some());
331    }
332
333    #[test]
334    fn json_mode_per_field_diff() {
335        let prev = ChangeTrackingSnapshot {
336            markdown: None,
337            json: Some(json!({"price": "$19"})),
338            content_hash: snapshot::hash_json(&json!({"price": "$19"})),
339            captured_at: None,
340        };
341        let o = opts(vec![ChangeTrackingMode::Json], Some(prev));
342        let cur = json!({"price": "$24"});
343        let r = compute_change_tracking(&o, "ignored markdown", Some(&cur), None);
344        assert_eq!(r.status, ChangeStatus::Changed);
345        let diff = r.diff.unwrap();
346        assert!(diff.text.is_none());
347        assert_eq!(
348            diff.json.unwrap()["price"],
349            json!({"previous": "$19", "current": "$24"})
350        );
351    }
352
353    #[test]
354    fn json_mode_same_when_tracked_fields_unchanged_even_if_markdown_differs() {
355        let prev = ChangeTrackingSnapshot {
356            markdown: None,
357            json: Some(json!({"price": "$19"})),
358            content_hash: snapshot::hash_json(&json!({"price": "$19"})),
359            captured_at: None,
360        };
361        let o = opts(vec![ChangeTrackingMode::Json], Some(prev));
362        let cur = json!({"price": "$19"});
363        let r = compute_change_tracking(&o, "totally different markdown", Some(&cur), None);
364        assert_eq!(r.status, ChangeStatus::Same);
365    }
366
367    #[test]
368    fn mixed_mode_either_surface_changes() {
369        let prev = ChangeTrackingSnapshot {
370            markdown: Some("Starter $19".into()),
371            json: Some(json!({"price": "$19"})),
372            content_hash: snapshot::hash_markdown("Starter $19"),
373            captured_at: None,
374        };
375        let o = opts(
376            vec![ChangeTrackingMode::Json, ChangeTrackingMode::GitDiff],
377            Some(prev),
378        );
379        let cur = json!({"price": "$24"});
380        let r = compute_change_tracking(&o, "Starter $24", Some(&cur), None);
381        assert_eq!(r.status, ChangeStatus::Changed);
382        let diff = r.diff.unwrap();
383        // mixed: text present AND diff.json is the per-field map (not the AST)
384        assert!(diff.text.is_some());
385        assert_eq!(
386            diff.json.unwrap()["price"],
387            json!({"previous": "$19", "current": "$24"})
388        );
389    }
390
391    #[test]
392    fn binary_content_hashes_no_diff() {
393        let prev = ChangeTrackingSnapshot {
394            markdown: None,
395            json: None,
396            content_hash: snapshot::hash_str("old pdf text"),
397            captured_at: None,
398        };
399        let o = ChangeTrackingOptions {
400            modes: vec![ChangeTrackingMode::GitDiff],
401            content_type: Some("application/pdf".into()),
402            ..opts(vec![ChangeTrackingMode::GitDiff], Some(prev))
403        };
404        let r = compute_change_tracking(&o, "new pdf text", None, Some("application/pdf"));
405        assert_eq!(r.status, ChangeStatus::Changed);
406        assert!(r.diff.is_none());
407    }
408}