use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::path::Path;
use std::time::SystemTime;
use serde::{Deserialize, Serialize};
use crate::document::{DocumentTree, NodeId};
use crate::utils::fingerprint::{Fingerprint, Fingerprinter, NodeFingerprint};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum ChangeType {
Added,
Removed,
Modified,
Restructured,
}
impl std::fmt::Display for ChangeType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ChangeType::Added => write!(f, "added"),
ChangeType::Removed => write!(f, "removed"),
ChangeType::Modified => write!(f, "modified"),
ChangeType::Restructured => write!(f, "restructured"),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NodeChange {
pub node_id: Option<String>,
pub title: String,
pub change_type: ChangeType,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub fingerprint: Option<NodeFingerprint>,
}
impl NodeChange {
pub fn new(node_id: Option<String>, title: String, change_type: ChangeType) -> Self {
Self {
node_id,
title,
change_type,
fingerprint: None,
}
}
pub fn with_fingerprint(mut self, fp: NodeFingerprint) -> Self {
self.fingerprint = Some(fp);
self
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct ChangeSet {
pub added: Vec<NodeChange>,
pub removed: Vec<NodeChange>,
pub modified: Vec<NodeChange>,
pub restructured: Vec<NodeChange>,
}
impl ChangeSet {
pub fn new() -> Self {
Self::default()
}
pub fn is_empty(&self) -> bool {
self.added.is_empty()
&& self.removed.is_empty()
&& self.modified.is_empty()
&& self.restructured.is_empty()
}
pub fn total_changes(&self) -> usize {
self.added.len() + self.removed.len() + self.modified.len() + self.restructured.len()
}
pub fn merge(&mut self, other: ChangeSet) {
self.added.extend(other.added);
self.removed.extend(other.removed);
self.modified.extend(other.modified);
self.restructured.extend(other.restructured);
}
pub fn changed_node_ids(&self) -> Vec<&str> {
let mut ids: Vec<&str> = Vec::new();
for change in &self.added {
if let Some(ref id) = change.node_id {
ids.push(id.as_str());
}
}
for change in &self.modified {
if let Some(ref id) = change.node_id {
ids.push(id.as_str());
}
}
for change in &self.restructured {
if let Some(ref id) = change.node_id {
ids.push(id.as_str());
}
}
ids
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentChangeInfo {
pub doc_id: String,
pub content_fp: Fingerprint,
pub node_fingerprints: HashMap<String, NodeFingerprint>,
pub modified_at: chrono::DateTime<chrono::Utc>,
pub processing_version: u32,
}
impl DocumentChangeInfo {
pub fn new(doc_id: &str) -> Self {
Self {
doc_id: doc_id.to_string(),
content_fp: Fingerprint::zero(),
node_fingerprints: HashMap::new(),
modified_at: chrono::Utc::now(),
processing_version: 1,
}
}
pub fn update_from_tree(&mut self, tree: &DocumentTree) {
self.content_fp = compute_tree_fingerprint(tree);
self.node_fingerprints = compute_all_node_fingerprints(tree);
self.modified_at = chrono::Utc::now();
}
}
pub struct ChangeDetector {
content_fps: HashMap<String, Fingerprint>,
node_fps: HashMap<String, HashMap<String, NodeFingerprint>>,
mtimes: HashMap<String, SystemTime>,
processing_versions: HashMap<String, u32>,
current_processing_version: u32,
}
impl ChangeDetector {
pub fn new() -> Self {
Self {
content_fps: HashMap::new(),
node_fps: HashMap::new(),
mtimes: HashMap::new(),
processing_versions: HashMap::new(),
current_processing_version: 1,
}
}
pub fn with_processing_version(mut self, version: u32) -> Self {
self.current_processing_version = version;
self
}
fn hash_content(content: &str) -> u64 {
let mut hasher = std::collections::hash_map::DefaultHasher::new();
content.hash(&mut hasher);
hasher.finish()
}
pub fn needs_reindex_by_mtime(&self, doc_id: &str, path: &Path) -> bool {
let Some(recorded_mtime) = self.mtimes.get(doc_id) else {
return true; };
let Ok(metadata) = std::fs::metadata(path) else {
return true; };
let Ok(current_mtime) = metadata.modified() else {
return true;
};
current_mtime > *recorded_mtime
}
pub fn needs_reindex_by_hash(&self, doc_id: &str, content: &str) -> bool {
let current_fp = Fingerprint::from_str(content);
match self.content_fps.get(doc_id) {
Some(recorded_fp) => recorded_fp != ¤t_fp,
None => true,
}
}
pub fn needs_reindex_by_fingerprint(&self, doc_id: &str, new_fp: &Fingerprint) -> bool {
match self.content_fps.get(doc_id) {
Some(recorded_fp) => recorded_fp != new_fp,
None => true,
}
}
pub fn needs_reindex_by_version(&self, doc_id: &str) -> bool {
match self.processing_versions.get(doc_id) {
Some(recorded_version) => *recorded_version < self.current_processing_version,
None => true,
}
}
pub fn record(&mut self, doc_id: &str, content: &str, path: Option<&Path>) {
self.record_with_tree(doc_id, content, None, path);
}
pub fn record_with_tree(
&mut self,
doc_id: &str,
content: &str,
tree: Option<&DocumentTree>,
path: Option<&Path>,
) {
let content_fp = Fingerprint::from_str(content);
self.content_fps.insert(doc_id.to_string(), content_fp);
if let Some(tree) = tree {
let node_fps = compute_all_node_fingerprints(tree);
self.node_fps.insert(doc_id.to_string(), node_fps);
}
if let Some(path) = path {
if let Ok(metadata) = std::fs::metadata(path) {
if let Ok(mtime) = metadata.modified() {
self.mtimes.insert(doc_id.to_string(), mtime);
}
}
}
self.processing_versions
.insert(doc_id.to_string(), self.current_processing_version);
}
pub fn record_change_info(&mut self, info: &DocumentChangeInfo, path: Option<&Path>) {
self.content_fps
.insert(info.doc_id.clone(), info.content_fp);
self.node_fps
.insert(info.doc_id.clone(), info.node_fingerprints.clone());
self.processing_versions
.insert(info.doc_id.clone(), info.processing_version);
if let Some(path) = path {
if let Ok(metadata) = std::fs::metadata(path) {
if let Ok(mtime) = metadata.modified() {
self.mtimes.insert(info.doc_id.clone(), mtime);
}
}
}
}
pub fn detect_changes(&self, old_tree: &DocumentTree, new_tree: &DocumentTree) -> ChangeSet {
let mut changes = ChangeSet::new();
let old_fps = compute_all_node_fingerprints(old_tree);
let new_fps = compute_all_node_fingerprints(new_tree);
let old_by_title: HashMap<String, (String, NodeFingerprint)> = {
let mut map = HashMap::new();
for node_id in old_tree.traverse() {
if let Some(node) = old_tree.get(node_id) {
let key = node
.node_id
.clone()
.unwrap_or_else(|| format!("node_{:?}", node_id.0));
if let Some(fp) = old_fps.get(&key) {
map.insert(node.title.clone(), (key, fp.clone()));
}
}
}
map
};
let new_by_title: HashMap<String, (String, NodeFingerprint)> = {
let mut map = HashMap::new();
for node_id in new_tree.traverse() {
if let Some(node) = new_tree.get(node_id) {
let key = node
.node_id
.clone()
.unwrap_or_else(|| format!("node_{:?}", node_id.0));
if let Some(fp) = new_fps.get(&key) {
map.insert(node.title.clone(), (key, fp.clone()));
}
}
}
map
};
for (title, (node_key, fp)) in &new_by_title {
if !old_by_title.contains_key(title) {
changes.added.push(
NodeChange::new(Some(node_key.clone()), title.clone(), ChangeType::Added)
.with_fingerprint(fp.clone()),
);
}
}
for (title, (node_key, fp)) in &old_by_title {
if !new_by_title.contains_key(title) {
changes.removed.push(
NodeChange::new(Some(node_key.clone()), title.clone(), ChangeType::Removed)
.with_fingerprint(fp.clone()),
);
}
}
for (title, (new_key, new_fp)) in &new_by_title {
if let Some((_old_key, old_fp)) = old_by_title.get(title) {
if new_fp.content_changed(old_fp) {
changes.modified.push(
NodeChange::new(Some(new_key.clone()), title.clone(), ChangeType::Modified)
.with_fingerprint(new_fp.clone()),
);
} else if new_fp.subtree_changed(old_fp) {
changes.restructured.push(
NodeChange::new(
Some(new_key.clone()),
title.clone(),
ChangeType::Restructured,
)
.with_fingerprint(new_fp.clone()),
);
}
}
}
changes
}
pub fn get_nodes_needing_reprocess(
&self,
doc_id: &str,
new_tree: &DocumentTree,
) -> Option<Vec<String>> {
let old_fps = self.node_fps.get(doc_id)?;
let new_fps = compute_all_node_fingerprints(new_tree);
let mut needs_reprocess = Vec::new();
if self.needs_reindex_by_version(doc_id) {
return Some(new_fps.keys().cloned().collect());
}
for (node_key, new_fp) in &new_fps {
if let Some(old_fp) = old_fps.get(node_key) {
if new_fp.content_changed(old_fp) || new_fp.subtree_changed(old_fp) {
needs_reprocess.push(node_key.clone());
}
} else {
needs_reprocess.push(node_key.clone());
}
}
Some(needs_reprocess)
}
pub fn clear(&mut self, doc_id: &str) {
self.content_fps.remove(doc_id);
self.node_fps.remove(doc_id);
self.mtimes.remove(doc_id);
self.processing_versions.remove(doc_id);
}
pub fn get_content_fingerprint(&self, doc_id: &str) -> Option<&Fingerprint> {
self.content_fps.get(doc_id)
}
pub fn get_node_fingerprints(&self, doc_id: &str) -> Option<&HashMap<String, NodeFingerprint>> {
self.node_fps.get(doc_id)
}
pub fn to_state(&self) -> ChangeDetectorState {
ChangeDetectorState {
content_fps: self.content_fps.clone(),
node_fps: self.node_fps.clone(),
processing_versions: self.processing_versions.clone(),
}
}
pub fn from_state(state: ChangeDetectorState) -> Self {
Self {
content_fps: state.content_fps,
node_fps: state.node_fps,
mtimes: HashMap::new(),
processing_versions: state.processing_versions,
current_processing_version: 1,
}
}
}
impl Default for ChangeDetector {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChangeDetectorState {
pub content_fps: HashMap<String, Fingerprint>,
pub node_fps: HashMap<String, HashMap<String, NodeFingerprint>>,
pub processing_versions: HashMap<String, u32>,
}
pub fn compute_tree_fingerprint(tree: &DocumentTree) -> Fingerprint {
let root_fp = compute_node_fingerprint(tree, tree.root());
root_fp.subtree
}
fn compute_node_content_fp(tree: &DocumentTree, node_id: NodeId) -> Fingerprint {
let node = match tree.get(node_id) {
Some(n) => n,
None => return Fingerprint::zero(),
};
Fingerprinter::new()
.with_str(&node.title)
.with_str(&node.content)
.with_option_str(node.node_id.as_deref())
.into_fingerprint()
}
fn compute_node_fingerprint(tree: &DocumentTree, node_id: NodeId) -> NodeFingerprint {
let node = match tree.get(node_id) {
Some(n) => n,
None => return NodeFingerprint::zero(),
};
let content_fp = Fingerprinter::new()
.with_str(&node.title)
.with_str(&node.content)
.with_option_str(node.node_id.as_deref())
.into_fingerprint();
let children = tree.children(node_id);
if children.is_empty() {
return NodeFingerprint::leaf(content_fp);
}
let mut subtree_fp = Fingerprinter::new();
subtree_fp.write_fingerprint(&content_fp);
for child_id in children {
let child_fp = compute_node_fingerprint(tree, child_id);
subtree_fp.write_fingerprint(&child_fp.subtree);
}
NodeFingerprint::new(content_fp, subtree_fp.into_fingerprint())
}
pub fn compute_all_node_fingerprints(tree: &DocumentTree) -> HashMap<String, NodeFingerprint> {
let mut fingerprints = HashMap::new();
for node_id in tree.traverse() {
if let Some(node) = tree.get(node_id) {
let key = node
.node_id
.clone()
.unwrap_or_else(|| format!("node_{:?}", node_id.0));
let fp = compute_node_fingerprint(tree, node_id);
fingerprints.insert(key, fp);
}
}
fingerprints
}
#[cfg(test)]
mod tests {
use super::*;
use crate::document::DocumentTree;
#[test]
fn test_change_detector_new() {
let detector = ChangeDetector::new();
assert!(detector.content_fps.is_empty());
}
#[test]
fn test_needs_reindex_by_hash() {
let mut detector = ChangeDetector::new();
assert!(detector.needs_reindex_by_hash("doc1", "content"));
detector.record("doc1", "content", None);
assert!(!detector.needs_reindex_by_hash("doc1", "content"));
assert!(detector.needs_reindex_by_hash("doc1", "new content"));
}
#[test]
fn test_detect_changes() {
let detector = ChangeDetector::new();
let mut tree1 = DocumentTree::new("Root", "");
let child1 = tree1.add_child(tree1.root(), "Section 1", "Content 1");
tree1.add_child(tree1.root(), "Section 2", "Content 2");
let mut tree2 = DocumentTree::new("Root", "");
tree2.add_child(tree2.root(), "Section 1", "Content 1"); tree2.add_child(tree2.root(), "Section 2", "Content modified"); tree2.add_child(tree2.root(), "Section 3", "Content 3");
let changes = detector.detect_changes(&tree1, &tree2);
assert!(!changes.is_empty());
assert!(!changes.added.is_empty()); assert!(!changes.modified.is_empty()); }
#[test]
fn test_change_set() {
let mut changes = ChangeSet::new();
assert!(changes.is_empty());
changes.added.push(NodeChange::new(
Some("node1".to_string()),
"Title".to_string(),
ChangeType::Added,
));
assert!(!changes.is_empty());
assert_eq!(changes.total_changes(), 1);
}
#[test]
fn test_processing_version() {
let mut detector = ChangeDetector::new().with_processing_version(2);
detector.record("doc1", "content", None);
assert!(!detector.needs_reindex_by_version("doc1"));
let detector2 = ChangeDetector::new().with_processing_version(3);
assert!(detector2.needs_reindex_by_version("doc1"));
}
#[test]
fn test_node_fingerprint() {
let mut tree = DocumentTree::new("Root", "root content");
let child = tree.add_child(tree.root(), "Child", "child content");
let root_fp = compute_node_fingerprint(&tree, tree.root());
let child_fp = compute_node_fingerprint(&tree, child);
assert_eq!(child_fp.content, child_fp.subtree);
assert_ne!(root_fp.content, root_fp.subtree);
}
#[test]
fn test_fingerprint_serialization() {
let mut detector = ChangeDetector::new();
let mut tree = DocumentTree::new("Root", "content");
tree.add_child(tree.root(), "Section", "section content");
detector.record_with_tree("doc1", "content", Some(&tree), None);
let state = detector.to_state();
let json = serde_json::to_string(&state).unwrap();
let restored: ChangeDetectorState = serde_json::from_str(&json).unwrap();
assert_eq!(state.content_fps, restored.content_fps);
}
}