Skip to main content

infigraph_docs/
lib.rs

1pub mod chunk;
2pub mod embed;
3pub mod extract;
4pub mod search;
5pub mod store;
6pub mod watch;
7
8use std::path::{Path, PathBuf};
9use std::sync::atomic::{AtomicUsize, Ordering};
10
11use anyhow::{Context, Result};
12use rayon::prelude::*;
13use sha2::{Digest, Sha256};
14
15use chunk::{Chunk, ChunkStrategy};
16use extract::ExtractedDoc;
17use store::DocStore;
18
19pub mod links;
20
21pub struct DocIndex {
22    root: PathBuf,
23    db_path: PathBuf,
24    store: Option<DocStore>,
25}
26
27pub struct DocIndexResult {
28    pub total_files: usize,
29    pub indexed_files: usize,
30    pub total_chunks: usize,
31}
32
33impl DocIndex {
34    pub fn open(root: &Path) -> Result<Self> {
35        let tg_dir = root.join(".infigraph");
36        std::fs::create_dir_all(&tg_dir)?;
37        let db_path = tg_dir.join("docs.kuzu");
38        Ok(Self {
39            root: root.to_path_buf(),
40            db_path,
41            store: None,
42        })
43    }
44
45    pub fn init(&mut self) -> Result<()> {
46        let store = DocStore::open(&self.db_path)?;
47        self.store = Some(store);
48        Ok(())
49    }
50
51    pub fn store(&self) -> Option<&DocStore> {
52        self.store.as_ref()
53    }
54
55    pub fn root(&self) -> &Path {
56        &self.root
57    }
58
59    pub fn clean(&mut self) -> Result<()> {
60        self.store = None;
61        let tg_dir = self.root.join(".infigraph");
62        if self.db_path.is_dir() {
63            let _ = std::fs::remove_dir_all(&self.db_path);
64        } else {
65            let _ = std::fs::remove_file(&self.db_path);
66        }
67        let _ = std::fs::remove_file(self.db_path.with_extension("wal"));
68        let _ = std::fs::remove_file(self.db_path.with_extension("lock"));
69        let _ = std::fs::remove_file(tg_dir.join("docs_embeddings.bin"));
70        let _ = std::fs::remove_file(tg_dir.join("docs_hnsw_index.usearch"));
71        let _ = std::fs::remove_file(tg_dir.join("docs_hnsw_index.meta"));
72        infigraph_core::embed::invalidate_embeddings_cache();
73        infigraph_core::embed::invalidate_hnsw_cache();
74        Ok(())
75    }
76
77    pub fn reindex(&mut self) -> Result<DocIndexResult> {
78        self.clean()?;
79        self.init()?;
80        self.index()
81    }
82
83    pub fn index(&self) -> Result<DocIndexResult> {
84        let store = self.store.as_ref().context("call init() first")?;
85
86        let files = self.collect_doc_files()?;
87        let total = files.len();
88
89        if total == 0 {
90            return Ok(DocIndexResult {
91                total_files: 0,
92                indexed_files: 0,
93                total_chunks: 0,
94            });
95        }
96
97        let existing_hashes = store.get_doc_hashes().unwrap_or_default();
98
99        let done = AtomicUsize::new(0);
100
101        let results: Vec<(ExtractedDoc, Vec<Chunk>)> = files
102            .par_iter()
103            .filter_map(|path| {
104                let rel = path
105                    .strip_prefix(&self.root)
106                    .ok()?
107                    .to_string_lossy()
108                    .replace('\\', "/");
109                let bytes = std::fs::read(path).ok()?;
110                let hash = {
111                    let mut h = Sha256::new();
112                    h.update(&bytes);
113                    format!("{:x}", h.finalize())
114                };
115
116                let n = done.fetch_add(1, Ordering::Relaxed) + 1;
117                let pct = n * 100 / total;
118                let prev_pct = (n - 1) * 100 / total;
119                if (pct / 25) > (prev_pct / 25) || n == total {
120                    eprintln!("Doc indexing: {}/{} ({}%)", n, total, pct);
121                }
122
123                if existing_hashes.get(&rel).map(|s| s.as_str()) == Some(hash.as_str()) {
124                    return None;
125                }
126
127                let ext = path.extension()?.to_string_lossy().to_lowercase();
128                let doc = extract::extract_document(path, &bytes, &ext).ok()?;
129                let strategy = ChunkStrategy::for_extension(&ext);
130                let chunks = chunk::chunk_document(&doc, &rel, &hash, strategy);
131                Some((
132                    ExtractedDoc {
133                        file: rel,
134                        content_hash: hash,
135                        ..doc
136                    },
137                    chunks,
138                ))
139            })
140            .collect();
141
142        let indexed = results.len();
143        let total_chunks: usize = results.iter().map(|(_, c)| c.len()).sum();
144
145        if !results.is_empty() {
146            let docs: Vec<&ExtractedDoc> = results.iter().map(|(d, _)| d).collect();
147            let chunks: Vec<&Chunk> = results.iter().flat_map(|(_, c)| c.iter()).collect();
148            store.upsert_all_parquet(&docs, &chunks)?;
149        }
150
151        if total_chunks > 0 {
152            let all_chunks: Vec<&Chunk> = results.iter().flat_map(|(_, c)| c.iter()).collect();
153            let changed_files: Vec<&str> = results.iter().map(|(d, _)| d.file.as_str()).collect();
154            embed::update_doc_embeddings(store, &self.root, &all_chunks, &changed_files)?;
155        }
156
157        // Extract links from indexed docs and create LINKS_TO edges
158        if !results.is_empty() {
159            let all_doc_ids: std::collections::HashSet<String> = {
160                let existing = store.get_doc_hashes().unwrap_or_default();
161                existing.keys().cloned().collect()
162            };
163            for (doc, _) in &results {
164                links::extract_and_link_doc(store, doc, &all_doc_ids);
165            }
166        }
167
168        Ok(DocIndexResult {
169            total_files: total,
170            indexed_files: indexed,
171            total_chunks,
172        })
173    }
174
175    fn collect_doc_files(&self) -> Result<Vec<PathBuf>> {
176        let mut files = Vec::new();
177        self.walk_doc_dir(&self.root, &mut files)?;
178        Ok(files)
179    }
180
181    fn walk_doc_dir(&self, dir: &Path, files: &mut Vec<PathBuf>) -> Result<()> {
182        let ignore_dirs = [
183            ".infigraph",
184            ".git",
185            "node_modules",
186            "__pycache__",
187            ".venv",
188            "venv",
189            "target",
190            "build",
191            "dist",
192            ".tox",
193        ];
194
195        for entry in std::fs::read_dir(dir)? {
196            let entry = entry?;
197            let path = entry.path();
198            let name = entry.file_name();
199            let name_str = name.to_string_lossy();
200
201            if path.is_dir() {
202                if !ignore_dirs.contains(&name_str.as_ref()) && !name_str.starts_with('.') {
203                    self.walk_doc_dir(&path, files)?;
204                }
205            } else if path.is_file() && is_document_file(&path) {
206                files.push(path);
207            }
208        }
209        Ok(())
210    }
211}
212
213pub fn is_document_file(path: &Path) -> bool {
214    let ext = match path.extension() {
215        Some(e) => e.to_string_lossy().to_lowercase(),
216        None => return false,
217    };
218    matches!(
219        ext.as_str(),
220        "md" | "markdown"
221            | "txt"
222            | "rst"
223            | "adoc"
224            | "org"
225            | "pdf"
226            | "docx"
227            | "pptx"
228            | "xlsx"
229            | "rtf"
230            | "html"
231            | "htm"
232            | "epub"
233            | "xml"
234            | "xsl"
235            | "xsd"
236            | "svg"
237            | "plist"
238    )
239}