lmm-agent 0.1.2

Equation-based autonomous agent framework for the lmm ecosystem.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
// Copyright 2026 Mahmoud Harmouch.
//
// Licensed under the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

//! # `knowledge` - Knowledge Acquisition for Agents
//!
//! This module enables agents to build a queryable knowledge base from diverse
//! local and remote sources, then retrieve grounded, extractive answers to
//! natural-language questions, entirely offline and without any LLM.
//!
//! ## Sources
//!
//! | Variant | Description |
//! |---|---|
//! | `KnowledgeSource::File(path)` | A single `.txt`, `.md`, or `.pdf` file |
//! | `KnowledgeSource::Dir(path)` | All parseable files within a directory |
//! | `KnowledgeSource::Url(url)` | Fetch and parse a web URL (`net` feature) |
//! | `KnowledgeSource::RawText(text)` | Inline text string |
//!
//! ## Retrieval algorithm
//!
//! 1. Tokenise the question into lowercase content words (≥ 3 chars, stop-words removed).
//! 2. Score every indexed [`DocumentChunk`] with IDF-weighted token overlap.
//! 3. Take the top-k chunks and concatenate their text.
//! 4. Feed the concatenated corpus + the original question into
//!    [`lmm::text::TextSummarizer::summarize_with_query`] to produce a short
//!    extractive answer.
//!
//! ## Quick example
//!
//! ```rust
//! use lmm_agent::cognition::knowledge::{KnowledgeIndex, KnowledgeSource};
//!
//! let mut index = KnowledgeIndex::new();
//! index.ingest_text("my-doc", "Rust gives you control over memory without a garbage collector. \
//!                              The borrow checker enforces safety at compile time.");
//! let answer = index.answer("How does Rust handle memory?", 3);
//! assert!(answer.is_some());
//! ```

use anyhow::{Result, anyhow};
use lmm::text::TextSummarizer;
use std::cmp::Ordering;
use std::collections::{HashMap, HashSet};
use std::fs;
#[cfg(feature = "knowledge")]
use std::io::Cursor;
use std::path::{Path, PathBuf};

/// A source from which an agent can acquire knowledge.
#[derive(Debug, Clone)]
pub enum KnowledgeSource {
    /// A local file.  Supported extensions: `.txt`, `.md`, and `.pdf` (requires `knowledge`).
    File(PathBuf),
    /// All parseable files found directly inside this directory (non-recursive).
    Dir(PathBuf),
    /// Fetch plain text from a URL. Requires the `net` feature.
    Url(String),
    /// Inline text supplied directly by the caller.
    RawText(String),
}

/// A sentence-sized unit of ingested knowledge.
#[derive(Debug, Clone)]
pub struct DocumentChunk {
    /// Human-readable label for the source (filename or URL).
    pub source: String,
    /// The raw text of this chunk.
    pub text: String,
    /// Pre-tokenised, lowercase content words used for fast scoring.
    pub tokens: Vec<String>,
}

impl DocumentChunk {
    pub fn new(source: impl Into<String>, text: impl Into<String>) -> Self {
        let text = text.into();
        let tokens = tokenise(&text);
        Self {
            source: source.into(),
            text,
            tokens,
        }
    }
}

/// In-memory, term-indexed knowledge base built from ingested documents.
///
/// Retrieval is based on IDF-weighted token overlap: content words that appear
/// rarely across the corpus are weighted more heavily, focusing results on
/// discriminative evidence.
///
/// # Examples
///
/// ```rust
/// use lmm_agent::cognition::knowledge::KnowledgeIndex;
///
/// let mut idx = KnowledgeIndex::new();
/// idx.ingest_text(
///     "rust-book",
///     "Rust prevents data races at compile time through its ownership model.",
/// );
/// let hits = idx.query("What prevents data races in Rust?", 3);
/// assert!(!hits.is_empty());
/// ```
#[derive(Debug, Clone, Default)]
pub struct KnowledgeIndex {
    chunks: Vec<DocumentChunk>,
    term_index: HashMap<String, Vec<usize>>,
    doc_freq: HashMap<String, usize>,
}

impl KnowledgeIndex {
    /// Creates an empty `KnowledgeIndex`.
    pub fn new() -> Self {
        Self::default()
    }

    /// Returns `true` if no documents have been ingested yet.
    pub fn is_empty(&self) -> bool {
        self.chunks.is_empty()
    }

    /// Returns the total number of indexed chunks.
    pub fn len(&self) -> usize {
        self.chunks.len()
    }

    /// Ingests a raw text string under the given `source` label.
    ///
    /// The text is split into sentence-level chunks via a lightweight sentence
    /// splitter before being indexed.  Returns the number of chunks created.
    pub fn ingest_text(&mut self, source: &str, text: &str) -> usize {
        let sentences = split_sentences(text);
        let start = self.chunks.len();
        for sentence in sentences {
            if sentence.split_whitespace().count() < 4 {
                continue;
            }
            let chunk = DocumentChunk::new(source, &sentence);
            let idx = self.chunks.len();
            for token in &chunk.tokens {
                self.term_index.entry(token.clone()).or_default().push(idx);
            }
            self.chunks.push(chunk);
        }
        let added = self.chunks.len() - start;
        self.rebuild_doc_freq();
        added
    }

    /// Returns the `top_k` most relevant [`DocumentChunk`]s for `question`.
    ///
    /// Chunks are scored by the sum of IDF weights for each question token that
    /// appears in the chunk.
    pub fn query(&self, question: &str, top_k: usize) -> Vec<&DocumentChunk> {
        if self.chunks.is_empty() {
            return Vec::new();
        }
        let q_tokens = tokenise(question);
        let n = self.chunks.len() as f64;

        let mut scores: Vec<(usize, f64)> = (0..self.chunks.len())
            .map(|i| {
                let chunk = &self.chunks[i];
                let score: f64 = q_tokens
                    .iter()
                    .filter(|t| chunk.tokens.contains(t))
                    .map(|t| {
                        let df = *self.doc_freq.get(t).unwrap_or(&1) as f64;
                        (n / df).ln() + 1.0
                    })
                    .sum();
                (i, score)
            })
            .filter(|(_, s)| *s > 0.0)
            .collect();

        scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(Ordering::Equal));
        scores
            .into_iter()
            .take(top_k)
            .map(|(i, _)| &self.chunks[i])
            .collect()
    }

    /// Returns an extractive answer to `question` from the knowledge base,
    /// or `None` if no relevant chunks are found.
    ///
    /// The top-`top_k` chunks are concatenated and passed to
    /// [`lmm::text::TextSummarizer`] with the original question as a relevance
    /// hint. The summariser selects the sentences most likely to answer the question.
    pub fn answer(&self, question: &str, top_k: usize) -> Option<String> {
        let hits = self.query(question, top_k);
        if hits.is_empty() {
            return None;
        }
        let corpus: String = hits
            .iter()
            .map(|c| c.text.as_str())
            .collect::<Vec<_>>()
            .join(" ");
        let summariser = TextSummarizer::new(3, 4, 2);
        summariser
            .summarize_with_query(&corpus, question)
            .ok()
            .map(|sentences| sentences.join(" "))
    }

    fn rebuild_doc_freq(&mut self) {
        self.doc_freq.clear();
        for chunk in &self.chunks {
            let mut seen = HashSet::new();
            for token in &chunk.tokens {
                if seen.insert(token) {
                    *self.doc_freq.entry(token.clone()).or_insert(0) += 1;
                }
            }
        }
    }
}

/// Trait for pluggable document parsers.
pub trait DocumentParser: Send + Sync {
    /// Returns `true` when this parser can handle the given file extension.
    fn supports_extension(&self, ext: &str) -> bool;
    /// Parses `bytes` into a plain-text string.
    fn parse_bytes(&self, bytes: &[u8]) -> Result<String>;
}

/// Parses plain `.txt` files (UTF-8 assumed, with a lossy fallback).
#[derive(Debug, Default, Clone)]
pub struct PlainTextParser;

impl DocumentParser for PlainTextParser {
    fn supports_extension(&self, ext: &str) -> bool {
        ext.eq_ignore_ascii_case("txt")
    }

    fn parse_bytes(&self, bytes: &[u8]) -> Result<String> {
        Ok(String::from_utf8_lossy(bytes).into_owned())
    }
}

/// Parses `.md` (Markdown) files by stripping formatting markers and yielding plain text.
#[derive(Debug, Default, Clone)]
pub struct MarkdownParser;

impl DocumentParser for MarkdownParser {
    fn supports_extension(&self, ext: &str) -> bool {
        matches!(ext.to_ascii_lowercase().as_str(), "md" | "markdown")
    }

    fn parse_bytes(&self, bytes: &[u8]) -> Result<String> {
        let raw = String::from_utf8_lossy(bytes);
        Ok(strip_markdown(&raw))
    }
}

/// Parses `.pdf` files using `lopdf`. Requires the `knowledge` feature.
#[cfg(feature = "knowledge")]
#[derive(Debug, Default, Clone)]
pub struct PdfParser;

#[cfg(feature = "knowledge")]
impl DocumentParser for PdfParser {
    fn supports_extension(&self, ext: &str) -> bool {
        ext.eq_ignore_ascii_case("pdf")
    }

    fn parse_bytes(&self, bytes: &[u8]) -> Result<String> {
        #[cfg(feature = "knowledge")]
        use lopdf::Document;

        let doc =
            Document::load_from(Cursor::new(bytes)).map_err(|e| anyhow!("lopdf error: {e}"))?;
        let mut out = String::new();
        for page_num in 1..=doc.get_pages().len() as u32 {
            if let Ok(texts) = doc.extract_text(&[page_num]) {
                out.push_str(&texts);
                out.push('\n');
            }
        }
        Ok(out)
    }
}

/// Returns a list of parsers active for the current feature set.
pub fn default_parsers() -> Vec<Box<dyn DocumentParser>> {
    #[allow(unused_mut)]
    let mut parsers: Vec<Box<dyn DocumentParser>> =
        vec![Box::new(PlainTextParser), Box::new(MarkdownParser)];
    #[cfg(feature = "knowledge")]
    parsers.push(Box::new(PdfParser));
    parsers
}

/// Ingest a [`KnowledgeSource`] into `index`, returning the number of new chunks.
///
/// This function is the top-level entry point used by [`LmmAgent::ingest`].
pub async fn ingest(index: &mut KnowledgeIndex, source: KnowledgeSource) -> Result<usize> {
    match source {
        KnowledgeSource::RawText(text) => Ok(index.ingest_text("inline", &text)),

        KnowledgeSource::File(path) => ingest_file(index, &path),

        KnowledgeSource::Dir(dir) => {
            let mut total = 0;
            let entries = fs::read_dir(&dir)
                .map_err(|e| anyhow!("Cannot read dir {}: {e}", dir.display()))?;
            for entry in entries.flatten() {
                let p = entry.path();
                if p.is_file() {
                    total += ingest_file(index, &p).unwrap_or(0);
                }
            }
            Ok(total)
        }

        KnowledgeSource::Url(url) => ingest_url(index, &url).await,
    }
}

fn ingest_file(index: &mut KnowledgeIndex, path: &Path) -> Result<usize> {
    let ext = path
        .extension()
        .and_then(|e| e.to_str())
        .unwrap_or("")
        .to_ascii_lowercase();

    let parsers = default_parsers();
    let parser = parsers
        .iter()
        .find(|p| p.supports_extension(&ext))
        .ok_or_else(|| anyhow!("No parser for '{}' (extension: .{})", path.display(), ext))?;

    let bytes = fs::read(path).map_err(|e| anyhow!("Cannot read {}: {e}", path.display()))?;
    let text = parser.parse_bytes(&bytes)?;
    let label = path.file_name().and_then(|n| n.to_str()).unwrap_or("file");
    Ok(index.ingest_text(label, &text))
}

#[cfg(feature = "net")]
async fn ingest_url(index: &mut KnowledgeIndex, url: &str) -> Result<usize> {
    let body = reqwest::get(url)
        .await
        .map_err(|e| anyhow!("HTTP error for {url}: {e}"))?
        .text()
        .await
        .map_err(|e| anyhow!("Body error for {url}: {e}"))?;
    let plain = strip_html(&body);
    Ok(index.ingest_text(url, &plain))
}

#[cfg(not(feature = "net"))]
async fn ingest_url(_index: &mut KnowledgeIndex, url: &str) -> Result<usize> {
    Err(anyhow!(
        "URL ingestion requires the `net` feature. Enable it with --features net. URL: {url}"
    ))
}

static STOP_WORDS: phf::Set<&'static str> = phf::phf_set! {
    "the","and","for","are","was","that","this","with","from","have",
    "not","but","had","has","its","were","they","will","been","their",
    "all","one","can","her","his","him","she","who","which","what",
    "into","then","than","when","also","more","some","out","about",
    "said","would","could","should","each","other","there","these",
    "those","such","any","our","you","your","very","just","now","may"
};

fn tokenise(text: &str) -> Vec<String> {
    text.split(|c: char| !c.is_ascii_alphabetic())
        .filter_map(|w| {
            let w = w.to_ascii_lowercase();
            if w.len() >= 3 && !STOP_WORDS.contains(w.as_str()) {
                Some(w)
            } else {
                None
            }
        })
        .collect()
}

fn split_sentences(text: &str) -> Vec<String> {
    let mut out = Vec::new();
    let mut cur = String::new();
    let chars: Vec<char> = text.chars().collect();
    for (i, &ch) in chars.iter().enumerate() {
        cur.push(ch);
        if matches!(ch, '.' | '!' | '?') {
            let next_lower = chars
                .get(i + 1)
                .map(|c| c.is_ascii_lowercase())
                .unwrap_or(false);
            let prev_digit = i > 0 && chars[i - 1].is_ascii_digit();
            let next_digit = chars
                .get(i + 1)
                .map(|c| c.is_ascii_digit())
                .unwrap_or(false);
            if ch == '.' && (next_lower || (prev_digit && next_digit)) {
                continue;
            }
            let s = cur.trim().to_string();
            if !s.is_empty() {
                out.push(s);
            }
            cur.clear();
        }
    }
    let tail = cur.trim().to_string();
    if !tail.is_empty() {
        out.push(tail);
    }
    out
}

fn strip_markdown(text: &str) -> String {
    let mut out = String::with_capacity(text.len());
    let mut in_code = false;
    for line in text.lines() {
        let trimmed = line.trim();
        if trimmed.starts_with("```") {
            in_code = !in_code;
            continue;
        }
        if in_code {
            continue;
        }
        if trimmed.starts_with('#') || trimmed.starts_with("---") || trimmed.starts_with("===") {
            continue;
        }
        let clean: String = trimmed
            .chars()
            .filter(|&c| c != '*' && c != '_' && c != '`' && c != '|')
            .collect();
        let clean = clean.trim();
        if !clean.is_empty() {
            out.push_str(clean);
            out.push(' ');
        }
    }
    out
}

#[allow(dead_code)]
fn strip_html(html: &str) -> String {
    let mut out = String::with_capacity(html.len() / 2);
    let mut in_tag = false;
    let mut buf = String::new();
    for ch in html.chars() {
        match ch {
            '<' => {
                if !buf.trim().is_empty() {
                    out.push_str(buf.trim());
                    out.push(' ');
                }
                buf.clear();
                in_tag = true;
            }
            '>' => {
                in_tag = false;
            }
            _ if !in_tag => {
                buf.push(ch);
            }
            _ => {}
        }
    }
    if !buf.trim().is_empty() {
        out.push_str(buf.trim());
    }
    out
}