lepiter_core/
lib.rs

1//! Core data model and parser for Lepiter knowledge bases stored as page JSON files.
2//!
3//! # Scope
4//! - Scans a Lepiter directory and builds a metadata index keyed by page id.
5//! - Loads and parses individual pages lazily by id.
6//! - Converts page snippet trees into a stable block-oriented node model.
7//! - Preserves unknown node types as [`Node::Unknown`] to keep consumers resilient.
8//!
9//! # Example
10//! ```no_run
11//! use lepiter_core::KnowledgeBase;
12//!
13//! # fn main() -> anyhow::Result<()> {
14//! let index = KnowledgeBase::open("./lepiter")?;
15//! for page in index.sorted_pages_by_title() {
16//!     println!("{} - {}", page.id, page.title);
17//! }
18//! # Ok(())
19//! # }
20//! ```
21
22use std::collections::HashMap;
23use std::fs::File;
24use std::io::BufReader;
25use std::path::{Path, PathBuf};
26
27use anyhow::{Context, Result};
28use chrono::{DateTime, FixedOffset};
29use serde::Deserialize;
30use serde_json::Value;
31use walkdir::WalkDir;
32
33/// Canonical page identifier used throughout the API.
34pub type PageId = String;
35
36/// Metadata for a page discovered during index scanning.
37#[derive(Debug, Clone)]
38pub struct PageMeta {
39    /// Canonical page id (preferred key over filename).
40    pub id: PageId,
41    /// Human-readable page title.
42    pub title: String,
43    /// Absolute or relative path to the source page file.
44    pub path: PathBuf,
45    /// Last edit timestamp, if present in source metadata.
46    pub updated_at: Option<DateTime<FixedOffset>>,
47    /// Optional page tags extracted from metadata.
48    pub tags: Vec<String>,
49}
50
51/// Fully parsed page content.
52#[derive(Debug, Clone)]
53pub struct Page {
54    /// Canonical page id.
55    pub id: PageId,
56    /// Page title.
57    pub title: String,
58    /// Last edit timestamp, if present.
59    pub updated_at: Option<DateTime<FixedOffset>>,
60    /// Page tags.
61    pub tags: Vec<String>,
62    /// Parsed block-level content.
63    pub content: Vec<Node>,
64}
65
66/// Block-oriented normalized node model used by consumers (e.g. TUI).
67#[derive(Debug, Clone)]
68pub enum Node {
69    /// Markdown-style heading.
70    Heading { level: u8, text: String },
71    /// Paragraph text.
72    Paragraph { text: String },
73    /// Plain text line.
74    Text { text: String },
75    /// List with item nodes.
76    List { items: Vec<Vec<Node>> },
77    /// Code block with optional language.
78    Code {
79        language: Option<String>,
80        code: String,
81    },
82    /// Link block.
83    Link { text: String, url: String },
84    /// Quote block.
85    Quote { text: String },
86    /// Rewrite block (search/replace transformation).
87    Rewrite {
88        language: Option<String>,
89        search: String,
90        replace: String,
91        scope: Option<String>,
92        is_method_pattern: Option<bool>,
93    },
94    /// Unknown/unsupported source node type preserved losslessly.
95    Unknown { typ: String, raw: Value },
96}
97
98/// Non-fatal parse/indexing issue associated with a source file.
99#[derive(Debug, Clone)]
100pub struct ParseIssue {
101    /// File path where the issue occurred.
102    pub path: PathBuf,
103    /// Human-readable error description.
104    pub message: String,
105}
106
107/// Match category for search results.
108#[derive(Debug, Clone, Copy, PartialEq, Eq)]
109pub enum SearchMatchKind {
110    /// Match came from page metadata (title/id/tags).
111    Meta,
112    /// Match came from rendered page content.
113    Content,
114}
115
116/// Search result entry for one page.
117#[derive(Debug, Clone, PartialEq, Eq)]
118pub struct SearchHit {
119    /// Canonical page id.
120    pub id: PageId,
121    /// How this page matched.
122    pub kind: SearchMatchKind,
123}
124
125/// Classification of a raw link target.
126#[derive(Debug, Clone, PartialEq, Eq)]
127pub enum LinkTargetKind {
128    /// Resolved to an internal page id.
129    InternalPage(PageId),
130    /// Resolved to an attachment file path in the knowledge base.
131    AttachmentPath(PathBuf),
132    /// Resolved to an external URL/scheme target.
133    ExternalUrl(String),
134    /// Could not classify target.
135    Unknown(String),
136}
137
138/// Result of resolving a page by title.
139#[derive(Debug, Clone, PartialEq, Eq)]
140pub enum TitleResolution {
141    /// A unique page id was resolved.
142    Unique(PageId),
143    /// No matching title found.
144    NotFound,
145    /// Multiple candidate page ids matched.
146    Ambiguous(Vec<PageId>),
147}
148
149/// Indexed knowledge base metadata with lazy page loading.
150#[derive(Debug, Clone)]
151pub struct KnowledgeBaseIndex {
152    root: PathBuf,
153    /// Metadata map keyed by canonical page id.
154    pub pages: HashMap<PageId, PageMeta>,
155    /// Non-fatal issues encountered while scanning metadata.
156    pub index_issues: Vec<ParseIssue>,
157}
158
159/// Entry point for opening a Lepiter knowledge base directory.
160pub struct KnowledgeBase;
161
162impl KnowledgeBase {
163    /// Scans a knowledge base directory and builds a page metadata index.
164    ///
165    /// This operation only reads metadata and does not parse full page content.
166    /// Full parsing is done lazily via [`KnowledgeBaseIndex::load_page`].
167    pub fn open(path: impl AsRef<Path>) -> Result<KnowledgeBaseIndex> {
168        let root = path.as_ref().to_path_buf();
169        let mut pages = HashMap::new();
170        let mut issues = Vec::new();
171
172        for entry in WalkDir::new(&root)
173            .min_depth(1)
174            .max_depth(1)
175            .into_iter()
176            .filter_map(|e| e.ok())
177        {
178            let file_type = entry.file_type();
179            let file_path = entry.path();
180            if !file_type.is_file()
181                || file_path.extension().and_then(|e| e.to_str()) != Some("lepiter")
182            {
183                continue;
184            }
185
186            match parse_page_meta(file_path) {
187                Ok(mut meta) => {
188                    if meta.id.is_empty()
189                        && let Some(stem) = file_path.file_stem().and_then(|s| s.to_str())
190                    {
191                        meta.id = stem.to_string();
192                    }
193                    if meta.title.is_empty() {
194                        meta.title = meta.id.clone();
195                    }
196                    pages.insert(meta.id.clone(), meta);
197                }
198                Err(err) => issues.push(ParseIssue {
199                    path: file_path.to_path_buf(),
200                    message: format!("{err:#}"),
201                }),
202            }
203        }
204
205        Ok(KnowledgeBaseIndex {
206            root,
207            pages,
208            index_issues: issues,
209        })
210    }
211}
212
213impl KnowledgeBaseIndex {
214    /// Loads and parses a single page by canonical id.
215    ///
216    /// Returns an error if the id is missing from the index or if JSON parsing fails.
217    pub fn load_page(&self, id: &str) -> Result<Page> {
218        let meta = self
219            .pages
220            .get(id)
221            .with_context(|| format!("page id not found: {id}"))?;
222
223        let file = File::open(&meta.path)
224            .with_context(|| format!("failed to open page file {}", meta.path.display()))?;
225        let reader = BufReader::new(file);
226        let raw: Value =
227            serde_json::from_reader(reader).with_context(|| "failed to decode page JSON")?;
228
229        let mut content = Vec::new();
230        if let Some(items) = raw
231            .get("children")
232            .and_then(|v| v.get("items"))
233            .and_then(Value::as_array)
234        {
235            for item in items {
236                parse_item_recursive(item, &mut content);
237            }
238        }
239
240        Ok(Page {
241            id: meta.id.clone(),
242            title: meta.title.clone(),
243            updated_at: meta.updated_at,
244            tags: meta.tags.clone(),
245            content,
246        })
247    }
248
249    /// Returns metadata entries sorted case-insensitively by title.
250    pub fn sorted_pages_by_title(&self) -> Vec<&PageMeta> {
251        let mut pages = self.pages.values().collect::<Vec<_>>();
252        pages.sort_by(|a, b| a.title.to_lowercase().cmp(&b.title.to_lowercase()));
253        pages
254    }
255
256    /// Returns page ids filtered by metadata query (title/id/tags), sorted by title.
257    pub fn filter_page_ids(&self, query: &str) -> Vec<PageId> {
258        let needle = query.trim().to_lowercase();
259        let mut metas = self.sorted_pages_by_title();
260        if !needle.is_empty() {
261            metas.retain(|m| page_meta_matches(m, &needle));
262        }
263        metas.into_iter().map(|m| m.id.clone()).collect()
264    }
265
266    /// Searches pages by metadata and optionally content, returning sorted hits.
267    pub fn search_hits(&self, query: &str, include_content: bool) -> Vec<SearchHit> {
268        let needle = query.trim().to_lowercase();
269        if needle.is_empty() {
270            return Vec::new();
271        }
272
273        let mut by_id: HashMap<PageId, SearchMatchKind> = HashMap::new();
274        let metas = self.sorted_pages_by_title();
275
276        for meta in &metas {
277            if page_meta_matches(meta, &needle) {
278                by_id.insert(meta.id.clone(), SearchMatchKind::Meta);
279            }
280        }
281
282        if include_content {
283            for meta in &metas {
284                if by_id.contains_key(&meta.id) {
285                    continue;
286                }
287                let Ok(page) = self.load_page(&meta.id) else {
288                    continue;
289                };
290                if render_page_to_text(&page).to_lowercase().contains(&needle) {
291                    by_id.insert(meta.id.clone(), SearchMatchKind::Content);
292                }
293            }
294        }
295
296        let mut hits = Vec::new();
297        for meta in metas {
298            if let Some(kind) = by_id.get(&meta.id) {
299                hits.push(SearchHit {
300                    id: meta.id.clone(),
301                    kind: *kind,
302                });
303            }
304        }
305        hits
306    }
307
308    /// Resolves a page id from title using case-insensitive exact match, then partial match.
309    pub fn resolve_page_id_by_title(&self, title: &str) -> TitleResolution {
310        let needle = title.trim().to_lowercase();
311        if needle.is_empty() {
312            return TitleResolution::NotFound;
313        }
314
315        let exact = self
316            .sorted_pages_by_title()
317            .into_iter()
318            .filter(|m| m.title.to_lowercase() == needle)
319            .map(|m| m.id.clone())
320            .collect::<Vec<_>>();
321        match exact.len() {
322            1 => return TitleResolution::Unique(exact[0].clone()),
323            n if n > 1 => return TitleResolution::Ambiguous(exact),
324            _ => {}
325        }
326
327        let partial = self
328            .sorted_pages_by_title()
329            .into_iter()
330            .filter(|m| m.title.to_lowercase().contains(&needle))
331            .map(|m| m.id.clone())
332            .collect::<Vec<_>>();
333        match partial.len() {
334            1 => TitleResolution::Unique(partial[0].clone()),
335            0 => TitleResolution::NotFound,
336            _ => TitleResolution::Ambiguous(partial),
337        }
338    }
339
340    /// Classifies a raw link target for navigation/open behavior.
341    pub fn classify_link_target(&self, raw: &str) -> LinkTargetKind {
342        let target = raw.trim();
343        if target.is_empty() {
344            return LinkTargetKind::Unknown(raw.to_string());
345        }
346
347        if self.pages.contains_key(target) {
348            return LinkTargetKind::InternalPage(target.to_string());
349        }
350
351        if let Some(rest) = target.strip_prefix("page:") {
352            let id = rest.trim();
353            if self.pages.contains_key(id) {
354                return LinkTargetKind::InternalPage(id.to_string());
355            }
356        }
357        if let Some(rest) = target.strip_prefix("title:") {
358            return match self.resolve_page_id_by_title(rest.trim()) {
359                TitleResolution::Unique(id) => LinkTargetKind::InternalPage(id),
360                _ => LinkTargetKind::Unknown(target.to_string()),
361            };
362        }
363
364        if let Some(uuid) = extract_uuid_like(target)
365            && self.pages.contains_key(uuid)
366        {
367            return LinkTargetKind::InternalPage(uuid.to_string());
368        }
369
370        if is_external_target(target) {
371            return LinkTargetKind::ExternalUrl(target.to_string());
372        }
373
374        if let Some(rel) = attachment_relative_path(target) {
375            return LinkTargetKind::AttachmentPath(self.root.join(rel));
376        }
377
378        match self.resolve_page_id_by_title(target) {
379            TitleResolution::Unique(id) => LinkTargetKind::InternalPage(id),
380            _ => LinkTargetKind::Unknown(target.to_string()),
381        }
382    }
383
384    /// Returns the root path used to build this index.
385    pub fn root(&self) -> &Path {
386        &self.root
387    }
388}
389
390fn page_meta_matches(meta: &PageMeta, needle: &str) -> bool {
391    meta.title.to_lowercase().contains(needle)
392        || meta.id.to_lowercase().contains(needle)
393        || meta.tags.iter().any(|t| t.to_lowercase().contains(needle))
394}
395
396fn is_external_target(target: &str) -> bool {
397    let lower = target.to_lowercase();
398    lower.starts_with("http://")
399        || lower.starts_with("https://")
400        || lower.starts_with("mailto:")
401        || lower.starts_with("file://")
402        || lower.contains("://")
403}
404
405fn attachment_relative_path(target: &str) -> Option<&str> {
406    if let Some(rest) = target.strip_prefix("attachments/") {
407        return Some(rest).map(|_| target);
408    }
409    if let Some(pos) = target.find("/attachments/") {
410        let start = pos + 1;
411        return target.get(start..);
412    }
413    if let Some(pos) = target.find("attachments/") {
414        return target.get(pos..);
415    }
416    None
417}
418
419fn extract_uuid_like(input: &str) -> Option<&str> {
420    let bytes = input.as_bytes();
421    if bytes.len() < 36 {
422        return None;
423    }
424
425    for i in 0..=bytes.len() - 36 {
426        let cand = &input[i..i + 36];
427        let ok = cand.chars().enumerate().all(|(idx, c)| match idx {
428            8 | 13 | 18 | 23 => c == '-',
429            _ => c.is_ascii_hexdigit(),
430        });
431        if ok {
432            return Some(cand);
433        }
434    }
435    None
436}
437
438#[derive(Debug, Deserialize)]
439struct RawMeta {
440    #[serde(default)]
441    uid: Option<RawUid>,
442    #[serde(default)]
443    #[serde(rename = "pageType")]
444    page_type: Option<RawPageType>,
445    #[serde(default)]
446    title: Option<String>,
447    #[serde(default)]
448    #[serde(rename = "editTime")]
449    edit_time: Option<RawEditTime>,
450    #[serde(default)]
451    tags: Option<Value>,
452}
453
454#[derive(Debug, Deserialize)]
455struct RawUid {
456    #[serde(default)]
457    uuid: Option<String>,
458    #[serde(default)]
459    #[serde(rename = "uidString")]
460    uid_string: Option<String>,
461}
462
463#[derive(Debug, Deserialize)]
464struct RawPageType {
465    #[serde(default)]
466    title: Option<String>,
467}
468
469#[derive(Debug, Deserialize)]
470struct RawEditTime {
471    #[serde(default)]
472    time: Option<RawTimeValue>,
473}
474
475#[derive(Debug, Deserialize)]
476struct RawTimeValue {
477    #[serde(default)]
478    #[serde(rename = "dateAndTimeString")]
479    date_and_time_string: Option<String>,
480}
481
482fn parse_page_meta(path: &Path) -> Result<PageMeta> {
483    let file = File::open(path).with_context(|| format!("failed to open {}", path.display()))?;
484    let reader = BufReader::new(file);
485    let raw: RawMeta =
486        serde_json::from_reader(reader).with_context(|| "failed to decode page metadata")?;
487
488    let id = raw
489        .uid
490        .as_ref()
491        .and_then(|u| u.uuid.clone().or_else(|| u.uid_string.clone()))
492        .unwrap_or_default();
493    let title = raw
494        .page_type
495        .and_then(|pt| pt.title)
496        .or(raw.title)
497        .unwrap_or_default();
498    let updated_at = raw
499        .edit_time
500        .and_then(|e| e.time)
501        .and_then(|t| t.date_and_time_string)
502        .and_then(|s| DateTime::parse_from_rfc3339(&s).ok());
503    let tags = parse_tags(raw.tags.as_ref());
504
505    Ok(PageMeta {
506        id,
507        title,
508        path: path.to_path_buf(),
509        updated_at,
510        tags,
511    })
512}
513
514fn parse_tags(value: Option<&Value>) -> Vec<String> {
515    let Some(value) = value else {
516        return Vec::new();
517    };
518    match value {
519        Value::Array(items) => items
520            .iter()
521            .filter_map(|item| {
522                item.as_str()
523                    .map(ToOwned::to_owned)
524                    .or_else(|| {
525                        item.get("name")
526                            .and_then(Value::as_str)
527                            .map(ToOwned::to_owned)
528                    })
529                    .or_else(|| {
530                        item.get("title")
531                            .and_then(Value::as_str)
532                            .map(ToOwned::to_owned)
533                    })
534            })
535            .collect(),
536        Value::Object(obj) => obj
537            .get("items")
538            .and_then(Value::as_array)
539            .map(|items| {
540                items
541                    .iter()
542                    .filter_map(|i| {
543                        i.get("title")
544                            .and_then(Value::as_str)
545                            .map(ToOwned::to_owned)
546                    })
547                    .collect::<Vec<_>>()
548            })
549            .unwrap_or_default(),
550        _ => Vec::new(),
551    }
552}
553
554fn parse_item_recursive(item: &Value, out: &mut Vec<Node>) {
555    let typ = extract_type(item);
556    out.push(parse_node(item));
557    if matches!(typ.as_deref(), Some("listSnippet")) {
558        // list snippets already materialize children into Node::List items.
559        return;
560    }
561    if let Some(children) = item
562        .get("children")
563        .and_then(|v| v.get("items"))
564        .and_then(Value::as_array)
565    {
566        for child in children {
567            parse_item_recursive(child, out);
568        }
569    }
570}
571
572fn parse_node(item: &Value) -> Node {
573    let typ = extract_type(item);
574
575    match typ.as_deref() {
576        Some("textSnippet") => parse_text_like_node(item),
577        Some("quoteSnippet") | Some("blockQuoteSnippet") | Some("commentSnippet") => Node::Quote {
578            text: extract_text(item).unwrap_or_default(),
579        },
580        Some("listSnippet") => parse_list_node(item),
581        Some("pictureSnippet") => parse_picture_node(item),
582        Some("youtubeSnippet") => parse_youtube_node(item),
583        Some("elementSnippet") => parse_element_node(item),
584        Some("pharoRewrite") => parse_rewrite_node(item),
585        Some("wordSnippet") => parse_word_node(item),
586        Some(
587            t @ ("pharoSnippet"
588            | "pythonSnippet"
589            | "javascriptSnippet"
590            | "shellCommandSnippet"
591            | "gemstoneSnippet"
592            | "exampleSnippet"
593            | "changesSnippet"
594            | "robocoderMetamodelSnippet"),
595        ) => Node::Code {
596            language: infer_language(Some(t)),
597            code: extract_code(item)
598                .or_else(|| extract_text(item))
599                .unwrap_or_default(),
600        },
601        Some(t @ "pharoLinkSnippet") if has_link(item) => Node::Link {
602            text: extract_text(item).unwrap_or_else(|| t.to_string()),
603            url: extract_link(item).unwrap_or_default(),
604        },
605        Some("linkSnippet") if has_link(item) => Node::Link {
606            text: extract_text(item).unwrap_or_else(|| "link".to_string()),
607            url: extract_link(item).unwrap_or_default(),
608        },
609        Some(t) => Node::Unknown {
610            typ: t.to_string(),
611            raw: item.clone(),
612        },
613        None => Node::Unknown {
614            typ: "<missing-type>".to_string(),
615            raw: item.clone(),
616        },
617    }
618}
619
620fn parse_text_like_node(item: &Value) -> Node {
621    let text = extract_text(item).unwrap_or_default();
622    if let Some((level, heading)) = parse_heading(&text) {
623        Node::Heading {
624            level,
625            text: heading,
626        }
627    } else if let Some(stripped) = text.strip_prefix("> ") {
628        Node::Quote {
629            text: stripped.to_string(),
630        }
631    } else if text.trim().is_empty() {
632        Node::Text { text }
633    } else {
634        Node::Paragraph { text }
635    }
636}
637
638fn parse_list_node(item: &Value) -> Node {
639    let mut items = Vec::new();
640    if let Some(children) = item
641        .get("children")
642        .and_then(|v| v.get("items"))
643        .and_then(Value::as_array)
644    {
645        for child in children {
646            items.push(vec![parse_node(child)]);
647        }
648    }
649    Node::List { items }
650}
651
652fn parse_picture_node(item: &Value) -> Node {
653    let url = item
654        .get("url")
655        .and_then(Value::as_str)
656        .map(ToOwned::to_owned)
657        .or_else(|| extract_link(item))
658        .unwrap_or_default();
659    let text = item
660        .get("caption")
661        .and_then(Value::as_str)
662        .map(ToOwned::to_owned)
663        .or_else(|| extract_text(item))
664        .unwrap_or_else(|| "picture".to_string());
665
666    if url.is_empty() {
667        Node::Unknown {
668            typ: "pictureSnippet".to_string(),
669            raw: item.clone(),
670        }
671    } else {
672        Node::Link { text, url }
673    }
674}
675
676fn parse_youtube_node(item: &Value) -> Node {
677    let url = item
678        .get("youtubeUrl")
679        .and_then(Value::as_str)
680        .map(ToOwned::to_owned)
681        .or_else(|| extract_link(item))
682        .unwrap_or_default();
683    let text = extract_text(item).unwrap_or_else(|| "youtube".to_string());
684
685    if url.is_empty() {
686        Node::Unknown {
687            typ: "youtubeSnippet".to_string(),
688            raw: item.clone(),
689        }
690    } else {
691        Node::Link { text, url }
692    }
693}
694
695fn parse_element_node(item: &Value) -> Node {
696    let code = extract_code(item).or_else(|| extract_text(item));
697    if let Some(code) = code.filter(|c| !c.trim().is_empty()) {
698        Node::Code {
699            language: Some("element".to_string()),
700            code,
701        }
702    } else {
703        Node::Unknown {
704            typ: "elementSnippet".to_string(),
705            raw: item.clone(),
706        }
707    }
708}
709
710fn parse_rewrite_node(item: &Value) -> Node {
711    let search = item
712        .get("search")
713        .and_then(Value::as_str)
714        .map(ToOwned::to_owned)
715        .unwrap_or_default();
716    let replace = item
717        .get("replace")
718        .and_then(Value::as_str)
719        .map(ToOwned::to_owned)
720        .unwrap_or_default();
721    let scope = item
722        .get("scope")
723        .and_then(Value::as_str)
724        .map(ToOwned::to_owned);
725    let is_method_pattern = item.get("isMethodPattern").and_then(Value::as_bool);
726
727    if search.is_empty() && replace.is_empty() {
728        Node::Unknown {
729            typ: "pharoRewrite".to_string(),
730            raw: item.clone(),
731        }
732    } else {
733        Node::Rewrite {
734            language: Some("pharo".to_string()),
735            search,
736            replace,
737            scope,
738            is_method_pattern,
739        }
740    }
741}
742
743fn parse_word_node(item: &Value) -> Node {
744    let mut lines = Vec::new();
745
746    if let Some(word) = item
747        .get("wordString")
748        .and_then(Value::as_str)
749        .map(str::trim)
750        .filter(|s| !s.is_empty())
751    {
752        lines.push(word.to_string());
753    }
754
755    if let Some(explanation) = item
756        .get("explanationAttachmentNameString")
757        .and_then(Value::as_str)
758        .map(str::trim)
759        .filter(|s| !s.is_empty())
760    {
761        lines.push(format!("explanation: {explanation}"));
762    }
763
764    if lines.is_empty() {
765        collect_text_fragments(item, &mut lines, 0, 12);
766    }
767
768    lines.retain(|s| !s.trim().is_empty());
769    lines.truncate(8);
770
771    if lines.is_empty() {
772        return Node::Unknown {
773            typ: "wordSnippet".to_string(),
774            raw: item.clone(),
775        };
776    }
777
778    let mut text = lines.join("\n");
779    if text.chars().count() > 1200 {
780        text = text.chars().take(1199).collect::<String>();
781        text.push('…');
782    }
783
784    Node::Paragraph { text }
785}
786
787fn collect_text_fragments(value: &Value, out: &mut Vec<String>, depth: usize, remaining: usize) {
788    if remaining == 0 || out.len() >= remaining || depth > 4 {
789        return;
790    }
791
792    match value {
793        Value::String(s) => {
794            let trimmed = s.trim();
795            if !trimmed.is_empty() {
796                out.push(trimmed.to_string());
797            }
798        }
799        Value::Array(items) => {
800            for item in items {
801                if out.len() >= remaining {
802                    break;
803                }
804                collect_text_fragments(item, out, depth + 1, remaining);
805            }
806        }
807        Value::Object(map) => {
808            for (key, item) in map {
809                if matches!(
810                    key.as_str(),
811                    "__type"
812                        | "children"
813                        | "uid"
814                        | "createEmail"
815                        | "createTime"
816                        | "editEmail"
817                        | "editTime"
818                        | "paragraphStyle"
819                ) {
820                    continue;
821                }
822                if out.len() >= remaining {
823                    break;
824                }
825                collect_text_fragments(item, out, depth + 1, remaining);
826            }
827        }
828        _ => {}
829    }
830}
831
832fn parse_heading(input: &str) -> Option<(u8, String)> {
833    let trimmed = input.trim();
834    let hashes = trimmed.chars().take_while(|c| *c == '#').count();
835    if hashes == 0 {
836        return None;
837    }
838    let rest = trimmed[hashes..].trim_start();
839    if rest.is_empty() {
840        return None;
841    }
842    Some((hashes.min(6) as u8, rest.to_string()))
843}
844
845fn extract_type(item: &Value) -> Option<String> {
846    item.get("type")
847        .and_then(Value::as_str)
848        .map(ToOwned::to_owned)
849        .or_else(|| {
850            item.get("__type")
851                .and_then(Value::as_str)
852                .map(ToOwned::to_owned)
853        })
854}
855
856fn extract_text(item: &Value) -> Option<String> {
857    item.get("string")
858        .and_then(Value::as_str)
859        .map(ToOwned::to_owned)
860        .or_else(|| {
861            item.get("text")
862                .and_then(Value::as_str)
863                .map(ToOwned::to_owned)
864        })
865        .or_else(|| {
866            item.get("content")
867                .and_then(Value::as_str)
868                .map(ToOwned::to_owned)
869        })
870}
871
872fn extract_code(item: &Value) -> Option<String> {
873    item.get("code")
874        .and_then(Value::as_str)
875        .map(ToOwned::to_owned)
876        .or_else(|| {
877            item.get("source")
878                .and_then(Value::as_str)
879                .map(ToOwned::to_owned)
880        })
881}
882
883fn extract_link(item: &Value) -> Option<String> {
884    item.get("url")
885        .and_then(Value::as_str)
886        .map(ToOwned::to_owned)
887        .or_else(|| {
888            item.get("href")
889                .and_then(Value::as_str)
890                .map(ToOwned::to_owned)
891        })
892}
893
894fn has_link(item: &Value) -> bool {
895    item.get("url").and_then(Value::as_str).is_some()
896        || item.get("href").and_then(Value::as_str).is_some()
897}
898
899fn infer_language(typ: Option<&str>) -> Option<String> {
900    let typ = typ?;
901    match typ {
902        "pharoSnippet" => Some("pharo".to_string()),
903        "pythonSnippet" => Some("python".to_string()),
904        "javascriptSnippet" => Some("javascript".to_string()),
905        "jsonSnippet" => Some("json".to_string()),
906        "yamlSnippet" => Some("yaml".to_string()),
907        _ => {
908            if typ.ends_with("Snippet") {
909                Some(typ.trim_end_matches("Snippet").to_lowercase())
910            } else {
911                None
912            }
913        }
914    }
915}
916
917/// Renders a parsed page to plain text.
918pub fn render_page_to_text(page: &Page) -> String {
919    render_nodes_to_text(&page.content)
920}
921
922/// Renders normalized nodes to plain text.
923pub fn render_nodes_to_text(nodes: &[Node]) -> String {
924    let mut out = String::new();
925    for node in nodes {
926        match node {
927            Node::Heading { level, text } => {
928                out.push_str(&"#".repeat((*level).max(1) as usize));
929                out.push(' ');
930                out.push_str(text);
931                out.push_str("\n\n");
932            }
933            Node::Paragraph { text } => {
934                out.push_str(text);
935                out.push_str("\n\n");
936            }
937            Node::Text { text } => {
938                out.push_str(text);
939                out.push('\n');
940            }
941            Node::List { items } => {
942                for item in items {
943                    out.push_str("- ");
944                    out.push_str(render_nodes_to_text(item).trim());
945                    out.push('\n');
946                }
947                out.push('\n');
948            }
949            Node::Code { language, code } => {
950                out.push_str("```");
951                if let Some(lang) = language {
952                    out.push_str(lang);
953                }
954                out.push('\n');
955                out.push_str(code);
956                out.push_str("\n```\n\n");
957            }
958            Node::Link { text, url } => {
959                out.push_str(&format!("[{text}]({url})\n\n"));
960            }
961            Node::Quote { text } => {
962                out.push_str(&format!("> {text}\n\n"));
963            }
964            Node::Rewrite {
965                language,
966                search,
967                replace,
968                scope,
969                is_method_pattern,
970            } => {
971                let lang = language.clone().unwrap_or_else(|| "rewrite".to_string());
972                out.push_str(&format!("```diff {lang}\n"));
973                if let Some(scope) = scope {
974                    out.push_str(&format!("# scope: {scope}\n"));
975                }
976                if let Some(is_method_pattern) = is_method_pattern {
977                    out.push_str(&format!("# method_pattern: {is_method_pattern}\n"));
978                }
979                for line in normalize_text(search).lines() {
980                    out.push('-');
981                    out.push_str(line);
982                    out.push('\n');
983                }
984                for line in normalize_text(replace).lines() {
985                    out.push('+');
986                    out.push_str(line);
987                    out.push('\n');
988                }
989                out.push_str("```\n\n");
990            }
991            Node::Unknown { typ, .. } => {
992                out.push_str(&format!("[[unknown: {typ}]]\n\n"));
993            }
994        }
995    }
996    out
997}
998
999fn normalize_text(input: &str) -> String {
1000    input.replace("\r\n", "\n").replace('\r', "\n")
1001}
1002
1003/// Collects all observed `type`/`__type` values and their counts in one page file.
1004pub fn collect_node_types_in_file(path: &Path) -> Result<HashMap<String, usize>> {
1005    let file = File::open(path).with_context(|| format!("failed to open {}", path.display()))?;
1006    let reader = BufReader::new(file);
1007    let raw: Value = serde_json::from_reader(reader).with_context(|| "failed to decode JSON")?;
1008
1009    let mut out = HashMap::new();
1010    collect_node_types_value(&raw, &mut out);
1011    Ok(out)
1012}
1013
1014fn collect_node_types_value(value: &Value, out: &mut HashMap<String, usize>) {
1015    match value {
1016        Value::Object(map) => {
1017            if let Some(typ) = map
1018                .get("type")
1019                .and_then(Value::as_str)
1020                .or_else(|| map.get("__type").and_then(Value::as_str))
1021            {
1022                *out.entry(typ.to_string()).or_insert(0) += 1;
1023            }
1024            for v in map.values() {
1025                collect_node_types_value(v, out);
1026            }
1027        }
1028        Value::Array(items) => {
1029            for item in items {
1030                collect_node_types_value(item, out);
1031            }
1032        }
1033        _ => {}
1034    }
1035}
1036
1037#[cfg(test)]
1038mod tests {
1039    use super::*;
1040    use serde_json::json;
1041    use std::fs;
1042    use std::time::{SystemTime, UNIX_EPOCH};
1043
1044    fn temp_file_path(name: &str) -> PathBuf {
1045        let ts = SystemTime::now()
1046            .duration_since(UNIX_EPOCH)
1047            .expect("time")
1048            .as_nanos();
1049        std::env::temp_dir().join(format!("lepiter-core-{name}-{ts}.lepiter"))
1050    }
1051
1052    #[test]
1053    fn parse_heading_detects_markdown_style() {
1054        assert_eq!(
1055            parse_heading("## Heading"),
1056            Some((2, "Heading".to_string()))
1057        );
1058        assert_eq!(parse_heading("No heading"), None);
1059    }
1060
1061    #[test]
1062    fn parse_tags_supports_array_and_object_items() {
1063        let arr = json!(["a", {"name": "b"}, {"title": "c"}]);
1064        assert_eq!(parse_tags(Some(&arr)), vec!["a", "b", "c"]);
1065
1066        let obj = json!({"items": [{"title":"x"}, {"title":"y"}]});
1067        assert_eq!(parse_tags(Some(&obj)), vec!["x", "y"]);
1068    }
1069
1070    #[test]
1071    fn parse_node_covers_known_and_unknown_types() {
1072        let heading = json!({"__type":"textSnippet","string":"# Title"});
1073        assert!(matches!(parse_node(&heading), Node::Heading { .. }));
1074
1075        let quote = json!({"__type":"blockQuoteSnippet","string":"quoted"});
1076        assert!(matches!(parse_node(&quote), Node::Quote { .. }));
1077
1078        let code = json!({"__type":"pythonSnippet","code":"print(1)"});
1079        assert!(matches!(parse_node(&code), Node::Code { .. }));
1080
1081        let link = json!({"__type":"pharoLinkSnippet","string":"link","url":"page:abc"});
1082        assert!(matches!(parse_node(&link), Node::Link { .. }));
1083
1084        let picture = json!({"__type":"pictureSnippet","url":"attachments/x.png","caption":"img"});
1085        assert!(matches!(parse_node(&picture), Node::Link { .. }));
1086
1087        let youtube = json!({"__type":"youtubeSnippet","youtubeUrl":"https://youtu.be/abc"});
1088        assert!(matches!(parse_node(&youtube), Node::Link { .. }));
1089
1090        let element = json!({"__type":"elementSnippet","code":"GtInspector newOn: 42"});
1091        assert!(matches!(parse_node(&element), Node::Code { .. }));
1092
1093        let rewrite =
1094            json!({"__type":"pharoRewrite","search":"a","replace":"b","isMethodPattern":true});
1095        assert!(matches!(parse_node(&rewrite), Node::Rewrite { .. }));
1096
1097        let word = json!({"__type":"wordSnippet","wordString":"refactoring"});
1098        assert!(matches!(parse_node(&word), Node::Paragraph { .. }));
1099
1100        let list = json!({
1101            "__type":"listSnippet",
1102            "children":{"items":[{"__type":"textSnippet","string":"item"}]}
1103        });
1104        assert!(matches!(parse_node(&list), Node::List { .. }));
1105
1106        let unknown = json!({"__type":"mysterySnippet","x":1});
1107        assert!(matches!(parse_node(&unknown), Node::Unknown { .. }));
1108
1109        let missing = json!({"x":1});
1110        assert!(matches!(parse_node(&missing), Node::Unknown { .. }));
1111    }
1112
1113    #[test]
1114    fn infer_language_maps_common_snippet_types() {
1115        assert_eq!(
1116            infer_language(Some("pharoSnippet")),
1117            Some("pharo".to_string())
1118        );
1119        assert_eq!(
1120            infer_language(Some("javascriptSnippet")),
1121            Some("javascript".to_string())
1122        );
1123        assert_eq!(
1124            infer_language(Some("yamlSnippet")),
1125            Some("yaml".to_string())
1126        );
1127        assert_eq!(
1128            infer_language(Some("customSnippet")),
1129            Some("custom".to_string())
1130        );
1131        assert_eq!(infer_language(None), None);
1132    }
1133
1134    #[test]
1135    fn render_nodes_outputs_unknown_placeholder() {
1136        let text = render_nodes_to_text(&[
1137            Node::Paragraph {
1138                text: "para".to_string(),
1139            },
1140            Node::Rewrite {
1141                language: Some("pharo".to_string()),
1142                search: "a".to_string(),
1143                replace: "b".to_string(),
1144                scope: None,
1145                is_method_pattern: Some(true),
1146            },
1147            Node::Unknown {
1148                typ: "weird".to_string(),
1149                raw: json!({"a":1}),
1150            },
1151        ]);
1152        assert!(text.contains("para"));
1153        assert!(text.contains("```diff pharo"));
1154        assert!(text.contains("-a"));
1155        assert!(text.contains("+b"));
1156        assert!(text.contains("[[unknown: weird]]"));
1157    }
1158
1159    #[test]
1160    fn collect_node_types_counts_nested_values() -> Result<()> {
1161        let path = temp_file_path("types");
1162        let content = json!({
1163            "__type":"page",
1164            "children":{"__type":"snippets","items":[
1165                {"__type":"textSnippet","children":{"__type":"snippets","items":[]}},
1166                {"__type":"pythonSnippet","code":"print(1)"}
1167            ]}
1168        });
1169        fs::write(&path, serde_json::to_vec(&content)?)?;
1170        let counts = collect_node_types_in_file(&path)?;
1171        fs::remove_file(&path)?;
1172
1173        assert_eq!(counts.get("page"), Some(&1));
1174        assert_eq!(counts.get("textSnippet"), Some(&1));
1175        assert_eq!(counts.get("pythonSnippet"), Some(&1));
1176        Ok(())
1177    }
1178
1179    #[test]
1180    fn parse_page_meta_extracts_core_fields() -> Result<()> {
1181        let path = temp_file_path("meta");
1182        let content = json!({
1183            "uid":{"uuid":"id-123"},
1184            "pageType":{"title":"Title"},
1185            "editTime":{"time":{"dateAndTimeString":"2024-01-01T00:00:00+00:00"}},
1186            "tags":["t1","t2"]
1187        });
1188        fs::write(&path, serde_json::to_vec(&content)?)?;
1189        let meta = parse_page_meta(&path)?;
1190        fs::remove_file(&path)?;
1191
1192        assert_eq!(meta.id, "id-123");
1193        assert_eq!(meta.title, "Title");
1194        assert_eq!(meta.tags, vec!["t1", "t2"]);
1195        assert!(meta.updated_at.is_some());
1196        Ok(())
1197    }
1198
1199    #[test]
1200    fn parse_item_recursive_includes_children() {
1201        let root = json!({
1202            "__type":"textSnippet",
1203            "string":"parent",
1204            "children":{"items":[
1205                {"__type":"textSnippet","string":"child"}
1206            ]}
1207        });
1208        let mut out = Vec::new();
1209        parse_item_recursive(&root, &mut out);
1210        assert_eq!(out.len(), 2);
1211    }
1212
1213    #[test]
1214    fn filter_page_ids_matches_title_id_and_tags() {
1215        let mut pages = HashMap::new();
1216        pages.insert(
1217            "id-1".to_string(),
1218            PageMeta {
1219                id: "id-1".to_string(),
1220                title: "Alpha".to_string(),
1221                path: PathBuf::from("/tmp/a"),
1222                updated_at: None,
1223                tags: vec!["rust".to_string()],
1224            },
1225        );
1226        pages.insert(
1227            "id-2".to_string(),
1228            PageMeta {
1229                id: "id-2".to_string(),
1230                title: "Beta".to_string(),
1231                path: PathBuf::from("/tmp/b"),
1232                updated_at: None,
1233                tags: vec!["pharo".to_string()],
1234            },
1235        );
1236        let index = KnowledgeBaseIndex {
1237            root: PathBuf::from("/tmp"),
1238            pages,
1239            index_issues: Vec::new(),
1240        };
1241
1242        assert_eq!(index.filter_page_ids("alpha"), vec!["id-1".to_string()]);
1243        assert_eq!(index.filter_page_ids("id-2"), vec!["id-2".to_string()]);
1244        assert_eq!(index.filter_page_ids("pharo"), vec!["id-2".to_string()]);
1245        assert_eq!(
1246            index.filter_page_ids(""),
1247            vec!["id-1".to_string(), "id-2".to_string()]
1248        );
1249    }
1250
1251    #[test]
1252    fn resolve_page_id_by_title_handles_unique_ambiguous_and_missing() {
1253        let mut pages = HashMap::new();
1254        pages.insert(
1255            "id-1".to_string(),
1256            PageMeta {
1257                id: "id-1".to_string(),
1258                title: "Alpha".to_string(),
1259                path: PathBuf::from("/tmp/a"),
1260                updated_at: None,
1261                tags: Vec::new(),
1262            },
1263        );
1264        pages.insert(
1265            "id-2".to_string(),
1266            PageMeta {
1267                id: "id-2".to_string(),
1268                title: "Alphabet".to_string(),
1269                path: PathBuf::from("/tmp/b"),
1270                updated_at: None,
1271                tags: Vec::new(),
1272            },
1273        );
1274        let index = KnowledgeBaseIndex {
1275            root: PathBuf::from("/tmp"),
1276            pages,
1277            index_issues: Vec::new(),
1278        };
1279
1280        assert_eq!(
1281            index.resolve_page_id_by_title("Alpha"),
1282            TitleResolution::Unique("id-1".to_string())
1283        );
1284        assert!(matches!(
1285            index.resolve_page_id_by_title("alp"),
1286            TitleResolution::Ambiguous(_)
1287        ));
1288        assert_eq!(
1289            index.resolve_page_id_by_title("zzz"),
1290            TitleResolution::NotFound
1291        );
1292    }
1293
1294    #[test]
1295    fn classify_link_target_covers_internal_attachment_external_unknown() {
1296        let mut pages = HashMap::new();
1297        pages.insert(
1298            "8a505fa0-2222-3333-4444-555555555555".to_string(),
1299            PageMeta {
1300                id: "8a505fa0-2222-3333-4444-555555555555".to_string(),
1301                title: "Alpha".to_string(),
1302                path: PathBuf::from("/tmp/a"),
1303                updated_at: None,
1304                tags: Vec::new(),
1305            },
1306        );
1307        let index = KnowledgeBaseIndex {
1308            root: PathBuf::from("/kb"),
1309            pages,
1310            index_issues: Vec::new(),
1311        };
1312
1313        assert!(matches!(
1314            index.classify_link_target("8a505fa0-2222-3333-4444-555555555555"),
1315            LinkTargetKind::InternalPage(_)
1316        ));
1317        assert!(matches!(
1318            index.classify_link_target("title:alpha"),
1319            LinkTargetKind::InternalPage(_)
1320        ));
1321        assert!(matches!(
1322            index.classify_link_target("go to 8a505fa0-2222-3333-4444-555555555555 now"),
1323            LinkTargetKind::InternalPage(_)
1324        ));
1325        assert!(matches!(
1326            index.classify_link_target("attachments/image.png"),
1327            LinkTargetKind::AttachmentPath(_)
1328        ));
1329        assert!(matches!(
1330            index.classify_link_target("https://example.com"),
1331            LinkTargetKind::ExternalUrl(_)
1332        ));
1333        assert!(matches!(
1334            index.classify_link_target("not a thing"),
1335            LinkTargetKind::Unknown(_)
1336        ));
1337    }
1338
1339    #[test]
1340    fn parse_word_node_extracts_primary_fields() {
1341        let item = json!({
1342            "__type":"wordSnippet",
1343            "wordString":"refactoring",
1344            "explanationAttachmentNameString":"attachments/x/explanation.json"
1345        });
1346        let node = parse_node(&item);
1347        match node {
1348            Node::Paragraph { text } => {
1349                assert!(text.contains("refactoring"));
1350                assert!(text.contains("attachments/x/explanation.json"));
1351            }
1352            other => panic!("expected paragraph, got {other:?}"),
1353        }
1354    }
1355}
lepiter_core/lib.rs

lepiter_core/
lib.rs