use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::path::Path;
use chrono::Utc;
use super::storage::KnowledgeStore;
use super::types::*;
fn content_hash(content: &str) -> String {
use std::collections::hash_map::DefaultHasher;
let mut h1 = DefaultHasher::new();
content.hash(&mut h1);
let v1 = h1.finish();
let mut h2 = DefaultHasher::new();
"salt".hash(&mut h2);
content.hash(&mut h2);
let v2 = h2.finish();
format!("{:016x}{:016x}", v1, v2)
}
pub trait Importer: Send + Sync {
fn parse(&self, path: &Path) -> Result<Vec<MemoryCandidate>, KcError>;
fn format_name(&self) -> &'static str;
}
pub struct MarkdownImporter {
pub split: SplitStrategy,
}
impl MarkdownImporter {
fn collect_md_files(dir: &Path) -> Result<Vec<std::path::PathBuf>, KcError> {
let mut files = Vec::new();
Self::walk_dir(dir, &mut files)?;
files.sort();
Ok(files)
}
fn walk_dir(dir: &Path, files: &mut Vec<std::path::PathBuf>) -> Result<(), KcError> {
let entries = std::fs::read_dir(dir)
.map_err(|e| KcError::ImportError(format!("read dir {}: {}", dir.display(), e)))?;
for entry in entries {
let entry = entry
.map_err(|e| KcError::ImportError(format!("dir entry: {}", e)))?;
let path = entry.path();
if path.is_dir() {
Self::walk_dir(&path, files)?;
} else if path.extension().and_then(|e| e.to_str()) == Some("md") {
files.push(path);
}
}
Ok(())
}
fn parse_file(&self, path: &Path) -> Result<Vec<MemoryCandidate>, KcError> {
let text = std::fs::read_to_string(path)
.map_err(|e| KcError::ImportError(format!("read {}: {}", path.display(), e)))?;
let source = path.display().to_string();
let sections = self.split_text(&text);
Ok(sections
.into_iter()
.filter(|s| !s.trim().is_empty())
.map(|content| {
let hash = content_hash(&content);
let mut metadata = HashMap::new();
metadata.insert("source_format".to_owned(), "markdown".to_owned());
metadata.insert("source_file".to_owned(), source.clone());
MemoryCandidate {
content,
source: source.clone(),
content_hash: hash,
metadata,
}
})
.collect())
}
fn split_text(&self, text: &str) -> Vec<String> {
match &self.split {
SplitStrategy::ByHeading => Self::split_by_heading(text),
SplitStrategy::ByParagraph => Self::split_by_paragraph(text),
SplitStrategy::ByTokenCount(n) => Self::split_by_token_count(text, *n),
SplitStrategy::Smart => Self::split_by_heading(text),
}
}
fn split_by_heading(text: &str) -> Vec<String> {
let mut sections = Vec::new();
let mut current = String::new();
for line in text.lines() {
if line.starts_with("## ") {
if !current.trim().is_empty() {
sections.push(current.trim().to_owned());
}
current = String::new();
}
current.push_str(line);
current.push('\n');
}
if !current.trim().is_empty() {
sections.push(current.trim().to_owned());
}
sections
}
fn split_by_paragraph(text: &str) -> Vec<String> {
text.split("\n\n")
.map(|s| s.trim().to_owned())
.filter(|s| !s.is_empty())
.collect()
}
fn split_by_token_count(text: &str, n: usize) -> Vec<String> {
let char_limit = n * 4;
let mut sections = Vec::new();
let mut current = String::new();
for line in text.lines() {
if !current.is_empty() && current.len() + line.len() + 1 > char_limit {
sections.push(current.trim().to_owned());
current = String::new();
}
if !current.is_empty() {
current.push('\n');
}
current.push_str(line);
}
if !current.trim().is_empty() {
sections.push(current.trim().to_owned());
}
sections
}
}
impl Importer for MarkdownImporter {
fn parse(&self, path: &Path) -> Result<Vec<MemoryCandidate>, KcError> {
if path.is_file() {
self.parse_file(path)
} else if path.is_dir() {
let files = Self::collect_md_files(path)?;
let mut candidates = Vec::new();
for file in files {
candidates.extend(self.parse_file(&file)?);
}
Ok(candidates)
} else {
Err(KcError::ImportError(format!(
"path does not exist: {}",
path.display()
)))
}
}
fn format_name(&self) -> &'static str {
"markdown"
}
}
pub struct ObsidianImporter;
impl ObsidianImporter {
fn parse_frontmatter(text: &str) -> (HashMap<String, String>, String) {
let mut metadata = HashMap::new();
let trimmed = text.trim_start();
if !trimmed.starts_with("---") {
return (metadata, text.to_owned());
}
let after_first = &trimmed[3..];
if let Some(end_idx) = after_first.find("\n---") {
let frontmatter_block = &after_first[..end_idx];
let body_start = 3 + end_idx + 4; let body = trimmed[body_start..].trim_start_matches('\n').to_owned();
for line in frontmatter_block.lines() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
if let Some(colon_pos) = line.find(':') {
let key = line[..colon_pos].trim().to_owned();
let value = line[colon_pos + 1..].trim().to_owned();
if !key.is_empty() {
metadata.insert(key, value);
}
}
}
(metadata, body)
} else {
(metadata, text.to_owned())
}
}
fn convert_wikilinks(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut chars = text.chars().peekable();
while let Some(ch) = chars.next() {
if ch == '[' {
if chars.peek() == Some(&'[') {
chars.next(); let mut link_content = String::new();
let mut found_close = false;
while let Some(c) = chars.next() {
if c == ']' {
if chars.peek() == Some(&']') {
chars.next(); found_close = true;
break;
} else {
link_content.push(c);
}
} else {
link_content.push(c);
}
}
if found_close {
if let Some(pipe_pos) = link_content.find('|') {
result.push_str(&link_content[pipe_pos + 1..]);
} else {
result.push_str(&link_content);
}
} else {
result.push_str("[[");
result.push_str(&link_content);
}
} else {
result.push(ch);
}
} else {
result.push(ch);
}
}
result
}
fn extract_tags(tags_value: &str) -> Vec<String> {
let cleaned = tags_value
.trim()
.trim_start_matches('[')
.trim_end_matches(']');
cleaned
.split(',')
.map(|t| t.trim().trim_matches('"').trim_matches('\'').to_owned())
.filter(|t| !t.is_empty())
.collect()
}
}
impl Importer for ObsidianImporter {
fn parse(&self, path: &Path) -> Result<Vec<MemoryCandidate>, KcError> {
let files = if path.is_file() {
vec![path.to_path_buf()]
} else if path.is_dir() {
MarkdownImporter::collect_md_files(path)?
} else {
return Err(KcError::ImportError(format!(
"path does not exist: {}",
path.display()
)));
};
let mut candidates = Vec::new();
for file in files {
let text = std::fs::read_to_string(&file)
.map_err(|e| KcError::ImportError(format!("read {}: {}", file.display(), e)))?;
let source = file.display().to_string();
let (frontmatter, body) = Self::parse_frontmatter(&text);
let body = Self::convert_wikilinks(&body);
let mut metadata = HashMap::new();
metadata.insert("source_format".to_owned(), "obsidian".to_owned());
metadata.insert("source_file".to_owned(), source.clone());
for (k, v) in &frontmatter {
metadata.insert(k.clone(), v.clone());
}
if let Some(tags_value) = frontmatter.get("tags") {
let tags = Self::extract_tags(tags_value);
if !tags.is_empty() {
metadata.insert("extracted_tags".to_owned(), tags.join(", "));
}
}
let sections = MarkdownImporter::split_by_heading(&body);
for section in sections {
if section.trim().is_empty() {
continue;
}
let hash = content_hash(§ion);
candidates.push(MemoryCandidate {
content: section,
source: source.clone(),
content_hash: hash,
metadata: metadata.clone(),
});
}
}
Ok(candidates)
}
fn format_name(&self) -> &'static str {
"obsidian"
}
}
pub struct JsonImporter;
impl Importer for JsonImporter {
fn parse(&self, path: &Path) -> Result<Vec<MemoryCandidate>, KcError> {
let text = std::fs::read_to_string(path)
.map_err(|e| KcError::ImportError(format!("read {}: {}", path.display(), e)))?;
let items: Vec<serde_json::Value> = serde_json::from_str(&text)
.map_err(|e| KcError::ImportError(format!("parse JSON: {}", e)))?;
let mut candidates = Vec::new();
for item in items {
let content = item
.get("content")
.and_then(|v| v.as_str())
.ok_or_else(|| {
KcError::ImportError("JSON object missing 'content' field".to_owned())
})?
.to_owned();
let source = item
.get("source")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_owned();
let hash = content_hash(&content);
let mut metadata = HashMap::new();
metadata.insert("source_format".to_owned(), "json".to_owned());
if let Some(source_str) = item.get("source").and_then(|v| v.as_str()) {
metadata.insert("source_file".to_owned(), source_str.to_owned());
}
if let Some(meta_obj) = item.get("metadata").and_then(|v| v.as_object()) {
for (k, v) in meta_obj {
metadata.insert(
k.clone(),
v.as_str()
.map(|s| s.to_owned())
.unwrap_or_else(|| v.to_string()),
);
}
}
candidates.push(MemoryCandidate {
content,
source,
content_hash: hash,
metadata,
});
}
Ok(candidates)
}
fn format_name(&self) -> &'static str {
"json"
}
}
pub struct ImportPipeline;
impl ImportPipeline {
pub fn run(
store: &dyn KnowledgeStore,
importer: &dyn Importer,
path: &Path,
config: &ImportConfig,
) -> Result<ImportReport, KcError> {
let start = std::time::Instant::now();
let candidates = importer.parse(path)?;
let total_processed = candidates.len();
let existing_pages = store.list_topic_pages()?;
let mut imported = 0usize;
let mut skipped = 0usize;
let mut errors: Vec<String> = Vec::new();
for candidate in &candidates {
let existing = existing_pages
.iter()
.find(|p| {
p.metadata
.source_memory_ids
.contains(&candidate.content_hash)
});
match config.duplicate_strategy {
DuplicateStrategy::Skip => {
if existing.is_some() {
skipped += 1;
continue;
}
match Self::create_page(store, candidate) {
Ok(()) => imported += 1,
Err(e) => errors.push(format!("{}", e)),
}
}
DuplicateStrategy::Replace => {
if let Some(existing_page) = existing {
let mut updated = existing_page.clone();
updated.content = candidate.content.clone();
updated.metadata.updated_at = Utc::now();
updated.version += 1;
match store.update_topic_page(&updated) {
Ok(()) => imported += 1,
Err(e) => errors.push(format!("{}", e)),
}
} else {
match Self::create_page(store, candidate) {
Ok(()) => imported += 1,
Err(e) => errors.push(format!("{}", e)),
}
}
}
DuplicateStrategy::Append => {
match Self::create_page(store, candidate) {
Ok(()) => imported += 1,
Err(e) => errors.push(format!("{}", e)),
}
}
DuplicateStrategy::Ask => {
if existing.is_some() {
skipped += 1;
continue;
}
match Self::create_page(store, candidate) {
Ok(()) => imported += 1,
Err(e) => errors.push(format!("{}", e)),
}
}
}
}
let duration_ms = start.elapsed().as_millis() as u64;
Ok(ImportReport {
total_processed,
imported,
skipped,
errors,
duration_ms,
})
}
fn create_page(
store: &dyn KnowledgeStore,
candidate: &MemoryCandidate,
) -> Result<(), KcError> {
let now = Utc::now();
let id = format!("import-{}", candidate.content_hash);
let title = Self::derive_title(&candidate.content);
let page = TopicPage {
id: TopicId(id),
title,
content: candidate.content.clone(),
sections: Vec::new(),
summary: String::new(),
status: TopicStatus::Active,
version: 1,
metadata: TopicMetadata {
created_at: now,
updated_at: now,
compilation_count: 0,
source_memory_ids: vec![candidate.content_hash.clone()],
tags: Vec::new(),
quality_score: None,
},
};
store.create_topic_page(&page)
}
fn derive_title(content: &str) -> String {
let first_line = content.lines().next().unwrap_or("Imported");
let title = first_line
.trim()
.trim_start_matches('#')
.trim();
if title.is_empty() {
"Imported".to_owned()
} else if title.len() > 100 {
format!("{}...", &title[..97])
} else {
title.to_owned()
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::compiler::storage::SqliteKnowledgeStore;
use std::io::Write;
use tempfile::TempDir;
fn write_temp_file(dir: &TempDir, name: &str, content: &str) -> std::path::PathBuf {
let path = dir.path().join(name);
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent).unwrap();
}
let mut f = std::fs::File::create(&path).unwrap();
f.write_all(content.as_bytes()).unwrap();
path
}
fn make_store() -> SqliteKnowledgeStore {
let store = SqliteKnowledgeStore::in_memory().unwrap();
store.init_schema().unwrap();
store
}
fn default_import_config(dup: DuplicateStrategy) -> ImportConfig {
ImportConfig {
default_policy: ImportPolicy::Skip,
split_strategy: SplitStrategy::ByHeading,
duplicate_strategy: dup,
max_document_size_bytes: 10_000_000,
}
}
#[test]
fn test_markdown_importer_single_file() {
let dir = TempDir::new().unwrap();
let content = "# Title\n\nIntro paragraph.\n\n## Section One\n\nContent of section one.\n\n## Section Two\n\nContent of section two.\n";
let path = write_temp_file(&dir, "test.md", content);
let importer = MarkdownImporter {
split: SplitStrategy::ByHeading,
};
let candidates = importer.parse(&path).unwrap();
assert_eq!(candidates.len(), 3);
assert!(candidates[0].content.contains("Title"));
assert!(candidates[0].content.contains("Intro paragraph"));
assert!(candidates[1].content.contains("Section One"));
assert!(candidates[1].content.contains("Content of section one"));
assert!(candidates[2].content.contains("Section Two"));
assert!(candidates[2].content.contains("Content of section two"));
for c in &candidates {
assert_eq!(c.metadata.get("source_format").unwrap(), "markdown");
assert!(c.metadata.get("source_file").unwrap().contains("test.md"));
assert!(!c.content_hash.is_empty());
}
}
#[test]
fn test_markdown_importer_by_paragraph() {
let dir = TempDir::new().unwrap();
let content = "First paragraph with some text.\n\nSecond paragraph here.\n\nThird paragraph too.\n";
let path = write_temp_file(&dir, "para.md", content);
let importer = MarkdownImporter {
split: SplitStrategy::ByParagraph,
};
let candidates = importer.parse(&path).unwrap();
assert_eq!(candidates.len(), 3);
assert!(candidates[0].content.contains("First paragraph"));
assert!(candidates[1].content.contains("Second paragraph"));
assert!(candidates[2].content.contains("Third paragraph"));
let hashes: Vec<&str> = candidates.iter().map(|c| c.content_hash.as_str()).collect();
assert_ne!(hashes[0], hashes[1]);
assert_ne!(hashes[1], hashes[2]);
}
#[test]
fn test_markdown_importer_directory() {
let dir = TempDir::new().unwrap();
write_temp_file(&dir, "a.md", "# Doc A\n\nContent A.\n");
write_temp_file(&dir, "subdir/b.md", "# Doc B\n\nContent B.\n");
write_temp_file(&dir, "not_md.txt", "Should be ignored.\n");
let importer = MarkdownImporter {
split: SplitStrategy::Smart,
};
let candidates = importer.parse(dir.path()).unwrap();
assert_eq!(candidates.len(), 2);
}
#[test]
fn test_markdown_importer_by_token_count() {
let dir = TempDir::new().unwrap();
let content = "Line one is here now.\nLine two is here too.\nLine three is great.\nLine four wow cool.\n";
let path = write_temp_file(&dir, "tokens.md", content);
let importer = MarkdownImporter {
split: SplitStrategy::ByTokenCount(10),
};
let candidates = importer.parse(&path).unwrap();
assert!(candidates.len() >= 2, "got {} candidates", candidates.len());
}
#[test]
fn test_obsidian_frontmatter() {
let dir = TempDir::new().unwrap();
let content = r#"---
title: My Note
tags: [rust, programming]
date: 2024-01-15
---
# My Note
This references [[Other Note]] and [[Display|shown text]].
## Details
More content about [[Topics]].
"#;
let path = write_temp_file(&dir, "note.md", content);
let importer = ObsidianImporter;
let candidates = importer.parse(&path).unwrap();
assert!(candidates.len() >= 2, "got {} candidates", candidates.len());
let meta = &candidates[0].metadata;
assert_eq!(meta.get("source_format").unwrap(), "obsidian");
assert_eq!(meta.get("title").unwrap(), "My Note");
assert!(meta.contains_key("tags"));
assert!(meta.get("extracted_tags").unwrap().contains("rust"));
assert!(meta.get("extracted_tags").unwrap().contains("programming"));
assert!(
candidates[0].content.contains("Other Note"),
"content: {}",
candidates[0].content
);
assert!(
!candidates[0].content.contains("[["),
"should not contain [["
);
assert!(
candidates[0].content.contains("shown text"),
"content: {}",
candidates[0].content
);
assert!(
!candidates[0].content.contains("Display|"),
"should not contain pipe syntax"
);
}
#[test]
fn test_obsidian_no_frontmatter() {
let dir = TempDir::new().unwrap();
let content = "# Plain Note\n\nNo frontmatter here.\n";
let path = write_temp_file(&dir, "plain.md", content);
let importer = ObsidianImporter;
let candidates = importer.parse(&path).unwrap();
assert_eq!(candidates.len(), 1);
assert!(candidates[0].content.contains("Plain Note"));
assert_eq!(
candidates[0].metadata.get("source_format").unwrap(),
"obsidian"
);
}
#[test]
fn test_json_importer() {
let dir = TempDir::new().unwrap();
let json = r#"[
{"content": "First memory", "source": "test.json"},
{"content": "Second memory", "source": "test.json", "metadata": {"topic": "rust"}},
{"content": "Third memory"}
]"#;
let path = write_temp_file(&dir, "data.json", json);
let importer = JsonImporter;
let candidates = importer.parse(&path).unwrap();
assert_eq!(candidates.len(), 3);
assert_eq!(candidates[0].content, "First memory");
assert_eq!(candidates[0].source, "test.json");
assert_eq!(candidates[1].content, "Second memory");
assert_eq!(
candidates[1].metadata.get("topic").unwrap(),
"rust"
);
assert_eq!(candidates[2].content, "Third memory");
assert_eq!(candidates[2].source, "");
for c in &candidates {
assert!(!c.content_hash.is_empty());
assert_eq!(c.metadata.get("source_format").unwrap(), "json");
}
assert_ne!(candidates[0].content_hash, candidates[1].content_hash);
}
#[test]
fn test_json_importer_missing_content() {
let dir = TempDir::new().unwrap();
let json = r#"[{"source": "no content field"}]"#;
let path = write_temp_file(&dir, "bad.json", json);
let importer = JsonImporter;
let result = importer.parse(&path);
assert!(result.is_err());
}
#[test]
fn test_import_pipeline_skip_duplicates() {
let store = make_store();
let dir = TempDir::new().unwrap();
let content = "# Topic A\n\nSome unique content.\n\n## Section B\n\nMore content here.\n";
let path = write_temp_file(&dir, "doc.md", content);
let importer = MarkdownImporter {
split: SplitStrategy::ByHeading,
};
let config = default_import_config(DuplicateStrategy::Skip);
let report1 = ImportPipeline::run(&store, &importer, &path, &config).unwrap();
assert_eq!(report1.total_processed, 2);
assert_eq!(report1.imported, 2);
assert_eq!(report1.skipped, 0);
let report2 = ImportPipeline::run(&store, &importer, &path, &config).unwrap();
assert_eq!(report2.total_processed, 2);
assert_eq!(report2.imported, 0);
assert_eq!(report2.skipped, 2);
let pages = store.list_topic_pages().unwrap();
assert_eq!(pages.len(), 2);
}
#[test]
fn test_import_pipeline_replace() {
let store = make_store();
let dir = TempDir::new().unwrap();
let content1 = "# Topic\n\nOriginal content.\n";
let path = write_temp_file(&dir, "doc.md", content1);
let importer = MarkdownImporter {
split: SplitStrategy::ByHeading,
};
let config = default_import_config(DuplicateStrategy::Replace);
let report1 = ImportPipeline::run(&store, &importer, &path, &config).unwrap();
assert_eq!(report1.imported, 1);
let report2 = ImportPipeline::run(&store, &importer, &path, &config).unwrap();
assert_eq!(report2.total_processed, 1);
assert_eq!(report2.imported, 1);
assert_eq!(report2.skipped, 0);
let pages = store.list_topic_pages().unwrap();
assert_eq!(pages.len(), 1);
assert_eq!(pages[0].version, 2);
}
#[test]
fn test_import_pipeline_append() {
let store = make_store();
let dir = TempDir::new().unwrap();
let content = "# Topic\n\nSome content.\n";
let path = write_temp_file(&dir, "doc.md", content);
let importer = MarkdownImporter {
split: SplitStrategy::ByHeading,
};
let config = default_import_config(DuplicateStrategy::Append);
let report1 = ImportPipeline::run(&store, &importer, &path, &config).unwrap();
assert_eq!(report1.imported, 1);
let report2 = ImportPipeline::run(&store, &importer, &path, &config).unwrap();
assert_eq!(report2.total_processed, 1);
assert_eq!(report2.errors.len(), 1);
}
}