use std::collections::BTreeMap;
use std::path::Path;
use anyhow::{Context, Result, anyhow};
use serde::{Deserialize, Serialize};
use crate::project::ProjectLayout;
pub(super) const SOURCE_KIND: &str = "research_source";
#[derive(Debug, Clone, Serialize, Deserialize)]
pub(super) struct ImportedSource {
pub name: String,
pub path: String,
#[serde(default)]
pub doc_ids: Vec<String>,
#[serde(default)]
pub thread: String,
pub imported_at: String,
#[serde(default)]
pub chunks: usize,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub(super) struct Imports {
#[serde(default)]
pub sources: BTreeMap<String, ImportedSource>,
}
impl Imports {
fn path(layout: &ProjectLayout) -> std::path::PathBuf {
layout.root.join(".inkhaven").join("research-sources.json")
}
pub(super) fn load(layout: &ProjectLayout) -> Imports {
match std::fs::read_to_string(Imports::path(layout)) {
Ok(raw) => serde_json::from_str(&raw).unwrap_or_default(),
Err(_) => Imports::default(),
}
}
pub(super) fn save(&self, layout: &ProjectLayout) -> Result<()> {
let dir = layout.root.join(".inkhaven");
std::fs::create_dir_all(&dir).with_context(|| format!("create {}", dir.display()))?;
let json = serde_json::to_string_pretty(self).context("serialise imports")?;
crate::io_atomic::write(&Imports::path(layout), json.as_bytes())
.context("write research-sources.json")?;
Ok(())
}
}
pub(super) fn source_name(path: &Path) -> String {
let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("source");
let s = slug::slugify(stem);
if s.is_empty() { "source".to_string() } else { s }
}
pub(super) fn read_source(path: &Path) -> Result<String> {
let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("").to_ascii_lowercase();
match ext.as_str() {
"md" | "markdown" | "txt" | "text" | "" => {
std::fs::read_to_string(path).with_context(|| format!("read {}", path.display()))
}
"pdf" => pdf_extract::extract_text(path)
.map_err(|e| anyhow!("PDF text extraction failed for {}: {e}", path.display())),
other => Err(anyhow!("unsupported source format: .{other} (md / txt / pdf)")),
}
}
pub(super) fn chunk_text(text: &str, max_chars: usize) -> Vec<String> {
let max = max_chars.max(200);
let mut chunks: Vec<String> = Vec::new();
let mut cur = String::new();
for para in text.split("\n\n").map(str::trim).filter(|p| !p.is_empty()) {
if para.chars().count() > max {
if !cur.is_empty() {
chunks.push(std::mem::take(&mut cur));
}
let chars: Vec<char> = para.chars().collect();
for piece in chars.chunks(max) {
chunks.push(piece.iter().collect());
}
continue;
}
let need = if cur.is_empty() { para.chars().count() } else { cur.chars().count() + 2 + para.chars().count() };
if need > max && !cur.is_empty() {
chunks.push(std::mem::take(&mut cur));
}
if !cur.is_empty() {
cur.push_str("\n\n");
}
cur.push_str(para);
}
if !cur.trim().is_empty() {
chunks.push(cur);
}
chunks
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn chunk_packs_and_splits() {
let text = "Alpha para.\n\nBeta para.\n\nGamma para.";
let c = chunk_text(text, 1000);
assert_eq!(c.len(), 1);
assert!(c[0].contains("Alpha") && c[0].contains("Gamma"));
let para = "w".repeat(150);
let big_text = format!("{para}\n\n{para}\n\n{para}");
let c2 = chunk_text(&big_text, 200);
assert!(c2.len() >= 2, "got {} chunks", c2.len());
let big = "x".repeat(900);
let c3 = chunk_text(&big, 200);
assert!(c3.len() >= 4);
assert!(chunk_text(" \n\n ", 500).is_empty());
}
#[test]
fn names_and_unsupported() {
assert_eq!(source_name(Path::new("/a/Roman Aqueducts.md")), "roman-aqueducts");
assert!(read_source(Path::new("/x/notes.docx")).is_err());
}
}