pub mod audio;
#[cfg(feature = "documents")]
pub mod document;
pub mod file;
pub mod git;
pub mod image;
pub mod mediawiki;
pub mod pdf;
pub mod url;
pub mod video;
pub mod wayback;
use std::fs::File;
use std::io::Write;
use std::path::{Path, PathBuf};
use crate::WikiError;
use crate::indexer;
use crate::sources::SourceRecord;
use crate::store::WikiIndexStore;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct IngestResult {
pub record: SourceRecord,
pub raw_path: PathBuf,
pub asset_path: Option<PathBuf>,
}
pub(crate) fn write_raw_markdown(
vault_root: &Path,
record: &SourceRecord,
markdown: &str,
) -> Result<PathBuf, WikiError> {
let raw_path = PathBuf::from("raw").join(format!("{}.md", record.id));
write_immutable(vault_root, &raw_path, markdown.as_bytes())?;
Ok(raw_path)
}
pub(crate) fn write_asset(
vault_root: &Path,
record: &SourceRecord,
file_name: &str,
bytes: &[u8],
) -> Result<PathBuf, WikiError> {
let asset_path = asset_path(record, file_name);
write_immutable(vault_root, &asset_path, bytes)?;
Ok(asset_path)
}
pub(crate) fn write_asset_with_suffix(
vault_root: &Path,
record: &SourceRecord,
suffix: &str,
file_name: &str,
bytes: &[u8],
) -> Result<PathBuf, WikiError> {
let extension = sanitized_extension_for_file_name(file_name);
let suffix = sanitize_asset_suffix(suffix);
let asset_path = PathBuf::from("raw")
.join("assets")
.join(format!("{}.{}.{}", record.id, suffix, extension));
write_immutable(vault_root, &asset_path, bytes)?;
Ok(asset_path)
}
pub(crate) fn write_asset_from_path(
vault_root: &Path,
record: &SourceRecord,
file_name: &str,
source_path: &Path,
content_hash: &str,
) -> Result<PathBuf, WikiError> {
let asset_path = asset_path(record, file_name);
write_immutable_file(vault_root, &asset_path, source_path, content_hash)?;
Ok(asset_path)
}
fn sanitize_asset_suffix(value: &str) -> String {
let mut suffix = String::new();
let mut last_dash = false;
for ch in value.chars() {
if ch.is_ascii_alphanumeric() || ch == '_' {
suffix.push(ch);
last_dash = false;
} else if !last_dash && !suffix.is_empty() {
suffix.push('-');
last_dash = true;
}
}
while suffix.ends_with('-') {
suffix.pop();
}
if suffix.is_empty() {
"asset".to_string()
} else {
suffix
}
}
pub(crate) fn index_after_ingest(
vault_root: &Path,
store: &mut impl WikiIndexStore,
) -> Result<(), WikiError> {
indexer::index_vault(vault_root, store).map_err(|error| WikiError::InvalidInput {
field: "index",
message: error.to_string(),
})
}
pub(crate) fn write_raw_then_index(
vault_root: &Path,
store: &mut impl WikiIndexStore,
record: SourceRecord,
markdown: &str,
asset_path: Option<PathBuf>,
) -> Result<IngestResult, WikiError> {
let raw_path = write_raw_markdown(vault_root, &record, markdown)?;
index_after_ingest(vault_root, store)?;
Ok(IngestResult {
record,
raw_path,
asset_path,
})
}
pub(crate) fn markdown_metadata(fields: &[(&str, String)]) -> String {
let mut metadata = String::from("---\n");
for (key, value) in fields {
metadata.push_str(key);
metadata.push_str(": ");
metadata.push_str(&yaml_safe_single_line_scalar(value));
metadata.push('\n');
}
metadata.push_str("---\n\n");
metadata
}
fn yaml_safe_single_line_scalar(value: &str) -> String {
let value = single_line(value);
if yaml_plain_scalar_is_safe(&value) {
value
} else {
serde_json::to_string(&value).unwrap_or_else(|_| "\"\"".to_string())
}
}
fn yaml_plain_scalar_is_safe(value: &str) -> bool {
if value.is_empty() {
return false;
}
!value.contains(": ")
&& !value.contains(" #")
&& !value.contains(['"', '\''])
&& !value.starts_with([
'-', '?', '{', '}', '[', ']', ',', '&', '*', '!', '|', '>', '%', '@', '`',
])
}
pub(crate) fn single_line(value: &str) -> String {
value.split_whitespace().collect::<Vec<_>>().join(" ")
}
pub(crate) fn markdown_title(value: &str) -> String {
single_line(value).trim_matches('#').trim().to_string()
}
pub(crate) fn text_from_utf8_lossy(bytes: &[u8]) -> String {
String::from_utf8_lossy(bytes).replace("\r\n", "\n")
}
pub(crate) fn path_to_string(path: &Path) -> String {
path.to_string_lossy().replace('\\', "/")
}
fn write_immutable(vault_root: &Path, relative: &Path, bytes: &[u8]) -> Result<(), WikiError> {
let path = vault_root.join(relative);
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent).map_err(|error| WikiError::Io {
action: "create raw source directory",
path: Some(parent.to_path_buf()),
source: error,
})?;
}
if path.exists() {
return validate_existing_raw_bytes(&path, relative, bytes);
}
let mut temp_file = create_raw_temp_file(&path)?;
if let Err(error) = temp_file.write_all(bytes) {
return Err(WikiError::Io {
action: "write raw source temp file",
path: Some(temp_file.path().to_path_buf()),
source: error,
});
}
if let Err(error) = temp_file.as_file().sync_all() {
return Err(WikiError::Io {
action: "sync raw source temp file",
path: Some(temp_file.path().to_path_buf()),
source: error,
});
}
match temp_file.persist_noclobber(&path) {
Ok(_) => sync_parent_dir(&path),
Err(error) if error.error.kind() == std::io::ErrorKind::AlreadyExists => {
validate_existing_raw_bytes(&path, relative, bytes)
}
Err(error) => Err(WikiError::Io {
action: "write raw source",
path: Some(path),
source: error.error,
}),
}
}
fn write_immutable_file(
vault_root: &Path,
relative: &Path,
source_path: &Path,
_content_hash: &str,
) -> Result<(), WikiError> {
let path = vault_root.join(relative);
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent).map_err(|error| WikiError::Io {
action: "create raw source directory",
path: Some(parent.to_path_buf()),
source: error,
})?;
}
if path.exists() {
return validate_existing_raw_file(&path, relative, source_path);
}
let mut temp_file = create_raw_temp_file(&path)?;
let mut source = std::fs::File::open(source_path).map_err(|error| WikiError::Io {
action: "open raw source",
path: Some(source_path.to_path_buf()),
source: error,
})?;
if let Err(error) = std::io::copy(&mut source, &mut temp_file) {
return Err(WikiError::Io {
action: "write raw source temp file",
path: Some(temp_file.path().to_path_buf()),
source: error,
});
}
if let Err(error) = temp_file.as_file().sync_all() {
return Err(WikiError::Io {
action: "sync raw source temp file",
path: Some(temp_file.path().to_path_buf()),
source: error,
});
}
match temp_file.persist_noclobber(&path) {
Ok(_) => sync_parent_dir(&path),
Err(error) if error.error.kind() == std::io::ErrorKind::AlreadyExists => {
validate_existing_raw_file(&path, relative, source_path)
}
Err(error) => Err(WikiError::Io {
action: "write raw source",
path: Some(path),
source: error.error,
}),
}
}
fn validate_existing_raw_bytes(
path: &Path,
relative: &Path,
bytes: &[u8],
) -> Result<(), WikiError> {
let existing_hash =
gobby_core::indexing::file_content_hash(path).map_err(|error| WikiError::Io {
action: "hash existing raw source",
path: Some(path.to_path_buf()),
source: error,
})?;
if existing_hash == gobby_core::indexing::content_hash(bytes) {
return Ok(());
}
Err(immutable_exists_error(relative))
}
fn validate_existing_raw_file(
path: &Path,
relative: &Path,
source_path: &Path,
) -> Result<(), WikiError> {
let existing_hash =
gobby_core::indexing::file_content_hash(path).map_err(|error| WikiError::Io {
action: "hash existing raw source",
path: Some(path.to_path_buf()),
source: error,
})?;
let source_hash =
gobby_core::indexing::file_content_hash(source_path).map_err(|error| WikiError::Io {
action: "hash raw source",
path: Some(source_path.to_path_buf()),
source: error,
})?;
if existing_hash == source_hash {
return Ok(());
}
Err(immutable_exists_error(relative))
}
fn immutable_exists_error(relative: &Path) -> WikiError {
WikiError::InvalidInput {
field: "raw_path",
message: format!(
"immutable raw source already exists at {}",
relative.display()
),
}
}
fn create_raw_temp_file(path: &Path) -> Result<tempfile::NamedTempFile, WikiError> {
let Some(parent) = path
.parent()
.filter(|parent| !parent.as_os_str().is_empty())
else {
return Err(WikiError::Io {
action: "create raw source temp file",
path: Some(path.to_path_buf()),
source: std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"raw source target has no parent directory",
),
});
};
let file_name = path
.file_name()
.and_then(|name| name.to_str())
.unwrap_or("source");
tempfile::Builder::new()
.prefix(&format!(".{file_name}."))
.suffix(".tmp")
.tempfile_in(parent)
.map_err(|source| WikiError::Io {
action: "create raw source temp file",
path: Some(parent.to_path_buf()),
source,
})
}
fn sync_parent_dir(path: &Path) -> Result<(), WikiError> {
#[cfg(not(unix))]
{
let _ = path;
Ok(())
}
#[cfg(unix)]
{
let Some(parent) = path.parent() else {
return Ok(());
};
File::open(parent)
.and_then(|dir| dir.sync_all())
.map_err(|error| WikiError::Io {
action: "sync raw source directory",
path: Some(parent.to_path_buf()),
source: error,
})
}
}
pub(crate) fn asset_path(record: &SourceRecord, file_name: &str) -> PathBuf {
let extension = sanitized_extension_for_file_name(file_name);
PathBuf::from("raw")
.join("assets")
.join(format!("{}.{}", record.id, extension))
}
fn sanitized_extension_for_file_name(file_name: &str) -> String {
let basename = Path::new(file_name)
.file_name()
.and_then(|value| value.to_str())
.unwrap_or(file_name);
Path::new(basename)
.extension()
.and_then(|value| value.to_str())
.map(sanitize_extension)
.filter(|value| !value.is_empty())
.unwrap_or_else(|| "bin".to_string())
}
fn sanitize_extension(extension: &str) -> String {
extension
.chars()
.filter(|ch| ch.is_ascii_alphanumeric())
.flat_map(char::to_lowercase)
.collect()
}
#[cfg(test)]
mod tests {
use std::path::{Path, PathBuf};
use super::{asset_path, markdown_metadata, write_asset_from_path};
use gobby_core::ai_context::AiContext;
use gobby_core::config::EnvOnlySource;
use gobby_core::indexing::content_hash;
use crate::ScopeIdentity;
use crate::api::IngestFileOptions;
use crate::ingest::file;
use crate::ingest::wayback::{self, WaybackCaptureSnapshot};
use crate::sources::{
CompileStatus, IngestionMethod, SourceDraft, SourceKind, SourceManifest, SourceRecord,
};
use crate::store::{
MemoryWikiStore, StoreError, WikiChunk, WikiDocument, WikiIndexStore, WikiIngestion,
WikiIngestionEvent, WikiLink, WikiSource,
};
fn no_ai_context() -> AiContext {
let mut source = EnvOnlySource;
let mut context = AiContext::resolve(None, &mut source);
IngestFileOptions {
no_ai: true,
..IngestFileOptions::default()
}
.apply_to_ai_context(&mut context);
context
}
fn write_file(root: &std::path::Path, relative: &str, contents: &str) {
let path = root.join(relative);
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent).expect("create parent");
}
std::fs::write(path, contents).expect("write file");
}
fn test_source_record() -> SourceRecord {
SourceRecord {
id: "source-1".to_string(),
location: "/tmp/report.pdf".to_string(),
canonical_location: "/tmp/report.pdf".to_string(),
kind: SourceKind::Pdf,
fetched_at: "2026-05-29T17:00:00Z".to_string(),
content_hash: "hash".to_string(),
title: None,
citation: None,
license: None,
ingestion_method: IngestionMethod::Manual,
compile_status: CompileStatus::Pending,
replay: None,
}
}
#[test]
fn asset_path_uses_basename_before_extension_extraction() {
let record = test_source_record();
assert_eq!(
asset_path(&record, "../nested/report.PDF"),
PathBuf::from("raw/assets/source-1.pdf")
);
assert_eq!(
asset_path(&record, "../nested/no-extension"),
PathBuf::from("raw/assets/source-1.bin")
);
}
#[test]
fn markdown_metadata_quotes_yaml_sensitive_scalars() {
let metadata = markdown_metadata(&[
("source_kind", "pdf".to_string()),
("source_location", "https://example.com/a: b #c".to_string()),
("draft", "true".to_string()),
]);
assert!(metadata.contains("source_kind: pdf\n"));
assert!(metadata.contains("source_location: \"https://example.com/a: b #c\"\n"));
assert!(metadata.contains("draft: true\n"));
}
#[test]
fn immutable_file_existing_match_hashes_actual_source_path() {
let temp = tempfile::tempdir().expect("tempdir");
let record = test_source_record();
let first_source = temp.path().join("first.txt");
let second_source = temp.path().join("second.txt");
let different_source = temp.path().join("different.txt");
std::fs::write(&first_source, "same bytes").expect("write first");
std::fs::write(&second_source, "same bytes").expect("write second");
std::fs::write(&different_source, "different bytes").expect("write different");
let asset_path = write_asset_from_path(
temp.path(),
&record,
"source.txt",
&first_source,
"intentionally-stale-hash",
)
.expect("first asset write");
write_asset_from_path(
temp.path(),
&record,
"source.txt",
&second_source,
"another-stale-hash",
)
.expect("matching existing asset is idempotent");
let error = write_asset_from_path(
temp.path(),
&record,
"source.txt",
&different_source,
"stale-hash",
)
.expect_err("different source rejected");
assert_eq!(asset_path, PathBuf::from("raw/assets/source-1.txt"));
assert_eq!(error.code(), "invalid_input");
}
#[test]
fn ingest_indexes_raw_without_wiki_rewrite() {
let temp = tempfile::tempdir().expect("tempdir");
let wiki_body = "# Existing Topic\n\nUser-authored notes stay intact.\n";
write_file(temp.path(), "wiki/topics/existing.md", wiki_body);
let source_path = temp.path().join("capture.txt");
std::fs::write(&source_path, "captured source text\n").expect("write source");
let mut store = MemoryWikiStore::default();
store.file_hashes.insert(
PathBuf::from("wiki/topics/existing.md"),
content_hash(wiki_body.as_bytes()),
);
let before = std::fs::read_to_string(temp.path().join("wiki/topics/existing.md"))
.expect("read existing before");
let scope = ScopeIdentity::global();
let ai_context = no_ai_context();
let options = IngestFileOptions {
no_ai: true,
..IngestFileOptions::default()
};
let result = file::ingest_path(
temp.path(),
&mut store,
&scope,
&ai_context,
&options,
&source_path,
"2026-05-29T17:00:00Z",
)
.expect("ingest path");
let after = std::fs::read_to_string(temp.path().join("wiki/topics/existing.md"))
.expect("read existing after");
assert_eq!(after, before);
assert!(temp.path().join(&result.raw_path).is_file());
assert!(store.documents.contains_key(&PathBuf::from("raw/INDEX.md")));
assert!(
!store
.documents
.contains_key(&PathBuf::from("wiki/topics/existing.md"))
);
assert!(store.ingestions.iter().any(|ingestion| {
ingestion.path == Path::new("raw/INDEX.md")
&& ingestion.event == WikiIngestionEvent::Added
}));
}
#[derive(Debug, Default)]
struct RawFirstStore {
vault_root: PathBuf,
expected_raw_path: PathBuf,
inner: MemoryWikiStore,
observed_index_write: bool,
}
impl RawFirstStore {
fn new(vault_root: &Path, expected_raw_path: impl Into<PathBuf>) -> Self {
Self {
vault_root: vault_root.to_path_buf(),
expected_raw_path: expected_raw_path.into(),
inner: MemoryWikiStore::default(),
observed_index_write: false,
}
}
fn assert_raw_exists_before_index(&mut self) {
self.observed_index_write = true;
assert!(
self.vault_root.join(&self.expected_raw_path).is_file(),
"external connector must write raw source before derived index rows"
);
}
}
impl WikiIndexStore for RawFirstStore {
fn indexed_hashes(
&mut self,
) -> Result<std::collections::BTreeMap<PathBuf, String>, StoreError> {
self.inner.indexed_hashes()
}
fn upsert_document(&mut self, document: WikiDocument) -> Result<(), StoreError> {
self.assert_raw_exists_before_index();
self.inner.upsert_document(document)
}
fn replace_chunks(
&mut self,
path: &Path,
chunks: Vec<WikiChunk>,
) -> Result<(), StoreError> {
self.assert_raw_exists_before_index();
self.inner.replace_chunks(path, chunks)
}
fn replace_links(&mut self, path: &Path, links: Vec<WikiLink>) -> Result<(), StoreError> {
self.assert_raw_exists_before_index();
self.inner.replace_links(path, links)
}
fn upsert_source(&mut self, source: WikiSource) -> Result<(), StoreError> {
self.assert_raw_exists_before_index();
self.inner.upsert_source(source)
}
fn record_ingestion(&mut self, ingestion: WikiIngestion) -> Result<(), StoreError> {
self.assert_raw_exists_before_index();
self.inner.record_ingestion(ingestion)
}
fn record_file_hash(
&mut self,
path: PathBuf,
content_hash: String,
) -> Result<(), StoreError> {
self.assert_raw_exists_before_index();
self.inner.record_file_hash(path, content_hash)
}
fn delete_derived_rows(&mut self, path: &Path) -> Result<(), StoreError> {
self.assert_raw_exists_before_index();
self.inner.delete_derived_rows(path)
}
}
#[test]
fn external_connectors_write_raw_first() {
let temp = tempfile::tempdir().expect("tempdir");
let snapshot = WaybackCaptureSnapshot {
original_url: "https://example.com/reference".to_string(),
capture_url: "https://web.archive.org/web/20260529120000/https://example.com/reference"
.to_string(),
capture_timestamp: "20260529120000".to_string(),
fetched_at: "2026-05-29T18:30:00Z".to_string(),
body: b"<html><body>Archived reference.</body></html>".to_vec(),
content_type: Some("text/html".to_string()),
};
let expected_record = SourceManifest::register(
temp.path(),
SourceDraft {
location: snapshot.capture_url.clone(),
kind: SourceKind::Wayback,
fetched_at: snapshot.fetched_at.clone(),
content: snapshot.body.clone(),
title: Some(snapshot.original_url.clone()),
citation: Some(snapshot.capture_url.clone()),
license: None,
ingestion_method: IngestionMethod::Manual,
compile_status: CompileStatus::Pending,
},
)
.expect("predict wayback record");
let expected_raw_path = PathBuf::from("raw").join(format!("{}.md", expected_record.id));
std::fs::remove_file(temp.path().join("raw/INDEX.md")).expect("remove predicted manifest");
let mut store = RawFirstStore::new(temp.path(), expected_raw_path);
wayback::ingest_capture(temp.path(), &mut store, snapshot).expect("ingest wayback");
assert!(store.observed_index_write);
}
}