gobby-wiki 0.3.0

Gobby wiki CLI shell
use std::path::{Path, PathBuf};

use crate::document::DocumentDegradation;
use crate::ingest::{
    IngestResult, index_after_ingest, lowercase_extension, markdown_title, write_asset,
    write_raw_markdown,
};
use crate::sources::{CompileStatus, IngestionMethod, SourceDraftRef, SourceKind, SourceManifest};
use crate::store::WikiIndexStore;
use crate::{ScopeIdentity, WikiError};

mod html;
mod office;
mod render;

use html::*;
use office::*;
use render::*;

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DocumentSnapshot {
    pub location: String,
    pub file_name: String,
    pub fetched_at: String,
    pub bytes: Vec<u8>,
    pub kind: SourceKind,
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DocumentIngestResult {
    pub record: crate::sources::SourceRecord,
    pub raw_path: PathBuf,
    pub asset_path: PathBuf,
    pub derived_path: PathBuf,
    pub document_degradation: Option<DocumentDegradation>,
}

impl From<DocumentIngestResult> for IngestResult {
    fn from(result: DocumentIngestResult) -> Self {
        Self {
            record: result.record,
            raw_path: result.raw_path,
            asset_path: Some(result.asset_path),
        }
    }
}

#[derive(Debug, Clone, Copy)]
pub struct DocumentRequest<'a> {
    pub file_name: &'a str,
    pub kind: &'a SourceKind,
    pub bytes: &'a [u8],
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DocumentExtraction {
    pub title: Option<String>,
    pub markdown: String,
    pub units_label: &'static str,
    pub units_count: usize,
    pub degradation: Option<DocumentDegradation>,
}

pub trait DocumentExtractor {
    fn extract(&self, request: &DocumentRequest<'_>) -> Result<DocumentExtraction, WikiError>;
}

pub enum DocumentEndpoint<'a> {
    Available(&'a dyn DocumentExtractor),
    Unavailable(DocumentDegradation),
}

struct LocalDocumentExtractor;

pub fn ingest_document(
    vault_root: &Path,
    store: &mut impl WikiIndexStore,
    scope: ScopeIdentity,
    snapshot: DocumentSnapshot,
) -> Result<DocumentIngestResult, WikiError> {
    let result = ingest_document_without_index(vault_root, scope, snapshot)?;
    index_after_ingest(vault_root, store)?;
    Ok(result)
}

pub(crate) fn ingest_document_without_index(
    vault_root: &Path,
    scope: ScopeIdentity,
    snapshot: DocumentSnapshot,
) -> Result<DocumentIngestResult, WikiError> {
    static EXTRACTOR: LocalDocumentExtractor = LocalDocumentExtractor;
    ingest_document_with_endpoint_without_index(
        vault_root,
        scope,
        snapshot,
        DocumentEndpoint::Available(&EXTRACTOR),
    )
}

pub fn ingest_document_with_endpoint(
    vault_root: &Path,
    store: &mut impl WikiIndexStore,
    scope: ScopeIdentity,
    snapshot: DocumentSnapshot,
    endpoint: DocumentEndpoint<'_>,
) -> Result<DocumentIngestResult, WikiError> {
    let result =
        ingest_document_with_endpoint_without_index(vault_root, scope, snapshot, endpoint)?;
    index_after_ingest(vault_root, store)?;
    Ok(result)
}

pub(crate) fn ingest_document_with_endpoint_without_index(
    vault_root: &Path,
    scope: ScopeIdentity,
    snapshot: DocumentSnapshot,
    endpoint: DocumentEndpoint<'_>,
) -> Result<DocumentIngestResult, WikiError> {
    let request = DocumentRequest {
        file_name: &snapshot.file_name,
        kind: &snapshot.kind,
        bytes: &snapshot.bytes,
    };
    let (extraction, degradation) = match endpoint {
        DocumentEndpoint::Available(extractor) => match extractor.extract(&request) {
            Ok(extraction) => {
                let degradation = extraction.degradation.clone();
                (Some(extraction), degradation)
            }
            Err(error) => (
                None,
                Some(document_degradation_for_error(&request, error.to_string())),
            ),
        },
        DocumentEndpoint::Unavailable(degradation) => (None, Some(degradation)),
    };
    let title = extraction
        .as_ref()
        .and_then(|value| value.title.clone())
        .unwrap_or_else(|| markdown_title(&snapshot.file_name));
    let record = SourceManifest::register_borrowed(
        vault_root,
        SourceDraftRef {
            location: snapshot.location.clone(),
            kind: snapshot.kind.clone(),
            fetched_at: snapshot.fetched_at.clone(),
            content: &snapshot.bytes,
            title: Some(title.clone()),
            citation: Some(snapshot.location.clone()),
            license: None,
            ingestion_method: IngestionMethod::Manual,
            compile_status: CompileStatus::Pending,
        },
    )?;
    let asset_path = write_asset(vault_root, &record, &snapshot.file_name, &snapshot.bytes)?;
    let raw_markdown = render_raw_document_markdown(&snapshot, &record.content_hash, &asset_path);
    let raw_path = match write_raw_markdown(vault_root, &record, &raw_markdown) {
        Ok(path) => path,
        Err(error) => {
            remove_document_asset_after_failure(vault_root, &asset_path, "raw markdown write");
            return Err(error);
        }
    };
    let derived_path = match write_document_derived_markdown(
        vault_root,
        &scope,
        &record,
        &snapshot,
        &title,
        &asset_path,
        extraction.as_ref(),
        degradation.as_ref(),
    ) {
        Ok(path) => path,
        Err(error) => {
            remove_document_asset_after_failure(vault_root, &asset_path, "derived markdown write");
            return Err(error);
        }
    };

    Ok(DocumentIngestResult {
        record,
        raw_path,
        asset_path,
        derived_path,
        document_degradation: degradation,
    })
}

fn remove_document_asset_after_failure(vault_root: &Path, asset_path: &Path, context: &str) {
    let full_path = vault_root.join(asset_path);
    if let Err(error) = std::fs::remove_file(&full_path) {
        log::warn!(
            "failed to remove document asset {} after {context} failure: {error}",
            full_path.display()
        );
    }
}

impl DocumentExtractor for LocalDocumentExtractor {
    fn extract(&self, request: &DocumentRequest<'_>) -> Result<DocumentExtraction, WikiError> {
        match request.kind {
            SourceKind::Html => extract_html_document(request.bytes),
            SourceKind::Office => extract_office_document(request.file_name, request.bytes),
            _ => Err(document_error(format!(
                "unsupported document kind: {:?}",
                request.kind
            ))),
        }
    }
}

fn extension(file_name: &str) -> Option<String> {
    lowercase_extension(file_name)
}

fn decode_xml_entities(text: &str) -> String {
    html_escape::decode_html_entities(text).into_owned()
}

fn document_error(message: impl Into<String>) -> WikiError {
    WikiError::InvalidInput {
        field: "document",
        message: message.into(),
    }
}

#[cfg(test)]
mod tests;