reasonkit/lib.rs
1#![doc = include_str!("../README.md")]
2// doc_auto_cfg was merged into doc_cfg in Rust 1.92
3#![cfg_attr(docsrs, feature(doc_cfg))]
4
5//! # ReasonKit Core
6//!
7//! AI Thinking Enhancement System - Turn Prompts into Protocols
8//!
9//! ReasonKit Core is a **pure reasoning engine** that improves AI thinking patterns
10//! through structured reasoning protocols called ThinkTools. It transforms ad-hoc
11//! LLM prompting into auditable, reproducible reasoning chains.
12//!
13//! ## Philosophy
14//!
15//! **"Designed, Not Dreamed"** - Structure beats raw intelligence. By imposing
16//! systematic reasoning protocols, ReasonKit helps AI models produce more reliable,
17//! verifiable, and explainable outputs.
18//!
19//! ## Quick Start
20//!
21//! ### Rust Usage
22//!
23//! ```rust,ignore
24//! use reasonkit::thinktool::{ProtocolExecutor, ProtocolInput};
25//!
26//! #[tokio::main]
27//! async fn main() -> anyhow::Result<()> {
28//! // Create executor (auto-detects LLM from environment)
29//! let executor = ProtocolExecutor::new()?;
30//!
31//! // Run GigaThink for multi-perspective analysis
32//! let result = executor.execute(
33//! "gigathink",
34//! ProtocolInput::query("Should we use microservices?")
35//! ).await?;
36//!
37//! println!("Confidence: {:.2}", result.confidence);
38//! for perspective in result.perspectives() {
39//! println!("- {}", perspective);
40//! }
41//! Ok(())
42//! }
43//! ```
44//!
45//! ### Python Usage
46//!
47//! ```python
48//! from reasonkit import Reasoner, Profile, run_gigathink
49//!
50//! # Quick usage with convenience functions
51//! result = run_gigathink("What factors drive startup success?")
52//! print(result.perspectives)
53//!
54//! # Full control with Reasoner class
55//! r = Reasoner()
56//! result = r.think_with_profile(Profile.Balanced, "Should we pivot?")
57//! print(f"Confidence: {result.confidence:.1%}")
58//! ```
59//!
60//! ## ThinkTools (Core Reasoning Protocols)
61//!
62//! ReasonKit provides five core ThinkTools, each implementing a specific reasoning strategy:
63//!
64//! | Tool | Code | Purpose | Output |
65//! |------|------|---------|--------|
66//! | **GigaThink** | `gt` | Expansive creative thinking | 10+ diverse perspectives |
67//! | **LaserLogic** | `ll` | Precision deductive reasoning | Validity assessment, fallacy detection |
68//! | **BedRock** | `br` | First principles decomposition | Core axioms, rebuilt foundations |
69//! | **ProofGuard** | `pg` | Multi-source verification | Triangulated evidence (3+ sources) |
70//! | **BrutalHonesty** | `bh` | Adversarial self-critique | Flaws, weaknesses, counter-arguments |
71//!
72//! ## Reasoning Profiles
73//!
74//! Profiles chain multiple ThinkTools together for comprehensive analysis:
75//!
76//! | Profile | ThinkTools | Min Confidence | Use Case |
77//! |---------|------------|----------------|----------|
78//! | `quick` | GT, LL | 70% | Fast initial analysis |
79//! | `balanced` | GT, LL, BR, PG | 80% | Standard decision-making |
80//! | `deep` | All 5 | 85% | Complex problems |
81//! | `paranoid` | All 5 + validation | 95% | High-stakes decisions |
82//!
83//! ## Feature Flags
84//!
85//! - `memory` - Enable memory layer integration via `reasonkit-mem`
86//! - `aesthetic` - Enable UI/UX assessment capabilities
87//! - `vibe` - Enable VIBE protocol validation system
88//! - `code-intelligence` - Enable multi-language code analysis
89//! - `arf` - Enable Autonomous Reasoning Framework
90//! - `minimax` - Enable MiniMax M2 model integration
91//!
92//! ## Supported LLM Providers
93//!
94//! ReasonKit supports 18+ LLM providers out of the box:
95//!
96//! - **Major Cloud**: Anthropic, OpenAI, Google Gemini, Vertex AI, Azure OpenAI, AWS Bedrock
97//! - **Specialized**: xAI (Grok), Groq, Mistral, DeepSeek, Cohere, Perplexity, Cerebras
98//! - **Inference**: Together AI, Fireworks AI, Alibaba Qwen
99//! - **Aggregation**: OpenRouter (300+ models), Cloudflare AI Gateway
100//!
101//! ## Architecture
102//!
103//! ```text
104//! +------------------+ +------------------+ +------------------+
105//! | User Query | --> | Protocol Engine | --> | Auditable Output|
106//! +------------------+ +------------------+ +------------------+
107//! |
108//! +-------------+-------------+
109//! | | |
110//! +----v----+ +-----v-----+ +----v----+
111//! | LLM | | ThinkTool | | Profile |
112//! | Client | | Modules | | System |
113//! +---------+ +-----------+ +---------+
114//! ```
115//!
116//! ## Modules
117//!
118//! - [`thinktool`] - Core ThinkTool protocols and execution engine
119//! - [`engine`] - High-level async reasoning loop with streaming
120//! - [`orchestration`] - Long-horizon task orchestration (100+ tool calls)
121//! - [`error`] - Error types and result aliases
122//! - [`telemetry`] - Metrics and observability
123//!
124//! ## Optional Modules (Feature-Gated)
125//!
126//! - \[`bindings`\] - Python bindings via PyO3 (requires `python`)
127//! - \[`rag`\] - Full RAG engine with LLM integration (requires `memory`)
128//! - \[`aesthetic`\] - UI/UX assessment system (requires `aesthetic`)
129//! - \[`vibe`\] - VIBE protocol validation (requires `vibe`)
130//! - \[`code_intelligence`\] - Multi-language code analysis (requires `code-intelligence`)
131
132// TRACKED: Enable `#![warn(missing_docs)]` before v1.0 release
133// Status: All public APIs need documentation first (tracked in QA plan)
134#![allow(missing_docs)]
135#![warn(clippy::all)]
136#![deny(unsafe_code)]
137
138// ============================================================================
139// CORE MODULES (always available)
140// ============================================================================
141
142/// Python bindings via PyO3 for using ReasonKit from Python.
143///
144/// Build with `maturin develop --release` for development or
145/// `maturin build --release` for distribution.
146///
147/// See module documentation for Python usage examples.
148#[cfg(feature = "python")]
149pub mod bindings;
150
151/// Global constants and configuration defaults.
152pub mod constants;
153
154/// High-performance async reasoning engine with streaming support.
155///
156/// The engine module provides [`ReasoningLoop`](engine::ReasoningLoop) for
157/// orchestrating ThinkTool execution with memory integration and concurrent
158/// processing.
159pub mod engine;
160
161/// Error types and result aliases for ReasonKit operations.
162///
163/// All ReasonKit functions return [`Result<T>`](Result) which is an alias
164/// for `std::result::Result<T, Error>`.
165pub mod error;
166
167/// Evaluation and benchmarking utilities.
168pub mod evaluation;
169
170/// Provider-neutral LLM clients (e.g. Ollama `/api/chat`).
171pub mod llm;
172
173/// Document ingestion and processing pipeline.
174pub mod ingestion;
175
176/// MiniMax M2 model integration for 100+ tool calling.
177///
178/// Provides protocol generation, benchmarking, and long-horizon execution
179/// capabilities leveraging M2's exceptional tool-use performance.
180pub mod m2;
181
182/// MCP (Model Context Protocol) server implementations.
183///
184/// ReasonKit implements MCP servers in Rust (no Node.js) for tool integration.
185pub mod mcp;
186
187/// Long-horizon task orchestration system.
188///
189/// Coordinates complex multi-step operations across ReasonKit components
190/// with state persistence, error recovery, and performance monitoring.
191pub mod orchestration;
192
193/// Document processing and transformation utilities.
194pub mod processing;
195
196/// Telemetry, metrics, and observability infrastructure.
197///
198/// Provides OpenTelemetry integration for tracing, metrics collection,
199/// and privacy-preserving data export.
200pub mod telemetry;
201
202/// ThinkTool protocol engine - the core of ReasonKit.
203///
204/// This module provides the structured reasoning protocols that transform
205/// ad-hoc LLM prompting into auditable, reproducible reasoning chains.
206///
207/// # Key Types
208///
209/// - [`ProtocolExecutor`](thinktool::ProtocolExecutor) - Executes protocols with LLM integration
210/// - [`ProtocolInput`](thinktool::ProtocolInput) - Input data for protocol execution
211/// - [`ProtocolOutput`](thinktool::ProtocolOutput) - Results with confidence scores
212///
213/// # Example
214///
215/// ```rust,ignore
216/// use reasonkit::thinktool::{ProtocolExecutor, ProtocolInput};
217///
218/// let executor = ProtocolExecutor::new()?;
219/// let result = executor.execute(
220/// "gigathink",
221/// ProtocolInput::query("Analyze market trends")
222/// ).await?;
223/// ```
224pub mod thinktool;
225
226/// Verification and validation utilities.
227pub mod verification;
228
229/// Web interface and HTTP API components.
230pub mod web;
231
232/// Web interface handlers and routes.
233pub mod web_interface;
234
235/// Core trait definitions for cross-crate integration.
236///
237/// Provides trait contracts used by optional companion crates:
238/// - reasonkit-mem
239/// - reasonkit-web
240pub mod traits;
241
242/// Aesthetic Expression Mastery System - M2-Enhanced UI/UX Assessment.
243///
244/// Leverages VIBE Benchmark Excellence (91.5% Web, 89.7% Android, 88.0% iOS)
245/// for automated UI/UX quality assessment.
246#[cfg(feature = "aesthetic")]
247pub mod aesthetic;
248
249/// VIBE Protocol Validation System.
250///
251/// Implements the revolutionary "Agent-as-a-Verifier" paradigm for
252/// validating AI outputs against structured protocols.
253#[cfg(feature = "vibe")]
254pub mod vibe;
255
256/// Multi-Language Code Intelligence Enhancement.
257///
258/// Provides code parsing, analysis, and understanding capabilities
259/// across multiple programming languages.
260#[cfg(feature = "code-intelligence")]
261pub mod code_intelligence;
262
263// ============================================================================
264// MEMORY MODULES (optional - enable with `memory` feature)
265// ============================================================================
266
267/// Memory interface trait for reasonkit-mem integration.
268///
269/// Defines how reasonkit-core communicates with the reasonkit-mem
270/// crate for storage, retrieval, and embedding operations.
271pub mod memory_interface;
272
273/// Re-export reasonkit-mem types when memory feature is enabled.
274#[cfg(feature = "memory")]
275pub use reasonkit_mem;
276
277/// Re-export commonly used types from reasonkit-mem for convenience.
278#[cfg(feature = "memory")]
279pub use reasonkit_mem::{
280 embedding, indexing, raptor, retrieval, storage, Error as MemError, Result as MemResult,
281};
282
283/// RAG (Retrieval-Augmented Generation) engine with LLM integration.
284///
285/// Provides the full RAG pipeline including document retrieval,
286/// context augmentation, and LLM-powered generation.
287#[cfg(feature = "memory")]
288pub mod rag;
289
290/// Autonomous Reasoning Framework for self-directed AI operations.
291#[cfg(feature = "arf")]
292pub mod arf;
293
294/// GLM-4.6 model integration for agentic coordination and cost-efficient reasoning.
295#[cfg(feature = "glm46")]
296pub mod glm46;
297
298// ============================================================================
299// RE-EXPORTS
300// ============================================================================
301
302pub use error::{Error, Result};
303
304/// Crate version string for runtime logging and API responses.
305///
306/// # Example
307///
308/// ```rust
309/// println!("ReasonKit Core v{}", reasonkit::VERSION);
310/// ```
311pub const VERSION: &str = env!("CARGO_PKG_VERSION");
312
313// Re-export orchestration system types
314pub use orchestration::{
315 ComponentCoordinator, ErrorRecovery, LongHorizonConfig, LongHorizonOrchestrator,
316 LongHorizonResult, PerformanceTracker, StateManager, TaskGraph, TaskNode, TaskPriority,
317 TaskStatus,
318};
319
320// Re-export engine module types
321pub use engine::{
322 Decision, MemoryContext, Profile as ReasoningProfile, ReasoningConfig, ReasoningError,
323 ReasoningEvent, ReasoningLoop, ReasoningLoopBuilder, ReasoningSession, ReasoningStep, StepKind,
324 StreamHandle, ThinkToolResult,
325};
326
327// Re-export Python bindings types for convenience
328#[cfg(feature = "python")]
329pub use bindings::{
330 Profile as PyProfile, Reasoner as PyReasoner, ThinkToolOutput as PyThinkToolOutput,
331};
332
333use chrono::{DateTime, Utc};
334use serde::{Deserialize, Serialize};
335use uuid::Uuid;
336
337// Python module entry point (only when python feature is enabled)
338#[cfg(feature = "python")]
339mod python_module {
340 #[allow(unused_imports)] // Used when python feature is enabled
341 use super::*;
342 use pyo3::prelude::*;
343
344 /// Python module entry point for ReasonKit.
345 ///
346 /// This is the main entry point for the Python bindings, automatically
347 /// called when the module is imported in Python.
348 ///
349 /// # Building
350 ///
351 /// ```bash
352 /// cd reasonkit-core
353 /// maturin develop --release # Development install
354 /// maturin build --release # Build wheel for distribution
355 /// ```
356 ///
357 /// # Python Usage
358 ///
359 /// ```python
360 /// from reasonkit import Reasoner, Profile, ReasonerError
361 /// from reasonkit import run_gigathink, run_laserlogic, run_bedrock
362 /// from reasonkit import run_proofguard, run_brutalhonesty
363 /// from reasonkit import quick_think, balanced_think, deep_think, paranoid_think
364 /// from reasonkit import version
365 ///
366 /// # Check version
367 /// print(f"ReasonKit v{version()}")
368 ///
369 /// # Create reasoner (auto-detects LLM from environment)
370 /// r = Reasoner(use_mock=False)
371 ///
372 /// # Run individual ThinkTools
373 /// result = r.run_gigathink("What factors drive startup success?")
374 /// for perspective in result.perspectives():
375 /// print(f"- {perspective}")
376 ///
377 /// # Run with profile for comprehensive analysis
378 /// result = r.think_with_profile(Profile.Balanced, "Should we use microservices?")
379 /// print(f"Confidence: {result.confidence:.1%}")
380 ///
381 /// # Convenience functions (no Reasoner instantiation needed)
382 /// result = run_gigathink("Analyze market trends", use_mock=True)
383 /// result = balanced_think("Complex decision to make")
384 /// ```
385 #[pymodule]
386 #[pyo3(name = "reasonkit")]
387 fn reasonkit(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
388 // Register all bindings (classes, functions, exceptions)
389 crate::bindings::register_bindings(m)?;
390 Ok(())
391 }
392}
393
394// ============================================================================
395// CORE TYPES (always available - needed by ingestion, processing, etc.)
396// ============================================================================
397
398/// Document type categorization for the knowledge base.
399///
400/// Determines how documents are processed, indexed, and retrieved.
401///
402/// # Example
403///
404/// ```rust
405/// use reasonkit::DocumentType;
406///
407/// let doc_type = DocumentType::Paper;
408/// assert!(matches!(doc_type, DocumentType::Paper));
409/// ```
410#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
411#[serde(rename_all = "snake_case")]
412pub enum DocumentType {
413 /// Academic paper or research article (arXiv, journals, etc.)
414 Paper,
415 /// Technical documentation (API docs, guides, manuals)
416 Documentation,
417 /// Source code or code snippets
418 Code,
419 /// Personal notes or annotations
420 Note,
421 /// Transcript of audio/video content
422 Transcript,
423 /// Benchmark results or performance data
424 Benchmark,
425}
426
427/// Source type enumeration for document provenance.
428///
429/// Tracks where documents originated for citation and verification.
430///
431/// # Example
432///
433/// ```rust
434/// use reasonkit::SourceType;
435///
436/// let source = SourceType::Github;
437/// assert!(matches!(source, SourceType::Github));
438/// ```
439#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
440#[serde(rename_all = "snake_case")]
441pub enum SourceType {
442 /// arXiv preprint server
443 Arxiv,
444 /// GitHub repository
445 Github,
446 /// General website
447 Website,
448 /// Local file system
449 Local,
450 /// External API
451 Api,
452}
453
454/// Source information for a document.
455///
456/// Contains provenance data including URLs, timestamps, and version information
457/// for proper citation and retrieval tracking.
458///
459/// # Example
460///
461/// ```rust
462/// use reasonkit::{Source, SourceType};
463/// use chrono::Utc;
464///
465/// let source = Source {
466/// source_type: SourceType::Github,
467/// url: Some("https://github.com/org/repo".to_string()),
468/// path: None,
469/// arxiv_id: None,
470/// github_repo: Some("org/repo".to_string()),
471/// retrieved_at: Utc::now(),
472/// version: Some("v1.0.0".to_string()),
473/// };
474/// ```
475#[derive(Debug, Clone, Serialize, Deserialize)]
476pub struct Source {
477 /// Type of source (determines how to interpret other fields)
478 #[serde(rename = "type")]
479 pub source_type: SourceType,
480
481 /// URL of the source document (if applicable)
482 pub url: Option<String>,
483
484 /// Local file path (for local sources)
485 pub path: Option<String>,
486
487 /// arXiv paper ID (e.g., "2301.12345")
488 pub arxiv_id: Option<String>,
489
490 /// GitHub repository identifier (e.g., "owner/repo")
491 pub github_repo: Option<String>,
492
493 /// Timestamp when the document was retrieved
494 pub retrieved_at: DateTime<Utc>,
495
496 /// Version or commit hash of the source
497 pub version: Option<String>,
498}
499
500/// Author information for document metadata.
501///
502/// # Example
503///
504/// ```rust
505/// use reasonkit::Author;
506///
507/// let author = Author {
508/// name: "Jane Doe".to_string(),
509/// affiliation: Some("University of AI".to_string()),
510/// email: Some("jane@example.com".to_string()),
511/// };
512/// ```
513#[derive(Debug, Clone, Serialize, Deserialize)]
514pub struct Author {
515 /// Full name of the author
516 pub name: String,
517
518 /// Institutional affiliation
519 pub affiliation: Option<String>,
520
521 /// Contact email
522 pub email: Option<String>,
523}
524
525/// Document metadata for indexing and retrieval.
526///
527/// Contains bibliographic information, tags, and categorization data
528/// for rich document search and filtering.
529///
530/// # Example
531///
532/// ```rust
533/// use reasonkit::Metadata;
534///
535/// let metadata = Metadata {
536/// title: Some("Understanding AI Reasoning".to_string()),
537/// authors: vec![],
538/// abstract_text: Some("This paper explores...".to_string()),
539/// tags: vec!["ai".to_string(), "reasoning".to_string()],
540/// ..Default::default()
541/// };
542/// ```
543#[derive(Debug, Clone, Default, Serialize, Deserialize)]
544pub struct Metadata {
545 /// Document title
546 pub title: Option<String>,
547
548 /// List of authors
549 pub authors: Vec<Author>,
550
551 /// Abstract or summary text
552 #[serde(rename = "abstract")]
553 pub abstract_text: Option<String>,
554
555 /// Publication date (ISO 8601 format)
556 pub date: Option<String>,
557
558 /// Publication venue (journal, conference, etc.)
559 pub venue: Option<String>,
560
561 /// Citation count (if available)
562 pub citations: Option<i32>,
563
564 /// User-defined tags
565 pub tags: Vec<String>,
566
567 /// Subject categories
568 pub categories: Vec<String>,
569
570 /// Extracted keywords
571 pub keywords: Vec<String>,
572
573 /// Digital Object Identifier
574 pub doi: Option<String>,
575
576 /// License information
577 pub license: Option<String>,
578}
579
580/// References to different embedding types for a chunk.
581///
582/// Supports hybrid retrieval by tracking multiple embedding representations
583/// (dense, sparse, ColBERT) for each text chunk.
584#[derive(Debug, Clone, Default, Serialize, Deserialize)]
585pub struct EmbeddingIds {
586 /// Dense embedding ID (e.g., from OpenAI, Cohere)
587 pub dense: Option<String>,
588
589 /// Sparse embedding ID (e.g., BM25, SPLADE)
590 pub sparse: Option<String>,
591
592 /// ColBERT multi-vector embedding ID
593 pub colbert: Option<String>,
594}
595
596/// A chunk of text from a document.
597///
598/// Documents are split into chunks for embedding and retrieval.
599/// Each chunk maintains positional information and embedding references.
600///
601/// # Example
602///
603/// ```rust
604/// use reasonkit::{Chunk, EmbeddingIds};
605/// use uuid::Uuid;
606///
607/// let chunk = Chunk {
608/// id: Uuid::new_v4(),
609/// text: "This is a chunk of text...".to_string(),
610/// index: 0,
611/// start_char: 0,
612/// end_char: 26,
613/// token_count: Some(7),
614/// section: Some("Introduction".to_string()),
615/// page: Some(1),
616/// embedding_ids: EmbeddingIds::default(),
617/// };
618/// ```
619#[derive(Debug, Clone, Serialize, Deserialize)]
620pub struct Chunk {
621 /// Unique identifier for this chunk
622 pub id: Uuid,
623
624 /// The text content of the chunk
625 pub text: String,
626
627 /// Position index within the document
628 pub index: usize,
629
630 /// Starting character position in the original document
631 pub start_char: usize,
632
633 /// Ending character position in the original document
634 pub end_char: usize,
635
636 /// Estimated token count for the chunk
637 pub token_count: Option<usize>,
638
639 /// Section or heading this chunk belongs to
640 pub section: Option<String>,
641
642 /// Page number (for paginated documents)
643 pub page: Option<usize>,
644
645 /// References to stored embeddings
646 pub embedding_ids: EmbeddingIds,
647}
648
649/// Processing state enumeration for documents.
650///
651/// Tracks the current state of a document in the processing pipeline.
652#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
653#[serde(rename_all = "snake_case")]
654pub enum ProcessingState {
655 /// Document is queued for processing
656 #[default]
657 Pending,
658 /// Document is currently being processed
659 Processing,
660 /// Processing completed successfully
661 Completed,
662 /// Processing failed with errors
663 Failed,
664}
665
666/// Processing status for a document.
667///
668/// Tracks which processing stages have been completed and any errors encountered.
669#[derive(Debug, Clone, Default, Serialize, Deserialize)]
670pub struct ProcessingStatus {
671 /// Current processing state
672 pub status: ProcessingState,
673
674 /// Whether the document has been chunked
675 pub chunked: bool,
676
677 /// Whether embeddings have been generated
678 pub embedded: bool,
679
680 /// Whether the document has been indexed
681 pub indexed: bool,
682
683 /// Whether RAPTOR summarization has been applied
684 pub raptor_processed: bool,
685
686 /// List of error messages (if any)
687 pub errors: Vec<String>,
688}
689
690/// Content format enumeration.
691///
692/// Identifies the format of document content for proper parsing.
693#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
694#[serde(rename_all = "snake_case")]
695pub enum ContentFormat {
696 /// Plain text
697 #[default]
698 Text,
699 /// Markdown format
700 Markdown,
701 /// HTML content
702 Html,
703 /// LaTeX source
704 Latex,
705}
706
707/// Document content container.
708///
709/// Stores the raw content along with format and statistical information.
710#[derive(Debug, Clone, Default, Serialize, Deserialize)]
711pub struct DocumentContent {
712 /// Raw content string
713 pub raw: String,
714
715 /// Content format
716 pub format: ContentFormat,
717
718 /// Primary language code (e.g., "en", "zh")
719 pub language: String,
720
721 /// Word count
722 pub word_count: usize,
723
724 /// Character count
725 pub char_count: usize,
726}
727
728/// A document in the knowledge base.
729///
730/// The primary data structure for storing and managing documents.
731/// Contains content, metadata, processing status, and chunks.
732///
733/// # Example
734///
735/// ```rust
736/// use reasonkit::{Document, DocumentType, Source, SourceType};
737/// use chrono::Utc;
738///
739/// let source = Source {
740/// source_type: SourceType::Local,
741/// url: None,
742/// path: Some("/path/to/doc.md".to_string()),
743/// arxiv_id: None,
744/// github_repo: None,
745/// retrieved_at: Utc::now(),
746/// version: None,
747/// };
748///
749/// let doc = Document::new(DocumentType::Documentation, source)
750/// .with_content("# My Document\n\nContent here...".to_string());
751///
752/// assert_eq!(doc.doc_type, DocumentType::Documentation);
753/// assert!(doc.content.word_count > 0);
754/// ```
755#[derive(Debug, Clone, Serialize, Deserialize)]
756pub struct Document {
757 /// Unique document identifier
758 pub id: Uuid,
759
760 /// Document type categorization
761 #[serde(rename = "type")]
762 pub doc_type: DocumentType,
763
764 /// Source information for provenance
765 pub source: Source,
766
767 /// Document content
768 pub content: DocumentContent,
769
770 /// Document metadata
771 pub metadata: Metadata,
772
773 /// Processing status
774 pub processing: ProcessingStatus,
775
776 /// Text chunks for retrieval
777 pub chunks: Vec<Chunk>,
778
779 /// Creation timestamp
780 pub created_at: DateTime<Utc>,
781
782 /// Last update timestamp
783 pub updated_at: Option<DateTime<Utc>>,
784}
785
786impl Document {
787 /// Create a new document with the given type and source.
788 ///
789 /// # Arguments
790 ///
791 /// * `doc_type` - The type of document
792 /// * `source` - Source information for provenance
793 ///
794 /// # Example
795 ///
796 /// ```rust
797 /// use reasonkit::{Document, DocumentType, Source, SourceType};
798 /// use chrono::Utc;
799 ///
800 /// let source = Source {
801 /// source_type: SourceType::Local,
802 /// url: None,
803 /// path: Some("/path/to/file.txt".to_string()),
804 /// arxiv_id: None,
805 /// github_repo: None,
806 /// retrieved_at: Utc::now(),
807 /// version: None,
808 /// };
809 ///
810 /// let doc = Document::new(DocumentType::Note, source);
811 /// assert_eq!(doc.doc_type, DocumentType::Note);
812 /// ```
813 pub fn new(doc_type: DocumentType, source: Source) -> Self {
814 Self {
815 id: Uuid::new_v4(),
816 doc_type,
817 source,
818 content: DocumentContent::default(),
819 metadata: Metadata::default(),
820 processing: ProcessingStatus::default(),
821 chunks: Vec::new(),
822 created_at: Utc::now(),
823 updated_at: None,
824 }
825 }
826
827 /// Set the document content and compute statistics.
828 ///
829 /// # Arguments
830 ///
831 /// * `raw` - The raw content string
832 ///
833 /// # Example
834 ///
835 /// ```rust
836 /// use reasonkit::{Document, DocumentType, Source, SourceType};
837 /// use chrono::Utc;
838 ///
839 /// let source = Source {
840 /// source_type: SourceType::Local,
841 /// url: None,
842 /// path: None,
843 /// arxiv_id: None,
844 /// github_repo: None,
845 /// retrieved_at: Utc::now(),
846 /// version: None,
847 /// };
848 ///
849 /// let doc = Document::new(DocumentType::Note, source)
850 /// .with_content("Hello world".to_string());
851 ///
852 /// assert_eq!(doc.content.word_count, 2);
853 /// assert_eq!(doc.content.char_count, 11);
854 /// ```
855 pub fn with_content(mut self, raw: String) -> Self {
856 let word_count = raw.split_whitespace().count();
857 let char_count = raw.len();
858 self.content = DocumentContent {
859 raw,
860 format: ContentFormat::Text,
861 language: "en".to_string(),
862 word_count,
863 char_count,
864 };
865 self
866 }
867
868 /// Set the document metadata.
869 ///
870 /// # Arguments
871 ///
872 /// * `metadata` - The metadata to set
873 ///
874 /// # Example
875 ///
876 /// ```rust
877 /// use reasonkit::{Document, DocumentType, Source, SourceType, Metadata};
878 /// use chrono::Utc;
879 ///
880 /// let source = Source {
881 /// source_type: SourceType::Local,
882 /// url: None,
883 /// path: None,
884 /// arxiv_id: None,
885 /// github_repo: None,
886 /// retrieved_at: Utc::now(),
887 /// version: None,
888 /// };
889 ///
890 /// let metadata = Metadata {
891 /// title: Some("My Document".to_string()),
892 /// ..Default::default()
893 /// };
894 ///
895 /// let doc = Document::new(DocumentType::Note, source)
896 /// .with_metadata(metadata);
897 ///
898 /// assert_eq!(doc.metadata.title, Some("My Document".to_string()));
899 /// ```
900 pub fn with_metadata(mut self, metadata: Metadata) -> Self {
901 self.metadata = metadata;
902 self
903 }
904}
905
906// Conversion to reasonkit-mem Document type
907#[cfg(feature = "memory")]
908impl From<Document> for reasonkit_mem::Document {
909 fn from(doc: Document) -> Self {
910 use reasonkit_mem::types::{
911 Author as MemAuthor, Chunk as MemChunk, ContentFormat as MemContentFormat,
912 DocumentContent as MemDocumentContent, DocumentType as MemDocumentType,
913 EmbeddingIds as MemEmbeddingIds, Metadata as MemMetadata,
914 ProcessingState as MemProcessingState, ProcessingStatus as MemProcessingStatus,
915 Source as MemSource, SourceType as MemSourceType,
916 };
917
918 // Convert DocumentType
919 let doc_type = match doc.doc_type {
920 DocumentType::Paper => MemDocumentType::Paper,
921 DocumentType::Documentation => MemDocumentType::Documentation,
922 DocumentType::Code => MemDocumentType::Code,
923 DocumentType::Note => MemDocumentType::Note,
924 DocumentType::Transcript => MemDocumentType::Transcript,
925 DocumentType::Benchmark => MemDocumentType::Benchmark,
926 };
927
928 // Convert SourceType
929 let source_type = match doc.source.source_type {
930 SourceType::Arxiv => MemSourceType::Arxiv,
931 SourceType::Github => MemSourceType::Github,
932 SourceType::Website => MemSourceType::Website,
933 SourceType::Local => MemSourceType::Local,
934 SourceType::Api => MemSourceType::Api,
935 };
936
937 // Convert Source
938 let source = MemSource {
939 source_type,
940 url: doc.source.url,
941 path: doc.source.path,
942 arxiv_id: doc.source.arxiv_id,
943 github_repo: doc.source.github_repo,
944 retrieved_at: doc.source.retrieved_at,
945 version: doc.source.version,
946 };
947
948 // Convert ContentFormat
949 let format = match doc.content.format {
950 ContentFormat::Text => MemContentFormat::Text,
951 ContentFormat::Markdown => MemContentFormat::Markdown,
952 ContentFormat::Html => MemContentFormat::Html,
953 ContentFormat::Latex => MemContentFormat::Latex,
954 };
955
956 // Convert DocumentContent
957 let content = MemDocumentContent {
958 raw: doc.content.raw,
959 format,
960 language: doc.content.language,
961 word_count: doc.content.word_count,
962 char_count: doc.content.char_count,
963 };
964
965 // Convert Authors
966 let authors = doc
967 .metadata
968 .authors
969 .into_iter()
970 .map(|a| MemAuthor {
971 name: a.name,
972 affiliation: a.affiliation,
973 email: a.email,
974 })
975 .collect();
976
977 // Convert Metadata
978 let metadata = MemMetadata {
979 title: doc.metadata.title,
980 authors,
981 abstract_text: doc.metadata.abstract_text,
982 date: doc.metadata.date,
983 venue: doc.metadata.venue,
984 citations: doc.metadata.citations,
985 tags: doc.metadata.tags,
986 categories: doc.metadata.categories,
987 keywords: doc.metadata.keywords,
988 doi: doc.metadata.doi,
989 license: doc.metadata.license,
990 };
991
992 // Convert ProcessingState
993 let status = match doc.processing.status {
994 ProcessingState::Pending => MemProcessingState::Pending,
995 ProcessingState::Processing => MemProcessingState::Processing,
996 ProcessingState::Completed => MemProcessingState::Completed,
997 ProcessingState::Failed => MemProcessingState::Failed,
998 };
999
1000 // Convert ProcessingStatus
1001 let processing = MemProcessingStatus {
1002 status,
1003 chunked: doc.processing.chunked,
1004 embedded: doc.processing.embedded,
1005 indexed: doc.processing.indexed,
1006 raptor_processed: doc.processing.raptor_processed,
1007 errors: doc.processing.errors,
1008 };
1009
1010 // Convert Chunks
1011 let chunks = doc
1012 .chunks
1013 .into_iter()
1014 .map(|c| {
1015 let embedding_ids = MemEmbeddingIds {
1016 dense: c.embedding_ids.dense,
1017 sparse: c.embedding_ids.sparse,
1018 colbert: c.embedding_ids.colbert,
1019 };
1020 MemChunk {
1021 id: c.id,
1022 text: c.text,
1023 index: c.index,
1024 start_char: c.start_char,
1025 end_char: c.end_char,
1026 token_count: c.token_count,
1027 section: c.section,
1028 page: c.page,
1029 embedding_ids,
1030 }
1031 })
1032 .collect();
1033
1034 // Construct reasonkit-mem Document
1035 reasonkit_mem::Document {
1036 id: doc.id,
1037 doc_type,
1038 source,
1039 content,
1040 metadata,
1041 processing,
1042 chunks,
1043 created_at: doc.created_at,
1044 updated_at: doc.updated_at,
1045 }
1046 }
1047}
1048
1049/// Source of a search match for hybrid retrieval.
1050///
1051/// Indicates which retrieval method produced a search result,
1052/// enabling score fusion and result explanation.
1053#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
1054#[serde(rename_all = "snake_case")]
1055pub enum MatchSource {
1056 /// Dense vector retrieval (semantic similarity)
1057 Dense,
1058 /// Sparse retrieval (BM25, keyword matching)
1059 Sparse,
1060 /// Hybrid retrieval (combined dense + sparse)
1061 Hybrid,
1062 /// RAPTOR hierarchical retrieval
1063 Raptor,
1064}
1065
1066/// Search result from a query.
1067///
1068/// Contains the matched chunk, relevance score, and source information.
1069///
1070/// # Example
1071///
1072/// ```rust
1073/// use reasonkit::{SearchResult, MatchSource, Chunk, EmbeddingIds};
1074/// use uuid::Uuid;
1075///
1076/// let chunk = Chunk {
1077/// id: Uuid::new_v4(),
1078/// text: "Relevant content...".to_string(),
1079/// index: 0,
1080/// start_char: 0,
1081/// end_char: 19,
1082/// token_count: Some(2),
1083/// section: None,
1084/// page: None,
1085/// embedding_ids: EmbeddingIds::default(),
1086/// };
1087///
1088/// let result = SearchResult {
1089/// score: 0.95,
1090/// document_id: Uuid::new_v4(),
1091/// chunk,
1092/// match_source: MatchSource::Dense,
1093/// };
1094///
1095/// assert!(result.score > 0.9);
1096/// ```
1097#[derive(Debug, Clone, Serialize, Deserialize)]
1098pub struct SearchResult {
1099 /// Relevance score (higher is more relevant)
1100 pub score: f32,
1101
1102 /// ID of the document containing the match
1103 pub document_id: Uuid,
1104
1105 /// The matched chunk
1106 pub chunk: Chunk,
1107
1108 /// Which retrieval method produced this match
1109 pub match_source: MatchSource,
1110}
1111
1112// ============================================================================
1113// MEMORY-SPECIFIC TYPES (only with `memory` feature)
1114// ============================================================================
1115
1116#[cfg(feature = "memory")]
1117pub use reasonkit_mem::RetrievalConfig;
1118
1119/// Simple retrieval configuration (available without memory feature).
1120///
1121/// Provides basic retrieval parameters when the full memory layer is not enabled.
1122#[cfg(not(feature = "memory"))]
1123#[derive(Debug, Clone, Serialize, Deserialize)]
1124pub struct RetrievalConfig {
1125 /// Maximum number of results to return
1126 pub top_k: usize,
1127
1128 /// Minimum relevance score threshold
1129 pub min_score: f32,
1130
1131 /// Weight for dense retrieval in hybrid mode (0.0-1.0)
1132 pub alpha: f32,
1133
1134 /// Whether to use RAPTOR hierarchical retrieval
1135 pub use_raptor: bool,
1136
1137 /// Whether to rerank results
1138 pub rerank: bool,
1139}
1140
1141#[cfg(not(feature = "memory"))]
1142impl Default for RetrievalConfig {
1143 fn default() -> Self {
1144 Self {
1145 top_k: 10,
1146 min_score: 0.0,
1147 alpha: 0.7,
1148 use_raptor: false,
1149 rerank: false,
1150 }
1151 }
1152}
1153
1154#[cfg(test)]
1155mod tests {
1156 use super::*;
1157
1158 #[test]
1159 fn test_core_compiles() {
1160 // This test verifies basic module compilation
1161 // The fact that it runs means the crate compiles successfully
1162 }
1163
1164 #[test]
1165 fn test_document_creation() {
1166 let source = Source {
1167 source_type: SourceType::Local,
1168 url: None,
1169 path: Some("/test.txt".to_string()),
1170 arxiv_id: None,
1171 github_repo: None,
1172 retrieved_at: Utc::now(),
1173 version: None,
1174 };
1175 let doc = Document::new(DocumentType::Note, source);
1176 assert_eq!(doc.doc_type, DocumentType::Note);
1177 }
1178
1179 #[test]
1180 fn test_document_with_content() {
1181 let source = Source {
1182 source_type: SourceType::Local,
1183 url: None,
1184 path: None,
1185 arxiv_id: None,
1186 github_repo: None,
1187 retrieved_at: Utc::now(),
1188 version: None,
1189 };
1190 let doc =
1191 Document::new(DocumentType::Note, source).with_content("Hello world test".to_string());
1192
1193 assert_eq!(doc.content.word_count, 3);
1194 assert_eq!(doc.content.char_count, 16);
1195 }
1196
1197 #[test]
1198 fn test_version_available() {
1199 assert!(!VERSION.is_empty());
1200 }
1201}