reasonkit/
lib.rs

1#![doc = include_str!("../README.md")]
2// doc_auto_cfg was merged into doc_cfg in Rust 1.92
3#![cfg_attr(docsrs, feature(doc_cfg))]
4
5//! # ReasonKit Core
6//!
7//! AI Thinking Enhancement System - Turn Prompts into Protocols
8//!
9//! ReasonKit Core is a **pure reasoning engine** that improves AI thinking patterns
10//! through structured reasoning protocols called ThinkTools. It transforms ad-hoc
11//! LLM prompting into auditable, reproducible reasoning chains.
12//!
13//! ## Philosophy
14//!
15//! **"Designed, Not Dreamed"** - Structure beats raw intelligence. By imposing
16//! systematic reasoning protocols, ReasonKit helps AI models produce more reliable,
17//! verifiable, and explainable outputs.
18//!
19//! ## Quick Start
20//!
21//! ### Rust Usage
22//!
23//! ```rust,ignore
24//! use reasonkit::thinktool::{ProtocolExecutor, ProtocolInput};
25//!
26//! #[tokio::main]
27//! async fn main() -> anyhow::Result<()> {
28//!     // Create executor (auto-detects LLM from environment)
29//!     let executor = ProtocolExecutor::new()?;
30//!
31//!     // Run GigaThink for multi-perspective analysis
32//!     let result = executor.execute(
33//!         "gigathink",
34//!         ProtocolInput::query("Should we use microservices?")
35//!     ).await?;
36//!
37//!     println!("Confidence: {:.2}", result.confidence);
38//!     for perspective in result.perspectives() {
39//!         println!("- {}", perspective);
40//!     }
41//!     Ok(())
42//! }
43//! ```
44//!
45//! ### Python Usage
46//!
47//! ```python
48//! from reasonkit import Reasoner, Profile, run_gigathink
49//!
50//! # Quick usage with convenience functions
51//! result = run_gigathink("What factors drive startup success?")
52//! print(result.perspectives)
53//!
54//! # Full control with Reasoner class
55//! r = Reasoner()
56//! result = r.think_with_profile(Profile.Balanced, "Should we pivot?")
57//! print(f"Confidence: {result.confidence:.1%}")
58//! ```
59//!
60//! ## ThinkTools (Core Reasoning Protocols)
61//!
62//! ReasonKit provides five core ThinkTools, each implementing a specific reasoning strategy:
63//!
64//! | Tool | Code | Purpose | Output |
65//! |------|------|---------|--------|
66//! | **GigaThink** | `gt` | Expansive creative thinking | 10+ diverse perspectives |
67//! | **LaserLogic** | `ll` | Precision deductive reasoning | Validity assessment, fallacy detection |
68//! | **BedRock** | `br` | First principles decomposition | Core axioms, rebuilt foundations |
69//! | **ProofGuard** | `pg` | Multi-source verification | Triangulated evidence (3+ sources) |
70//! | **BrutalHonesty** | `bh` | Adversarial self-critique | Flaws, weaknesses, counter-arguments |
71//!
72//! ## Reasoning Profiles
73//!
74//! Profiles chain multiple ThinkTools together for comprehensive analysis:
75//!
76//! | Profile | ThinkTools | Min Confidence | Use Case |
77//! |---------|------------|----------------|----------|
78//! | `quick` | GT, LL | 70% | Fast initial analysis |
79//! | `balanced` | GT, LL, BR, PG | 80% | Standard decision-making |
80//! | `deep` | All 5 | 85% | Complex problems |
81//! | `paranoid` | All 5 + validation | 95% | High-stakes decisions |
82//!
83//! ## Feature Flags
84//!
85//! - `memory` - Enable memory layer integration via `reasonkit-mem`
86//! - `aesthetic` - Enable UI/UX assessment capabilities
87//! - `vibe` - Enable VIBE protocol validation system
88//! - `code-intelligence` - Enable multi-language code analysis
89//! - `arf` - Enable Autonomous Reasoning Framework
90//! - `minimax` - Enable MiniMax M2 model integration
91//!
92//! ## Supported LLM Providers
93//!
94//! ReasonKit supports 18+ LLM providers out of the box:
95//!
96//! - **Major Cloud**: Anthropic, OpenAI, Google Gemini, Vertex AI, Azure OpenAI, AWS Bedrock
97//! - **Specialized**: xAI (Grok), Groq, Mistral, DeepSeek, Cohere, Perplexity, Cerebras
98//! - **Inference**: Together AI, Fireworks AI, Alibaba Qwen
99//! - **Aggregation**: OpenRouter (300+ models), Cloudflare AI Gateway
100//!
101//! ## Architecture
102//!
103//! ```text
104//! +------------------+     +------------------+     +------------------+
105//! |   User Query     | --> | Protocol Engine  | --> |  Auditable Output|
106//! +------------------+     +------------------+     +------------------+
107//!                                  |
108//!                    +-------------+-------------+
109//!                    |             |             |
110//!               +----v----+  +-----v-----+  +----v----+
111//!               | LLM     |  | ThinkTool |  | Profile |
112//!               | Client  |  | Modules   |  | System  |
113//!               +---------+  +-----------+  +---------+
114//! ```
115//!
116//! ## Modules
117//!
118//! - [`thinktool`] - Core ThinkTool protocols and execution engine
119//! - [`engine`] - High-level async reasoning loop with streaming
120//! - [`orchestration`] - Long-horizon task orchestration (100+ tool calls)
121//! - [`error`] - Error types and result aliases
122//! - [`telemetry`] - Metrics and observability
123//!
124//! ## Optional Modules (Feature-Gated)
125//!
126//! - \[`bindings`\] - Python bindings via PyO3 (requires `python`)
127//! - \[`rag`\] - Full RAG engine with LLM integration (requires `memory`)
128//! - \[`aesthetic`\] - UI/UX assessment system (requires `aesthetic`)
129//! - \[`vibe`\] - VIBE protocol validation (requires `vibe`)
130//! - \[`code_intelligence`\] - Multi-language code analysis (requires `code-intelligence`)
131
132// TRACKED: Enable `#![warn(missing_docs)]` before v1.0 release
133// Status: All public APIs need documentation first (tracked in QA plan)
134#![allow(missing_docs)]
135#![warn(clippy::all)]
136#![deny(unsafe_code)]
137
138// ============================================================================
139// CORE MODULES (always available)
140// ============================================================================
141
142/// Python bindings via PyO3 for using ReasonKit from Python.
143///
144/// Build with `maturin develop --release` for development or
145/// `maturin build --release` for distribution.
146///
147/// See module documentation for Python usage examples.
148#[cfg(feature = "python")]
149pub mod bindings;
150
151/// Global constants and configuration defaults.
152pub mod constants;
153
154/// High-performance async reasoning engine with streaming support.
155///
156/// The engine module provides [`ReasoningLoop`](engine::ReasoningLoop) for
157/// orchestrating ThinkTool execution with memory integration and concurrent
158/// processing.
159pub mod engine;
160
161/// Error types and result aliases for ReasonKit operations.
162///
163/// All ReasonKit functions return [`Result<T>`](Result) which is an alias
164/// for `std::result::Result<T, Error>`.
165pub mod error;
166
167/// Evaluation and benchmarking utilities.
168pub mod evaluation;
169
170/// Provider-neutral LLM clients (e.g. Ollama `/api/chat`).
171pub mod llm;
172
173/// Document ingestion and processing pipeline.
174pub mod ingestion;
175
176/// MiniMax M2 model integration for 100+ tool calling.
177///
178/// Provides protocol generation, benchmarking, and long-horizon execution
179/// capabilities leveraging M2's exceptional tool-use performance.
180pub mod m2;
181
182/// MCP (Model Context Protocol) server implementations.
183///
184/// ReasonKit implements MCP servers in Rust (no Node.js) for tool integration.
185pub mod mcp;
186
187/// Long-horizon task orchestration system.
188///
189/// Coordinates complex multi-step operations across ReasonKit components
190/// with state persistence, error recovery, and performance monitoring.
191pub mod orchestration;
192
193/// Document processing and transformation utilities.
194pub mod processing;
195
196/// Telemetry, metrics, and observability infrastructure.
197///
198/// Provides OpenTelemetry integration for tracing, metrics collection,
199/// and privacy-preserving data export.
200pub mod telemetry;
201
202/// ThinkTool protocol engine - the core of ReasonKit.
203///
204/// This module provides the structured reasoning protocols that transform
205/// ad-hoc LLM prompting into auditable, reproducible reasoning chains.
206///
207/// # Key Types
208///
209/// - [`ProtocolExecutor`](thinktool::ProtocolExecutor) - Executes protocols with LLM integration
210/// - [`ProtocolInput`](thinktool::ProtocolInput) - Input data for protocol execution
211/// - [`ProtocolOutput`](thinktool::ProtocolOutput) - Results with confidence scores
212///
213/// # Example
214///
215/// ```rust,ignore
216/// use reasonkit::thinktool::{ProtocolExecutor, ProtocolInput};
217///
218/// let executor = ProtocolExecutor::new()?;
219/// let result = executor.execute(
220///     "gigathink",
221///     ProtocolInput::query("Analyze market trends")
222/// ).await?;
223/// ```
224pub mod thinktool;
225
226/// Verification and validation utilities.
227pub mod verification;
228
229/// Web interface and HTTP API components.
230pub mod web;
231
232/// Web interface handlers and routes.
233pub mod web_interface;
234
235/// Core trait definitions for cross-crate integration.
236///
237/// Provides trait contracts used by optional companion crates:
238/// - reasonkit-mem
239/// - reasonkit-web
240pub mod traits;
241
242/// Aesthetic Expression Mastery System - M2-Enhanced UI/UX Assessment.
243///
244/// Leverages VIBE Benchmark Excellence (91.5% Web, 89.7% Android, 88.0% iOS)
245/// for automated UI/UX quality assessment.
246#[cfg(feature = "aesthetic")]
247pub mod aesthetic;
248
249/// VIBE Protocol Validation System.
250///
251/// Implements the revolutionary "Agent-as-a-Verifier" paradigm for
252/// validating AI outputs against structured protocols.
253#[cfg(feature = "vibe")]
254pub mod vibe;
255
256/// Multi-Language Code Intelligence Enhancement.
257///
258/// Provides code parsing, analysis, and understanding capabilities
259/// across multiple programming languages.
260#[cfg(feature = "code-intelligence")]
261pub mod code_intelligence;
262
263// ============================================================================
264// MEMORY MODULES (optional - enable with `memory` feature)
265// ============================================================================
266
267/// Memory interface trait for reasonkit-mem integration.
268///
269/// Defines how reasonkit-core communicates with the reasonkit-mem
270/// crate for storage, retrieval, and embedding operations.
271pub mod memory_interface;
272
273/// Re-export reasonkit-mem types when memory feature is enabled.
274#[cfg(feature = "memory")]
275pub use reasonkit_mem;
276
277/// Re-export commonly used types from reasonkit-mem for convenience.
278#[cfg(feature = "memory")]
279pub use reasonkit_mem::{
280    embedding, indexing, raptor, retrieval, storage, Error as MemError, Result as MemResult,
281};
282
283/// RAG (Retrieval-Augmented Generation) engine with LLM integration.
284///
285/// Provides the full RAG pipeline including document retrieval,
286/// context augmentation, and LLM-powered generation.
287#[cfg(feature = "memory")]
288pub mod rag;
289
290/// Autonomous Reasoning Framework for self-directed AI operations.
291#[cfg(feature = "arf")]
292pub mod arf;
293
294/// GLM-4.6 model integration for agentic coordination and cost-efficient reasoning.
295#[cfg(feature = "glm46")]
296pub mod glm46;
297
298// ============================================================================
299// RE-EXPORTS
300// ============================================================================
301
302pub use error::{Error, Result};
303
304/// Crate version string for runtime logging and API responses.
305///
306/// # Example
307///
308/// ```rust
309/// println!("ReasonKit Core v{}", reasonkit::VERSION);
310/// ```
311pub const VERSION: &str = env!("CARGO_PKG_VERSION");
312
313// Re-export orchestration system types
314pub use orchestration::{
315    ComponentCoordinator, ErrorRecovery, LongHorizonConfig, LongHorizonOrchestrator,
316    LongHorizonResult, PerformanceTracker, StateManager, TaskGraph, TaskNode, TaskPriority,
317    TaskStatus,
318};
319
320// Re-export engine module types
321pub use engine::{
322    Decision, MemoryContext, Profile as ReasoningProfile, ReasoningConfig, ReasoningError,
323    ReasoningEvent, ReasoningLoop, ReasoningLoopBuilder, ReasoningSession, ReasoningStep, StepKind,
324    StreamHandle, ThinkToolResult,
325};
326
327// Re-export Python bindings types for convenience
328#[cfg(feature = "python")]
329pub use bindings::{
330    Profile as PyProfile, Reasoner as PyReasoner, ThinkToolOutput as PyThinkToolOutput,
331};
332
333use chrono::{DateTime, Utc};
334use serde::{Deserialize, Serialize};
335use uuid::Uuid;
336
337// Python module entry point (only when python feature is enabled)
338#[cfg(feature = "python")]
339mod python_module {
340    #[allow(unused_imports)] // Used when python feature is enabled
341    use super::*;
342    use pyo3::prelude::*;
343
344    /// Python module entry point for ReasonKit.
345    ///
346    /// This is the main entry point for the Python bindings, automatically
347    /// called when the module is imported in Python.
348    ///
349    /// # Building
350    ///
351    /// ```bash
352    /// cd reasonkit-core
353    /// maturin develop --release   # Development install
354    /// maturin build --release     # Build wheel for distribution
355    /// ```
356    ///
357    /// # Python Usage
358    ///
359    /// ```python
360    /// from reasonkit import Reasoner, Profile, ReasonerError
361    /// from reasonkit import run_gigathink, run_laserlogic, run_bedrock
362    /// from reasonkit import run_proofguard, run_brutalhonesty
363    /// from reasonkit import quick_think, balanced_think, deep_think, paranoid_think
364    /// from reasonkit import version
365    ///
366    /// # Check version
367    /// print(f"ReasonKit v{version()}")
368    ///
369    /// # Create reasoner (auto-detects LLM from environment)
370    /// r = Reasoner(use_mock=False)
371    ///
372    /// # Run individual ThinkTools
373    /// result = r.run_gigathink("What factors drive startup success?")
374    /// for perspective in result.perspectives():
375    ///     print(f"- {perspective}")
376    ///
377    /// # Run with profile for comprehensive analysis
378    /// result = r.think_with_profile(Profile.Balanced, "Should we use microservices?")
379    /// print(f"Confidence: {result.confidence:.1%}")
380    ///
381    /// # Convenience functions (no Reasoner instantiation needed)
382    /// result = run_gigathink("Analyze market trends", use_mock=True)
383    /// result = balanced_think("Complex decision to make")
384    /// ```
385    #[pymodule]
386    #[pyo3(name = "reasonkit")]
387    fn reasonkit(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
388        // Register all bindings (classes, functions, exceptions)
389        crate::bindings::register_bindings(m)?;
390        Ok(())
391    }
392}
393
394// ============================================================================
395// CORE TYPES (always available - needed by ingestion, processing, etc.)
396// ============================================================================
397
398/// Document type categorization for the knowledge base.
399///
400/// Determines how documents are processed, indexed, and retrieved.
401///
402/// # Example
403///
404/// ```rust
405/// use reasonkit::DocumentType;
406///
407/// let doc_type = DocumentType::Paper;
408/// assert!(matches!(doc_type, DocumentType::Paper));
409/// ```
410#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
411#[serde(rename_all = "snake_case")]
412pub enum DocumentType {
413    /// Academic paper or research article (arXiv, journals, etc.)
414    Paper,
415    /// Technical documentation (API docs, guides, manuals)
416    Documentation,
417    /// Source code or code snippets
418    Code,
419    /// Personal notes or annotations
420    Note,
421    /// Transcript of audio/video content
422    Transcript,
423    /// Benchmark results or performance data
424    Benchmark,
425}
426
427/// Source type enumeration for document provenance.
428///
429/// Tracks where documents originated for citation and verification.
430///
431/// # Example
432///
433/// ```rust
434/// use reasonkit::SourceType;
435///
436/// let source = SourceType::Github;
437/// assert!(matches!(source, SourceType::Github));
438/// ```
439#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
440#[serde(rename_all = "snake_case")]
441pub enum SourceType {
442    /// arXiv preprint server
443    Arxiv,
444    /// GitHub repository
445    Github,
446    /// General website
447    Website,
448    /// Local file system
449    Local,
450    /// External API
451    Api,
452}
453
454/// Source information for a document.
455///
456/// Contains provenance data including URLs, timestamps, and version information
457/// for proper citation and retrieval tracking.
458///
459/// # Example
460///
461/// ```rust
462/// use reasonkit::{Source, SourceType};
463/// use chrono::Utc;
464///
465/// let source = Source {
466///     source_type: SourceType::Github,
467///     url: Some("https://github.com/org/repo".to_string()),
468///     path: None,
469///     arxiv_id: None,
470///     github_repo: Some("org/repo".to_string()),
471///     retrieved_at: Utc::now(),
472///     version: Some("v1.0.0".to_string()),
473/// };
474/// ```
475#[derive(Debug, Clone, Serialize, Deserialize)]
476pub struct Source {
477    /// Type of source (determines how to interpret other fields)
478    #[serde(rename = "type")]
479    pub source_type: SourceType,
480
481    /// URL of the source document (if applicable)
482    pub url: Option<String>,
483
484    /// Local file path (for local sources)
485    pub path: Option<String>,
486
487    /// arXiv paper ID (e.g., "2301.12345")
488    pub arxiv_id: Option<String>,
489
490    /// GitHub repository identifier (e.g., "owner/repo")
491    pub github_repo: Option<String>,
492
493    /// Timestamp when the document was retrieved
494    pub retrieved_at: DateTime<Utc>,
495
496    /// Version or commit hash of the source
497    pub version: Option<String>,
498}
499
500/// Author information for document metadata.
501///
502/// # Example
503///
504/// ```rust
505/// use reasonkit::Author;
506///
507/// let author = Author {
508///     name: "Jane Doe".to_string(),
509///     affiliation: Some("University of AI".to_string()),
510///     email: Some("jane@example.com".to_string()),
511/// };
512/// ```
513#[derive(Debug, Clone, Serialize, Deserialize)]
514pub struct Author {
515    /// Full name of the author
516    pub name: String,
517
518    /// Institutional affiliation
519    pub affiliation: Option<String>,
520
521    /// Contact email
522    pub email: Option<String>,
523}
524
525/// Document metadata for indexing and retrieval.
526///
527/// Contains bibliographic information, tags, and categorization data
528/// for rich document search and filtering.
529///
530/// # Example
531///
532/// ```rust
533/// use reasonkit::Metadata;
534///
535/// let metadata = Metadata {
536///     title: Some("Understanding AI Reasoning".to_string()),
537///     authors: vec![],
538///     abstract_text: Some("This paper explores...".to_string()),
539///     tags: vec!["ai".to_string(), "reasoning".to_string()],
540///     ..Default::default()
541/// };
542/// ```
543#[derive(Debug, Clone, Default, Serialize, Deserialize)]
544pub struct Metadata {
545    /// Document title
546    pub title: Option<String>,
547
548    /// List of authors
549    pub authors: Vec<Author>,
550
551    /// Abstract or summary text
552    #[serde(rename = "abstract")]
553    pub abstract_text: Option<String>,
554
555    /// Publication date (ISO 8601 format)
556    pub date: Option<String>,
557
558    /// Publication venue (journal, conference, etc.)
559    pub venue: Option<String>,
560
561    /// Citation count (if available)
562    pub citations: Option<i32>,
563
564    /// User-defined tags
565    pub tags: Vec<String>,
566
567    /// Subject categories
568    pub categories: Vec<String>,
569
570    /// Extracted keywords
571    pub keywords: Vec<String>,
572
573    /// Digital Object Identifier
574    pub doi: Option<String>,
575
576    /// License information
577    pub license: Option<String>,
578}
579
580/// References to different embedding types for a chunk.
581///
582/// Supports hybrid retrieval by tracking multiple embedding representations
583/// (dense, sparse, ColBERT) for each text chunk.
584#[derive(Debug, Clone, Default, Serialize, Deserialize)]
585pub struct EmbeddingIds {
586    /// Dense embedding ID (e.g., from OpenAI, Cohere)
587    pub dense: Option<String>,
588
589    /// Sparse embedding ID (e.g., BM25, SPLADE)
590    pub sparse: Option<String>,
591
592    /// ColBERT multi-vector embedding ID
593    pub colbert: Option<String>,
594}
595
596/// A chunk of text from a document.
597///
598/// Documents are split into chunks for embedding and retrieval.
599/// Each chunk maintains positional information and embedding references.
600///
601/// # Example
602///
603/// ```rust
604/// use reasonkit::{Chunk, EmbeddingIds};
605/// use uuid::Uuid;
606///
607/// let chunk = Chunk {
608///     id: Uuid::new_v4(),
609///     text: "This is a chunk of text...".to_string(),
610///     index: 0,
611///     start_char: 0,
612///     end_char: 26,
613///     token_count: Some(7),
614///     section: Some("Introduction".to_string()),
615///     page: Some(1),
616///     embedding_ids: EmbeddingIds::default(),
617/// };
618/// ```
619#[derive(Debug, Clone, Serialize, Deserialize)]
620pub struct Chunk {
621    /// Unique identifier for this chunk
622    pub id: Uuid,
623
624    /// The text content of the chunk
625    pub text: String,
626
627    /// Position index within the document
628    pub index: usize,
629
630    /// Starting character position in the original document
631    pub start_char: usize,
632
633    /// Ending character position in the original document
634    pub end_char: usize,
635
636    /// Estimated token count for the chunk
637    pub token_count: Option<usize>,
638
639    /// Section or heading this chunk belongs to
640    pub section: Option<String>,
641
642    /// Page number (for paginated documents)
643    pub page: Option<usize>,
644
645    /// References to stored embeddings
646    pub embedding_ids: EmbeddingIds,
647}
648
649/// Processing state enumeration for documents.
650///
651/// Tracks the current state of a document in the processing pipeline.
652#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
653#[serde(rename_all = "snake_case")]
654pub enum ProcessingState {
655    /// Document is queued for processing
656    #[default]
657    Pending,
658    /// Document is currently being processed
659    Processing,
660    /// Processing completed successfully
661    Completed,
662    /// Processing failed with errors
663    Failed,
664}
665
666/// Processing status for a document.
667///
668/// Tracks which processing stages have been completed and any errors encountered.
669#[derive(Debug, Clone, Default, Serialize, Deserialize)]
670pub struct ProcessingStatus {
671    /// Current processing state
672    pub status: ProcessingState,
673
674    /// Whether the document has been chunked
675    pub chunked: bool,
676
677    /// Whether embeddings have been generated
678    pub embedded: bool,
679
680    /// Whether the document has been indexed
681    pub indexed: bool,
682
683    /// Whether RAPTOR summarization has been applied
684    pub raptor_processed: bool,
685
686    /// List of error messages (if any)
687    pub errors: Vec<String>,
688}
689
690/// Content format enumeration.
691///
692/// Identifies the format of document content for proper parsing.
693#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
694#[serde(rename_all = "snake_case")]
695pub enum ContentFormat {
696    /// Plain text
697    #[default]
698    Text,
699    /// Markdown format
700    Markdown,
701    /// HTML content
702    Html,
703    /// LaTeX source
704    Latex,
705}
706
707/// Document content container.
708///
709/// Stores the raw content along with format and statistical information.
710#[derive(Debug, Clone, Default, Serialize, Deserialize)]
711pub struct DocumentContent {
712    /// Raw content string
713    pub raw: String,
714
715    /// Content format
716    pub format: ContentFormat,
717
718    /// Primary language code (e.g., "en", "zh")
719    pub language: String,
720
721    /// Word count
722    pub word_count: usize,
723
724    /// Character count
725    pub char_count: usize,
726}
727
728/// A document in the knowledge base.
729///
730/// The primary data structure for storing and managing documents.
731/// Contains content, metadata, processing status, and chunks.
732///
733/// # Example
734///
735/// ```rust
736/// use reasonkit::{Document, DocumentType, Source, SourceType};
737/// use chrono::Utc;
738///
739/// let source = Source {
740///     source_type: SourceType::Local,
741///     url: None,
742///     path: Some("/path/to/doc.md".to_string()),
743///     arxiv_id: None,
744///     github_repo: None,
745///     retrieved_at: Utc::now(),
746///     version: None,
747/// };
748///
749/// let doc = Document::new(DocumentType::Documentation, source)
750///     .with_content("# My Document\n\nContent here...".to_string());
751///
752/// assert_eq!(doc.doc_type, DocumentType::Documentation);
753/// assert!(doc.content.word_count > 0);
754/// ```
755#[derive(Debug, Clone, Serialize, Deserialize)]
756pub struct Document {
757    /// Unique document identifier
758    pub id: Uuid,
759
760    /// Document type categorization
761    #[serde(rename = "type")]
762    pub doc_type: DocumentType,
763
764    /// Source information for provenance
765    pub source: Source,
766
767    /// Document content
768    pub content: DocumentContent,
769
770    /// Document metadata
771    pub metadata: Metadata,
772
773    /// Processing status
774    pub processing: ProcessingStatus,
775
776    /// Text chunks for retrieval
777    pub chunks: Vec<Chunk>,
778
779    /// Creation timestamp
780    pub created_at: DateTime<Utc>,
781
782    /// Last update timestamp
783    pub updated_at: Option<DateTime<Utc>>,
784}
785
786impl Document {
787    /// Create a new document with the given type and source.
788    ///
789    /// # Arguments
790    ///
791    /// * `doc_type` - The type of document
792    /// * `source` - Source information for provenance
793    ///
794    /// # Example
795    ///
796    /// ```rust
797    /// use reasonkit::{Document, DocumentType, Source, SourceType};
798    /// use chrono::Utc;
799    ///
800    /// let source = Source {
801    ///     source_type: SourceType::Local,
802    ///     url: None,
803    ///     path: Some("/path/to/file.txt".to_string()),
804    ///     arxiv_id: None,
805    ///     github_repo: None,
806    ///     retrieved_at: Utc::now(),
807    ///     version: None,
808    /// };
809    ///
810    /// let doc = Document::new(DocumentType::Note, source);
811    /// assert_eq!(doc.doc_type, DocumentType::Note);
812    /// ```
813    pub fn new(doc_type: DocumentType, source: Source) -> Self {
814        Self {
815            id: Uuid::new_v4(),
816            doc_type,
817            source,
818            content: DocumentContent::default(),
819            metadata: Metadata::default(),
820            processing: ProcessingStatus::default(),
821            chunks: Vec::new(),
822            created_at: Utc::now(),
823            updated_at: None,
824        }
825    }
826
827    /// Set the document content and compute statistics.
828    ///
829    /// # Arguments
830    ///
831    /// * `raw` - The raw content string
832    ///
833    /// # Example
834    ///
835    /// ```rust
836    /// use reasonkit::{Document, DocumentType, Source, SourceType};
837    /// use chrono::Utc;
838    ///
839    /// let source = Source {
840    ///     source_type: SourceType::Local,
841    ///     url: None,
842    ///     path: None,
843    ///     arxiv_id: None,
844    ///     github_repo: None,
845    ///     retrieved_at: Utc::now(),
846    ///     version: None,
847    /// };
848    ///
849    /// let doc = Document::new(DocumentType::Note, source)
850    ///     .with_content("Hello world".to_string());
851    ///
852    /// assert_eq!(doc.content.word_count, 2);
853    /// assert_eq!(doc.content.char_count, 11);
854    /// ```
855    pub fn with_content(mut self, raw: String) -> Self {
856        let word_count = raw.split_whitespace().count();
857        let char_count = raw.len();
858        self.content = DocumentContent {
859            raw,
860            format: ContentFormat::Text,
861            language: "en".to_string(),
862            word_count,
863            char_count,
864        };
865        self
866    }
867
868    /// Set the document metadata.
869    ///
870    /// # Arguments
871    ///
872    /// * `metadata` - The metadata to set
873    ///
874    /// # Example
875    ///
876    /// ```rust
877    /// use reasonkit::{Document, DocumentType, Source, SourceType, Metadata};
878    /// use chrono::Utc;
879    ///
880    /// let source = Source {
881    ///     source_type: SourceType::Local,
882    ///     url: None,
883    ///     path: None,
884    ///     arxiv_id: None,
885    ///     github_repo: None,
886    ///     retrieved_at: Utc::now(),
887    ///     version: None,
888    /// };
889    ///
890    /// let metadata = Metadata {
891    ///     title: Some("My Document".to_string()),
892    ///     ..Default::default()
893    /// };
894    ///
895    /// let doc = Document::new(DocumentType::Note, source)
896    ///     .with_metadata(metadata);
897    ///
898    /// assert_eq!(doc.metadata.title, Some("My Document".to_string()));
899    /// ```
900    pub fn with_metadata(mut self, metadata: Metadata) -> Self {
901        self.metadata = metadata;
902        self
903    }
904}
905
906// Conversion to reasonkit-mem Document type
907#[cfg(feature = "memory")]
908impl From<Document> for reasonkit_mem::Document {
909    fn from(doc: Document) -> Self {
910        use reasonkit_mem::types::{
911            Author as MemAuthor, Chunk as MemChunk, ContentFormat as MemContentFormat,
912            DocumentContent as MemDocumentContent, DocumentType as MemDocumentType,
913            EmbeddingIds as MemEmbeddingIds, Metadata as MemMetadata,
914            ProcessingState as MemProcessingState, ProcessingStatus as MemProcessingStatus,
915            Source as MemSource, SourceType as MemSourceType,
916        };
917
918        // Convert DocumentType
919        let doc_type = match doc.doc_type {
920            DocumentType::Paper => MemDocumentType::Paper,
921            DocumentType::Documentation => MemDocumentType::Documentation,
922            DocumentType::Code => MemDocumentType::Code,
923            DocumentType::Note => MemDocumentType::Note,
924            DocumentType::Transcript => MemDocumentType::Transcript,
925            DocumentType::Benchmark => MemDocumentType::Benchmark,
926        };
927
928        // Convert SourceType
929        let source_type = match doc.source.source_type {
930            SourceType::Arxiv => MemSourceType::Arxiv,
931            SourceType::Github => MemSourceType::Github,
932            SourceType::Website => MemSourceType::Website,
933            SourceType::Local => MemSourceType::Local,
934            SourceType::Api => MemSourceType::Api,
935        };
936
937        // Convert Source
938        let source = MemSource {
939            source_type,
940            url: doc.source.url,
941            path: doc.source.path,
942            arxiv_id: doc.source.arxiv_id,
943            github_repo: doc.source.github_repo,
944            retrieved_at: doc.source.retrieved_at,
945            version: doc.source.version,
946        };
947
948        // Convert ContentFormat
949        let format = match doc.content.format {
950            ContentFormat::Text => MemContentFormat::Text,
951            ContentFormat::Markdown => MemContentFormat::Markdown,
952            ContentFormat::Html => MemContentFormat::Html,
953            ContentFormat::Latex => MemContentFormat::Latex,
954        };
955
956        // Convert DocumentContent
957        let content = MemDocumentContent {
958            raw: doc.content.raw,
959            format,
960            language: doc.content.language,
961            word_count: doc.content.word_count,
962            char_count: doc.content.char_count,
963        };
964
965        // Convert Authors
966        let authors = doc
967            .metadata
968            .authors
969            .into_iter()
970            .map(|a| MemAuthor {
971                name: a.name,
972                affiliation: a.affiliation,
973                email: a.email,
974            })
975            .collect();
976
977        // Convert Metadata
978        let metadata = MemMetadata {
979            title: doc.metadata.title,
980            authors,
981            abstract_text: doc.metadata.abstract_text,
982            date: doc.metadata.date,
983            venue: doc.metadata.venue,
984            citations: doc.metadata.citations,
985            tags: doc.metadata.tags,
986            categories: doc.metadata.categories,
987            keywords: doc.metadata.keywords,
988            doi: doc.metadata.doi,
989            license: doc.metadata.license,
990        };
991
992        // Convert ProcessingState
993        let status = match doc.processing.status {
994            ProcessingState::Pending => MemProcessingState::Pending,
995            ProcessingState::Processing => MemProcessingState::Processing,
996            ProcessingState::Completed => MemProcessingState::Completed,
997            ProcessingState::Failed => MemProcessingState::Failed,
998        };
999
1000        // Convert ProcessingStatus
1001        let processing = MemProcessingStatus {
1002            status,
1003            chunked: doc.processing.chunked,
1004            embedded: doc.processing.embedded,
1005            indexed: doc.processing.indexed,
1006            raptor_processed: doc.processing.raptor_processed,
1007            errors: doc.processing.errors,
1008        };
1009
1010        // Convert Chunks
1011        let chunks = doc
1012            .chunks
1013            .into_iter()
1014            .map(|c| {
1015                let embedding_ids = MemEmbeddingIds {
1016                    dense: c.embedding_ids.dense,
1017                    sparse: c.embedding_ids.sparse,
1018                    colbert: c.embedding_ids.colbert,
1019                };
1020                MemChunk {
1021                    id: c.id,
1022                    text: c.text,
1023                    index: c.index,
1024                    start_char: c.start_char,
1025                    end_char: c.end_char,
1026                    token_count: c.token_count,
1027                    section: c.section,
1028                    page: c.page,
1029                    embedding_ids,
1030                }
1031            })
1032            .collect();
1033
1034        // Construct reasonkit-mem Document
1035        reasonkit_mem::Document {
1036            id: doc.id,
1037            doc_type,
1038            source,
1039            content,
1040            metadata,
1041            processing,
1042            chunks,
1043            created_at: doc.created_at,
1044            updated_at: doc.updated_at,
1045        }
1046    }
1047}
1048
1049/// Source of a search match for hybrid retrieval.
1050///
1051/// Indicates which retrieval method produced a search result,
1052/// enabling score fusion and result explanation.
1053#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
1054#[serde(rename_all = "snake_case")]
1055pub enum MatchSource {
1056    /// Dense vector retrieval (semantic similarity)
1057    Dense,
1058    /// Sparse retrieval (BM25, keyword matching)
1059    Sparse,
1060    /// Hybrid retrieval (combined dense + sparse)
1061    Hybrid,
1062    /// RAPTOR hierarchical retrieval
1063    Raptor,
1064}
1065
1066/// Search result from a query.
1067///
1068/// Contains the matched chunk, relevance score, and source information.
1069///
1070/// # Example
1071///
1072/// ```rust
1073/// use reasonkit::{SearchResult, MatchSource, Chunk, EmbeddingIds};
1074/// use uuid::Uuid;
1075///
1076/// let chunk = Chunk {
1077///     id: Uuid::new_v4(),
1078///     text: "Relevant content...".to_string(),
1079///     index: 0,
1080///     start_char: 0,
1081///     end_char: 19,
1082///     token_count: Some(2),
1083///     section: None,
1084///     page: None,
1085///     embedding_ids: EmbeddingIds::default(),
1086/// };
1087///
1088/// let result = SearchResult {
1089///     score: 0.95,
1090///     document_id: Uuid::new_v4(),
1091///     chunk,
1092///     match_source: MatchSource::Dense,
1093/// };
1094///
1095/// assert!(result.score > 0.9);
1096/// ```
1097#[derive(Debug, Clone, Serialize, Deserialize)]
1098pub struct SearchResult {
1099    /// Relevance score (higher is more relevant)
1100    pub score: f32,
1101
1102    /// ID of the document containing the match
1103    pub document_id: Uuid,
1104
1105    /// The matched chunk
1106    pub chunk: Chunk,
1107
1108    /// Which retrieval method produced this match
1109    pub match_source: MatchSource,
1110}
1111
1112// ============================================================================
1113// MEMORY-SPECIFIC TYPES (only with `memory` feature)
1114// ============================================================================
1115
1116#[cfg(feature = "memory")]
1117pub use reasonkit_mem::RetrievalConfig;
1118
1119/// Simple retrieval configuration (available without memory feature).
1120///
1121/// Provides basic retrieval parameters when the full memory layer is not enabled.
1122#[cfg(not(feature = "memory"))]
1123#[derive(Debug, Clone, Serialize, Deserialize)]
1124pub struct RetrievalConfig {
1125    /// Maximum number of results to return
1126    pub top_k: usize,
1127
1128    /// Minimum relevance score threshold
1129    pub min_score: f32,
1130
1131    /// Weight for dense retrieval in hybrid mode (0.0-1.0)
1132    pub alpha: f32,
1133
1134    /// Whether to use RAPTOR hierarchical retrieval
1135    pub use_raptor: bool,
1136
1137    /// Whether to rerank results
1138    pub rerank: bool,
1139}
1140
1141#[cfg(not(feature = "memory"))]
1142impl Default for RetrievalConfig {
1143    fn default() -> Self {
1144        Self {
1145            top_k: 10,
1146            min_score: 0.0,
1147            alpha: 0.7,
1148            use_raptor: false,
1149            rerank: false,
1150        }
1151    }
1152}
1153
1154#[cfg(test)]
1155mod tests {
1156    use super::*;
1157
1158    #[test]
1159    fn test_core_compiles() {
1160        // This test verifies basic module compilation
1161        // The fact that it runs means the crate compiles successfully
1162    }
1163
1164    #[test]
1165    fn test_document_creation() {
1166        let source = Source {
1167            source_type: SourceType::Local,
1168            url: None,
1169            path: Some("/test.txt".to_string()),
1170            arxiv_id: None,
1171            github_repo: None,
1172            retrieved_at: Utc::now(),
1173            version: None,
1174        };
1175        let doc = Document::new(DocumentType::Note, source);
1176        assert_eq!(doc.doc_type, DocumentType::Note);
1177    }
1178
1179    #[test]
1180    fn test_document_with_content() {
1181        let source = Source {
1182            source_type: SourceType::Local,
1183            url: None,
1184            path: None,
1185            arxiv_id: None,
1186            github_repo: None,
1187            retrieved_at: Utc::now(),
1188            version: None,
1189        };
1190        let doc =
1191            Document::new(DocumentType::Note, source).with_content("Hello world test".to_string());
1192
1193        assert_eq!(doc.content.word_count, 3);
1194        assert_eq!(doc.content.char_count, 16);
1195    }
1196
1197    #[test]
1198    fn test_version_available() {
1199        assert!(!VERSION.is_empty());
1200    }
1201}
reasonkit/lib.rs

reasonkit/
lib.rs