use std::sync::{Arc, Mutex};
use serde::{Deserialize, Serialize};
use tower::util::BoxCloneService;
use crate::{ExtractionConfig, KreuzbergError, service::ExtractionRequest, types::ExtractionResult};
#[derive(Debug, Clone, Copy)]
pub struct ApiSizeLimits {
pub max_request_body_bytes: usize,
pub max_multipart_field_bytes: usize,
}
impl Default for ApiSizeLimits {
fn default() -> Self {
Self {
max_request_body_bytes: 100 * 1024 * 1024,
max_multipart_field_bytes: 100 * 1024 * 1024,
}
}
}
impl ApiSizeLimits {
pub fn new(max_request_body_bytes: usize, max_multipart_field_bytes: usize) -> Self {
Self {
max_request_body_bytes,
max_multipart_field_bytes,
}
}
pub fn from_mb(max_request_body_mb: usize, max_multipart_field_mb: usize) -> Self {
Self {
max_request_body_bytes: max_request_body_mb * 1024 * 1024,
max_multipart_field_bytes: max_multipart_field_mb * 1024 * 1024,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct PluginStatus {
pub ocr_backends_count: usize,
pub ocr_backends: Vec<String>,
pub extractors_count: usize,
pub post_processors_count: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct HealthResponse {
#[cfg_attr(feature = "api", schema(example = "healthy"))]
pub status: String,
#[cfg_attr(feature = "api", schema(example = "0.8.0"))]
pub version: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub plugins: Option<PluginStatus>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct InfoResponse {
#[cfg_attr(feature = "api", schema(example = "0.8.0"))]
pub version: String,
pub rust_backend: bool,
}
pub type ExtractResponse = Vec<ExtractionResult>;
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct ErrorResponse {
#[cfg_attr(feature = "api", schema(example = "ValidationError"))]
pub error_type: String,
#[cfg_attr(feature = "api", schema(example = "Invalid input provided"))]
pub message: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub traceback: Option<String>,
#[cfg_attr(feature = "api", schema(example = 400))]
pub status_code: u16,
}
#[derive(Clone)]
pub struct ApiState {
pub default_config: Arc<ExtractionConfig>,
pub extraction_service: Arc<Mutex<BoxCloneService<ExtractionRequest, ExtractionResult, KreuzbergError>>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct CacheStatsResponse {
#[cfg_attr(feature = "api", schema(example = "/tmp/kreuzberg-cache"))]
pub directory: String,
pub total_files: usize,
pub total_size_mb: f64,
pub available_space_mb: f64,
pub oldest_file_age_days: f64,
pub newest_file_age_days: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct CacheClearResponse {
#[cfg_attr(feature = "api", schema(example = "/tmp/kreuzberg-cache"))]
pub directory: String,
pub removed_files: usize,
pub freed_mb: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct EmbedRequest {
#[cfg_attr(feature = "api", schema(min_items = 1))]
pub texts: Vec<String>,
#[serde(skip_serializing_if = "Option::is_none")]
#[cfg_attr(feature = "api", schema(value_type = Option<Object>))]
pub config: Option<crate::core::config::EmbeddingConfig>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct EmbedResponse {
pub embeddings: Vec<Vec<f32>>,
#[cfg_attr(feature = "api", schema(example = "all-MiniLM-L6-v2"))]
pub model: String,
pub dimensions: usize,
pub count: usize,
}
fn default_chunker_type() -> String {
"text".to_string()
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct ChunkRequest {
#[cfg_attr(feature = "api", schema(example = "This is sample text to chunk.", min_length = 1))]
pub text: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub config: Option<ChunkingConfigRequest>,
#[serde(default = "default_chunker_type")]
#[cfg_attr(feature = "api", schema(example = "text", pattern = "^(text|markdown)$"))]
pub chunker_type: String,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct ChunkingConfigRequest {
#[cfg_attr(feature = "api", schema(minimum = 101, example = 2000))]
pub max_characters: Option<usize>,
#[cfg_attr(feature = "api", schema(minimum = 0, maximum = 1999, example = 100))]
pub overlap: Option<usize>,
pub trim: Option<bool>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct ChunkResponse {
pub chunks: Vec<ChunkItem>,
pub chunk_count: usize,
pub config: ChunkingConfigResponse,
pub input_size_bytes: usize,
#[cfg_attr(feature = "api", schema(example = "text"))]
pub chunker_type: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct ChunkItem {
pub content: String,
pub byte_start: usize,
pub byte_end: usize,
pub chunk_index: usize,
pub total_chunks: usize,
#[serde(skip_serializing_if = "Option::is_none")]
pub first_page: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub last_page: Option<usize>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct VersionResponse {
#[cfg_attr(feature = "api", schema(example = "0.8.0"))]
pub version: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct DetectResponse {
#[cfg_attr(feature = "api", schema(example = "application/pdf"))]
pub mime_type: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub filename: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct ManifestEntryResponse {
#[cfg_attr(feature = "api", schema(example = "paddle-ocr/det/model.onnx"))]
pub relative_path: String,
pub sha256: String,
pub size_bytes: u64,
pub source_url: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct ManifestResponse {
#[cfg_attr(feature = "api", schema(example = "0.8.0"))]
pub kreuzberg_version: String,
pub total_size_bytes: u64,
pub model_count: usize,
pub models: Vec<ManifestEntryResponse>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct WarmRequest {
#[serde(default)]
pub all_embeddings: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub embedding_model: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct WarmResponse {
pub cache_dir: String,
pub downloaded: Vec<String>,
pub already_cached: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct OpenWebDocumentResponse {
pub page_content: String,
pub metadata: OpenWebDocumentMetadata,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct OpenWebDocumentMetadata {
#[cfg_attr(feature = "api", schema(example = "document.pdf"))]
pub source: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct DoclingCompatResponse {
pub document: DoclingCompatDocument,
#[cfg_attr(feature = "api", schema(example = "success"))]
pub status: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct DoclingCompatDocument {
pub md_content: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct ChunkingConfigResponse {
pub max_characters: usize,
pub overlap: usize,
pub trim: bool,
#[cfg_attr(feature = "api", schema(example = "text"))]
pub chunker_type: String,
}