Skip to main content

text_core/
surface.rs

1//! Library-owned runtime surface for `text-core`.
2
3use runtime_core::{
4    describe_surface_response, structured_surface_response, surface_operation, PackageSurface,
5    RuntimeCapabilities, SurfaceRequest, SurfaceResponse,
6};
7use serde::Deserialize;
8
9use crate::{
10    detailed_text_stats, detect_script_profile, normalize_text, normalize_whitespace,
11    operations::analyze_text_statistics, segment_graphemes, segment_words, split_paragraphs,
12    split_sentence_spans, tokenize, TextBoundaryOptions, TextProcessingOptions,
13};
14
15/// Returns the package surface exposed by every transport wrapper.
16pub fn package_surface() -> PackageSurface {
17    PackageSurface {
18        library: env!("CARGO_PKG_NAME").to_string(),
19        version: env!("CARGO_PKG_VERSION").to_string(),
20        capabilities: RuntimeCapabilities::pure_rust(),
21        operations: vec![
22            surface_operation(
23                "describe",
24                "Inspect package metadata",
25                "Shared text documents, tokenization, spans, and statistics for video-analysis.",
26                serde_json::json!({"includeOperations": true}),
27            ),
28            surface_operation(
29                "text.statistics",
30                "Text statistics",
31                "Counts bytes, characters, words, lines, and sentences.",
32                serde_json::json!({"text": "Hello world. Again."}),
33            ),
34            surface_operation(
35                "text.normalize",
36                "Normalize text",
37                "Normalizes Unicode, casing, and whitespace with before/after statistics.",
38                serde_json::json!({"text": "  Hello   WORLD  ", "lowercase": true, "normalizeWhitespace": true}),
39            ),
40            surface_operation(
41                "text.tokenize",
42                "Tokenize text",
43                "Returns span-aware tokens, script profile, and detailed text statistics.",
44                serde_json::json!({"text": "Hello, Berlin 2026.", "includePunctuation": true}),
45            ),
46            surface_operation(
47                "text.boundaries",
48                "Text boundaries",
49                "Returns Unicode-safe word, sentence, paragraph, and grapheme boundaries.",
50                serde_json::json!({"text": "Hello world. Second paragraph."}),
51            ),
52        ],
53    }
54}
55
56/// Runs one library-owned operation.
57pub fn run_surface_operation(request: SurfaceRequest) -> Result<SurfaceResponse, String> {
58    let operation = request.operation.clone();
59    let value = match request.operation.as_str() {
60        "describe" => return Ok(describe_surface_response(&package_surface(), request)),
61        "text.statistics" => {
62            let result = analyze_text_statistics(parse_input(request.input)?);
63            serde_json::to_value(result).map_err(|error| error.to_string())?
64        }
65        "text.normalize" => normalize_value(parse_input(request.input)?)?,
66        "text.tokenize" => tokenize_value(parse_input(request.input)?)?,
67        "text.boundaries" => boundaries_value(parse_input(request.input)?)?,
68        operation => {
69            return Err(runtime_core::SurfaceError::unsupported_operation(
70                operation,
71                env!("CARGO_PKG_NAME"),
72            )
73            .to_error_string());
74        }
75    };
76    Ok(structured_surface_response(
77        operation.clone(),
78        workflow_title(operation.as_str()),
79        workflow_message(operation.as_str()),
80        workflow_summary(operation.as_str(), &value),
81        value,
82    ))
83}
84
85fn workflow_title(operation: &str) -> &'static str {
86    match operation {
87        "text.statistics" => "Text statistics",
88        "text.normalize" => "Normalized text",
89        "text.tokenize" => "Tokenized text",
90        "text.boundaries" => "Text boundaries",
91        _ => "Text core result",
92    }
93}
94
95fn workflow_message(operation: &str) -> &'static str {
96    match operation {
97        "text.statistics" => {
98            "Computed deterministic byte, character, word, line, and sentence statistics."
99        }
100        "text.normalize" => "Normalized text with explicit before and after statistics.",
101        "text.tokenize" => {
102            "Tokenized the supplied text with spans, script profile, and text statistics."
103        }
104        "text.boundaries" => {
105            "Extracted Unicode-safe word, sentence, paragraph, and grapheme boundaries."
106        }
107        _ => "Ran a text-core package operation.",
108    }
109}
110
111fn workflow_summary(operation: &str, value: &serde_json::Value) -> serde_json::Value {
112    match operation {
113        "text.statistics" => serde_json::json!({
114            "status": "ok",
115            "words": value["value"]["wordCount"],
116            "sentences": value["value"]["sentenceCount"]
117        }),
118        "text.normalize" => serde_json::json!({
119            "status": "ok",
120            "inputWords": value["before"]["basic"]["words"],
121            "outputWords": value["after"]["basic"]["words"]
122        }),
123        "text.tokenize" => serde_json::json!({
124            "status": "ok",
125            "tokenCount": value["tokens"].as_array().map(Vec::len).unwrap_or(0),
126            "dominantScript": value["scriptProfile"]["dominantScript"]
127        }),
128        "text.boundaries" => serde_json::json!({
129            "status": "ok",
130            "wordCount": value["words"].as_array().map(Vec::len).unwrap_or(0),
131            "sentenceCount": value["sentences"].as_array().map(Vec::len).unwrap_or(0),
132            "paragraphCount": value["paragraphs"].as_array().map(Vec::len).unwrap_or(0)
133        }),
134        _ => serde_json::json!({"status": "ok"}),
135    }
136}
137
138#[derive(Debug, Deserialize)]
139#[serde(rename_all = "camelCase")]
140struct NormalizeRequest {
141    text: String,
142    #[serde(default = "default_true")]
143    lowercase: bool,
144    #[serde(default)]
145    strip_diacritics: bool,
146    #[serde(default = "default_true")]
147    normalize_whitespace: bool,
148}
149
150#[derive(Debug, Deserialize)]
151#[serde(rename_all = "camelCase")]
152struct TokenizeRequest {
153    text: String,
154    #[serde(default)]
155    include_whitespace: bool,
156    #[serde(default)]
157    include_punctuation: bool,
158    #[serde(default = "default_true")]
159    lowercase: bool,
160}
161
162#[derive(Debug, Deserialize)]
163#[serde(rename_all = "camelCase")]
164struct BoundariesRequest {
165    text: String,
166    #[serde(default = "default_true")]
167    keep_apostrophes: bool,
168}
169
170fn normalize_value(request: NormalizeRequest) -> Result<serde_json::Value, String> {
171    let before = detailed_text_stats(&request.text, &TextProcessingOptions::default());
172    let mut normalized = normalize_text(
173        &request.text,
174        &TextProcessingOptions {
175            lowercase: request.lowercase,
176            ..TextProcessingOptions::default()
177        },
178    );
179    if request.strip_diacritics {
180        normalized = normalized.chars().filter(|ch| ch.is_ascii()).collect();
181    }
182    if request.normalize_whitespace {
183        normalized = normalize_whitespace(&normalized);
184    }
185    let after = detailed_text_stats(&normalized, &TextProcessingOptions::default());
186    Ok(serde_json::json!({
187        "text": normalized,
188        "before": before,
189        "after": after
190    }))
191}
192
193fn tokenize_value(request: TokenizeRequest) -> Result<serde_json::Value, String> {
194    let options = TextProcessingOptions {
195        lowercase: request.lowercase,
196        include_punctuation: request.include_punctuation || request.include_whitespace,
197        ..TextProcessingOptions::default()
198    };
199    Ok(serde_json::json!({
200        "tokens": tokenize(&request.text, &options),
201        "scriptProfile": detect_script_profile(&request.text),
202        "stats": detailed_text_stats(&request.text, &options)
203    }))
204}
205
206fn boundaries_value(request: BoundariesRequest) -> Result<serde_json::Value, String> {
207    let processing = TextProcessingOptions {
208        keep_apostrophes: request.keep_apostrophes,
209        include_punctuation: true,
210        ..TextProcessingOptions::default()
211    };
212    let boundary_options = TextBoundaryOptions {
213        include_punctuation: false,
214        ..TextBoundaryOptions::default()
215    };
216    Ok(serde_json::json!({
217        "words": segment_words(&request.text, &boundary_options),
218        "sentences": split_sentence_spans(&request.text, &processing),
219        "paragraphs": split_paragraphs(&request.text),
220        "graphemes": segment_graphemes(&request.text)
221    }))
222}
223
224fn parse_input<T: for<'de> Deserialize<'de>>(input: serde_json::Value) -> Result<T, String> {
225    runtime_core::parse_surface_input(None, input)
226}
227
228fn default_true() -> bool {
229    true
230}
231
232#[cfg(test)]
233mod tests {
234    use super::*;
235    use runtime_core::OperationId;
236
237    #[test]
238    fn package_surface_lists_text_operations() {
239        let ids = package_surface()
240            .operations
241            .into_iter()
242            .map(|operation| operation.id.0)
243            .collect::<Vec<_>>();
244
245        assert!(ids.contains(&"text.statistics".to_string()));
246        assert!(ids.contains(&"text.tokenize".to_string()));
247    }
248
249    #[test]
250    fn tokenization_operation_returns_stable_tokens() {
251        let response = run_surface_operation(SurfaceRequest {
252            operation: OperationId::new("text.tokenize"),
253            input: serde_json::json!({"text": "Hello Berlin.", "includePunctuation": true}),
254        })
255        .expect("tokenize");
256
257        assert_eq!(response.value["tokens"][0]["normalized"], "hello");
258        assert_eq!(response.value["stats"]["basic"]["words"], 2);
259    }
260
261    #[test]
262    fn malformed_input_returns_typed_error_string() {
263        let error = run_surface_operation(SurfaceRequest {
264            operation: OperationId::new("text.statistics"),
265            input: serde_json::json!({"missing": true}),
266        })
267        .expect_err("invalid request");
268
269        assert!(error.contains("invalid request"));
270    }
271}