1use runtime_core::{
4 describe_surface_response, structured_surface_response, surface_operation, PackageSurface,
5 RuntimeCapabilities, SurfaceRequest, SurfaceResponse,
6};
7use serde::Deserialize;
8
9use crate::{
10 detailed_text_stats, detect_script_profile, normalize_text, normalize_whitespace,
11 operations::analyze_text_statistics, segment_graphemes, segment_words, split_paragraphs,
12 split_sentence_spans, tokenize, TextBoundaryOptions, TextProcessingOptions,
13};
14
15pub fn package_surface() -> PackageSurface {
17 PackageSurface {
18 library: env!("CARGO_PKG_NAME").to_string(),
19 version: env!("CARGO_PKG_VERSION").to_string(),
20 capabilities: RuntimeCapabilities::pure_rust(),
21 operations: vec![
22 surface_operation(
23 "describe",
24 "Inspect package metadata",
25 "Shared text documents, tokenization, spans, and statistics for video-analysis.",
26 serde_json::json!({"includeOperations": true}),
27 ),
28 surface_operation(
29 "text.statistics",
30 "Text statistics",
31 "Counts bytes, characters, words, lines, and sentences.",
32 serde_json::json!({"text": "Hello world. Again."}),
33 ),
34 surface_operation(
35 "text.normalize",
36 "Normalize text",
37 "Normalizes Unicode, casing, and whitespace with before/after statistics.",
38 serde_json::json!({"text": " Hello WORLD ", "lowercase": true, "normalizeWhitespace": true}),
39 ),
40 surface_operation(
41 "text.tokenize",
42 "Tokenize text",
43 "Returns span-aware tokens, script profile, and detailed text statistics.",
44 serde_json::json!({"text": "Hello, Berlin 2026.", "includePunctuation": true}),
45 ),
46 surface_operation(
47 "text.boundaries",
48 "Text boundaries",
49 "Returns Unicode-safe word, sentence, paragraph, and grapheme boundaries.",
50 serde_json::json!({"text": "Hello world. Second paragraph."}),
51 ),
52 ],
53 }
54}
55
56pub fn run_surface_operation(request: SurfaceRequest) -> Result<SurfaceResponse, String> {
58 let operation = request.operation.clone();
59 let value = match request.operation.as_str() {
60 "describe" => return Ok(describe_surface_response(&package_surface(), request)),
61 "text.statistics" => {
62 let result = analyze_text_statistics(parse_input(request.input)?);
63 serde_json::to_value(result).map_err(|error| error.to_string())?
64 }
65 "text.normalize" => normalize_value(parse_input(request.input)?)?,
66 "text.tokenize" => tokenize_value(parse_input(request.input)?)?,
67 "text.boundaries" => boundaries_value(parse_input(request.input)?)?,
68 operation => {
69 return Err(runtime_core::SurfaceError::unsupported_operation(
70 operation,
71 env!("CARGO_PKG_NAME"),
72 )
73 .to_error_string());
74 }
75 };
76 Ok(structured_surface_response(
77 operation.clone(),
78 workflow_title(operation.as_str()),
79 workflow_message(operation.as_str()),
80 workflow_summary(operation.as_str(), &value),
81 value,
82 ))
83}
84
85fn workflow_title(operation: &str) -> &'static str {
86 match operation {
87 "text.statistics" => "Text statistics",
88 "text.normalize" => "Normalized text",
89 "text.tokenize" => "Tokenized text",
90 "text.boundaries" => "Text boundaries",
91 _ => "Text core result",
92 }
93}
94
95fn workflow_message(operation: &str) -> &'static str {
96 match operation {
97 "text.statistics" => {
98 "Computed deterministic byte, character, word, line, and sentence statistics."
99 }
100 "text.normalize" => "Normalized text with explicit before and after statistics.",
101 "text.tokenize" => {
102 "Tokenized the supplied text with spans, script profile, and text statistics."
103 }
104 "text.boundaries" => {
105 "Extracted Unicode-safe word, sentence, paragraph, and grapheme boundaries."
106 }
107 _ => "Ran a text-core package operation.",
108 }
109}
110
111fn workflow_summary(operation: &str, value: &serde_json::Value) -> serde_json::Value {
112 match operation {
113 "text.statistics" => serde_json::json!({
114 "status": "ok",
115 "words": value["value"]["wordCount"],
116 "sentences": value["value"]["sentenceCount"]
117 }),
118 "text.normalize" => serde_json::json!({
119 "status": "ok",
120 "inputWords": value["before"]["basic"]["words"],
121 "outputWords": value["after"]["basic"]["words"]
122 }),
123 "text.tokenize" => serde_json::json!({
124 "status": "ok",
125 "tokenCount": value["tokens"].as_array().map(Vec::len).unwrap_or(0),
126 "dominantScript": value["scriptProfile"]["dominantScript"]
127 }),
128 "text.boundaries" => serde_json::json!({
129 "status": "ok",
130 "wordCount": value["words"].as_array().map(Vec::len).unwrap_or(0),
131 "sentenceCount": value["sentences"].as_array().map(Vec::len).unwrap_or(0),
132 "paragraphCount": value["paragraphs"].as_array().map(Vec::len).unwrap_or(0)
133 }),
134 _ => serde_json::json!({"status": "ok"}),
135 }
136}
137
138#[derive(Debug, Deserialize)]
139#[serde(rename_all = "camelCase")]
140struct NormalizeRequest {
141 text: String,
142 #[serde(default = "default_true")]
143 lowercase: bool,
144 #[serde(default)]
145 strip_diacritics: bool,
146 #[serde(default = "default_true")]
147 normalize_whitespace: bool,
148}
149
150#[derive(Debug, Deserialize)]
151#[serde(rename_all = "camelCase")]
152struct TokenizeRequest {
153 text: String,
154 #[serde(default)]
155 include_whitespace: bool,
156 #[serde(default)]
157 include_punctuation: bool,
158 #[serde(default = "default_true")]
159 lowercase: bool,
160}
161
162#[derive(Debug, Deserialize)]
163#[serde(rename_all = "camelCase")]
164struct BoundariesRequest {
165 text: String,
166 #[serde(default = "default_true")]
167 keep_apostrophes: bool,
168}
169
170fn normalize_value(request: NormalizeRequest) -> Result<serde_json::Value, String> {
171 let before = detailed_text_stats(&request.text, &TextProcessingOptions::default());
172 let mut normalized = normalize_text(
173 &request.text,
174 &TextProcessingOptions {
175 lowercase: request.lowercase,
176 ..TextProcessingOptions::default()
177 },
178 );
179 if request.strip_diacritics {
180 normalized = normalized.chars().filter(|ch| ch.is_ascii()).collect();
181 }
182 if request.normalize_whitespace {
183 normalized = normalize_whitespace(&normalized);
184 }
185 let after = detailed_text_stats(&normalized, &TextProcessingOptions::default());
186 Ok(serde_json::json!({
187 "text": normalized,
188 "before": before,
189 "after": after
190 }))
191}
192
193fn tokenize_value(request: TokenizeRequest) -> Result<serde_json::Value, String> {
194 let options = TextProcessingOptions {
195 lowercase: request.lowercase,
196 include_punctuation: request.include_punctuation || request.include_whitespace,
197 ..TextProcessingOptions::default()
198 };
199 Ok(serde_json::json!({
200 "tokens": tokenize(&request.text, &options),
201 "scriptProfile": detect_script_profile(&request.text),
202 "stats": detailed_text_stats(&request.text, &options)
203 }))
204}
205
206fn boundaries_value(request: BoundariesRequest) -> Result<serde_json::Value, String> {
207 let processing = TextProcessingOptions {
208 keep_apostrophes: request.keep_apostrophes,
209 include_punctuation: true,
210 ..TextProcessingOptions::default()
211 };
212 let boundary_options = TextBoundaryOptions {
213 include_punctuation: false,
214 ..TextBoundaryOptions::default()
215 };
216 Ok(serde_json::json!({
217 "words": segment_words(&request.text, &boundary_options),
218 "sentences": split_sentence_spans(&request.text, &processing),
219 "paragraphs": split_paragraphs(&request.text),
220 "graphemes": segment_graphemes(&request.text)
221 }))
222}
223
224fn parse_input<T: for<'de> Deserialize<'de>>(input: serde_json::Value) -> Result<T, String> {
225 runtime_core::parse_surface_input(None, input)
226}
227
228fn default_true() -> bool {
229 true
230}
231
232#[cfg(test)]
233mod tests {
234 use super::*;
235 use runtime_core::OperationId;
236
237 #[test]
238 fn package_surface_lists_text_operations() {
239 let ids = package_surface()
240 .operations
241 .into_iter()
242 .map(|operation| operation.id.0)
243 .collect::<Vec<_>>();
244
245 assert!(ids.contains(&"text.statistics".to_string()));
246 assert!(ids.contains(&"text.tokenize".to_string()));
247 }
248
249 #[test]
250 fn tokenization_operation_returns_stable_tokens() {
251 let response = run_surface_operation(SurfaceRequest {
252 operation: OperationId::new("text.tokenize"),
253 input: serde_json::json!({"text": "Hello Berlin.", "includePunctuation": true}),
254 })
255 .expect("tokenize");
256
257 assert_eq!(response.value["tokens"][0]["normalized"], "hello");
258 assert_eq!(response.value["stats"]["basic"]["words"], 2);
259 }
260
261 #[test]
262 fn malformed_input_returns_typed_error_string() {
263 let error = run_surface_operation(SurfaceRequest {
264 operation: OperationId::new("text.statistics"),
265 input: serde_json::json!({"missing": true}),
266 })
267 .expect_err("invalid request");
268
269 assert!(error.contains("invalid request"));
270 }
271}