1use anyhow::{Context, Result};
4use clap::Args;
5use std::path::PathBuf;
6
7#[derive(Debug, Args)]
9pub struct ProcessArgs {
10 #[arg(short, long, value_name = "FILE/PATTERN", required = true)]
12 pub input: Vec<String>,
13
14 #[arg(short, long, value_name = "FILE")]
16 pub output: Option<PathBuf>,
17
18 #[arg(short, long, value_enum, default_value = "text")]
20 pub format: OutputFormat,
21
22 #[arg(short, long, value_enum, conflicts_with = "language_config")]
25 pub language: Option<Language>,
26
27 #[arg(short = 'c', long, value_name = "FILE", conflicts_with = "language")]
30 pub language_config: Option<PathBuf>,
31
32 #[arg(long, requires = "language_config")]
35 pub language_code: Option<String>,
36
37 #[arg(short, long)]
39 pub parallel: bool,
40
41 #[arg(long, conflicts_with = "parallel")]
44 pub adaptive: bool,
45
46 #[arg(short = 't', long, value_name = "COUNT")]
48 pub threads: Option<usize>,
49
50 #[arg(long, value_name = "SIZE_KB")]
52 pub chunk_kb: Option<usize>,
53
54 #[arg(short, long)]
56 pub quiet: bool,
57
58 #[arg(short, long, action = clap::ArgAction::Count)]
60 pub verbose: u8,
61
62 #[arg(long)]
64 pub stream: bool,
65
66 #[arg(long, default_value = "10", requires = "stream")]
68 pub stream_chunk_mb: u64,
69}
70
71#[derive(Debug, Clone, Copy, clap::ValueEnum)]
73pub enum OutputFormat {
74 #[value(alias = "txt")]
76 Text,
77 Json,
79 #[value(alias = "md")]
81 Markdown,
82}
83
84#[derive(Debug, Clone, Copy, clap::ValueEnum)]
86pub enum Language {
87 #[value(alias = "en", alias = "eng")]
89 English,
90 #[value(alias = "ja", alias = "jpn")]
92 Japanese,
93}
94
95impl ProcessArgs {
96 pub fn execute(&self) -> Result<()> {
98 self.init_logging()?;
100
101 log::info!("Starting text processing");
102 log::debug!("Arguments: {self:?}");
103
104 let mut formatter: Box<dyn crate::output::OutputFormatter> = self.create_formatter()?;
106
107 let processor = self.create_processor()?;
109
110 if self.input.len() == 1 && self.input[0] == "-" {
112 log::info!("Reading from stdin");
113 self.process_stdin(&processor, &mut formatter)?;
114 } else {
115 let files = crate::input::resolve_patterns(&self.input)?;
117 log::info!("Found {} files to process", files.len());
118
119 let mut progress = crate::progress::ProgressReporter::new(self.quiet);
121 progress.init_files(files.len() as u64);
122
123 for file in &files {
124 log::info!("Processing file: {}", file.display());
125
126 let file_size_mb = crate::input::FileReader::file_size(file)? / (1024 * 1024);
128 let should_stream = self.stream || file_size_mb > 100; if should_stream {
131 log::info!(
132 "Using streaming mode for {} ({}MB)",
133 file.display(),
134 file_size_mb
135 );
136 self.process_file_streaming(file, &processor, &mut formatter)?;
137 } else {
138 let content = crate::input::FileReader::read_text(file)?;
140
141 let result = processor
143 .process(sakurs_core::Input::from_text(content.clone()))
144 .map_err(|e| anyhow::anyhow!("Processing failed: {}", e))?;
145
146 let mut last_offset = 0;
148 for boundary in &result.boundaries {
149 let sentence = &content[last_offset..boundary.offset];
150 formatter.format_sentence(sentence.trim(), last_offset)?;
151 last_offset = boundary.offset;
152 }
153
154 if last_offset < content.len() {
156 let sentence = &content[last_offset..];
157 if !sentence.trim().is_empty() {
158 formatter.format_sentence(sentence.trim(), last_offset)?;
159 }
160 }
161 }
162
163 progress.file_completed(&file.file_name().unwrap_or_default().to_string_lossy());
164 }
165
166 progress.finish();
167 log::info!("Processing complete. Processed {} files", files.len());
168 }
169
170 formatter.finish()?;
172 Ok(())
173 }
174
175 fn init_logging(&self) -> Result<()> {
177 let log_level = match self.verbose {
178 0 => "warn",
179 1 => "info",
180 2 => "debug",
181 _ => "trace",
182 };
183
184 if !self.quiet {
185 env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(log_level))
186 .init();
187 }
188
189 Ok(())
190 }
191
192 fn create_formatter(&self) -> Result<Box<dyn crate::output::OutputFormatter>> {
194 use std::io;
195
196 match self.format {
197 OutputFormat::Text => {
198 if let Some(output_path) = &self.output {
199 let file = std::fs::File::create(output_path).with_context(|| {
200 format!("Failed to create output file: {}", output_path.display())
201 })?;
202 Ok(Box::new(crate::output::TextFormatter::new(file)))
203 } else {
204 Ok(Box::new(crate::output::TextFormatter::new(io::stdout())))
205 }
206 }
207 OutputFormat::Json => {
208 if let Some(output_path) = &self.output {
209 let file = std::fs::File::create(output_path).with_context(|| {
210 format!("Failed to create output file: {}", output_path.display())
211 })?;
212 Ok(Box::new(crate::output::JsonFormatter::new(file)))
213 } else {
214 Ok(Box::new(crate::output::JsonFormatter::new(io::stdout())))
215 }
216 }
217 OutputFormat::Markdown => {
218 if let Some(output_path) = &self.output {
219 let file = std::fs::File::create(output_path).with_context(|| {
220 format!("Failed to create output file: {}", output_path.display())
221 })?;
222 Ok(Box::new(crate::output::MarkdownFormatter::new(file)))
223 } else {
224 Ok(Box::new(
225 crate::output::MarkdownFormatter::new(io::stdout()),
226 ))
227 }
228 }
229 }
230 }
231
232 fn create_processor(&self) -> Result<sakurs_core::SentenceProcessor> {
234 use crate::language_source::LanguageSource;
235 use sakurs_core::{Config, SentenceProcessor};
236
237 let language_source = match (&self.language, &self.language_config) {
239 (Some(lang), None) => LanguageSource::BuiltIn(*lang),
240 (None, Some(path)) => LanguageSource::External {
241 path: path.clone(),
242 language_code: self.language_code.clone(),
243 },
244 (None, None) => LanguageSource::BuiltIn(Language::English), (Some(_), Some(_)) => unreachable!(), };
247
248 log::info!("Using language source: {}", language_source.display_name());
249
250 match language_source {
252 LanguageSource::BuiltIn(lang) => {
253 let language_code = lang.code();
254
255 let builder = Config::builder()
257 .language(language_code)
258 .map_err(|e| anyhow::anyhow!("Failed to set language: {}", e))?;
259
260 let builder = self.configure_builder(builder)?;
261
262 let config = builder
263 .build()
264 .map_err(|e| anyhow::anyhow!("Failed to build processor config: {}", e))?;
265
266 SentenceProcessor::with_config(config)
267 .map_err(|e| anyhow::anyhow!("Failed to create processor: {}", e))
268 }
269 LanguageSource::External {
270 path,
271 language_code,
272 } => {
273 use sakurs_core::domain::language::ConfigurableLanguageRules;
275 use std::sync::Arc;
276
277 let rules = ConfigurableLanguageRules::from_file(&path, language_code.as_deref())
278 .map_err(|e| {
279 anyhow::anyhow!("Failed to load external language config: {}", e)
280 })?;
281
282 let builder = Config::builder();
284 let builder = self.configure_builder(builder)?;
285
286 let config = builder
287 .build()
288 .map_err(|e| anyhow::anyhow!("Failed to build processor config: {}", e))?;
289
290 SentenceProcessor::with_custom_rules(config, Arc::new(rules))
292 .map_err(|e| anyhow::anyhow!("Failed to create processor: {}", e))
293 }
294 }
295 }
296
297 fn configure_builder(
299 &self,
300 builder: sakurs_core::ConfigBuilder,
301 ) -> Result<sakurs_core::ConfigBuilder> {
302 let mut builder = builder;
303
304 if let Some(thread_count) = self.threads {
309 if thread_count == 0 {
310 return Err(anyhow::anyhow!("Thread count must be greater than 0"));
311 }
312 builder = builder.threads(Some(thread_count));
313 } else if self.parallel {
314 builder = builder.threads(None); }
316
317 if let Some(chunk_kb) = self.chunk_kb {
319 if chunk_kb == 0 {
320 return Err(anyhow::anyhow!("Chunk size must be greater than 0"));
321 }
322 let chunk_size = chunk_kb * 1024;
324 builder = builder.chunk_size(chunk_size);
325 }
326
327 Ok(builder)
329 }
330
331 fn process_file_streaming(
333 &self,
334 file: &std::path::Path,
335 processor: &sakurs_core::SentenceProcessor,
336 formatter: &mut Box<dyn crate::output::OutputFormatter>,
337 ) -> Result<()> {
338 log::info!("Using streaming mode for large file: {}", file.display());
341
342 let content = crate::input::FileReader::read_text(file)?;
343 let result = processor
344 .process(sakurs_core::Input::from_text(content.clone()))
345 .map_err(|e| anyhow::anyhow!("Processing failed: {}", e))?;
346
347 let mut last_offset = 0;
348 for boundary in &result.boundaries {
349 let sentence = &content[last_offset..boundary.offset];
350 formatter.format_sentence(sentence.trim(), last_offset)?;
351 last_offset = boundary.offset;
352 }
353
354 if last_offset < content.len() {
356 let sentence = &content[last_offset..];
357 if !sentence.trim().is_empty() {
358 formatter.format_sentence(sentence.trim(), last_offset)?;
359 }
360 }
361
362 Ok(())
363 }
364
365 fn process_stdin(
367 &self,
368 processor: &sakurs_core::SentenceProcessor,
369 formatter: &mut Box<dyn crate::output::OutputFormatter>,
370 ) -> Result<()> {
371 use std::io::Read;
372
373 let mut buffer = String::new();
374 std::io::stdin()
375 .read_to_string(&mut buffer)
376 .context("Failed to read from stdin")?;
377
378 let result = processor
379 .process(sakurs_core::Input::from_text(buffer.clone()))
380 .map_err(|e| anyhow::anyhow!("Processing failed: {}", e))?;
381
382 let mut last_offset = 0;
383 for boundary in &result.boundaries {
384 let sentence = &buffer[last_offset..boundary.offset];
385 formatter.format_sentence(sentence.trim(), last_offset)?;
386 last_offset = boundary.offset;
387 }
388
389 if last_offset < buffer.len() {
391 let sentence = &buffer[last_offset..];
392 if !sentence.trim().is_empty() {
393 formatter.format_sentence(sentence.trim(), last_offset)?;
394 }
395 }
396
397 Ok(())
398 }
399}
400
401#[allow(dead_code)]
403fn find_safe_split_point(text: &str, target: usize) -> usize {
404 if text.len() <= target {
405 return text.len();
406 }
407
408 let search_start = target.saturating_sub(200);
410 let search_end = (target + 200).min(text.len());
411
412 if let Some(pos) = text[search_start..search_end].rfind(['.', '!', '?', '。', '!', '?']) {
413 let boundary = search_start + pos + 1;
414 if boundary <= text.len() && text.is_char_boundary(boundary) {
415 return boundary;
416 }
417 }
418
419 let mut search_end = target.min(text.len());
421 while search_end > 0 && !text.is_char_boundary(search_end) {
423 search_end -= 1;
424 }
425
426 if search_end > 0 {
427 if let Some(pos) = text[..search_end].rfind(|c: char| c.is_whitespace()) {
428 return pos + 1;
429 }
430 }
431
432 let mut pos = target.min(text.len());
434 while pos > 0 && !text.is_char_boundary(pos) {
435 pos -= 1;
436 }
437 pos
438}
439
440#[allow(dead_code)]
442fn output_sentences(
443 text: &str,
444 result: &sakurs_core::Output,
445 formatter: &mut Box<dyn crate::output::OutputFormatter>,
446 base_offset: usize,
447) -> Result<()> {
448 let mut last_offset = 0;
449 for boundary in &result.boundaries {
450 let sentence = &text[last_offset..boundary.offset];
451 formatter.format_sentence(sentence.trim(), base_offset + last_offset)?;
452 last_offset = boundary.offset;
453 }
454
455 Ok(())
456}
457
458#[cfg(test)]
459mod tests {
460 use super::*;
461
462 #[test]
463 fn test_find_safe_split_point_sentence_boundary() {
464 let text = "First. Second sentence here.";
466 let target = 10;
467 let split = find_safe_split_point(text, target);
468 assert_eq!(split, 28);
471 assert_eq!(&text[..split], text); let long_text = concat!(
475 "This is a sentence. ", "Another sentence. ", "Third sentence. ", "Fourth sentence. ", "Fifth sentence. ", "Sixth sentence. ", "Seventh sentence." );
483
484 let split = find_safe_split_point(long_text, 60);
486 println!("Long text len: {}, split: {}", long_text.len(), split);
487 assert_eq!(split, long_text.len()); let text3 = "This is a very long sentence without any periods until way at the end.";
493 let target3 = 20;
494 let split3 = find_safe_split_point(text3, target3);
495 println!("Text3 period position: {}", text3.find('.').unwrap());
498 println!("Target3: {}, Split3: {}", target3, split3);
499 assert_eq!(split3, 70); }
503
504 #[test]
505 fn test_find_safe_split_point_japanese_sentence() {
506 let text = "短い文。次の文。";
508 let target = 12;
509 let split = find_safe_split_point(text, target);
510 println!("Japanese text bytes: {}", text.len());
511 println!("Target: {}, Split: {}", target, split);
512
513 assert_eq!(split, 12); let text2 = "これはとても長い日本語の文章で句読点がありません";
521 let target2 = 30;
522 let split2 = find_safe_split_point(text2, target2);
523
524 assert!(text2.is_char_boundary(split2));
526 assert!(split2 <= target2);
527
528 let text3 = "最初の文。二番目。三番目。";
530 let target3 = 50; let split3 = find_safe_split_point(text3, target3);
532 println!(
533 "Text3 len: {}, target: {}, split: {}",
534 text3.len(),
535 target3,
536 split3
537 );
538
539 assert_eq!(split3, 39);
542 assert_eq!(&text3[..split3], text3);
543 }
544
545 #[test]
546 fn test_find_safe_split_point_word_boundary() {
547 let text = "This is a very long sentence without any punctuation marks that goes on and on";
548 let split = find_safe_split_point(text, 40);
549 assert!(split > 0);
551 assert!(text.chars().nth(split - 1).unwrap().is_whitespace() || split == text.len());
552 }
553
554 #[test]
555 fn test_find_safe_split_point_utf8_boundary() {
556 let text = "Hello 世界 World こんにちは Test";
557 let split = find_safe_split_point(text, 15);
558 assert!(text.is_char_boundary(split));
560 }
561
562 #[test]
563 fn test_find_safe_split_point_small_text() {
564 let text = "Short.";
565 let split = find_safe_split_point(text, 100);
566 assert_eq!(split, text.len());
567 }
568
569 #[test]
570 fn test_find_safe_split_point_exact_boundary() {
571 let text = "Exactly at boundary.";
572 let split = find_safe_split_point(text, text.len());
573 assert_eq!(split, text.len());
574 }
575
576 #[test]
577 fn test_find_safe_split_point_no_boundaries() {
578 let text = "NoSpacesOrPunctuationHereJustOneLongWord";
579 let split = find_safe_split_point(text, 20);
580 assert!(text.is_char_boundary(split));
582 assert!(split <= 20);
583 }
584}