use anyhow::{Context, Result};
use clap::Args;
use std::path::PathBuf;
#[derive(Debug, Args)]
pub struct ProcessArgs {
#[arg(short, long, value_name = "FILE/PATTERN", required = true)]
pub input: Vec<String>,
#[arg(short, long, value_name = "FILE")]
pub output: Option<PathBuf>,
#[arg(short, long, value_enum, default_value = "text")]
pub format: OutputFormat,
#[arg(short, long, value_enum, conflicts_with = "language_config")]
pub language: Option<Language>,
#[arg(short = 'c', long, value_name = "FILE", conflicts_with = "language")]
pub language_config: Option<PathBuf>,
#[arg(long, requires = "language_config")]
pub language_code: Option<String>,
#[arg(short, long)]
pub parallel: bool,
#[arg(long, conflicts_with = "parallel")]
pub adaptive: bool,
#[arg(short = 't', long, value_name = "COUNT")]
pub threads: Option<usize>,
#[arg(long, value_name = "SIZE_KB")]
pub chunk_kb: Option<usize>,
#[arg(short, long)]
pub quiet: bool,
#[arg(short, long, action = clap::ArgAction::Count)]
pub verbose: u8,
#[arg(long)]
pub stream: bool,
#[arg(long, default_value = "10", requires = "stream")]
pub stream_chunk_mb: u64,
}
#[derive(Debug, Clone, Copy, clap::ValueEnum)]
pub enum OutputFormat {
#[value(alias = "txt")]
Text,
Json,
#[value(alias = "md")]
Markdown,
}
#[derive(Debug, Clone, Copy, clap::ValueEnum)]
pub enum Language {
#[value(alias = "en", alias = "eng")]
English,
#[value(alias = "ja", alias = "jpn")]
Japanese,
}
impl ProcessArgs {
pub fn execute(&self) -> Result<()> {
self.init_logging()?;
log::info!("Starting text processing");
log::debug!("Arguments: {self:?}");
let mut formatter: Box<dyn crate::output::OutputFormatter> = self.create_formatter()?;
let processor = self.create_processor()?;
if self.input.len() == 1 && self.input[0] == "-" {
log::info!("Reading from stdin");
self.process_stdin(&processor, &mut formatter)?;
} else {
let files = crate::input::resolve_patterns(&self.input)?;
log::info!("Found {} files to process", files.len());
let mut progress = crate::progress::ProgressReporter::new(self.quiet);
progress.init_files(files.len() as u64);
for file in &files {
log::info!("Processing file: {}", file.display());
let file_size_mb = crate::input::FileReader::file_size(file)? / (1024 * 1024);
let should_stream = self.stream || file_size_mb > 100;
if should_stream {
log::info!(
"Using streaming mode for {} ({}MB)",
file.display(),
file_size_mb
);
self.process_file_streaming(file, &processor, &mut formatter)?;
} else {
let content = crate::input::FileReader::read_text(file)?;
let result = processor
.process(sakurs_core::Input::from_text(content.clone()))
.map_err(|e| anyhow::anyhow!("Processing failed: {}", e))?;
let mut last_offset = 0;
for boundary in &result.boundaries {
let sentence = &content[last_offset..boundary.offset];
formatter.format_sentence(sentence.trim(), last_offset)?;
last_offset = boundary.offset;
}
if last_offset < content.len() {
let sentence = &content[last_offset..];
if !sentence.trim().is_empty() {
formatter.format_sentence(sentence.trim(), last_offset)?;
}
}
}
progress.file_completed(&file.file_name().unwrap_or_default().to_string_lossy());
}
progress.finish();
log::info!("Processing complete. Processed {} files", files.len());
}
formatter.finish()?;
Ok(())
}
fn init_logging(&self) -> Result<()> {
let log_level = match self.verbose {
0 => "warn",
1 => "info",
2 => "debug",
_ => "trace",
};
if !self.quiet {
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(log_level))
.init();
}
Ok(())
}
fn create_formatter(&self) -> Result<Box<dyn crate::output::OutputFormatter>> {
use std::io;
match self.format {
OutputFormat::Text => {
if let Some(output_path) = &self.output {
let file = std::fs::File::create(output_path).with_context(|| {
format!("Failed to create output file: {}", output_path.display())
})?;
Ok(Box::new(crate::output::TextFormatter::new(file)))
} else {
Ok(Box::new(crate::output::TextFormatter::new(io::stdout())))
}
}
OutputFormat::Json => {
if let Some(output_path) = &self.output {
let file = std::fs::File::create(output_path).with_context(|| {
format!("Failed to create output file: {}", output_path.display())
})?;
Ok(Box::new(crate::output::JsonFormatter::new(file)))
} else {
Ok(Box::new(crate::output::JsonFormatter::new(io::stdout())))
}
}
OutputFormat::Markdown => {
if let Some(output_path) = &self.output {
let file = std::fs::File::create(output_path).with_context(|| {
format!("Failed to create output file: {}", output_path.display())
})?;
Ok(Box::new(crate::output::MarkdownFormatter::new(file)))
} else {
Ok(Box::new(
crate::output::MarkdownFormatter::new(io::stdout()),
))
}
}
}
}
fn create_processor(&self) -> Result<sakurs_core::SentenceProcessor> {
use crate::language_source::LanguageSource;
use sakurs_core::{Config, SentenceProcessor};
let language_source = match (&self.language, &self.language_config) {
(Some(lang), None) => LanguageSource::BuiltIn(*lang),
(None, Some(path)) => LanguageSource::External {
path: path.clone(),
language_code: self.language_code.clone(),
},
(None, None) => LanguageSource::BuiltIn(Language::English), (Some(_), Some(_)) => unreachable!(), };
log::info!("Using language source: {}", language_source.display_name());
match language_source {
LanguageSource::BuiltIn(lang) => {
let language_code = lang.code();
let builder = Config::builder()
.language(language_code)
.map_err(|e| anyhow::anyhow!("Failed to set language: {}", e))?;
let builder = self.configure_builder(builder)?;
let config = builder
.build()
.map_err(|e| anyhow::anyhow!("Failed to build processor config: {}", e))?;
SentenceProcessor::with_config(config)
.map_err(|e| anyhow::anyhow!("Failed to create processor: {}", e))
}
LanguageSource::External {
path,
language_code,
} => {
use sakurs_core::domain::language::ConfigurableLanguageRules;
use std::sync::Arc;
let rules = ConfigurableLanguageRules::from_file(&path, language_code.as_deref())
.map_err(|e| {
anyhow::anyhow!("Failed to load external language config: {}", e)
})?;
let builder = Config::builder();
let builder = self.configure_builder(builder)?;
let config = builder
.build()
.map_err(|e| anyhow::anyhow!("Failed to build processor config: {}", e))?;
SentenceProcessor::with_custom_rules(config, Arc::new(rules))
.map_err(|e| anyhow::anyhow!("Failed to create processor: {}", e))
}
}
}
fn configure_builder(
&self,
builder: sakurs_core::ConfigBuilder,
) -> Result<sakurs_core::ConfigBuilder> {
let mut builder = builder;
if let Some(thread_count) = self.threads {
if thread_count == 0 {
return Err(anyhow::anyhow!("Thread count must be greater than 0"));
}
builder = builder.threads(Some(thread_count));
} else if self.parallel {
builder = builder.threads(None); }
if let Some(chunk_kb) = self.chunk_kb {
if chunk_kb == 0 {
return Err(anyhow::anyhow!("Chunk size must be greater than 0"));
}
let chunk_size = chunk_kb * 1024;
builder = builder.chunk_size(chunk_size);
}
Ok(builder)
}
fn process_file_streaming(
&self,
file: &std::path::Path,
processor: &sakurs_core::SentenceProcessor,
formatter: &mut Box<dyn crate::output::OutputFormatter>,
) -> Result<()> {
log::info!("Using streaming mode for large file: {}", file.display());
let content = crate::input::FileReader::read_text(file)?;
let result = processor
.process(sakurs_core::Input::from_text(content.clone()))
.map_err(|e| anyhow::anyhow!("Processing failed: {}", e))?;
let mut last_offset = 0;
for boundary in &result.boundaries {
let sentence = &content[last_offset..boundary.offset];
formatter.format_sentence(sentence.trim(), last_offset)?;
last_offset = boundary.offset;
}
if last_offset < content.len() {
let sentence = &content[last_offset..];
if !sentence.trim().is_empty() {
formatter.format_sentence(sentence.trim(), last_offset)?;
}
}
Ok(())
}
fn process_stdin(
&self,
processor: &sakurs_core::SentenceProcessor,
formatter: &mut Box<dyn crate::output::OutputFormatter>,
) -> Result<()> {
use std::io::Read;
let mut buffer = String::new();
std::io::stdin()
.read_to_string(&mut buffer)
.context("Failed to read from stdin")?;
let result = processor
.process(sakurs_core::Input::from_text(buffer.clone()))
.map_err(|e| anyhow::anyhow!("Processing failed: {}", e))?;
let mut last_offset = 0;
for boundary in &result.boundaries {
let sentence = &buffer[last_offset..boundary.offset];
formatter.format_sentence(sentence.trim(), last_offset)?;
last_offset = boundary.offset;
}
if last_offset < buffer.len() {
let sentence = &buffer[last_offset..];
if !sentence.trim().is_empty() {
formatter.format_sentence(sentence.trim(), last_offset)?;
}
}
Ok(())
}
}
#[allow(dead_code)]
fn find_safe_split_point(text: &str, target: usize) -> usize {
if text.len() <= target {
return text.len();
}
let search_start = target.saturating_sub(200);
let search_end = (target + 200).min(text.len());
if let Some(pos) = text[search_start..search_end].rfind(['.', '!', '?', '。', '!', '?']) {
let boundary = search_start + pos + 1;
if boundary <= text.len() && text.is_char_boundary(boundary) {
return boundary;
}
}
let mut search_end = target.min(text.len());
while search_end > 0 && !text.is_char_boundary(search_end) {
search_end -= 1;
}
if search_end > 0 {
if let Some(pos) = text[..search_end].rfind(|c: char| c.is_whitespace()) {
return pos + 1;
}
}
let mut pos = target.min(text.len());
while pos > 0 && !text.is_char_boundary(pos) {
pos -= 1;
}
pos
}
#[allow(dead_code)]
fn output_sentences(
text: &str,
result: &sakurs_core::Output,
formatter: &mut Box<dyn crate::output::OutputFormatter>,
base_offset: usize,
) -> Result<()> {
let mut last_offset = 0;
for boundary in &result.boundaries {
let sentence = &text[last_offset..boundary.offset];
formatter.format_sentence(sentence.trim(), base_offset + last_offset)?;
last_offset = boundary.offset;
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_find_safe_split_point_sentence_boundary() {
let text = "First. Second sentence here.";
let target = 10;
let split = find_safe_split_point(text, target);
assert_eq!(split, 28);
assert_eq!(&text[..split], text);
let long_text = concat!(
"This is a sentence. ", "Another sentence. ", "Third sentence. ", "Fourth sentence. ", "Fifth sentence. ", "Sixth sentence. ", "Seventh sentence." );
let split = find_safe_split_point(long_text, 60);
println!("Long text len: {}, split: {}", long_text.len(), split);
assert_eq!(split, long_text.len());
let text3 = "This is a very long sentence without any periods until way at the end.";
let target3 = 20;
let split3 = find_safe_split_point(text3, target3);
println!("Text3 period position: {}", text3.find('.').unwrap());
println!("Target3: {}, Split3: {}", target3, split3);
assert_eq!(split3, 70); }
#[test]
fn test_find_safe_split_point_japanese_sentence() {
let text = "短い文。次の文。";
let target = 12;
let split = find_safe_split_point(text, target);
println!("Japanese text bytes: {}", text.len());
println!("Target: {}, Split: {}", target, split);
assert_eq!(split, 12);
let text2 = "これはとても長い日本語の文章で句読点がありません";
let target2 = 30;
let split2 = find_safe_split_point(text2, target2);
assert!(text2.is_char_boundary(split2));
assert!(split2 <= target2);
let text3 = "最初の文。二番目。三番目。";
let target3 = 50; let split3 = find_safe_split_point(text3, target3);
println!(
"Text3 len: {}, target: {}, split: {}",
text3.len(),
target3,
split3
);
assert_eq!(split3, 39);
assert_eq!(&text3[..split3], text3);
}
#[test]
fn test_find_safe_split_point_word_boundary() {
let text = "This is a very long sentence without any punctuation marks that goes on and on";
let split = find_safe_split_point(text, 40);
assert!(split > 0);
assert!(text.chars().nth(split - 1).unwrap().is_whitespace() || split == text.len());
}
#[test]
fn test_find_safe_split_point_utf8_boundary() {
let text = "Hello 世界 World こんにちは Test";
let split = find_safe_split_point(text, 15);
assert!(text.is_char_boundary(split));
}
#[test]
fn test_find_safe_split_point_small_text() {
let text = "Short.";
let split = find_safe_split_point(text, 100);
assert_eq!(split, text.len());
}
#[test]
fn test_find_safe_split_point_exact_boundary() {
let text = "Exactly at boundary.";
let split = find_safe_split_point(text, text.len());
assert_eq!(split, text.len());
}
#[test]
fn test_find_safe_split_point_no_boundaries() {
let text = "NoSpacesOrPunctuationHereJustOneLongWord";
let split = find_safe_split_point(text, 20);
assert!(text.is_char_boundary(split));
assert!(split <= 20);
}
}