oar_ocr_core/utils/dict.rs
1//! Dictionary and tokenizer loading utilities.
2//!
3//! This module provides helper functions for loading character dictionaries
4//! and tokenizer files used throughout the OCR pipeline.
5
6use crate::core::OCRError;
7use std::path::Path;
8
9/// Reads a character dictionary file and returns a vector of strings.
10///
11/// Each line in the file becomes one entry in the resulting vector.
12/// Empty lines are preserved.
13///
14/// # Arguments
15///
16/// * `path` - Path to the dictionary file
17///
18/// # Returns
19///
20/// A vector of strings, one per line in the file.
21///
22/// # Errors
23///
24/// Returns an `OCRError::InvalidInput` if the file cannot be read.
25///
26/// # Example
27///
28/// ```rust,no_run
29/// use oar_ocr_core::utils::read_character_dict;
30/// use std::path::Path;
31///
32/// let dict = read_character_dict(Path::new("path/to/dict.txt"))?;
33/// # Ok::<(), oar_ocr_core::core::OCRError>(())
34/// ```
35pub fn read_character_dict(path: &Path) -> Result<Vec<String>, OCRError> {
36 let content = std::fs::read_to_string(path).map_err(|e| OCRError::InvalidInput {
37 message: format!(
38 "Failed to read character dictionary from '{}': {}",
39 path.display(),
40 e
41 ),
42 })?;
43 Ok(content.lines().map(|s| s.to_string()).collect())
44}
45
46/// Reads a character dictionary file and returns the raw content string.
47///
48/// This is useful when you need the raw content before processing.
49///
50/// # Arguments
51///
52/// * `path` - Path to the dictionary file
53///
54/// # Returns
55///
56/// The raw file content as a string.
57///
58/// # Errors
59///
60/// Returns an `OCRError::InvalidInput` if the file cannot be read.
61pub fn read_dict_content(path: &Path) -> Result<String, OCRError> {
62 std::fs::read_to_string(path).map_err(|e| OCRError::InvalidInput {
63 message: format!("Failed to read dictionary from '{}': {}", path.display(), e),
64 })
65}
66
67/// Validates that a required path option is present and returns the path.
68///
69/// This is a helper for builder patterns where a path is required but stored
70/// as an `Option<PathBuf>`.
71///
72/// # Arguments
73///
74/// * `path` - Optional path to validate
75/// * `component` - Component name for error message (e.g., "text_recognition")
76/// * `description` - Human-readable description of what the path is for
77///
78/// # Returns
79///
80/// The path if present.
81///
82/// # Errors
83///
84/// Returns an `OCRError::ConfigError` if the path is None.
85///
86/// # Example
87///
88/// ```rust,no_run
89/// use oar_ocr_core::utils::require_path;
90/// use std::path::PathBuf;
91///
92/// let path: Option<PathBuf> = Some(PathBuf::from("/path/to/dict.txt"));
93/// let validated = require_path(path, "text_recognition", "character dictionary path")?;
94/// # Ok::<(), oar_ocr_core::core::OCRError>(())
95/// ```
96pub fn require_path<P: AsRef<Path> + Clone>(
97 path: Option<P>,
98 component: &str,
99 description: &str,
100) -> Result<P, OCRError> {
101 path.ok_or_else(|| {
102 OCRError::config_error_detailed(
103 component,
104 format!("{} is required for {}", description, component),
105 )
106 })
107}
108
109#[cfg(test)]
110mod tests {
111 use super::*;
112 use std::io::Write;
113 use tempfile::NamedTempFile;
114
115 #[test]
116 fn test_read_character_dict() -> Result<(), Box<dyn std::error::Error>> {
117 let mut file = NamedTempFile::new()?;
118 writeln!(file, "a")?;
119 writeln!(file, "b")?;
120 writeln!(file, "c")?;
121
122 let dict = read_character_dict(file.path())?;
123 assert_eq!(dict, vec!["a", "b", "c"]);
124 Ok(())
125 }
126
127 #[test]
128 fn test_read_dict_content() -> Result<(), Box<dyn std::error::Error>> {
129 let mut file = NamedTempFile::new()?;
130 write!(file, "hello\nworld")?;
131
132 let content = read_dict_content(file.path())?;
133 assert_eq!(content, "hello\nworld");
134 Ok(())
135 }
136
137 #[test]
138 fn test_read_nonexistent_file() {
139 let result = read_character_dict(Path::new("/nonexistent/path/dict.txt"));
140 assert!(result.is_err());
141 }
142
143 #[test]
144 fn test_require_path_some() {
145 let path = Some(std::path::PathBuf::from("/some/path"));
146 let result = require_path(path, "test", "test path");
147 assert!(result.is_ok());
148 }
149
150 #[test]
151 fn test_require_path_none() {
152 let path: Option<std::path::PathBuf> = None;
153 let result = require_path(path, "test_component", "test path");
154 assert!(result.is_err());
155 let err = result.unwrap_err();
156 assert!(err.to_string().contains("test_component"));
157 }
158}