langextract_rust/
io.rs

1//! I/O utilities for loading text from various sources.
2
3use crate::exceptions::{LangExtractError, LangExtractResult};
4use regex::Regex;
5
6/// Check if a string is a URL (starts with http:// or https://)
7pub fn is_url(text: &str) -> bool {
8    text.starts_with("http://") || text.starts_with("https://")
9}
10
11/// Download text content from a URL
12pub async fn download_text_from_url(url: &str) -> LangExtractResult<String> {
13    if !is_url(url) {
14        return Err(LangExtractError::invalid_input(format!(
15            "Invalid URL: {}",
16            url
17        )));
18    }
19
20    let client = reqwest::Client::new();
21    let response = client.get(url).send().await?;
22
23    if !response.status().is_success() {
24        return Err(LangExtractError::invalid_input(
25            format!("HTTP error: {} for URL: {}", response.status(), url)
26        ));
27    }
28
29    let content = response.text().await?;
30    Ok(content)
31}
32
33/// Clean and normalize text content
34pub fn normalize_text(text: &str) -> String {
35    // Remove excessive whitespace and normalize line endings
36    let whitespace_regex = Regex::new(r"\s+").unwrap();
37    let normalized = whitespace_regex.replace_all(text.trim(), " ");
38    normalized.to_string()
39}
40
41/// Extract plain text from HTML content (basic implementation)
42pub fn extract_text_from_html(html: &str) -> String {
43    // This is a very basic HTML tag removal
44    // In a production system, you'd want to use a proper HTML parser
45    let tag_regex = Regex::new(r"<[^>]*>").unwrap();
46    let text = tag_regex.replace_all(html, " ");
47    normalize_text(&text)
48}
49
50/// Load text from a file path
51pub async fn load_text_from_file(file_path: &str) -> LangExtractResult<String> {
52    let content = tokio::fs::read_to_string(file_path).await?;
53    Ok(content)
54}
55
56/// Save text to a file path
57pub async fn save_text_to_file(file_path: &str, content: &str) -> LangExtractResult<()> {
58    tokio::fs::write(file_path, content).await?;
59    Ok(())
60}
61
62/// Detect the content type of text (plain text, HTML, etc.)
63#[derive(Debug, Clone, PartialEq)]
64pub enum ContentType {
65    PlainText,
66    Html,
67    Json,
68    Yaml,
69    Unknown,
70}
71
72pub fn detect_content_type(content: &str) -> ContentType {
73    let trimmed = content.trim();
74    
75    // Check for JSON
76    if (trimmed.starts_with('{') && trimmed.ends_with('}'))
77        || (trimmed.starts_with('[') && trimmed.ends_with(']'))
78    {
79        if serde_json::from_str::<serde_json::Value>(trimmed).is_ok() {
80            return ContentType::Json;
81        }
82    }
83    
84    // Check for YAML (very basic check)
85    if trimmed.contains("---") || trimmed.contains(": ") {
86        if serde_yaml::from_str::<serde_yaml::Value>(trimmed).is_ok() {
87            return ContentType::Yaml;
88        }
89    }
90    
91    // Check for HTML
92    let html_regex = Regex::new(r"<[^>]+>").unwrap();
93    if html_regex.is_match(trimmed) {
94        return ContentType::Html;
95    }
96    
97    ContentType::PlainText
98}
99
100#[cfg(test)]
101mod tests {
102    use super::*;
103
104    #[test]
105    fn test_is_url() {
106        assert!(is_url("https://example.com"));
107        assert!(is_url("http://example.com"));
108        assert!(!is_url("example.com"));
109        assert!(!is_url("ftp://example.com"));
110        assert!(!is_url("file:///path/to/file"));
111    }
112
113    #[test]
114    fn test_normalize_text() {
115        let input = "  Hello    world  \n\n  How are you?  ";
116        let expected = "Hello world How are you?";
117        assert_eq!(normalize_text(input), expected);
118    }
119
120    #[test]
121    fn test_extract_text_from_html() {
122        let html = "<html><body><h1>Hello</h1><p>World</p></body></html>";
123        let text = extract_text_from_html(html);
124        assert_eq!(text, "Hello World");
125    }
126
127    #[test]
128    fn test_detect_content_type() {
129        assert_eq!(
130            detect_content_type(r#"{"key": "value"}"#),
131            ContentType::Json
132        );
133        
134        assert_eq!(
135            detect_content_type("key: value\nother: data"),
136            ContentType::Yaml
137        );
138        
139        assert_eq!(
140            detect_content_type("<html><body>Hello</body></html>"),
141            ContentType::Html
142        );
143        
144        assert_eq!(
145            detect_content_type("Just plain text"),
146            ContentType::PlainText
147        );
148    }
149
150    #[tokio::test]
151    async fn test_download_invalid_url() {
152        let result = download_text_from_url("not-a-url").await;
153        assert!(result.is_err());
154    }
155
156    #[test]
157    fn test_content_type_edge_cases() {
158        // Invalid JSON should not be detected as JSON
159        assert_eq!(
160            detect_content_type(r#"{"invalid": json"#),
161            ContentType::PlainText
162        );
163        
164        // Empty string
165        assert_eq!(
166            detect_content_type(""),
167            ContentType::PlainText
168        );
169        
170        // Whitespace only
171        assert_eq!(
172            detect_content_type("   \n\t  "),
173            ContentType::PlainText
174        );
175    }
176}