1use crate::exceptions::{LangExtractError, LangExtractResult};
4use regex::Regex;
5
6pub fn is_url(text: &str) -> bool {
8 text.starts_with("http://") || text.starts_with("https://")
9}
10
11pub async fn download_text_from_url(url: &str) -> LangExtractResult<String> {
13 if !is_url(url) {
14 return Err(LangExtractError::invalid_input(format!(
15 "Invalid URL: {}",
16 url
17 )));
18 }
19
20 let client = reqwest::Client::new();
21 let response = client.get(url).send().await?;
22
23 if !response.status().is_success() {
24 return Err(LangExtractError::invalid_input(
25 format!("HTTP error: {} for URL: {}", response.status(), url)
26 ));
27 }
28
29 let content = response.text().await?;
30 Ok(content)
31}
32
33pub fn normalize_text(text: &str) -> String {
35 let whitespace_regex = Regex::new(r"\s+").unwrap();
37 let normalized = whitespace_regex.replace_all(text.trim(), " ");
38 normalized.to_string()
39}
40
41pub fn extract_text_from_html(html: &str) -> String {
43 let tag_regex = Regex::new(r"<[^>]*>").unwrap();
46 let text = tag_regex.replace_all(html, " ");
47 normalize_text(&text)
48}
49
50pub async fn load_text_from_file(file_path: &str) -> LangExtractResult<String> {
52 let content = tokio::fs::read_to_string(file_path).await?;
53 Ok(content)
54}
55
56pub async fn save_text_to_file(file_path: &str, content: &str) -> LangExtractResult<()> {
58 tokio::fs::write(file_path, content).await?;
59 Ok(())
60}
61
62#[derive(Debug, Clone, PartialEq)]
64pub enum ContentType {
65 PlainText,
66 Html,
67 Json,
68 Yaml,
69 Unknown,
70}
71
72pub fn detect_content_type(content: &str) -> ContentType {
73 let trimmed = content.trim();
74
75 if (trimmed.starts_with('{') && trimmed.ends_with('}'))
77 || (trimmed.starts_with('[') && trimmed.ends_with(']'))
78 {
79 if serde_json::from_str::<serde_json::Value>(trimmed).is_ok() {
80 return ContentType::Json;
81 }
82 }
83
84 if trimmed.contains("---") || trimmed.contains(": ") {
86 if serde_yaml::from_str::<serde_yaml::Value>(trimmed).is_ok() {
87 return ContentType::Yaml;
88 }
89 }
90
91 let html_regex = Regex::new(r"<[^>]+>").unwrap();
93 if html_regex.is_match(trimmed) {
94 return ContentType::Html;
95 }
96
97 ContentType::PlainText
98}
99
100#[cfg(test)]
101mod tests {
102 use super::*;
103
104 #[test]
105 fn test_is_url() {
106 assert!(is_url("https://example.com"));
107 assert!(is_url("http://example.com"));
108 assert!(!is_url("example.com"));
109 assert!(!is_url("ftp://example.com"));
110 assert!(!is_url("file:///path/to/file"));
111 }
112
113 #[test]
114 fn test_normalize_text() {
115 let input = " Hello world \n\n How are you? ";
116 let expected = "Hello world How are you?";
117 assert_eq!(normalize_text(input), expected);
118 }
119
120 #[test]
121 fn test_extract_text_from_html() {
122 let html = "<html><body><h1>Hello</h1><p>World</p></body></html>";
123 let text = extract_text_from_html(html);
124 assert_eq!(text, "Hello World");
125 }
126
127 #[test]
128 fn test_detect_content_type() {
129 assert_eq!(
130 detect_content_type(r#"{"key": "value"}"#),
131 ContentType::Json
132 );
133
134 assert_eq!(
135 detect_content_type("key: value\nother: data"),
136 ContentType::Yaml
137 );
138
139 assert_eq!(
140 detect_content_type("<html><body>Hello</body></html>"),
141 ContentType::Html
142 );
143
144 assert_eq!(
145 detect_content_type("Just plain text"),
146 ContentType::PlainText
147 );
148 }
149
150 #[tokio::test]
151 async fn test_download_invalid_url() {
152 let result = download_text_from_url("not-a-url").await;
153 assert!(result.is_err());
154 }
155
156 #[test]
157 fn test_content_type_edge_cases() {
158 assert_eq!(
160 detect_content_type(r#"{"invalid": json"#),
161 ContentType::PlainText
162 );
163
164 assert_eq!(
166 detect_content_type(""),
167 ContentType::PlainText
168 );
169
170 assert_eq!(
172 detect_content_type(" \n\t "),
173 ContentType::PlainText
174 );
175 }
176}