1use serde::{Deserialize, Serialize};
19
20#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
22pub struct ExtractedUrl {
23 pub url: String,
24 pub start: usize,
25 pub end: usize,
26}
27
28#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
31pub struct ExtractedEntity {
32 pub name: String,
33 pub entity_type: String,
34 pub start: usize,
35 pub end: usize,
36}
37
38#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
43pub struct ExtractionResult {
44 pub entities: Vec<ExtractedEntity>,
45 pub urls: Vec<ExtractedUrl>,
46 pub elapsed_ms: u64,
48}
49
50#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
54pub enum GlinerVariant {
55 Fp32,
56 Int8,
57}
58
59impl GlinerVariant {
60 pub fn as_filename(self) -> &'static str {
61 match self {
62 Self::Fp32 => "model.onnx",
63 Self::Int8 => "model_int8.onnx",
64 }
65 }
66 pub fn display_size(self) -> &'static str {
67 match self {
68 Self::Fp32 => "1.1 GB",
69 Self::Int8 => "349 MB",
70 }
71 }
72}
73
74pub trait Extractor: Send + Sync {
77 fn name(&self) -> &'static str;
78 fn extract(&self, body: &str) -> Result<ExtractionResult, crate::errors::AppError>;
79}
80
81pub struct RegexExtractor;
84
85impl Extractor for RegexExtractor {
86 fn name(&self) -> &'static str {
87 "regex"
88 }
89 fn extract(&self, body: &str) -> Result<ExtractionResult, crate::errors::AppError> {
90 Ok(ExtractionResult {
91 entities: Vec::new(),
92 urls: extract_urls(body),
93 elapsed_ms: 0,
94 })
95 }
96}
97
98pub fn extract_urls(body: &str) -> Vec<ExtractedUrl> {
101 let mut out = Vec::new();
102 let mut cursor = 0usize;
103 while cursor < body.len() {
104 let hay = &body[cursor..];
105 let http_at = hay.find("http://");
107 let https_at = hay.find("https://");
108 let (rel_start, scheme_len) = match (http_at, https_at) {
109 (Some(a), Some(b)) => {
110 if a <= b {
111 (a, 7)
112 } else {
113 (b, 8)
114 }
115 }
116 (Some(a), None) => (a, 7),
117 (None, Some(b)) => (b, 8),
118 (None, None) => break,
119 };
120 let abs_start = cursor + rel_start;
121 let after_scheme = abs_start + scheme_len;
122 let mut end = after_scheme;
123 for (i, c) in body[after_scheme..].char_indices() {
124 if c.is_whitespace() || matches!(c, ')' | ']' | '}' | '"' | '\'' | '<') {
125 end = after_scheme + i;
126 break;
127 }
128 end = after_scheme + i + c.len_utf8();
129 }
130 out.push(ExtractedUrl {
131 url: body[abs_start..end].to_string(),
132 start: abs_start,
133 end,
134 });
135 cursor = end;
136 }
137 out
138}
139
140#[cfg(feature = "ner-legacy")]
145pub fn extract_graph_auto(
146 body: &str,
147 paths: &crate::paths::AppPaths,
148 gliner_variant: GlinerVariant,
149) -> Result<ExtractionResult, crate::errors::AppError> {
150 let legacy_variant = match gliner_variant {
151 GlinerVariant::Fp32 => crate::extraction_gliner::GlinerVariant::Fp32,
152 GlinerVariant::Int8 => crate::extraction_gliner::GlinerVariant::Int8,
153 };
154 let extracted = crate::extraction_gliner::extract_graph_auto(body, paths, legacy_variant)
155 .map_err(crate::errors::AppError::from)?;
156 Ok(ExtractionResult {
157 entities: extracted
158 .entities
159 .into_iter()
160 .map(|entity| ExtractedEntity {
161 name: entity.name,
162 entity_type: entity.entity_type.to_string(),
163 start: 0,
164 end: 0,
165 })
166 .collect(),
167 urls: extracted
168 .urls
169 .into_iter()
170 .map(|url| ExtractedUrl {
171 end: url.offset + url.url.len(),
172 url: url.url,
173 start: url.offset,
174 })
175 .collect(),
176 elapsed_ms: 0,
177 })
178}
179
180#[cfg(not(feature = "ner-legacy"))]
181pub fn extract_graph_auto(
182 body: &str,
183 _paths: &crate::paths::AppPaths,
184 _gliner_variant: GlinerVariant,
185) -> Result<ExtractionResult, crate::errors::AppError> {
186 let start = std::time::Instant::now();
187 let urls = extract_urls(body);
188 Ok(ExtractionResult {
189 entities: Vec::new(),
190 urls,
191 elapsed_ms: start.elapsed().as_millis() as u64,
192 })
193}
194
195#[cfg(test)]
196mod tests {
197 use super::*;
198
199 #[test]
200 fn extract_urls_finds_http_and_https() {
201 let body = "see https://example.com/foo and http://bar.baz/qux end";
202 let urls = extract_urls(body);
203 assert_eq!(urls.len(), 2, "got {urls:?} for body {body:?}");
204 assert_eq!(urls[0].url, "https://example.com/foo");
205 assert_eq!(urls[1].url, "http://bar.baz/qux");
206 }
207
208 #[test]
209 fn extract_urls_handles_trailing_punctuation() {
210 let body = "see https://example.com/foo).";
211 let urls = extract_urls(body);
212 assert_eq!(urls.len(), 1);
213 assert_eq!(urls[0].url, "https://example.com/foo");
214 }
215
216 #[test]
217 fn extract_urls_empty_body() {
218 assert!(extract_urls("").is_empty());
219 }
220
221 #[test]
222 fn gliner_variant_size_strings() {
223 assert_eq!(GlinerVariant::Fp32.display_size(), "1.1 GB");
224 assert_eq!(GlinerVariant::Int8.display_size(), "349 MB");
225 }
226
227 #[test]
228 fn regex_extractor_returns_only_urls() {
229 let result = RegexExtractor.extract("see https://example.com").unwrap();
230 assert_eq!(result.entities.len(), 0);
231 assert_eq!(result.urls.len(), 1);
232 }
233}