use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ExtractedUrl {
pub url: String,
pub start: usize,
pub end: usize,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ExtractedEntity {
pub name: String,
pub entity_type: String,
pub start: usize,
pub end: usize,
}
#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
pub struct ExtractionResult {
pub entities: Vec<ExtractedEntity>,
pub urls: Vec<ExtractedUrl>,
pub elapsed_ms: u64,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum GlinerVariant {
Fp32,
Int8,
}
impl GlinerVariant {
pub fn as_filename(self) -> &'static str {
match self {
Self::Fp32 => "model.onnx",
Self::Int8 => "model_int8.onnx",
}
}
pub fn display_size(self) -> &'static str {
match self {
Self::Fp32 => "1.1 GB",
Self::Int8 => "349 MB",
}
}
}
pub trait Extractor: Send + Sync {
fn name(&self) -> &'static str;
fn extract(&self, body: &str) -> Result<ExtractionResult, crate::errors::AppError>;
}
pub struct RegexExtractor;
impl Extractor for RegexExtractor {
fn name(&self) -> &'static str {
"regex"
}
fn extract(&self, body: &str) -> Result<ExtractionResult, crate::errors::AppError> {
Ok(ExtractionResult {
entities: Vec::new(),
urls: extract_urls(body),
elapsed_ms: 0,
})
}
}
pub fn extract_urls(body: &str) -> Vec<ExtractedUrl> {
let mut out = Vec::new();
let mut cursor = 0usize;
while cursor < body.len() {
let hay = &body[cursor..];
let http_at = hay.find("http://");
let https_at = hay.find("https://");
let (rel_start, scheme_len) = match (http_at, https_at) {
(Some(a), Some(b)) => {
if a <= b {
(a, 7)
} else {
(b, 8)
}
}
(Some(a), None) => (a, 7),
(None, Some(b)) => (b, 8),
(None, None) => break,
};
let abs_start = cursor + rel_start;
let after_scheme = abs_start + scheme_len;
let mut end = after_scheme;
for (i, c) in body[after_scheme..].char_indices() {
if c.is_whitespace() || matches!(c, ')' | ']' | '}' | '"' | '\'' | '<') {
end = after_scheme + i;
break;
}
end = after_scheme + i + c.len_utf8();
}
out.push(ExtractedUrl {
url: body[abs_start..end].to_string(),
start: abs_start,
end,
});
cursor = end;
}
out
}
#[cfg(feature = "ner-legacy")]
pub fn extract_graph_auto(
body: &str,
paths: &crate::paths::AppPaths,
gliner_variant: GlinerVariant,
) -> Result<ExtractionResult, crate::errors::AppError> {
let legacy_variant = match gliner_variant {
GlinerVariant::Fp32 => crate::extraction_gliner::GlinerVariant::Fp32,
GlinerVariant::Int8 => crate::extraction_gliner::GlinerVariant::Int8,
};
let extracted = crate::extraction_gliner::extract_graph_auto(body, paths, legacy_variant)
.map_err(crate::errors::AppError::from)?;
Ok(ExtractionResult {
entities: extracted
.entities
.into_iter()
.map(|entity| ExtractedEntity {
name: entity.name,
entity_type: entity.entity_type.to_string(),
start: 0,
end: 0,
})
.collect(),
urls: extracted
.urls
.into_iter()
.map(|url| ExtractedUrl {
end: url.offset + url.url.len(),
url: url.url,
start: url.offset,
})
.collect(),
elapsed_ms: 0,
})
}
#[cfg(not(feature = "ner-legacy"))]
pub fn extract_graph_auto(
body: &str,
_paths: &crate::paths::AppPaths,
_gliner_variant: GlinerVariant,
) -> Result<ExtractionResult, crate::errors::AppError> {
let start = std::time::Instant::now();
let urls = extract_urls(body);
Ok(ExtractionResult {
entities: Vec::new(),
urls,
elapsed_ms: start.elapsed().as_millis() as u64,
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extract_urls_finds_http_and_https() {
let body = "see https://example.com/foo and http://bar.baz/qux end";
let urls = extract_urls(body);
assert_eq!(urls.len(), 2, "got {urls:?} for body {body:?}");
assert_eq!(urls[0].url, "https://example.com/foo");
assert_eq!(urls[1].url, "http://bar.baz/qux");
}
#[test]
fn extract_urls_handles_trailing_punctuation() {
let body = "see https://example.com/foo).";
let urls = extract_urls(body);
assert_eq!(urls.len(), 1);
assert_eq!(urls[0].url, "https://example.com/foo");
}
#[test]
fn extract_urls_empty_body() {
assert!(extract_urls("").is_empty());
}
#[test]
fn gliner_variant_size_strings() {
assert_eq!(GlinerVariant::Fp32.display_size(), "1.1 GB");
assert_eq!(GlinerVariant::Int8.display_size(), "349 MB");
}
#[test]
fn regex_extractor_returns_only_urls() {
let result = RegexExtractor.extract("see https://example.com").unwrap();
assert_eq!(result.entities.len(), 0);
assert_eq!(result.urls.len(), 1);
}
}