use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use chrono::{DateTime, Utc};
use super::types::*;
pub trait ContentExtractor: Send + Sync {
fn can_handle(&self, url: &str) -> bool;
fn extract(&self, url: &str) -> Result<ExtractedContent, KcError>;
}
#[derive(Debug, Clone)]
pub struct ExtractedContent {
pub title: String,
pub author: Option<String>,
pub content: String,
pub published: Option<DateTime<Utc>>,
pub url: String,
pub platform: String,
}
#[derive(Debug, Clone)]
pub struct IntakeReport {
pub url: String,
pub title: String,
pub memory_candidate: MemoryCandidate,
pub content_length: usize,
pub platform: String,
}
fn url_hash(url: &str) -> String {
use std::collections::hash_map::DefaultHasher;
let mut h1 = DefaultHasher::new();
url.hash(&mut h1);
let v1 = h1.finish();
let mut h2 = DefaultHasher::new();
"salt".hash(&mut h2);
url.hash(&mut h2);
let v2 = h2.finish();
format!("{:016x}{:016x}", v1, v2)
}
fn extract_domain(url: &str) -> String {
let without_scheme = url
.strip_prefix("https://")
.or_else(|| url.strip_prefix("http://"))
.unwrap_or(url);
let domain = without_scheme
.split('/')
.next()
.unwrap_or(without_scheme);
domain
.split(':')
.next()
.unwrap_or(domain)
.to_owned()
}
pub struct JinaExtractor {
api_key: Option<String>,
}
impl JinaExtractor {
pub fn new(api_key: Option<String>) -> Self {
Self { api_key }
}
}
impl ContentExtractor for JinaExtractor {
fn can_handle(&self, _url: &str) -> bool {
true }
fn extract(&self, url: &str) -> Result<ExtractedContent, KcError> {
let client = reqwest::blocking::Client::builder()
.timeout(std::time::Duration::from_secs(30))
.build()
.map_err(|e| KcError::ImportError(format!("HTTP client error: {}", e)))?;
let jina_url = format!("https://r.jina.ai/{}", url);
let mut req = client.get(&jina_url);
if let Some(key) = &self.api_key {
req = req.header("Authorization", format!("Bearer {}", key));
}
req = req.header("Accept", "text/plain");
let resp = req
.send()
.map_err(|e| KcError::ImportError(format!("Jina request failed: {}", e)))?;
if !resp.status().is_success() {
return Err(KcError::ImportError(format!(
"Jina returned status {}",
resp.status()
)));
}
let text = resp
.text()
.map_err(|e| KcError::ImportError(format!("Failed to read Jina response: {}", e)))?;
let (title, content) = parse_title_and_content(&text);
let platform = extract_domain(url);
Ok(ExtractedContent {
title,
author: None,
content,
published: None,
url: url.to_owned(),
platform,
})
}
}
pub struct GenericExtractor;
impl ContentExtractor for GenericExtractor {
fn can_handle(&self, _url: &str) -> bool {
true
}
fn extract(&self, url: &str) -> Result<ExtractedContent, KcError> {
let client = reqwest::blocking::Client::builder()
.timeout(std::time::Duration::from_secs(30))
.build()
.map_err(|e| KcError::ImportError(format!("HTTP client error: {}", e)))?;
let resp = client
.get(url)
.header(
"User-Agent",
"engram-ai/1.0 (knowledge-compiler intake)",
)
.send()
.map_err(|e| KcError::ImportError(format!("HTTP request failed: {}", e)))?;
if !resp.status().is_success() {
return Err(KcError::ImportError(format!(
"HTTP {} for {}",
resp.status(),
url
)));
}
let body = resp
.text()
.map_err(|e| KcError::ImportError(format!("Failed to read response: {}", e)))?;
let title = extract_html_title(&body)
.unwrap_or_else(|| extract_domain(url));
let content = strip_html_tags(&body);
let platform = extract_domain(url);
Ok(ExtractedContent {
title,
author: None,
content,
published: None,
url: url.to_owned(),
platform,
})
}
}
fn extract_html_title(html: &str) -> Option<String> {
let lower = html.to_lowercase();
let start = lower.find("<title>")?;
let after = start + 7;
let end = lower[after..].find("</title>")?;
let title = html[after..after + end].trim().to_owned();
if title.is_empty() {
None
} else {
Some(title)
}
}
fn strip_html_tags(html: &str) -> String {
let mut result = String::with_capacity(html.len());
let mut in_tag = false;
for ch in html.chars() {
match ch {
'<' => in_tag = true,
'>' => in_tag = false,
_ if !in_tag => result.push(ch),
_ => {}
}
}
let mut cleaned = String::with_capacity(result.len());
let mut prev_blank = false;
for line in result.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
if !prev_blank {
cleaned.push('\n');
prev_blank = true;
}
} else {
cleaned.push_str(trimmed);
cleaned.push('\n');
prev_blank = false;
}
}
cleaned.trim().to_owned()
}
fn parse_title_and_content(text: &str) -> (String, String) {
let trimmed = text.trim();
if let Some(first_newline) = trimmed.find('\n') {
let first_line = trimmed[..first_newline].trim();
if first_line.starts_with('#') {
let title = first_line.trim_start_matches('#').trim().to_owned();
let content = trimmed[first_newline..].trim().to_owned();
if title.is_empty() {
("Untitled".to_owned(), trimmed.to_owned())
} else {
(title, content)
}
} else {
(
first_line.to_owned(),
trimmed[first_newline..].trim().to_owned(),
)
}
} else {
(trimmed.to_owned(), trimmed.to_owned())
}
}
pub struct YtDlpExtractor;
impl YtDlpExtractor {
fn parse_upload_date(date_str: &str) -> Option<DateTime<Utc>> {
if date_str.len() != 8 {
return None;
}
let year: i32 = date_str[0..4].parse().ok()?;
let month: u32 = date_str[4..6].parse().ok()?;
let day: u32 = date_str[6..8].parse().ok()?;
chrono::NaiveDate::from_ymd_opt(year, month, day)
.and_then(|d| d.and_hms_opt(0, 0, 0))
.map(|dt| DateTime::<Utc>::from_naive_utc_and_offset(dt, Utc))
}
fn clean_subtitle_text(text: &str) -> String {
let mut lines = Vec::new();
let mut prev_line = String::new();
for line in text.lines() {
let trimmed = line.trim();
if trimmed.is_empty()
|| trimmed.starts_with("WEBVTT")
|| trimmed.starts_with("Kind:")
|| trimmed.starts_with("Language:")
|| trimmed.contains("-->")
|| trimmed.parse::<u32>().is_ok()
{
continue;
}
let cleaned: String = {
let mut result = String::with_capacity(trimmed.len());
let mut in_tag = false;
for ch in trimmed.chars() {
match ch {
'<' => in_tag = true,
'>' => in_tag = false,
_ if !in_tag => result.push(ch),
_ => {}
}
}
result
};
let cleaned = cleaned.trim().to_owned();
if !cleaned.is_empty() && cleaned != prev_line {
lines.push(cleaned.clone());
prev_line = cleaned;
}
}
lines.join(" ")
}
}
impl ContentExtractor for YtDlpExtractor {
fn can_handle(&self, url: &str) -> bool {
url.contains("youtube.com/watch")
|| url.contains("youtu.be/")
|| url.contains("youtube.com/shorts/")
}
fn extract(&self, url: &str) -> Result<ExtractedContent, KcError> {
use std::process::Command;
let meta_output = Command::new("yt-dlp")
.args(["--dump-json", "--no-download", url])
.output()
.map_err(|e| {
if e.kind() == std::io::ErrorKind::NotFound {
KcError::ImportError("yt-dlp not installed".to_owned())
} else {
KcError::ImportError(format!("yt-dlp execution error: {}", e))
}
})?;
if !meta_output.status.success() {
let stderr = String::from_utf8_lossy(&meta_output.stderr);
return Err(KcError::ImportError(format!(
"yt-dlp metadata fetch failed: {}",
stderr.trim()
)));
}
let meta_json: serde_json::Value =
serde_json::from_slice(&meta_output.stdout).map_err(|e| {
KcError::ImportError(format!("Failed to parse yt-dlp JSON: {}", e))
})?;
let title = meta_json
.get("title")
.and_then(|v| v.as_str())
.unwrap_or("Untitled Video")
.to_owned();
let uploader = meta_json
.get("uploader")
.and_then(|v| v.as_str())
.map(|s| s.to_owned());
let description = meta_json
.get("description")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_owned();
let published = meta_json
.get("upload_date")
.and_then(|v| v.as_str())
.and_then(Self::parse_upload_date);
let video_id = meta_json
.get("id")
.and_then(|v| v.as_str())
.unwrap_or("unknown");
let tmp_prefix = format!("/tmp/engram-ytdlp-{}", video_id);
let sub_result = Command::new("yt-dlp")
.args([
"--write-sub",
"--write-auto-sub",
"--sub-lang",
"en,zh",
"--skip-download",
"-o",
&tmp_prefix,
url,
])
.output();
let mut subtitle_content: Option<String> = None;
if let Ok(sub_output) = sub_result {
if sub_output.status.success() {
for ext in &["en.vtt", "en.srt", "zh.vtt", "zh.srt"] {
let sub_path = format!("{}.{}", tmp_prefix, ext);
if let Ok(sub_text) = std::fs::read_to_string(&sub_path) {
let cleaned = Self::clean_subtitle_text(&sub_text);
if !cleaned.is_empty() {
subtitle_content = Some(cleaned);
}
let _ = std::fs::remove_file(&sub_path);
if subtitle_content.is_some() {
break;
}
}
}
}
}
for ext in &["en.vtt", "en.srt", "zh.vtt", "zh.srt"] {
let sub_path = format!("{}.{}", tmp_prefix, ext);
let _ = std::fs::remove_file(&sub_path);
}
let content = subtitle_content.unwrap_or(description);
Ok(ExtractedContent {
title,
author: uploader,
content,
published,
url: url.to_owned(),
platform: "youtube.com".to_owned(),
})
}
}
pub struct GithubExtractor {
token: Option<String>,
}
impl GithubExtractor {
pub fn new(token: Option<String>) -> Self {
Self { token }
}
fn parse_owner_repo(url: &str) -> Option<(String, String)> {
let without_scheme = url
.strip_prefix("https://")
.or_else(|| url.strip_prefix("http://"))
.unwrap_or(url);
let without_host = without_scheme
.strip_prefix("github.com/")
.or_else(|| without_scheme.strip_prefix("www.github.com/"))?;
let parts: Vec<&str> = without_host.split('/').collect();
if parts.len() >= 2 && !parts[0].is_empty() && !parts[1].is_empty() {
Some((parts[0].to_owned(), parts[1].to_owned()))
} else {
None
}
}
fn build_client(&self) -> Result<reqwest::blocking::Client, KcError> {
reqwest::blocking::Client::builder()
.timeout(std::time::Duration::from_secs(30))
.build()
.map_err(|e| KcError::ImportError(format!("HTTP client error: {}", e)))
}
fn add_auth(
&self,
req: reqwest::blocking::RequestBuilder,
) -> reqwest::blocking::RequestBuilder {
if let Some(token) = &self.token {
req.header("Authorization", format!("Bearer {}", token))
} else {
req
}
}
fn decode_base64(encoded: &str) -> Result<String, KcError> {
let cleaned: String = encoded.chars().filter(|c| !c.is_whitespace()).collect();
let lookup = |ch: char| -> Result<u8, KcError> {
match ch {
'A'..='Z' => Ok(ch as u8 - b'A'),
'a'..='z' => Ok(ch as u8 - b'a' + 26),
'0'..='9' => Ok(ch as u8 - b'0' + 52),
'+' => Ok(62),
'/' => Ok(63),
_ => Err(KcError::ImportError(format!(
"Invalid base64 character: {}",
ch
))),
}
};
let mut bytes = Vec::with_capacity(cleaned.len() * 3 / 4);
let chars: Vec<char> = cleaned.chars().collect();
let mut i = 0;
while i < chars.len() {
let remaining = chars.len() - i;
if remaining < 2 {
break;
}
let a = lookup(chars[i])?;
let b = lookup(chars[i + 1])?;
bytes.push((a << 2) | (b >> 4));
if i + 2 < chars.len() && chars[i + 2] != '=' {
let c = lookup(chars[i + 2])?;
bytes.push((b << 4) | (c >> 2));
if i + 3 < chars.len() && chars[i + 3] != '=' {
let d = lookup(chars[i + 3])?;
bytes.push((c << 6) | d);
}
}
i += 4;
}
String::from_utf8(bytes)
.map_err(|e| KcError::ImportError(format!("Invalid UTF-8 in decoded content: {}", e)))
}
}
impl ContentExtractor for GithubExtractor {
fn can_handle(&self, url: &str) -> bool {
url.contains("github.com/") && !url.contains("gist.github.com")
}
fn extract(&self, url: &str) -> Result<ExtractedContent, KcError> {
let (owner, repo) = Self::parse_owner_repo(url).ok_or_else(|| {
KcError::ImportError(format!(
"Could not parse owner/repo from GitHub URL: {}",
url
))
})?;
let client = self.build_client()?;
let repo_url = format!("https://api.github.com/repos/{}/{}", owner, repo);
let repo_req = self.add_auth(
client
.get(&repo_url)
.header("User-Agent", "engram-ai/1.0 (knowledge-compiler intake)")
.header("Accept", "application/vnd.github.v3+json"),
);
let repo_resp = repo_req
.send()
.map_err(|e| KcError::ImportError(format!("GitHub API request failed: {}", e)))?;
match repo_resp.status().as_u16() {
404 => {
return Err(KcError::ImportError(format!(
"GitHub repository not found: {}/{}",
owner, repo
)));
}
403 => {
return Err(KcError::ImportError(
"GitHub API rate limit exceeded. Provide a token for higher limits."
.to_owned(),
));
}
s if s >= 400 => {
return Err(KcError::ImportError(format!(
"GitHub API returned status {}",
s
)));
}
_ => {}
}
let repo_json: serde_json::Value = repo_resp
.json()
.map_err(|e| KcError::ImportError(format!("Failed to parse GitHub API response: {}", e)))?;
let full_name = repo_json
.get("full_name")
.and_then(|v| v.as_str())
.unwrap_or(&format!("{}/{}", owner, repo))
.to_owned();
let description = repo_json
.get("description")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_owned();
let stars = repo_json
.get("stargazers_count")
.and_then(|v| v.as_u64())
.unwrap_or(0);
let language = repo_json
.get("language")
.and_then(|v| v.as_str())
.unwrap_or("unknown")
.to_owned();
let topics: Vec<String> = repo_json
.get("topics")
.and_then(|v| v.as_array())
.map(|arr| {
arr.iter()
.filter_map(|v| v.as_str().map(|s| s.to_owned()))
.collect()
})
.unwrap_or_default();
let readme_url = format!(
"https://api.github.com/repos/{}/{}/readme",
owner, repo
);
let readme_req = self.add_auth(
client
.get(&readme_url)
.header("User-Agent", "engram-ai/1.0 (knowledge-compiler intake)")
.header("Accept", "application/vnd.github.v3+json"),
);
let readme_content = match readme_req.send() {
Ok(resp) if resp.status().is_success() => {
let readme_json: serde_json::Value = resp
.json()
.unwrap_or(serde_json::Value::Null);
readme_json
.get("content")
.and_then(|v| v.as_str())
.and_then(|encoded| Self::decode_base64(encoded).ok())
.unwrap_or_default()
}
_ => String::new(),
};
let title = if description.is_empty() {
full_name.clone()
} else {
format!("{}: {}", full_name, description)
};
let mut content_parts = Vec::new();
if !description.is_empty() {
content_parts.push(format!("**Description:** {}", description));
}
content_parts.push(format!("**Language:** {} | **Stars:** {}", language, stars));
if !topics.is_empty() {
content_parts.push(format!("**Topics:** {}", topics.join(", ")));
}
if !readme_content.is_empty() {
content_parts.push(String::new()); content_parts.push(readme_content);
}
let content = content_parts.join("\n");
Ok(ExtractedContent {
title,
author: Some(owner),
content,
published: None,
url: url.to_owned(),
platform: "github.com".to_owned(),
})
}
}
pub struct IntakePipeline {
extractors: Vec<Box<dyn ContentExtractor>>,
}
impl Default for IntakePipeline {
fn default() -> Self {
Self::new()
}
}
impl IntakePipeline {
pub fn new() -> Self {
Self {
extractors: Vec::new(),
}
}
pub fn add_extractor(&mut self, extractor: Box<dyn ContentExtractor>) {
self.extractors.push(extractor);
}
pub fn extractor_count(&self) -> usize {
self.extractors.len()
}
pub fn ingest(&self, url: &str) -> Result<IntakeReport, KcError> {
let extractor = self
.extractors
.iter()
.find(|e| e.can_handle(url))
.ok_or_else(|| {
KcError::ImportError(format!(
"No extractor can handle URL: {}",
url
))
})?;
let content = extractor.extract(url)?;
let content_length = content.content.len();
let candidate = MemoryCandidate {
content: format!(
"# {}\n\nSource: {}\nAuthor: {}\n\n{}",
content.title,
content.url,
content.author.as_deref().unwrap_or("unknown"),
content.content,
),
source: url.to_owned(),
content_hash: url_hash(&content.url),
metadata: HashMap::from([
("source_url".to_owned(), content.url.clone()),
("platform".to_owned(), content.platform.clone()),
(
"intake_timestamp".to_owned(),
Utc::now().to_rfc3339(),
),
]),
};
Ok(IntakeReport {
url: url.to_owned(),
title: content.title,
memory_candidate: candidate,
content_length,
platform: content.platform,
})
}
pub fn ingest_and_import(
&self,
url: &str,
_import_pipeline: &super::import::ImportPipeline,
) -> Result<IntakeReport, KcError> {
self.ingest(url)
}
}
#[cfg(test)]
mod tests {
use super::*;
struct MockExtractor {
handles: bool,
title: String,
content: String,
author: Option<String>,
platform: String,
fail: bool,
}
impl MockExtractor {
fn new(handles: bool, title: &str, content: &str) -> Self {
Self {
handles,
title: title.to_owned(),
content: content.to_owned(),
author: None,
platform: "mock".to_owned(),
fail: false,
}
}
fn failing(handles: bool) -> Self {
Self {
handles,
title: String::new(),
content: String::new(),
author: None,
platform: "mock".to_owned(),
fail: true,
}
}
fn with_author(mut self, author: &str) -> Self {
self.author = Some(author.to_owned());
self
}
fn with_platform(mut self, platform: &str) -> Self {
self.platform = platform.to_owned();
self
}
}
impl ContentExtractor for MockExtractor {
fn can_handle(&self, _url: &str) -> bool {
self.handles
}
fn extract(&self, url: &str) -> Result<ExtractedContent, KcError> {
if self.fail {
return Err(KcError::ImportError("mock extraction failed".to_owned()));
}
Ok(ExtractedContent {
title: self.title.clone(),
author: self.author.clone(),
content: self.content.clone(),
published: None,
url: url.to_owned(),
platform: self.platform.clone(),
})
}
}
#[test]
fn test_intake_pipeline_new() {
let pipeline = IntakePipeline::new();
assert_eq!(pipeline.extractor_count(), 0);
assert!(pipeline.extractors.is_empty());
}
#[test]
fn test_add_extractor() {
let mut pipeline = IntakePipeline::new();
assert_eq!(pipeline.extractor_count(), 0);
pipeline.add_extractor(Box::new(MockExtractor::new(true, "T1", "C1")));
assert_eq!(pipeline.extractor_count(), 1);
pipeline.add_extractor(Box::new(MockExtractor::new(false, "T2", "C2")));
assert_eq!(pipeline.extractor_count(), 2);
pipeline.add_extractor(Box::new(MockExtractor::new(true, "T3", "C3")));
assert_eq!(pipeline.extractor_count(), 3);
}
#[test]
fn test_jina_can_handle() {
let extractor = JinaExtractor::new(None);
assert!(extractor.can_handle("https://example.com"));
assert!(extractor.can_handle("https://github.com/user/repo"));
assert!(extractor.can_handle("https://www.youtube.com/watch?v=abc123"));
assert!(extractor.can_handle("http://anything.goes/here"));
assert!(extractor.can_handle("not-even-a-url"));
}
#[test]
fn test_generic_can_handle() {
let extractor = GenericExtractor;
assert!(extractor.can_handle("https://example.com"));
assert!(extractor.can_handle("https://github.com/user/repo"));
assert!(extractor.can_handle("https://www.youtube.com/watch?v=abc123"));
assert!(extractor.can_handle("http://anything.goes/here"));
assert!(extractor.can_handle("not-even-a-url"));
}
#[test]
fn test_ingest_selects_first_matching() {
let mut pipeline = IntakePipeline::new();
pipeline.add_extractor(Box::new(MockExtractor::new(false, "Skip", "skip")));
pipeline.add_extractor(Box::new(
MockExtractor::new(true, "Second", "second content")
.with_platform("second-platform"),
));
pipeline.add_extractor(Box::new(
MockExtractor::new(true, "Third", "third content")
.with_platform("third-platform"),
));
let report = pipeline.ingest("https://example.com/article").unwrap();
assert_eq!(report.title, "Second");
assert_eq!(report.platform, "second-platform");
assert!(report.memory_candidate.content.contains("second content"));
}
#[test]
fn test_ingest_no_extractor() {
let pipeline = IntakePipeline::new();
let result = pipeline.ingest("https://example.com");
assert!(result.is_err());
let err = result.unwrap_err();
let msg = format!("{}", err);
assert!(msg.contains("No extractor"), "Error was: {}", msg);
}
#[test]
fn test_ingest_no_matching_extractor() {
let mut pipeline = IntakePipeline::new();
pipeline.add_extractor(Box::new(MockExtractor::new(false, "A", "a")));
pipeline.add_extractor(Box::new(MockExtractor::new(false, "B", "b")));
let result = pipeline.ingest("https://example.com");
assert!(result.is_err());
}
#[test]
fn test_extracted_content_to_candidate() {
let mut pipeline = IntakePipeline::new();
pipeline.add_extractor(Box::new(
MockExtractor::new(true, "Rust Guide", "Learn Rust programming.")
.with_author("Alice")
.with_platform("blog.example.com"),
));
let report = pipeline.ingest("https://blog.example.com/rust-guide").unwrap();
let candidate = &report.memory_candidate;
assert!(candidate.content.starts_with("# Rust Guide"));
assert!(candidate.content.contains("Source: https://blog.example.com/rust-guide"));
assert!(candidate.content.contains("Author: Alice"));
assert!(candidate.content.contains("Learn Rust programming."));
assert_eq!(candidate.source, "https://blog.example.com/rust-guide");
assert_eq!(
candidate.metadata.get("source_url").unwrap(),
"https://blog.example.com/rust-guide"
);
assert_eq!(
candidate.metadata.get("platform").unwrap(),
"blog.example.com"
);
assert!(candidate.metadata.contains_key("intake_timestamp"));
assert!(!candidate.content_hash.is_empty());
assert_eq!(candidate.content_hash.len(), 32); }
#[test]
fn test_extracted_content_unknown_author() {
let mut pipeline = IntakePipeline::new();
pipeline.add_extractor(Box::new(
MockExtractor::new(true, "Title", "Body"),
));
let report = pipeline.ingest("https://example.com/page").unwrap();
assert!(report.memory_candidate.content.contains("Author: unknown"));
}
#[test]
fn test_intake_report_fields() {
let mut pipeline = IntakePipeline::new();
pipeline.add_extractor(Box::new(
MockExtractor::new(true, "My Article", "Some body text here")
.with_platform("example.com"),
));
let report = pipeline.ingest("https://example.com/my-article").unwrap();
assert_eq!(report.url, "https://example.com/my-article");
assert_eq!(report.title, "My Article");
assert_eq!(report.content_length, "Some body text here".len());
assert_eq!(report.platform, "example.com");
}
#[test]
fn test_content_hash_dedup() {
let hash1 = url_hash("https://example.com/article");
let hash2 = url_hash("https://example.com/article");
assert_eq!(hash1, hash2);
let hash3 = url_hash("https://example.com/other-article");
assert_ne!(hash1, hash3);
}
#[test]
fn test_url_hash_deterministic() {
let hash = url_hash("https://example.com/page");
assert_eq!(hash.len(), 32);
assert!(hash.chars().all(|c| c.is_ascii_hexdigit()));
}
#[test]
fn test_extract_domain() {
assert_eq!(extract_domain("https://example.com/path"), "example.com");
assert_eq!(extract_domain("http://sub.example.com/a/b"), "sub.example.com");
assert_eq!(extract_domain("https://example.com:8080/path"), "example.com");
assert_eq!(extract_domain("https://example.com"), "example.com");
assert_eq!(extract_domain("no-scheme.com/path"), "no-scheme.com");
}
#[test]
fn test_parse_title_and_content() {
let (title, content) = parse_title_and_content("# My Title\n\nBody text here.");
assert_eq!(title, "My Title");
assert_eq!(content, "Body text here.");
let (title, content) = parse_title_and_content("First Line\nSecond line.");
assert_eq!(title, "First Line");
assert_eq!(content, "Second line.");
let (title, content) = parse_title_and_content("Only line");
assert_eq!(title, "Only line");
assert_eq!(content, "Only line");
}
#[test]
fn test_extract_html_title() {
let html = "<html><head><title>Page Title</title></head><body>Hi</body></html>";
assert_eq!(extract_html_title(html), Some("Page Title".to_owned()));
let no_title = "<html><body>Hi</body></html>";
assert_eq!(extract_html_title(no_title), None);
let empty_title = "<html><title></title></html>";
assert_eq!(extract_html_title(empty_title), None);
}
#[test]
fn test_strip_html_tags() {
let html = "<p>Hello <b>world</b></p><br/><p>Second paragraph</p>";
let text = strip_html_tags(html);
assert!(text.contains("Hello world"));
assert!(text.contains("Second paragraph"));
assert!(!text.contains('<'));
assert!(!text.contains('>'));
}
#[test]
fn test_ingest_extractor_failure() {
let mut pipeline = IntakePipeline::new();
pipeline.add_extractor(Box::new(MockExtractor::failing(true)));
let result = pipeline.ingest("https://example.com");
assert!(result.is_err());
let msg = format!("{}", result.unwrap_err());
assert!(msg.contains("mock extraction failed"));
}
#[test]
fn test_candidate_hash_uses_url() {
let mut pipeline = IntakePipeline::new();
pipeline.add_extractor(Box::new(MockExtractor::new(true, "T", "C")));
let r1 = pipeline.ingest("https://example.com/same").unwrap();
let r2 = pipeline.ingest("https://example.com/same").unwrap();
assert_eq!(r1.memory_candidate.content_hash, r2.memory_candidate.content_hash);
let r3 = pipeline.ingest("https://example.com/different").unwrap();
assert_ne!(r1.memory_candidate.content_hash, r3.memory_candidate.content_hash);
}
#[test]
fn test_ytdlp_can_handle() {
let extractor = YtDlpExtractor;
assert!(extractor.can_handle("https://www.youtube.com/watch?v=dQw4w9WgXcQ"));
assert!(extractor.can_handle("https://youtube.com/watch?v=abc123"));
assert!(extractor.can_handle("https://youtu.be/dQw4w9WgXcQ"));
assert!(extractor.can_handle("https://m.youtube.com/watch?v=abc123"));
assert!(extractor.can_handle("https://www.youtube.com/shorts/abc123"));
assert!(extractor.can_handle("https://youtube.com/shorts/xyz"));
}
#[test]
fn test_ytdlp_rejects_non_youtube() {
let extractor = YtDlpExtractor;
assert!(!extractor.can_handle("https://example.com"));
assert!(!extractor.can_handle("https://vimeo.com/12345"));
assert!(!extractor.can_handle("https://github.com/user/repo"));
assert!(!extractor.can_handle("https://youtube.com/channel/abc"));
assert!(!extractor.can_handle("https://www.youtube.com/"));
assert!(!extractor.can_handle("not-a-url"));
}
#[test]
fn test_ytdlp_parse_upload_date() {
let dt = YtDlpExtractor::parse_upload_date("20240115");
assert!(dt.is_some());
let dt = dt.unwrap();
assert_eq!(dt.format("%Y-%m-%d").to_string(), "2024-01-15");
assert!(YtDlpExtractor::parse_upload_date("2024011").is_none());
assert!(YtDlpExtractor::parse_upload_date("").is_none());
assert!(YtDlpExtractor::parse_upload_date("abcdefgh").is_none());
assert!(YtDlpExtractor::parse_upload_date("20241301").is_none()); }
#[test]
fn test_ytdlp_clean_subtitle_text() {
let vtt = "WEBVTT\nKind: captions\nLanguage: en\n\n\
00:00:01.000 --> 00:00:03.000\n\
Hello world\n\n\
00:00:03.000 --> 00:00:05.000\n\
Hello world\n\n\
00:00:05.000 --> 00:00:07.000\n\
This is a test\n";
let cleaned = YtDlpExtractor::clean_subtitle_text(vtt);
assert!(cleaned.contains("Hello world"));
assert!(cleaned.contains("This is a test"));
assert_eq!(
cleaned.matches("Hello world").count(),
1,
"cleaned: {}",
cleaned
);
assert!(!cleaned.contains("-->"));
assert!(!cleaned.contains("WEBVTT"));
}
#[test]
fn test_github_can_handle() {
let extractor = GithubExtractor::new(None);
assert!(extractor.can_handle("https://github.com/user/repo"));
assert!(extractor.can_handle("https://github.com/user/repo/tree/main/src"));
assert!(extractor.can_handle("https://github.com/user/repo/blob/main/README.md"));
assert!(extractor.can_handle("http://github.com/user/repo"));
assert!(extractor.can_handle("https://www.github.com/user/repo"));
}
#[test]
fn test_github_rejects_non_github() {
let extractor = GithubExtractor::new(None);
assert!(!extractor.can_handle("https://example.com"));
assert!(!extractor.can_handle("https://gitlab.com/user/repo"));
assert!(!extractor.can_handle("https://youtube.com/watch?v=abc"));
assert!(!extractor.can_handle("not-a-url"));
}
#[test]
fn test_github_rejects_gist() {
let extractor = GithubExtractor::new(None);
assert!(!extractor.can_handle("https://gist.github.com/user/abc123"));
assert!(!extractor.can_handle("https://gist.github.com/user/abc123/raw"));
}
#[test]
fn test_github_parse_owner_repo() {
let result = GithubExtractor::parse_owner_repo("https://github.com/rust-lang/rust");
assert_eq!(result, Some(("rust-lang".to_owned(), "rust".to_owned())));
let result =
GithubExtractor::parse_owner_repo("https://github.com/user/repo/tree/main/src");
assert_eq!(result, Some(("user".to_owned(), "repo".to_owned())));
let result = GithubExtractor::parse_owner_repo("https://github.com/user/repo/blob/main/README.md");
assert_eq!(result, Some(("user".to_owned(), "repo".to_owned())));
assert!(GithubExtractor::parse_owner_repo("https://github.com/").is_none());
assert!(GithubExtractor::parse_owner_repo("https://github.com/user").is_none());
assert!(GithubExtractor::parse_owner_repo("https://example.com/user/repo").is_none());
}
#[test]
fn test_github_decode_base64() {
let encoded = "SGVsbG8sIFdvcmxkIQ==";
let decoded = GithubExtractor::decode_base64(encoded).unwrap();
assert_eq!(decoded, "Hello, World!");
let encoded_with_newlines = "SGVs\nbG8s\nIFdv\ncmxk\nIQ==";
let decoded = GithubExtractor::decode_base64(encoded_with_newlines).unwrap();
assert_eq!(decoded, "Hello, World!");
let decoded = GithubExtractor::decode_base64("").unwrap();
assert_eq!(decoded, "");
}
}