use std::path::{Path, PathBuf};
use graphify_security::validate_url;
use regex::Regex;
use reqwest::Client;
use thiserror::Error;
use tracing::info;
#[derive(Debug, Error)]
pub enum IngestError {
#[error("HTTP error: {0}")]
Http(#[from] reqwest::Error),
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
#[error("security error: {0}")]
Security(#[from] graphify_security::SecurityError),
#[error("ingest error: {0}")]
Other(String),
}
pub async fn ingest_url(url: &str, output_dir: &Path) -> Result<PathBuf, IngestError> {
let validated = validate_url(url)?;
let client = Client::new();
let url_str = validated.as_str();
if url_str.contains("arxiv.org") {
ingest_arxiv(&client, url_str, output_dir).await
} else if url_str.contains("twitter.com") || url_str.contains("x.com") {
ingest_tweet(&client, url_str, output_dir).await
} else if url_str.ends_with(".pdf") {
ingest_pdf(&client, url_str, output_dir).await
} else {
ingest_webpage(&client, url_str, output_dir).await
}
}
async fn ingest_arxiv(client: &Client, url: &str, out: &Path) -> Result<PathBuf, IngestError> {
let abs_url = url.replace("/pdf/", "/abs/");
let response = client.get(&abs_url).send().await?;
let html = response.text().await?;
let arxiv_id = abs_url
.split('/')
.next_back()
.unwrap_or("unknown")
.trim_end_matches(".pdf");
let title = extract_between(&html, "<title>", "</title>")
.unwrap_or_else(|| format!("arXiv:{arxiv_id}"));
let title = strip_html_tags(&title).trim().to_string();
let abstract_text = extract_between(
&html,
"<blockquote class=\"abstract mathjax\">",
"</blockquote>",
)
.or_else(|| extract_between(&html, "Abstract:</span>", "</blockquote>"))
.unwrap_or_default();
let abstract_text = strip_html_tags(&abstract_text).trim().to_string();
let filename = format!("arxiv_{}.md", sanitize_filename(arxiv_id));
let path = out.join(&filename);
std::fs::create_dir_all(out)?;
let content = format!(
"---\nsource: {url}\ntype: arxiv\narxiv_id: {arxiv_id}\ntitle: \"{title}\"\n---\n\n# {title}\n\n## Abstract\n\n{abstract_text}\n"
);
std::fs::write(&path, content)?;
info!("Ingested arXiv paper: {} -> {}", arxiv_id, path.display());
Ok(path)
}
async fn ingest_tweet(client: &Client, url: &str, out: &Path) -> Result<PathBuf, IngestError> {
let oembed_url = format!(
"https://publish.twitter.com/oembed?url={}&omit_script=true",
urlencoding::encode(url)
);
let response = client.get(&oembed_url).send().await?;
let (author, text) = if response.status().is_success() {
let json: serde_json::Value = response.json().await?;
let author = json
.get("author_name")
.and_then(|v| v.as_str())
.unwrap_or("unknown")
.to_string();
let html_content = json
.get("html")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let text = strip_html_tags(&html_content);
(author, text)
} else {
("unknown".to_string(), format!("Tweet from: {url}"))
};
let tweet_id = url
.split('/')
.next_back()
.unwrap_or("unknown")
.split('?')
.next()
.unwrap_or("unknown");
let filename = format!("tweet_{}.md", sanitize_filename(tweet_id));
let path = out.join(&filename);
std::fs::create_dir_all(out)?;
let content = format!(
"---\nsource: {}\ntype: tweet\nauthor: \"{}\"\ntweet_id: {}\n---\n\n{}\n",
url,
author,
tweet_id,
text.trim()
);
std::fs::write(&path, content)?;
info!("Ingested tweet: {} -> {}", tweet_id, path.display());
Ok(path)
}
async fn ingest_pdf(client: &Client, url: &str, out: &Path) -> Result<PathBuf, IngestError> {
let response = client.get(url).send().await?;
let bytes = response.bytes().await?;
let filename = url.split('/').next_back().unwrap_or("document.pdf");
let filename = if filename.ends_with(".pdf") {
filename.to_string()
} else {
format!("{filename}.pdf")
};
let path = out.join(&filename);
std::fs::create_dir_all(out)?;
std::fs::write(&path, &bytes)?;
info!(
"Ingested PDF: {} ({} bytes) -> {}",
url,
bytes.len(),
path.display()
);
Ok(path)
}
async fn ingest_webpage(client: &Client, url: &str, out: &Path) -> Result<PathBuf, IngestError> {
let response = client.get(url).send().await?;
let html = response.text().await?;
let title = extract_between(&html, "<title>", "</title>")
.map(|t| strip_html_tags(&t))
.unwrap_or_default();
let text = strip_scripts_and_styles(&html);
let text = strip_html_tags(&text);
let text = collapse_whitespace(&text);
let filename = sanitize_filename(url);
let path = out.join(format!("{filename}.md"));
std::fs::create_dir_all(out)?;
let content = format!(
"---\nsource: {}\ntype: webpage\ntitle: \"{}\"\n---\n\n# {}\n\n{}\n",
url,
title.trim(),
title.trim(),
text.trim()
);
std::fs::write(&path, content)?;
info!("Ingested webpage: {} -> {}", url, path.display());
Ok(path)
}
pub fn save_query_result(
question: &str,
answer: &str,
memory_dir: &Path,
query_type: &str,
source_nodes: Option<&[String]>,
) -> Result<PathBuf, IngestError> {
std::fs::create_dir_all(memory_dir)?;
let timestamp = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
let filename = format!("{query_type}_{timestamp}.md");
let path = memory_dir.join(&filename);
let nodes_str = source_nodes.map(|n| n.join(", ")).unwrap_or_default();
let content = format!(
"---\ntype: {query_type}\ntimestamp: {timestamp}\nnodes: [{nodes_str}]\n---\n\n## Question\n\n{question}\n\n## Answer\n\n{answer}\n"
);
std::fs::write(&path, content)?;
info!("Saved query result: {} -> {}", query_type, path.display());
Ok(path)
}
fn extract_between(haystack: &str, start: &str, end: &str) -> Option<String> {
let start_idx = haystack.find(start)? + start.len();
let end_idx = haystack[start_idx..].find(end)? + start_idx;
Some(haystack[start_idx..end_idx].to_string())
}
fn strip_scripts_and_styles(html: &str) -> String {
static RE_SCRIPT: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
Regex::new(r"(?is)<script[^>]*>.*?</script>").expect("valid regex")
});
static RE_STYLE: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
Regex::new(r"(?is)<style[^>]*>.*?</style>").expect("valid regex")
});
let result = RE_SCRIPT.replace_all(html, "");
RE_STYLE.replace_all(&result, "").to_string()
}
fn strip_html_tags(html: &str) -> String {
static RE: std::sync::LazyLock<Regex> =
std::sync::LazyLock::new(|| Regex::new(r"<[^>]+>").expect("valid regex"));
RE.replace_all(html, "").to_string()
}
fn collapse_whitespace(text: &str) -> String {
static RE_WS: std::sync::LazyLock<Regex> =
std::sync::LazyLock::new(|| Regex::new(r"[ \t]+").expect("valid regex"));
static RE_NL: std::sync::LazyLock<Regex> =
std::sync::LazyLock::new(|| Regex::new(r"\n{3,}").expect("valid regex"));
let result = RE_WS.replace_all(text, " ");
RE_NL.replace_all(&result, "\n\n").to_string()
}
fn sanitize_filename(input: &str) -> String {
input
.replace("https://", "")
.replace("http://", "")
.replace(['/', '?', '&', '=', '#', ' '], "_")
.chars()
.filter(|c| c.is_alphanumeric() || *c == '_' || *c == '-' || *c == '.')
.take(80)
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_strip_html_tags() {
assert_eq!(strip_html_tags("<p>Hello <b>world</b></p>"), "Hello world");
assert_eq!(strip_html_tags("no tags"), "no tags");
assert_eq!(strip_html_tags("<br/>"), "");
}
#[test]
fn test_strip_scripts_and_styles() {
let html = "<p>Before</p><script>alert(1)</script><p>After</p>";
assert_eq!(strip_scripts_and_styles(html), "<p>Before</p><p>After</p>");
let html2 = "<style>.x{color:red}</style><p>Content</p>";
assert_eq!(strip_scripts_and_styles(html2), "<p>Content</p>");
}
#[test]
fn test_sanitize_filename() {
assert_eq!(
sanitize_filename("https://example.com/page?q=1"),
"example.com_page_q_1"
);
assert_eq!(sanitize_filename("simple"), "simple");
}
#[test]
fn test_sanitize_filename_max_length() {
let long_url = "a".repeat(200);
assert!(sanitize_filename(&long_url).len() <= 80);
}
#[test]
fn test_extract_between() {
assert_eq!(
extract_between("<title>Hello</title>", "<title>", "</title>"),
Some("Hello".to_string())
);
assert_eq!(extract_between("no markers", "<a>", "</a>"), None);
}
#[test]
fn test_collapse_whitespace() {
assert_eq!(collapse_whitespace("a b c"), "a b c");
assert_eq!(collapse_whitespace("a\n\n\n\nb"), "a\n\nb");
}
#[test]
fn test_save_query_result() {
let tmp = tempfile::tempdir().unwrap();
let path = save_query_result(
"What is Rust?",
"A systems programming language.",
tmp.path(),
"query",
Some(&["node1".to_string(), "node2".to_string()]),
)
.unwrap();
assert!(path.exists());
let content = std::fs::read_to_string(&path).unwrap();
assert!(content.contains("What is Rust?"));
assert!(content.contains("systems programming language"));
assert!(content.contains("node1, node2"));
assert!(content.contains("type: query"));
}
#[test]
fn test_save_query_result_no_nodes() {
let tmp = tempfile::tempdir().unwrap();
let path = save_query_result("question", "answer", tmp.path(), "chat", None).unwrap();
let content = std::fs::read_to_string(&path).unwrap();
assert!(content.contains("nodes: []"));
}
}