use std::path::Path;
use super::error::{LlmError, LlmResult};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DocFormat {
PlainText,
Markdown,
Word,
Unknown,
}
impl DocFormat {
pub fn from_path(path: &Path) -> Self {
match path.extension().and_then(|e| e.to_str()) {
Some("txt") => DocFormat::PlainText,
Some("md") | Some("markdown") => DocFormat::Markdown,
Some("docx") => DocFormat::Word,
_ => DocFormat::Unknown,
}
}
pub fn is_supported(&self) -> bool {
!matches!(self, DocFormat::Unknown)
}
}
pub fn load_documentation(path: &Path) -> LlmResult<String> {
let format = DocFormat::from_path(path);
if !path.exists() {
return Err(LlmError::DocumentationError(format!(
"File not found: {}",
path.display()
)));
}
match format {
DocFormat::PlainText | DocFormat::Markdown => load_text_file(path),
DocFormat::Word => load_docx_file(path),
DocFormat::Unknown => {
load_text_file(path).map_err(|_| {
LlmError::DocumentationError(format!("Unsupported file format: {}", path.display()))
})
}
}
}
fn load_text_file(path: &Path) -> LlmResult<String> {
std::fs::read_to_string(path).map_err(|e| {
LlmError::DocumentationError(format!("Failed to read {}: {}", path.display(), e))
})
}
fn load_docx_file(path: &Path) -> LlmResult<String> {
let file = std::fs::File::open(path).map_err(|e| {
LlmError::DocumentationError(format!("Failed to open {}: {}", path.display(), e))
})?;
let mut archive = zip::ZipArchive::new(file)
.map_err(|e| LlmError::DocumentationError(format!("Failed to read docx archive: {}", e)))?;
let mut document = archive.by_name("word/document.xml").map_err(|e| {
LlmError::DocumentationError(format!("Failed to find document.xml in docx: {}", e))
})?;
let mut xml_content = String::new();
std::io::Read::read_to_string(&mut document, &mut xml_content)
.map_err(|e| LlmError::DocumentationError(format!("Failed to read document.xml: {}", e)))?;
Ok(extract_text_from_docx_xml(&xml_content))
}
fn extract_text_from_docx_xml(xml: &str) -> String {
let mut result = String::new();
let mut in_text = false;
let mut current_text = String::new();
for c in xml.chars() {
if c == '<' {
if in_text && !current_text.is_empty() {
result.push_str(¤t_text);
current_text.clear();
}
in_text = false;
} else if c == '>' {
} else if in_text {
current_text.push(c);
}
}
let text_pattern = regex::Regex::new(r"<w:t[^>]*>([^<]*)</w:t>").unwrap();
let mut extracted = Vec::new();
for cap in text_pattern.captures_iter(xml) {
if let Some(text) = cap.get(1) {
extracted.push(text.as_str().to_string());
}
}
let para_pattern = regex::Regex::new(r"</w:p>").unwrap();
let mut last_end = 0;
let mut output = String::new();
for m in para_pattern.find_iter(xml) {
let para_xml = &xml[last_end..m.end()];
for cap in text_pattern.captures_iter(para_xml) {
if let Some(text) = cap.get(1) {
output.push_str(text.as_str());
}
}
output.push('\n');
last_end = m.end();
}
let remaining = &xml[last_end..];
for cap in text_pattern.captures_iter(remaining) {
if let Some(text) = cap.get(1) {
output.push_str(text.as_str());
}
}
output.trim().to_string()
}
pub fn load_documentation_files(paths: &[&Path]) -> LlmResult<String> {
let mut combined = Vec::new();
for path in paths {
let content = load_documentation(path)?;
combined.push(format!("--- {} ---\n{}", path.display(), content));
}
Ok(combined.join("\n\n"))
}
pub fn truncate_documentation(text: &str, max_tokens: usize) -> String {
let max_chars = max_tokens * 4;
if text.len() <= max_chars {
return text.to_string();
}
let truncated = &text[..max_chars];
if let Some(pos) = truncated.rfind("\n\n") {
if pos > max_chars / 2 {
return format!("{}...\n\n[Documentation truncated]", &truncated[..pos]);
}
}
if let Some(pos) = truncated.rfind(". ") {
if pos > max_chars / 2 {
return format!("{}.\n\n[Documentation truncated]", &truncated[..pos]);
}
}
if let Some(pos) = truncated.rfind(' ') {
return format!("{}...\n\n[Documentation truncated]", &truncated[..pos]);
}
format!("{}...\n\n[Documentation truncated]", truncated)
}
pub fn extract_relevant_sections(text: &str, field_names: &[&str], max_chars: usize) -> String {
let mut relevant_lines = Vec::new();
let lines: Vec<&str> = text.lines().collect();
for (i, line) in lines.iter().enumerate() {
let line_lower = line.to_lowercase();
for field in field_names {
if line_lower.contains(&field.to_lowercase()) {
let start = i.saturating_sub(1);
let end = (i + 2).min(lines.len());
for j in start..end {
let context_line = lines[j];
if !relevant_lines.contains(&context_line) {
relevant_lines.push(context_line);
}
}
break;
}
}
}
let result = relevant_lines.join("\n");
if result.len() > max_chars {
truncate_documentation(&result, max_chars / 4)
} else {
result
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
#[test]
fn test_doc_format_from_path() {
assert_eq!(
DocFormat::from_path(Path::new("doc.txt")),
DocFormat::PlainText
);
assert_eq!(
DocFormat::from_path(Path::new("doc.md")),
DocFormat::Markdown
);
assert_eq!(
DocFormat::from_path(Path::new("doc.markdown")),
DocFormat::Markdown
);
assert_eq!(DocFormat::from_path(Path::new("doc.docx")), DocFormat::Word);
assert_eq!(
DocFormat::from_path(Path::new("doc.pdf")),
DocFormat::Unknown
);
}
#[test]
fn test_doc_format_is_supported() {
assert!(DocFormat::PlainText.is_supported());
assert!(DocFormat::Markdown.is_supported());
assert!(DocFormat::Word.is_supported());
assert!(!DocFormat::Unknown.is_supported());
}
#[test]
fn test_load_text_file() {
let mut temp = NamedTempFile::with_suffix(".txt").unwrap();
writeln!(temp, "This is test documentation.").unwrap();
writeln!(temp, "It has multiple lines.").unwrap();
let content = load_documentation(temp.path()).unwrap();
assert!(content.contains("test documentation"));
assert!(content.contains("multiple lines"));
}
#[test]
fn test_load_markdown_file() {
let mut temp = NamedTempFile::with_suffix(".md").unwrap();
writeln!(temp, "# Header").unwrap();
writeln!(temp, "").unwrap();
writeln!(temp, "Some **bold** text.").unwrap();
let content = load_documentation(temp.path()).unwrap();
assert!(content.contains("# Header"));
assert!(content.contains("**bold**"));
}
#[test]
fn test_load_nonexistent_file() {
let result = load_documentation(Path::new("/nonexistent/file.txt"));
assert!(result.is_err());
}
#[test]
fn test_truncate_documentation_short() {
let text = "Short text.";
let result = truncate_documentation(text, 100);
assert_eq!(result, text);
}
#[test]
fn test_truncate_documentation_long() {
let text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph with more content that makes this longer.";
let result = truncate_documentation(text, 10); assert!(result.contains("[Documentation truncated]"));
assert!(result.len() < text.len());
}
#[test]
fn test_extract_relevant_sections() {
let text = "Line 1: Introduction\nLine 2: The customer_id field is important.\nLine 3: Other info.\nLine 4: The order_date represents when the order was placed.\nLine 5: Conclusion.";
let fields = vec!["customer_id", "order_date"];
let result = extract_relevant_sections(text, &fields, 1000);
assert!(result.contains("customer_id"));
assert!(result.contains("order_date"));
}
#[test]
fn test_extract_relevant_sections_no_match() {
let text = "This documentation has no relevant field mentions.";
let fields = vec!["nonexistent_field"];
let result = extract_relevant_sections(text, &fields, 1000);
assert!(result.is_empty());
}
#[test]
fn test_extract_text_from_docx_xml() {
let xml = r#"<w:document><w:body><w:p><w:r><w:t>Hello</w:t></w:r><w:r><w:t> World</w:t></w:r></w:p><w:p><w:r><w:t>Second paragraph</w:t></w:r></w:p></w:body></w:document>"#;
let result = extract_text_from_docx_xml(xml);
assert!(result.contains("Hello"));
assert!(result.contains("World"));
assert!(result.contains("Second paragraph"));
}
}