use anyhow::{Context, Result};
use std::path::Path;
pub fn extract_pdf_to_markdown(path: &Path) -> Result<String> {
let text = pdf_extract::extract_text(path).context("Failed to extract text from PDF")?;
let markdown = format_as_markdown(&text);
Ok(markdown)
}
fn format_as_markdown(text: &str) -> String {
let mut markdown = String::new();
let mut in_table = false;
for line in text.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
if in_table {
markdown.push('\n');
in_table = false;
}
markdown.push('\n');
continue;
}
if is_likely_table_row(trimmed) {
if !in_table {
markdown.push_str(&format_table_row(trimmed));
markdown.push('\n');
markdown.push_str(&create_table_separator(trimmed));
markdown.push('\n');
in_table = true;
} else {
markdown.push_str(&format_table_row(trimmed));
markdown.push('\n');
}
} else {
if in_table {
markdown.push('\n');
in_table = false;
}
if is_likely_heading(trimmed) {
let level = if trimmed.len() < 30 { "##" } else { "###" };
markdown.push_str(&format!("{} {}\n\n", level, trimmed.trim_end_matches(':')));
} else {
markdown.push_str(trimmed);
markdown.push('\n');
}
}
}
markdown
}
fn is_likely_table_row(line: &str) -> bool {
let columns: Vec<&str> = line.split_whitespace().collect();
if columns.len() >= 3 {
let has_tabs = line.contains('\t');
let has_multiple_spaces = line.contains(" ");
return has_tabs || has_multiple_spaces;
}
false
}
fn format_table_row(line: &str) -> String {
let columns: Vec<&str> = if line.contains('\t') {
line.split('\t').map(|s| s.trim()).collect()
} else {
line.split(" ")
.filter(|s| !s.trim().is_empty())
.map(|s| s.trim())
.collect()
};
format!("| {} |", columns.join(" | "))
}
fn create_table_separator(header: &str) -> String {
let column_count = if header.contains('\t') {
header.split('\t').count()
} else {
header.split(" ").filter(|s| !s.trim().is_empty()).count()
};
let separators = vec!["---"; column_count];
format!("| {} |", separators.join(" | "))
}
fn is_likely_heading(line: &str) -> bool {
let uppercase_count = line.chars().filter(|c| c.is_uppercase()).count();
let alpha_count = line.chars().filter(|c| c.is_alphabetic()).count();
if alpha_count > 0 {
let uppercase_ratio = uppercase_count as f64 / alpha_count as f64;
if uppercase_ratio > 0.8 && line.len() < 100 {
return true;
}
}
if line.ends_with(':') && line.len() < 80 && !line.contains("://") {
return true;
}
false
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_likely_table_row() {
assert!(is_likely_table_row("Column1 Column2 Column3"));
assert!(is_likely_table_row("Name\tAge\tCity"));
assert!(!is_likely_table_row("This is a normal sentence"));
assert!(!is_likely_table_row("Only two"));
}
#[test]
fn test_format_table_row() {
let row = "Name Age City";
assert_eq!(format_table_row(row), "| Name | Age | City |");
let tab_row = "Name\tAge\tCity";
assert_eq!(format_table_row(tab_row), "| Name | Age | City |");
}
#[test]
fn test_create_table_separator() {
let header = "Name Age City";
assert_eq!(create_table_separator(header), "| --- | --- | --- |");
}
#[test]
fn test_is_likely_heading() {
assert!(is_likely_heading("INTRODUCTION"));
assert!(is_likely_heading("Chapter 1:"));
assert!(is_likely_heading("Section Title:"));
assert!(!is_likely_heading("This is a normal sentence"));
assert!(!is_likely_heading("https://example.com"));
}
#[test]
fn test_format_as_markdown_simple() {
let text = "INTRODUCTION\n\nThis is some text.\n\nSection 1:\nMore text here.";
let markdown = format_as_markdown(text);
assert!(markdown.contains("## INTRODUCTION"));
assert!(markdown.contains("## Section 1"));
}
#[test]
fn test_format_as_markdown_with_table() {
let text = "Name Age City\nJohn 30 NYC\nJane 25 LA";
let markdown = format_as_markdown(text);
assert!(markdown.contains("| Name | Age | City |"));
assert!(markdown.contains("| --- | --- | --- |"));
assert!(markdown.contains("| John | 30 | NYC |"));
}
}