use crate::markdown_parser::{MarkdownParser, MarkdownSection};
use std::fs;
use std::path::Path;
pub struct SectionExtractor {
pub min_words: usize,
pub max_level: u32,
pub include_empty: bool,
}
impl Default for SectionExtractor {
fn default() -> Self {
Self { min_words: 10, max_level: 6, include_empty: false }
}
}
impl SectionExtractor {
pub fn new(min_words: usize, max_level: u32, include_empty: bool) -> Self {
Self { min_words, max_level, include_empty }
}
pub fn extract_from_content(&self, content: &str, file_path: &str) -> Vec<ExtractedSection> {
let sections = MarkdownParser::parse(content);
sections
.into_iter()
.filter(|section| self.should_include_section(section))
.map(|section| {
let plain_content = section.get_plain_content();
let word_count = section.word_count();
ExtractedSection {
title: section.title,
level: section.level,
content: section.content,
plain_content,
word_count,
line_start: section.line_start,
line_end: section.line_end,
path: section.path,
file_path: file_path.to_string(),
}
})
.collect()
}
pub fn extract_from_file<P: AsRef<Path>>(
&self,
file_path: P,
) -> Result<Vec<ExtractedSection>, std::io::Error> {
let content = fs::read_to_string(&file_path)?;
let path_str = file_path.as_ref().to_string_lossy().to_string();
Ok(self.extract_from_content(&content, &path_str))
}
pub fn extract_from_files<P: AsRef<Path>>(&self, file_paths: &[P]) -> Vec<ExtractedSection> {
let mut all_sections = Vec::new();
for file_path in file_paths {
match self.extract_from_file(file_path) {
Ok(mut sections) => {
all_sections.append(&mut sections);
}
Err(e) => {
eprintln!("Error reading {}: {}", file_path.as_ref().display(), e);
}
}
}
all_sections
}
fn should_include_section(&self, section: &MarkdownSection) -> bool {
if section.level > self.max_level {
return false;
}
if section.is_empty() && !self.include_empty {
return false;
}
if section.word_count() < self.min_words {
return false;
}
true
}
pub fn group_by_level<'a>(
&self,
sections: &'a [ExtractedSection],
) -> std::collections::HashMap<u32, Vec<&'a ExtractedSection>> {
let mut groups = std::collections::HashMap::new();
for section in sections {
groups.entry(section.level).or_insert_with(Vec::new).push(section);
}
groups
}
pub fn find_similar_titles(
&self,
sections: &[ExtractedSection],
threshold: f64,
) -> Vec<SimilarTitlePair> {
use crate::levenshtein::levenshtein_similarity;
let mut similar_pairs = Vec::new();
for i in 0..sections.len() {
for j in (i + 1)..sections.len() {
let section1 = §ions[i];
let section2 = §ions[j];
if section1.file_path == section2.file_path
&& section1.line_start == section2.line_start
{
continue;
}
let similarity = levenshtein_similarity(§ion1.title, §ion2.title);
if similarity >= threshold {
similar_pairs.push(SimilarTitlePair {
section1: section1.clone(),
section2: section2.clone(),
similarity,
});
}
}
}
similar_pairs.sort_by(|a, b| b.similarity.partial_cmp(&a.similarity).unwrap());
similar_pairs
}
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct ExtractedSection {
pub title: String,
pub level: u32,
pub content: String,
pub plain_content: String,
pub word_count: usize,
pub line_start: usize,
pub line_end: usize,
pub path: Vec<String>,
pub file_path: String,
}
impl ExtractedSection {
pub fn get_path_string(&self) -> String {
self.path.join(" > ")
}
pub fn get_summary(&self, max_words: usize) -> String {
let words: Vec<&str> = self.plain_content.split_whitespace().collect();
if words.len() <= max_words {
self.plain_content.clone()
} else {
words[..max_words].join(" ") + "..."
}
}
pub fn get_relative_path(&self) -> String {
if let Ok(current_dir) = std::env::current_dir() {
std::path::Path::new(&self.file_path)
.strip_prefix(¤t_dir)
.unwrap_or(std::path::Path::new(&self.file_path))
.to_string_lossy()
.to_string()
} else {
self.file_path.clone()
}
}
}
#[derive(Debug, Clone)]
pub struct SimilarTitlePair {
pub section1: ExtractedSection,
pub section2: ExtractedSection,
pub similarity: f64,
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
#[test]
fn test_extract_from_content() {
let content = r#"# Introduction
This is the introduction with more than ten words to meet the minimum requirement.
## Getting Started
This section explains how to get started with the project and has enough content.
### Quick Start
Short content.
# Advanced Topics
This section covers advanced topics and contains sufficient content for analysis.
"#;
let extractor = SectionExtractor::default();
let sections = extractor.extract_from_content(content, "test.md");
assert_eq!(sections.len(), 3);
assert_eq!(sections[0].title, "Introduction");
assert_eq!(sections[0].level, 1);
assert!(sections[0].word_count >= 10);
assert_eq!(sections[1].title, "Getting Started");
assert_eq!(sections[1].level, 2);
assert_eq!(sections[2].title, "Advanced Topics");
assert_eq!(sections[2].level, 1);
}
#[test]
fn test_extract_from_file() {
let content = r#"# Test Document
This is a test document with sufficient content for testing purposes.
## Section One
This section has enough content to pass the minimum word count filter.
"#;
let mut temp_file = NamedTempFile::new().unwrap();
temp_file.write_all(content.as_bytes()).unwrap();
let extractor = SectionExtractor::default();
let sections = extractor.extract_from_file(temp_file.path()).unwrap();
assert_eq!(sections.len(), 2);
assert_eq!(sections[0].title, "Test Document");
assert_eq!(sections[1].title, "Section One");
}
#[test]
fn test_find_similar_titles() {
let sections = vec![
ExtractedSection {
title: "Introduction".to_string(),
level: 1,
content: "Content".to_string(),
plain_content: "Content".to_string(),
word_count: 10,
line_start: 1,
line_end: 5,
path: vec!["Introduction".to_string()],
file_path: "file1.md".to_string(),
},
ExtractedSection {
title: "Introduction".to_string(),
level: 1,
content: "Different content".to_string(),
plain_content: "Different content".to_string(),
word_count: 10,
line_start: 1,
line_end: 5,
path: vec!["Introduction".to_string()],
file_path: "file2.md".to_string(),
},
ExtractedSection {
title: "Getting Started".to_string(),
level: 1,
content: "Content".to_string(),
plain_content: "Content".to_string(),
word_count: 10,
line_start: 10,
line_end: 15,
path: vec!["Getting Started".to_string()],
file_path: "file1.md".to_string(),
},
];
let extractor = SectionExtractor::default();
let similar_pairs = extractor.find_similar_titles(§ions, 0.9);
assert_eq!(similar_pairs.len(), 1);
assert_eq!(similar_pairs[0].similarity, 1.0);
assert_eq!(similar_pairs[0].section1.title, "Introduction");
assert_eq!(similar_pairs[0].section2.title, "Introduction");
}
}