use tree_sitter::{Parser, Tree};
use crate::parser::helpers::{node_text, strip_string_quotes};
use crate::parser::languages::get_language;
#[derive(Debug, Clone, Default)]
pub struct FileMetadata {
pub title: Option<String>,
pub description: Option<String>,
}
impl FileMetadata {
pub fn new(title: Option<String>, description: Option<String>) -> Self {
Self { title, description }
}
pub fn is_empty(&self) -> bool {
self.title.is_none() && self.description.is_none()
}
}
pub fn extract_file_metadata(source: &[u8], language: &str) -> FileMetadata {
let result = match language {
#[cfg(feature = "lang-markdown")]
"markdown" => extract_markdown_metadata(source),
#[cfg(feature = "lang-python")]
"python" => extract_with_tree(source, language, extract_python_metadata),
#[cfg(feature = "lang-rust")]
"rust" => extract_with_tree(source, language, extract_rust_metadata),
#[cfg(feature = "lang-javascript")]
"javascript" => extract_with_tree(source, language, extract_js_metadata),
#[cfg(feature = "lang-typescript")]
"typescript" | "tsx" => extract_with_tree(source, language, extract_js_metadata),
#[cfg(feature = "lang-go")]
"go" => extract_with_tree(source, language, extract_go_metadata),
#[cfg(feature = "lang-java")]
"java" => extract_with_tree(source, language, extract_java_metadata),
#[cfg(feature = "lang-c")]
"c" => extract_with_tree(source, language, extract_c_metadata),
#[cfg(feature = "lang-cpp")]
"cpp" => extract_with_tree(source, language, extract_c_metadata),
#[cfg(feature = "lang-ruby")]
"ruby" => extract_with_tree(source, language, extract_ruby_metadata),
#[cfg(feature = "lang-csharp")]
"csharp" => extract_with_tree(source, language, extract_csharp_metadata),
_ => FileMetadata::default(),
};
FileMetadata {
title: result.title.map(|t| truncate_to_line(&t)),
description: result.description.map(|d| truncate_to_sentence(&d)),
}
}
fn extract_with_tree<F>(source: &[u8], language: &str, extractor: F) -> FileMetadata
where
F: FnOnce(&Tree, &[u8]) -> FileMetadata,
{
let lang = match get_language(language) {
Ok(l) => l,
Err(_) => return FileMetadata::default(),
};
let mut parser = Parser::new();
if parser.set_language(&lang).is_err() {
return FileMetadata::default();
}
match parser.parse(source, None) {
Some(tree) => extractor(&tree, source),
None => FileMetadata::default(),
}
}
fn truncate_to_line(text: &str) -> String {
let trimmed = text.trim();
match trimmed.find('\n') {
Some(idx) => trimmed.get(..idx).unwrap_or(trimmed).trim().to_string(),
None => trimmed.to_string(),
}
}
fn truncate_to_sentence(text: &str) -> String {
let trimmed = text.trim();
for (i, c) in trimmed.char_indices() {
if c == '.' {
let next_idx = i + 1;
if next_idx >= trimmed.len() {
return trimmed.to_string();
}
let next_char = trimmed.get(next_idx..).and_then(|s| s.chars().next());
if next_char.is_none_or(|c| c.is_whitespace()) {
return trimmed.get(..=i).unwrap_or(trimmed).trim().to_string();
}
}
}
truncate_to_line(trimmed)
}
#[cfg(feature = "lang-markdown")]
fn extract_markdown_metadata(source: &[u8]) -> FileMetadata {
let text = match std::str::from_utf8(source) {
Ok(s) => s,
Err(_) => return FileMetadata::default(),
};
if let Some(meta) = extract_yaml_frontmatter(text)
&& !meta.is_empty()
{
return meta;
}
let mut title = None;
let mut description = None;
let mut in_code_block = false;
for line in text.lines() {
let trimmed = line.trim();
if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
in_code_block = !in_code_block;
continue;
}
if in_code_block {
continue;
}
if trimmed.is_empty() {
continue;
}
if title.is_none() && trimmed.starts_with('#') {
let heading_text = trimmed.trim_start_matches('#').trim();
let heading_text = heading_text.trim_end_matches('#').trim();
if !heading_text.is_empty() {
title = Some(heading_text.to_string());
continue;
}
}
if title.is_some() && description.is_none() && !trimmed.starts_with('#') {
description = Some(trimmed.to_string());
break;
}
}
FileMetadata::new(title, description)
}
#[cfg(feature = "lang-markdown")]
fn extract_yaml_frontmatter(text: &str) -> Option<FileMetadata> {
if !text.starts_with("---") {
return None;
}
let content = text.get(3..)?;
let end_idx = content.find("\n---")?;
let yaml_content = content.get(..end_idx)?;
let mut title = None;
let mut description = None;
for line in yaml_content.lines() {
let trimmed = line.trim();
if let Some(value) = trimmed.strip_prefix("title:") {
let value = value.trim().trim_matches('"').trim_matches('\'');
if !value.is_empty() {
title = Some(value.to_string());
}
} else if let Some(value) = trimmed.strip_prefix("description:") {
let value = value.trim().trim_matches('"').trim_matches('\'');
if !value.is_empty() {
description = Some(value.to_string());
}
}
}
Some(FileMetadata::new(title, description))
}
#[cfg(feature = "lang-python")]
fn extract_python_metadata(tree: &Tree, source: &[u8]) -> FileMetadata {
let root = tree.root_node();
let mut cursor = root.walk();
for child in root.children(&mut cursor) {
if child.kind() == "expression_statement"
&& let Some(string_node) = child.child(0)
&& (string_node.kind() == "string" || string_node.kind() == "concatenated_string")
{
let raw = node_text(string_node, source);
let text = strip_string_quotes(&raw);
let text = text.trim();
if text.is_empty() {
continue;
}
return split_docstring(text);
}
if child.kind() != "comment" && child.kind() != "expression_statement" {
break;
}
}
FileMetadata::default()
}
#[cfg(feature = "lang-rust")]
fn extract_rust_metadata(tree: &Tree, source: &[u8]) -> FileMetadata {
let root = tree.root_node();
let mut cursor = root.walk();
let mut doc_lines: Vec<String> = Vec::new();
for child in root.children(&mut cursor) {
if child.kind() == "line_comment" {
let text = node_text(child, source);
if let Some(doc) = text.strip_prefix("//!") {
doc_lines.push(doc.trim().to_string());
} else {
break;
}
} else if child.kind() == "block_comment" {
let text = node_text(child, source);
if text.starts_with("/*!") {
let content = text
.strip_prefix("/*!")
.and_then(|s| s.strip_suffix("*/"))
.unwrap_or(&text);
let cleaned = clean_block_comment(content);
if !cleaned.is_empty() {
return split_docstring(&cleaned);
}
}
break;
} else {
break;
}
}
if doc_lines.is_empty() {
return FileMetadata::default();
}
let combined = doc_lines.join("\n");
split_docstring(&combined)
}
#[cfg(feature = "lang-javascript")]
fn extract_js_metadata(tree: &Tree, source: &[u8]) -> FileMetadata {
let root = tree.root_node();
let mut cursor = root.walk();
for child in root.children(&mut cursor) {
if child.kind() == "comment" {
let text = node_text(child, source);
if text.starts_with("/**") && !text.starts_with("/***") {
let content = text
.strip_prefix("/**")
.and_then(|s| s.strip_suffix("*/"))
.unwrap_or(&text);
let cleaned = clean_jsdoc_comment(content);
if !cleaned.is_empty() {
return split_docstring(&cleaned);
}
}
} else if child.kind() != "hash_bang_line" {
break;
}
}
FileMetadata::default()
}
#[cfg(feature = "lang-go")]
fn extract_go_metadata(tree: &Tree, source: &[u8]) -> FileMetadata {
let root = tree.root_node();
let mut cursor = root.walk();
let mut doc_lines: Vec<String> = Vec::new();
for child in root.children(&mut cursor) {
if child.kind() == "comment" {
let text = node_text(child, source);
if text.starts_with("//") {
let line = text.strip_prefix("//").unwrap_or(&text).trim();
doc_lines.push(line.to_string());
} else if text.starts_with("/*") {
let content = text
.strip_prefix("/*")
.and_then(|s| s.strip_suffix("*/"))
.unwrap_or(&text);
let cleaned = clean_block_comment(content);
if !cleaned.is_empty() {
return split_docstring(&cleaned);
}
}
} else if child.kind() == "package_clause" {
break;
} else {
break;
}
}
if doc_lines.is_empty() {
return FileMetadata::default();
}
let combined = doc_lines.join("\n");
split_docstring(&combined)
}
#[cfg(feature = "lang-java")]
fn extract_java_metadata(tree: &Tree, source: &[u8]) -> FileMetadata {
let root = tree.root_node();
let mut cursor = root.walk();
for child in root.children(&mut cursor) {
if child.kind() == "block_comment" {
let text = node_text(child, source);
if text.starts_with("/**") && !text.starts_with("/***") {
let content = text
.strip_prefix("/**")
.and_then(|s| s.strip_suffix("*/"))
.unwrap_or(&text);
let cleaned = clean_jsdoc_comment(content);
if !cleaned.is_empty() {
return split_docstring(&cleaned);
}
}
} else if child.kind() != "line_comment" {
break;
}
}
FileMetadata::default()
}
#[cfg(feature = "lang-c")]
fn extract_c_metadata(tree: &Tree, source: &[u8]) -> FileMetadata {
let root = tree.root_node();
let mut cursor = root.walk();
let mut doc_lines: Vec<String> = Vec::new();
for child in root.children(&mut cursor) {
if child.kind() == "comment" {
let text = node_text(child, source);
if text.starts_with("/*") {
let content = text
.strip_prefix("/*")
.and_then(|s| s.strip_suffix("*/"))
.unwrap_or(&text);
let cleaned = clean_block_comment(content);
if !cleaned.is_empty() && !is_copyright_only(&cleaned) {
return split_docstring(&cleaned);
}
} else if text.starts_with("//") {
let line = text.strip_prefix("//").unwrap_or(&text).trim();
if !line.to_lowercase().contains("copyright") {
doc_lines.push(line.to_string());
}
}
} else {
break;
}
}
if doc_lines.is_empty() {
return FileMetadata::default();
}
let combined = doc_lines.join("\n");
split_docstring(&combined)
}
#[cfg(feature = "lang-ruby")]
fn extract_ruby_metadata(tree: &Tree, source: &[u8]) -> FileMetadata {
let root = tree.root_node();
let mut cursor = root.walk();
let mut doc_lines: Vec<String> = Vec::new();
for child in root.children(&mut cursor) {
if child.kind() == "comment" {
let text = node_text(child, source);
let line = text.strip_prefix('#').unwrap_or(&text).trim();
if !line.starts_with('!') && !line.to_lowercase().contains("encoding:") {
doc_lines.push(line.to_string());
}
} else {
break;
}
}
if doc_lines.is_empty() {
return FileMetadata::default();
}
let combined = doc_lines.join("\n");
split_docstring(&combined)
}
#[cfg(feature = "lang-csharp")]
fn extract_csharp_metadata(tree: &Tree, source: &[u8]) -> FileMetadata {
let root = tree.root_node();
let mut cursor = root.walk();
let mut doc_lines: Vec<String> = Vec::new();
for child in root.children(&mut cursor) {
if child.kind() == "comment" {
let text = node_text(child, source);
if let Some(doc) = text.strip_prefix("///") {
doc_lines.push(doc.trim().to_string());
} else if text.starts_with("//") {
continue;
}
} else if child.kind() != "using_directive" {
break;
}
}
if doc_lines.is_empty() {
return FileMetadata::default();
}
let combined = doc_lines.join("\n");
if let Some(summary) = extract_xml_summary(&combined) {
return split_docstring(&summary);
}
split_docstring(&combined)
}
#[cfg(feature = "lang-csharp")]
fn extract_xml_summary(text: &str) -> Option<String> {
let start = text.find("<summary>")?;
let end = text.find("</summary>")?;
if start >= end {
return None;
}
let content = text.get(start + 9..end)?;
let cleaned = content
.lines()
.map(|l| l.trim())
.collect::<Vec<_>>()
.join(" ")
.trim()
.to_string();
if cleaned.is_empty() {
None
} else {
Some(cleaned)
}
}
fn split_docstring(text: &str) -> FileMetadata {
let trimmed = text.trim();
if trimmed.is_empty() {
return FileMetadata::default();
}
let mut lines = trimmed.lines();
let title = lines.next().map(|s| s.trim().to_string());
let desc_lines: Vec<&str> = lines
.skip_while(|l| {
let t = l.trim();
t.is_empty() || is_decoration_line(t)
})
.take_while(|l| {
let t = l.trim();
!t.is_empty() && !is_decoration_line(t)
})
.collect();
let description = if desc_lines.is_empty() {
None
} else {
let desc = desc_lines.join(" ").trim().to_string();
if desc.is_empty() { None } else { Some(desc) }
};
FileMetadata::new(title, description)
}
fn is_decoration_line(line: &str) -> bool {
if line.len() < 3 {
return false;
}
let first_char = line.chars().next().unwrap();
matches!(first_char, '~' | '=' | '-' | '*' | '#' | '^' | '+')
&& line.chars().all(|c| c == first_char)
}
fn clean_block_comment(text: &str) -> String {
text.lines()
.map(|line| {
let trimmed = line.trim();
trimmed.strip_prefix('*').unwrap_or(trimmed).trim()
})
.filter(|line| !line.is_empty())
.collect::<Vec<_>>()
.join("\n")
}
fn clean_jsdoc_comment(text: &str) -> String {
let mut lines: Vec<String> = Vec::new();
for line in text.lines() {
let trimmed = line.trim();
let content = trimmed.strip_prefix('*').unwrap_or(trimmed).trim();
if content.is_empty() {
continue;
}
if content.starts_with('@') {
if let Some(rest) = content
.strip_prefix("@description")
.or_else(|| content.strip_prefix("@fileoverview"))
{
let desc = rest.trim();
if !desc.is_empty() {
lines.push(desc.to_string());
}
}
continue;
}
lines.push(content.to_string());
}
lines.join("\n")
}
fn is_copyright_only(text: &str) -> bool {
let lower = text.to_lowercase();
lower.contains("copyright")
&& !lower.contains("description")
&& !lower.contains("purpose")
&& !lower.contains("overview")
&& text.lines().count() <= 5
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_truncate_to_sentence() {
assert_eq!(
truncate_to_sentence("Hello world. This is more."),
"Hello world."
);
assert_eq!(truncate_to_sentence("No period here"), "No period here");
assert_eq!(
truncate_to_sentence("File v1.2.3 description. More info."),
"File v1.2.3 description."
);
}
#[test]
fn test_truncate_to_line() {
assert_eq!(truncate_to_line("First line\nSecond line"), "First line");
assert_eq!(truncate_to_line("Single line"), "Single line");
}
#[test]
fn test_split_docstring() {
let meta = split_docstring("Title here\n\nDescription follows.");
assert_eq!(meta.title, Some("Title here".to_string()));
assert_eq!(meta.description, Some("Description follows.".to_string()));
let meta = split_docstring("Just a title");
assert_eq!(meta.title, Some("Just a title".to_string()));
assert_eq!(meta.description, None);
}
#[test]
fn test_clean_block_comment() {
let input = "* First line\n * Second line\n * Third";
let result = clean_block_comment(input);
assert_eq!(result, "First line\nSecond line\nThird");
}
#[cfg(feature = "lang-markdown")]
#[test]
fn test_yaml_frontmatter() {
let text = "---\ntitle: My Title\ndescription: My description\n---\n# Heading";
let meta = extract_yaml_frontmatter(text).unwrap();
assert_eq!(meta.title, Some("My Title".to_string()));
assert_eq!(meta.description, Some("My description".to_string()));
}
#[cfg(feature = "lang-markdown")]
#[test]
fn test_markdown_heading_fallback() {
let source = b"# Main Title\n\nFirst paragraph here.";
let meta = extract_markdown_metadata(source);
assert_eq!(meta.title, Some("Main Title".to_string()));
assert_eq!(meta.description, Some("First paragraph here.".to_string()));
}
}