use std::env;
use std::fs;
use std::path::{Path, PathBuf};
use std::process::Command;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use std::time::Duration;
use anyhow::{Context, Result};
use futures::stream::{self, StreamExt};
use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
use tokio::sync::mpsc;
use tokio::time::sleep;
use tokio_stream::wrappers::ReceiverStream;
use tracing::{debug, info, warn};
use crate::llm::OllamaClient;
use crate::setup::config::load_config;
#[derive(Debug, Clone)]
pub struct ManpageContent {
pub tool_name: String,
pub section: String,
pub description: String,
}
#[derive(Debug, Clone)]
pub struct ManpageEntry {
pub tool_name: String,
pub section: String,
pub description: String,
pub vector: Vec<f32>,
}
#[derive(Debug)]
pub struct EmbeddingGenerator {
client: OllamaClient,
model: String,
}
impl EmbeddingGenerator {
pub fn new() -> Result<Self> {
let config = load_config().context("Failed to load config")?;
let client = OllamaClient::with_config(
config.ollama_url(),
config.generate_timeout_secs(),
config.embedding_timeout_secs(),
)?;
Ok(Self {
client,
model: config.models.embedding_model.clone(),
})
}
#[must_use]
pub fn with_client(client: OllamaClient, model: &str) -> Self {
Self {
client,
model: model.to_string(),
}
}
pub async fn generate_embeddings(
&self,
contents: Vec<ManpageContent>,
) -> Result<Vec<ManpageEntry>> {
let total = contents.len();
let concurrency = 10;
info!(
total = total,
concurrency = concurrency,
"Starting parallel embedding generation"
);
let pb = ProgressBar::new(total as u64);
let style = ProgressStyle::default_bar()
.template("{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} ({percent}%)")
.context("Invalid progress bar template")?
.progress_chars("#>-");
pb.set_style(style);
let completed = Arc::new(AtomicUsize::new(0));
let failed_items = Arc::new(tokio::sync::Mutex::new(Vec::new()));
let results: Vec<Option<ManpageEntry>> = stream::iter(contents.into_iter().enumerate())
.map(|(idx, content)| {
let client = self.client.clone();
let model = self.model.clone();
let pb = pb.clone();
let completed = Arc::clone(&completed);
let failed_items = Arc::clone(&failed_items);
async move {
let result = Self::generate_single(&client, &model, &content).await;
let count = completed.fetch_add(1, Ordering::SeqCst) + 1;
pb.set_position(count as u64);
match result {
Ok(entry) => Some(entry),
Err(e) => {
warn!(idx = idx, error = %e, "Failed to generate embedding, will retry");
failed_items.lock().await.push((idx, content));
None
}
}
}
})
.buffer_unordered(concurrency)
.collect()
.await;
pb.finish_with_message("Initial pass complete");
let mut entries: Vec<ManpageEntry> = results.into_iter().flatten().collect();
let failed = failed_items.lock().await;
if !failed.is_empty() {
info!(
count = failed.len(),
"Retrying failed embeddings sequentially"
);
println!("\nRetrying {} failed embeddings...", failed.len());
for (idx, content) in failed.iter() {
match self.generate_with_retry(&content.description).await {
Ok(vector) => {
entries.push(ManpageEntry {
tool_name: content.tool_name.clone(),
section: content.section.clone(),
description: content.description.clone(),
vector,
});
}
Err(e) => {
warn!(idx = idx, tool = %content.tool_name, error = %e, "Final retry failed");
}
}
}
}
info!(count = entries.len(), "Embedding generation complete");
Ok(entries)
}
async fn generate_single(
client: &OllamaClient,
model: &str,
content: &ManpageContent,
) -> Result<ManpageEntry> {
let max_attempts = 2;
for attempt in 1..=max_attempts {
match client.generate_embedding(model, &content.description).await {
Ok(vector) => {
return Ok(ManpageEntry {
tool_name: content.tool_name.clone(),
section: content.section.clone(),
description: content.description.clone(),
vector,
});
}
Err(e) if attempt < max_attempts => {
sleep(Duration::from_millis(500)).await;
debug!(attempt = attempt, error = %e, "Quick retry");
}
Err(e) => return Err(e),
}
}
unreachable!()
}
async fn generate_with_retry(&self, text: &str) -> Result<Vec<f32>> {
let max_attempts = 3;
for attempt in 1..=max_attempts {
match self.client.generate_embedding(&self.model, text).await {
Ok(vector) => return Ok(vector),
Err(e) if attempt < max_attempts => {
let delay = Duration::from_secs(2_u64.pow(attempt));
warn!(
attempt = attempt,
delay_secs = delay.as_secs(),
error = %e,
"Embedding generation failed, retrying"
);
sleep(delay).await;
}
Err(e) => {
return Err(e).context("Embedding generation failed after 3 attempts");
}
}
}
unreachable!()
}
#[allow(clippy::too_many_lines)]
pub async fn generate_embeddings_pipelined(
&self,
paths: Vec<PathBuf>,
) -> Result<Vec<ManpageEntry>> {
let total = paths.len();
let concurrency = 10;
let channel_size = 100;
info!(
total = total,
concurrency = concurrency,
"Starting pipelined extraction and embedding"
);
let mp = MultiProgress::new();
let bar_style = ProgressStyle::default_bar()
.template("{prefix:.bold.cyan} [{bar:40.cyan/blue}] {pos}/{len} ({percent}%)")
.context("Invalid progress bar template")?
.progress_chars("█▓░");
let extract_pb = mp.add(ProgressBar::new(total as u64));
extract_pb.set_style(bar_style.clone());
extract_pb.set_prefix("Extract");
let embed_pb = mp.add(ProgressBar::new(total as u64));
embed_pb.set_style(bar_style);
embed_pb.set_prefix("Embed ");
let (tx, rx) = mpsc::channel::<ManpageContent>(channel_size);
let extract_handle = tokio::spawn(async move {
let mut extracted = 0;
let mut errors = 0;
for path in paths {
match ManpageScanner::extract_content(&path) {
Ok(content) => {
if tx.send(content).await.is_err() {
break; }
extracted += 1;
}
Err(e) => {
errors += 1;
debug!(path = ?path, error = %e, "Failed to extract content");
}
}
extract_pb.inc(1);
}
extract_pb.finish();
(extracted, errors)
});
let client = self.client.clone();
let model = self.model.clone();
let failed_items = Arc::new(tokio::sync::Mutex::new(Vec::new()));
let rx_stream = ReceiverStream::new(rx);
let results: Vec<Option<ManpageEntry>> = rx_stream
.map(|content| {
let client = client.clone();
let model = model.clone();
let embed_pb = embed_pb.clone();
let failed_items = Arc::clone(&failed_items);
async move {
let result = Self::generate_single(&client, &model, &content).await;
embed_pb.inc(1);
match result {
Ok(entry) => Some(entry),
Err(e) => {
warn!(tool = %content.tool_name, error = %e, "Failed to embed");
failed_items.lock().await.push(content);
None
}
}
}
})
.buffer_unordered(concurrency)
.collect()
.await;
embed_pb.finish();
let (extracted, errors) = extract_handle.await.context("Extraction task failed")?;
if errors > 0 {
println!(" (Skipped {errors} malformed manpages)");
}
info!(
extracted = extracted,
errors = errors,
"Extraction complete"
);
let mut entries: Vec<ManpageEntry> = results.into_iter().flatten().collect();
let failed = failed_items.lock().await;
if !failed.is_empty() {
info!(count = failed.len(), "Retrying failed embeddings");
println!("\nRetrying {} failed embeddings...", failed.len());
for content in failed.iter() {
if let Ok(vector) = self.generate_with_retry(&content.description).await {
entries.push(ManpageEntry {
tool_name: content.tool_name.clone(),
section: content.section.clone(),
description: content.description.clone(),
vector,
});
}
}
}
info!(count = entries.len(), "Pipelined processing complete");
Ok(entries)
}
}
const DEFAULT_PATHS: &[&str] = &[
"/usr/share/man",
"/usr/local/share/man",
"/opt/homebrew/share/man", ];
const SECTIONS: &[&str] = &["man1", "man8"];
#[derive(Debug)]
pub struct ManpageScanner {
paths: Vec<PathBuf>,
}
impl ManpageScanner {
#[must_use]
pub fn new() -> Self {
let mut paths: Vec<PathBuf> = DEFAULT_PATHS.iter().map(PathBuf::from).collect();
if let Ok(manpath) = env::var("MANPATH") {
for path in manpath.split(':') {
if !path.is_empty() {
let path_buf = PathBuf::from(path);
if !paths.contains(&path_buf) {
paths.push(path_buf);
}
}
}
}
debug!(?paths, "Initialized manpage scanner");
Self { paths }
}
#[must_use]
pub const fn with_paths(paths: Vec<PathBuf>) -> Self {
Self { paths }
}
pub fn scan_directories(&self) -> Result<Vec<PathBuf>> {
let mut manpages = Vec::new();
for base_path in &self.paths {
if !base_path.exists() {
debug!(?base_path, "Manpage directory does not exist, skipping");
continue;
}
for section in SECTIONS {
let section_path = base_path.join(section);
if !section_path.exists() {
debug!(?section_path, "Section directory does not exist, skipping");
continue;
}
match Self::scan_section(§ion_path) {
Ok(pages) => {
debug!(
path = ?section_path,
count = pages.len(),
"Scanned section"
);
manpages.extend(pages);
}
Err(e) => {
warn!(path = ?section_path, error = %e, "Failed to scan section");
}
}
}
}
info!(count = manpages.len(), "Total manpages found");
Ok(manpages)
}
fn scan_section(section_path: &Path) -> Result<Vec<PathBuf>> {
let mut pages = Vec::new();
let entries = fs::read_dir(section_path)
.with_context(|| format!("Failed to read directory: {}", section_path.display()))?;
for entry in entries {
let entry = entry
.with_context(|| format!("Failed to read entry in: {}", section_path.display()))?;
let path = entry.path();
if Self::is_manpage_file(&path) {
pages.push(path);
}
}
Ok(pages)
}
fn is_manpage_file(path: &Path) -> bool {
let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
return false;
};
name.ends_with(".1")
|| name.ends_with(".8")
|| name.ends_with(".1.gz")
|| name.ends_with(".8.gz")
}
#[must_use]
pub fn paths(&self) -> &[PathBuf] {
&self.paths
}
pub fn extract_content(path: &Path) -> Result<ManpageContent> {
let (tool_name, section) = Self::parse_filename(path)?;
debug!(tool = %tool_name, section = %section, "Extracting manpage content");
let output = Command::new("man")
.args(["-P", "cat", &tool_name])
.output()
.with_context(|| format!("Failed to execute man command for '{tool_name}'"))?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
anyhow::bail!("man command failed for '{}': {}", tool_name, stderr.trim());
}
let content = String::from_utf8(output.stdout)
.with_context(|| format!("Manpage '{tool_name}' contains invalid UTF-8"))?;
let description = Self::parse_manpage_content(&content, &tool_name);
Ok(ManpageContent {
tool_name,
section,
description,
})
}
fn parse_filename(path: &Path) -> Result<(String, String)> {
let filename = path
.file_name()
.and_then(|n| n.to_str())
.context("Invalid manpage filename")?;
let filename = filename.strip_suffix(".gz").unwrap_or(filename);
let section = filename
.chars()
.last()
.map_or_else(|| "1".to_string(), |c| c.to_string());
let tool_name = filename
.rsplit_once('.')
.map_or_else(|| filename.to_string(), |(name, _)| name.to_string());
Ok((tool_name, section))
}
fn parse_manpage_content(content: &str, tool_name: &str) -> String {
let mut result = String::new();
if let Some(name_text) = Self::extract_section(content, "NAME") {
let first_line = name_text.lines().next().unwrap_or("").trim();
if !first_line.is_empty() {
result.push_str(first_line);
}
}
if result.is_empty() {
result.push_str(tool_name);
}
if let Some(desc_text) = Self::extract_section(content, "DESCRIPTION") {
let first_para = Self::extract_first_paragraph(&desc_text);
if !first_para.is_empty() {
if !result.is_empty() {
result.push_str(" - ");
}
result.push_str(&first_para);
}
}
if result.len() > 500 {
let mut end = 500;
while !result.is_char_boundary(end) && end > 0 {
end -= 1;
}
result.truncate(end);
result.push_str("...");
}
result
}
fn extract_section(content: &str, section_name: &str) -> Option<String> {
let lines: Vec<&str> = content.lines().collect();
let mut in_section = false;
let mut section_text = String::new();
for line in lines {
let trimmed = line.trim();
if trimmed == section_name || trimmed == section_name.to_uppercase() {
in_section = true;
continue;
}
if in_section
&& !trimmed.is_empty()
&& trimmed
.chars()
.all(|c| c.is_uppercase() || c.is_whitespace())
&& trimmed.len() > 2
{
break;
}
if in_section && !trimmed.is_empty() {
if !section_text.is_empty() {
section_text.push(' ');
}
section_text.push_str(trimmed);
}
}
if section_text.is_empty() {
None
} else {
Some(section_text)
}
}
fn extract_first_paragraph(text: &str) -> String {
let mut result = String::new();
let mut prev_empty = false;
for line in text.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
if !result.is_empty() {
prev_empty = true;
}
continue;
}
if prev_empty && !result.is_empty() {
break;
}
if !result.is_empty() {
result.push(' ');
}
result.push_str(trimmed);
if result.len() > 400 {
break;
}
}
result
}
}
impl Default for ManpageScanner {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs::File;
use tempfile::TempDir;
fn create_test_structure(temp_dir: &TempDir) -> PathBuf {
let base = temp_dir.path().to_path_buf();
let man1 = base.join("man1");
fs::create_dir_all(&man1).unwrap();
File::create(man1.join("ls.1")).unwrap();
File::create(man1.join("cat.1.gz")).unwrap();
File::create(man1.join("readme.txt")).unwrap();
let man8 = base.join("man8");
fs::create_dir_all(&man8).unwrap();
File::create(man8.join("mount.8")).unwrap();
File::create(man8.join("fsck.8.gz")).unwrap();
base
}
#[test]
fn test_scanner_creation() {
let scanner = ManpageScanner::new();
assert!(!scanner.paths().is_empty());
}
#[test]
fn test_scanner_with_custom_paths() {
let paths = vec![PathBuf::from("/custom/path")];
let scanner = ManpageScanner::with_paths(paths.clone());
assert_eq!(scanner.paths(), &paths);
}
#[test]
fn test_scan_test_directory() {
let temp_dir = TempDir::new().unwrap();
let base = create_test_structure(&temp_dir);
let scanner = ManpageScanner::with_paths(vec![base]);
let pages = scanner.scan_directories().unwrap();
assert_eq!(pages.len(), 4);
}
#[test]
fn test_ignores_non_manpage_files() {
let temp_dir = TempDir::new().unwrap();
let base = create_test_structure(&temp_dir);
let scanner = ManpageScanner::with_paths(vec![base]);
let pages = scanner.scan_directories().unwrap();
assert!(!pages.iter().any(|p| p.to_string_lossy().contains("readme")));
}
#[test]
fn test_handles_missing_directories() {
let scanner = ManpageScanner::with_paths(vec![PathBuf::from("/nonexistent/path")]);
let pages = scanner.scan_directories().unwrap();
assert!(pages.is_empty());
}
#[test]
fn test_is_manpage_file() {
assert!(ManpageScanner::is_manpage_file(Path::new("ls.1")));
assert!(ManpageScanner::is_manpage_file(Path::new("cat.1.gz")));
assert!(ManpageScanner::is_manpage_file(Path::new("mount.8")));
assert!(ManpageScanner::is_manpage_file(Path::new("fsck.8.gz")));
assert!(!ManpageScanner::is_manpage_file(Path::new("readme.txt")));
assert!(!ManpageScanner::is_manpage_file(Path::new("lib.3"))); assert!(!ManpageScanner::is_manpage_file(Path::new("config.5"))); }
#[test]
fn test_parse_filename() {
let (name, section) = ManpageScanner::parse_filename(Path::new("ls.1")).unwrap();
assert_eq!(name, "ls");
assert_eq!(section, "1");
let (name, section) = ManpageScanner::parse_filename(Path::new("mount.8.gz")).unwrap();
assert_eq!(name, "mount");
assert_eq!(section, "8");
let (name, section) = ManpageScanner::parse_filename(Path::new("git-commit.1")).unwrap();
assert_eq!(name, "git-commit");
assert_eq!(section, "1");
}
#[test]
fn test_extract_section() {
let content = "NAME\n ls - list directory contents\n\nDESCRIPTION\n List information about the FILEs.";
let name = ManpageScanner::extract_section(content, "NAME");
assert!(name.is_some());
assert!(name.unwrap().contains("list directory"));
let desc = ManpageScanner::extract_section(content, "DESCRIPTION");
assert!(desc.is_some());
assert!(desc.unwrap().contains("FILEs"));
}
#[test]
fn test_parse_manpage_content() {
let content = "NAME\n ls - list directory contents\n\nDESCRIPTION\n List information about the FILEs.";
let result = ManpageScanner::parse_manpage_content(content, "ls");
assert!(result.contains("ls"));
assert!(result.contains("list"));
}
#[test]
fn test_extract_first_paragraph() {
let text = "First paragraph line one. Line two.\n\nSecond paragraph.";
let para = ManpageScanner::extract_first_paragraph(text);
assert!(para.contains("First paragraph"));
assert!(!para.contains("Second"));
}
}