use glob_match::glob_match;
use regex::Regex;
use std::sync::LazyLock;
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
#[serde(rename_all = "snake_case")]
pub enum ClaimType {
FilePath,
DirectoryPath,
CodeExample,
ClassName,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
#[serde(rename_all = "snake_case")]
pub enum ClaimConfidence {
Real,
Example,
Unclear,
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct Claim {
pub claim_type: ClaimType,
pub value: String,
pub doc_file: String,
pub line: usize,
pub confidence: ClaimConfidence,
#[serde(skip_serializing_if = "Option::is_none")]
pub context: Option<String>,
}
static FILE_PATH_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"`(/?(?:[\w.-]+/)+[\w.-]+\.[a-zA-Z0-9]+)`").unwrap()
});
static DIR_PATH_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"`(/?(?:[\w.-]+/)+)`").unwrap()
});
static CODE_BLOCK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?s)```(\w+)\n(.*?)```").unwrap()
});
static CLASS_NAME_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?:`)?([A-Z][a-zA-Z0-9]*(?:\\{1,2}[A-Z][a-zA-Z0-9]*)+)(?:`)?").unwrap()
});
const DOMAIN_EXTENSIONS: &[&str] = &[
".com", ".org", ".io", ".net", ".dev", ".co", ".app", ".ai", ".xyz",
];
fn is_mime_type(path: &str) -> bool {
let lower = path.to_lowercase();
lower.starts_with("application/")
|| lower.starts_with("text/")
|| lower.starts_with("image/")
|| lower.starts_with("audio/")
|| lower.starts_with("video/")
|| lower.starts_with("font/")
|| lower.starts_with("multipart/")
}
fn matches_ignore_pattern(value: &str, patterns: &[String]) -> bool {
patterns.iter().any(|pattern| glob_match(pattern, value))
}
const PLACEHOLDER_PREFIXES: &[&str] = &[
"My", "Your", "Example", "Sample", "Foo", "Bar", "Test", "Demo", "Dummy", "Fake",
];
fn is_placeholder_class(value: &str) -> bool {
value.split('\\').any(|segment| {
PLACEHOLDER_PREFIXES
.iter()
.any(|prefix| segment.starts_with(prefix))
})
}
fn line_suggests_example(line: &str) -> bool {
let lower = line.to_lowercase();
lower.contains("example")
|| lower.contains("e.g.")
|| lower.contains("e.g.,")
|| lower.contains("for instance")
|| lower.contains("sample")
|| lower.contains("such as")
|| lower.contains("this creates")
|| lower.contains("would create")
|| lower.contains("would generate")
|| lower.contains("would produce")
|| lower.contains("typically:")
}
fn is_os_path_context(line: &str, match_start: usize) -> bool {
if match_start >= 2 {
let prefix = &line[..match_start];
if prefix.contains(":\\") || prefix.contains(":/") {
return true;
}
}
let lower = line.to_lowercase();
(lower.contains("c:\\") || lower.contains("c:/"))
|| (lower.contains("users\\") || lower.contains("users/"))
|| lower.contains("program files")
|| lower.contains("%appdata%")
|| lower.contains("$home")
}
fn line_suggests_real(line: &str) -> bool {
line.contains("@see")
|| line.contains("@uses")
|| line.contains("@link")
|| line.contains("@param")
|| line.contains("@return")
|| line.contains("@throws")
}
fn classify_path_confidence(value: &str, line: &str, in_code_block: bool) -> ClaimConfidence {
if in_code_block {
return ClaimConfidence::Example;
}
if line_suggests_real(line) {
return ClaimConfidence::Real;
}
if line_suggests_example(line) {
return ClaimConfidence::Unclear;
}
let lower = value.to_lowercase();
if lower.contains("example") || lower.contains("sample") || lower.contains("your-") {
return ClaimConfidence::Unclear;
}
ClaimConfidence::Real
}
fn classify_class_confidence(value: &str, line: &str, in_code_block: bool) -> ClaimConfidence {
if is_placeholder_class(value) {
return ClaimConfidence::Example;
}
if in_code_block {
if line_suggests_real(line) {
return ClaimConfidence::Real;
}
return ClaimConfidence::Unclear;
}
if line_suggests_example(line) {
return ClaimConfidence::Unclear;
}
ClaimConfidence::Real
}
pub fn extract_claims(content: &str, doc_file: &str, ignore_patterns: &[String]) -> Vec<Claim> {
let mut claims = Vec::new();
let mut claimed_positions: Vec<(usize, usize)> = Vec::new();
let mut in_code_block = false;
for (line_idx, line) in content.lines().enumerate() {
let line_num = line_idx + 1;
if line.starts_with("```") {
in_code_block = !in_code_block;
continue;
}
if in_code_block {
continue;
}
for cap in FILE_PATH_PATTERN.captures_iter(line) {
let full_match = cap.get(0).unwrap();
let pos = (line_idx, full_match.start());
if !claimed_positions.contains(&pos) {
let path = cap.get(1).map(|m| m.as_str()).unwrap_or("");
if path.contains("://") || path.starts_with("http") {
continue;
}
if is_domain_like(path) {
continue;
}
if is_mime_type(path) {
continue;
}
if matches_ignore_pattern(path, ignore_patterns) {
continue;
}
if path.len() < 5 {
continue;
}
let confidence = classify_path_confidence(path, line, false);
claimed_positions.push(pos);
claims.push(Claim {
claim_type: ClaimType::FilePath,
value: path.to_string(),
doc_file: doc_file.to_string(),
line: line_num,
confidence,
context: Some(line.trim().to_string()),
});
}
}
for cap in CLASS_NAME_PATTERN.captures_iter(line) {
let full_match = cap.get(0).unwrap();
let pos = (line_idx, full_match.start());
if !claimed_positions.contains(&pos) {
let class_ref = cap.get(1).map(|m| m.as_str()).unwrap_or("");
if is_os_path_context(line, full_match.start()) {
continue;
}
let normalized = class_ref.replace("\\\\", "\\");
if matches_ignore_pattern(&normalized, ignore_patterns) {
continue;
}
let confidence = classify_class_confidence(&normalized, line, false);
claimed_positions.push(pos);
claims.push(Claim {
claim_type: ClaimType::ClassName,
value: normalized,
doc_file: doc_file.to_string(),
line: line_num,
confidence,
context: Some(line.trim().to_string()),
});
}
}
for cap in DIR_PATH_PATTERN.captures_iter(line) {
let full_match = cap.get(0).unwrap();
let pos = (line_idx, full_match.start());
if !claimed_positions.contains(&pos) {
let path = cap.get(1).map(|m| m.as_str()).unwrap_or("");
if path == "./" || path == "../" || path.len() < 4 {
continue;
}
if matches_ignore_pattern(path, ignore_patterns) {
continue;
}
let confidence = classify_path_confidence(path, line, false);
claimed_positions.push(pos);
claims.push(Claim {
claim_type: ClaimType::DirectoryPath,
value: path.to_string(),
doc_file: doc_file.to_string(),
line: line_num,
confidence,
context: Some(line.trim().to_string()),
});
}
}
}
for cap in CODE_BLOCK_PATTERN.captures_iter(content) {
let language = cap.get(1).map(|m| m.as_str()).unwrap_or("");
let code = cap.get(2).map(|m| m.as_str()).unwrap_or("");
let block_start = cap.get(0).unwrap().start();
let line_num = content[..block_start].lines().count() + 1;
if matches!(
language,
"php" | "rust" | "js" | "javascript" | "ts" | "typescript" | "python" | "go"
) {
claims.push(Claim {
claim_type: ClaimType::CodeExample,
value: code.trim().to_string(),
doc_file: doc_file.to_string(),
line: line_num,
confidence: ClaimConfidence::Unclear,
context: Some(format!("```{} block", language)),
});
}
}
claims
}
fn is_domain_like(path: &str) -> bool {
let lower = path.to_lowercase();
DOMAIN_EXTENSIONS
.iter()
.any(|ext| lower.contains(&format!("{ext}/")) || lower.ends_with(ext))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_file_paths() {
let content = "Check the file at `/inc/Engine/AI/Tools/BaseTool.php` for details.";
let claims = extract_claims(content, "test.md", &[]);
assert!(claims.iter().any(|c| c.claim_type == ClaimType::FilePath
&& c.value == "/inc/Engine/AI/Tools/BaseTool.php"));
}
#[test]
fn test_extract_file_paths_requires_separator() {
let content = "The `main.py` file is the entry point.";
let claims = extract_claims(content, "test.md", &[]);
assert!(!claims.iter().any(|c| c.value == "main.py"));
}
#[test]
fn test_extract_directory_paths() {
let content = "The tools are in `src/core/docs_audit/` directory.";
let claims = extract_claims(content, "test.md", &[]);
assert!(
claims
.iter()
.any(|c| c.claim_type == ClaimType::DirectoryPath
&& c.value == "src/core/docs_audit/")
);
}
#[test]
fn test_skip_domain_patterns() {
let content = "Visit `mysite.com/path/to/page.html` for documentation.";
let claims = extract_claims(content, "test.md", &[]);
assert!(!claims.iter().any(|c| c.value.contains("mysite.com")));
}
#[test]
fn test_extract_code_blocks() {
let content = r#"
Example:
```php
function test() {
return true;
}
```
"#;
let claims = extract_claims(content, "test.md", &[]);
assert!(claims
.iter()
.any(|c| c.claim_type == ClaimType::CodeExample));
}
#[test]
fn test_skip_urls() {
let content = "See https://example.com/path/to/file.html for details.";
let claims = extract_claims(content, "test.md", &[]);
assert!(claims.is_empty() || !claims.iter().any(|c| c.value.contains("https://")));
}
#[test]
fn test_no_identifiers_or_method_signatures() {
let content = r#"
The `BaseTool` class provides base functionality.
Call `register_tool(name, handler)` to register a tool.
"#;
let claims = extract_claims(content, "test.md", &[]);
assert!(claims.is_empty());
}
#[test]
fn test_skip_mime_types() {
let content =
"The file type is `application/vnd.openxmlformats-officedocument.spreadsheetml.sheet`.";
let claims = extract_claims(content, "test.md", &[]);
assert!(!claims.iter().any(|c| c.value.starts_with("application/")));
}
#[test]
fn test_skip_various_mime_types() {
let content = r#"
Supported types: `text/plain`, `image/png`, `audio/mpeg`, `video/mp4`.
"#;
let claims = extract_claims(content, "test.md", &[]);
assert!(claims.is_empty());
}
#[test]
fn test_ignore_patterns_filter_rest_api() {
let content = "The endpoint is at `/wp-json/datamachine/v1/events`.";
let patterns = vec!["/wp-json/**".to_string()];
let claims = extract_claims(content, "test.md", &patterns);
assert!(!claims.iter().any(|c| c.value.contains("wp-json")));
}
#[test]
fn test_ignore_patterns_filter_api_versioned() {
let content = "Call `/api/v1/users/list.json` for the user list.";
let patterns = vec!["**/v1/**".to_string()];
let claims = extract_claims(content, "test.md", &patterns);
assert!(!claims.iter().any(|c| c.value.contains("/v1/")));
}
#[test]
fn test_ignore_patterns_filter_oauth_callback() {
let content = "OAuth redirects to `/datamachine-auth/twitter/` callback.";
let patterns = vec!["/*-auth/**".to_string()];
let claims = extract_claims(content, "test.md", &patterns);
assert!(!claims.iter().any(|c| c.value.contains("-auth/")));
}
#[test]
fn test_extract_class_names() {
let content = "**Service**: DataMachine\\Services\\ProcessedItemsManager";
let claims = extract_claims(content, "test.md", &[]);
assert!(claims.iter().any(|c| c.claim_type == ClaimType::ClassName
&& c.value == "DataMachine\\Services\\ProcessedItemsManager"));
}
#[test]
fn test_extract_class_names_escaped_backslashes() {
let content = "The class `DataMachine\\\\Services\\\\CacheManager` handles caching.";
let claims = extract_claims(content, "test.md", &[]);
assert!(claims.iter().any(|c| c.claim_type == ClaimType::ClassName
&& c.value == "DataMachine\\Services\\CacheManager"));
}
#[test]
fn test_skip_non_namespaced_identifiers() {
let content = "The `CacheManager` class handles caching.";
let claims = extract_claims(content, "test.md", &[]);
assert!(!claims.iter().any(|c| c.claim_type == ClaimType::ClassName));
}
#[test]
fn test_no_ignore_patterns_extracts_api_paths() {
let content = "The endpoint is at `/wp-json/datamachine/v1/events.json`.";
let claims = extract_claims(content, "test.md", &[]);
assert!(claims.iter().any(|c| c.value.contains("wp-json")));
}
#[test]
fn test_prose_file_path_is_real_confidence() {
let content = "See `src/core/config.rs` for the configuration module.";
let claims = extract_claims(content, "test.md", &[]);
let claim = claims
.iter()
.find(|c| c.claim_type == ClaimType::FilePath)
.expect("should extract file path");
assert_eq!(claim.confidence, ClaimConfidence::Real);
}
#[test]
fn test_example_context_path_is_unclear() {
let content = "For example, `your-project/src/main.rs` would be the entry point.";
let claims = extract_claims(content, "test.md", &[]);
let claim = claims
.iter()
.find(|c| c.claim_type == ClaimType::FilePath)
.expect("should extract file path");
assert_ne!(claim.confidence, ClaimConfidence::Real);
}
#[test]
fn test_placeholder_class_is_example_confidence() {
let content = "Create a handler like MyNamespace\\MyHandler to process events.";
let claims = extract_claims(content, "test.md", &[]);
let claim = claims
.iter()
.find(|c| c.claim_type == ClaimType::ClassName)
.expect("should extract class name");
assert_eq!(claim.confidence, ClaimConfidence::Example);
}
#[test]
fn test_real_class_in_prose_is_real_confidence() {
let content = "The DataMachine\\Services\\CacheManager handles caching.";
let claims = extract_claims(content, "test.md", &[]);
let claim = claims
.iter()
.find(|c| c.claim_type == ClaimType::ClassName)
.expect("should extract class name");
assert_eq!(claim.confidence, ClaimConfidence::Real);
}
#[test]
fn test_code_block_claims_are_unclear() {
let content = "Example:\n```php\nfunction test() { return true; }\n```\n";
let claims = extract_claims(content, "test.md", &[]);
let code_claim = claims
.iter()
.find(|c| c.claim_type == ClaimType::CodeExample)
.expect("should extract code example");
assert_eq!(code_claim.confidence, ClaimConfidence::Unclear);
}
#[test]
fn test_code_block_interior_paths_not_extracted() {
let content = "```rust\nuse crate::core::config;\nlet path = \"src/main.rs\";\n```\n";
let claims = extract_claims(content, "test.md", &[]);
assert!(
!claims.iter().any(|c| c.claim_type == ClaimType::FilePath),
"file paths inside code blocks should not be extracted separately"
);
}
#[test]
fn test_annotation_context_is_real() {
let content = "@see DataMachine\\Core\\Engine for the main engine class.";
let claims = extract_claims(content, "test.md", &[]);
let claim = claims
.iter()
.find(|c| c.claim_type == ClaimType::ClassName)
.expect("should extract class name");
assert_eq!(claim.confidence, ClaimConfidence::Real);
}
#[test]
fn test_is_placeholder_class_detection() {
assert!(is_placeholder_class("MyNamespace\\MyHandler"));
assert!(is_placeholder_class("Your\\Extension\\Plugin"));
assert!(is_placeholder_class("Example\\Namespace\\Class"));
assert!(is_placeholder_class("Foo\\Bar\\Baz"));
assert!(is_placeholder_class("Test\\Mock\\Handler"));
assert!(!is_placeholder_class("DataMachine\\Services\\Cache"));
assert!(!is_placeholder_class("WordPress\\Plugin\\Activator"));
}
#[test]
fn test_windows_path_not_extracted_as_class() {
let content = "Typically: `C:\\Users\\<username>\\AppData\\Roaming\\homeboy\\`";
let claims = extract_claims(content, "test.md", &[]);
assert!(
!claims.iter().any(|c| c.claim_type == ClaimType::ClassName),
"Windows path segments should not be extracted as class names"
);
}
#[test]
fn test_os_path_context_detection() {
assert!(is_os_path_context(
"Typically: C:\\Users\\admin\\AppData\\Roaming",
25
));
assert!(is_os_path_context("Path is C:/Users/admin/AppData", 20));
assert!(!is_os_path_context(
"The DataMachine\\Services\\Cache class",
4
));
}
#[test]
fn test_this_creates_context_is_example() {
let content2 = "This creates `docs/api/endpoints.md` with heading";
let claims2 = extract_claims(content2, "test.md", &[]);
if let Some(claim) = claims2.iter().find(|c| c.claim_type == ClaimType::FilePath) {
assert_ne!(
claim.confidence,
ClaimConfidence::Real,
"paths in 'this creates' context should not be real"
);
}
let content3 = "**Example:** `projects/extrachill.json`";
let claims3 = extract_claims(content3, "test.md", &[]);
if let Some(claim) = claims3.iter().find(|c| c.claim_type == ClaimType::FilePath) {
assert_ne!(
claim.confidence,
ClaimConfidence::Real,
"paths in 'Example:' context should not be real"
);
}
}
}