use std::collections::HashSet;
use std::path::Path;
use std::sync::LazyLock;
use regex::Regex;
use crate::error::ValidationError;
use crate::normalize::normalize_candidate;
use crate::validator::{is_bad_example_context, is_wildcard_context, validate_candidate};
#[derive(Debug, Clone, PartialEq, Eq)]
enum MarkdownState {
Prose,
FencedBlock {
skip: bool,
fence_char: char,
opening_fence_len: usize,
},
}
fn parse_fence(trimmed_line: &str) -> Option<(char, usize)> {
let fence_char = match trimmed_line.as_bytes().first() {
Some(b'`') => '`',
Some(b'~') => '~',
_ => return None,
};
let fence_len = trimmed_line
.chars()
.take_while(|&c| c == fence_char)
.count();
if fence_len >= 3 {
Some((fence_char, fence_len))
} else {
None
}
}
static GTS_DISCOVERY_PATTERN_RELAXED: LazyLock<Regex> = LazyLock::new(|| {
match Regex::new(concat!(
r"(?:gts://)?", r"\bgts\.", r"(?:[a-z_*][a-z0-9_*.-]*\.){3,}", r"[a-z_*][a-z0-9_*.-]*", r"\.v[0-9]+", r"(?:\.[0-9]+)?", r"(?:~[a-z_][a-z0-9_.-]*)*", r"~?", )) {
Ok(regex) => regex,
Err(err) => panic!("Invalid discovery regex: {err}"),
}
});
static GTS_DISCOVERY_PATTERN_WELL_FORMED: LazyLock<Regex> = LazyLock::new(|| {
match Regex::new(concat!(
r"(?:gts://)?", r"\bgts\.", r"[a-z_*][a-z0-9_*]*\.", r"[a-z_*][a-z0-9_*]*\.", r"[a-z_*][a-z0-9_*]*\.", r"[a-z_*][a-z0-9_*]*\.", r"v[0-9]+", r"(?:\.[0-9]+)?", r"(?:~[a-z_][a-z0-9_]*\.[a-z_][a-z0-9_]*\.[a-z_][a-z0-9_]*\.[a-z_][a-z0-9_]*\.v[0-9]+(?:\.[0-9]+)?)*", r"~?", )) {
Ok(regex) => regex,
Err(err) => panic!("Invalid discovery regex: {err}"),
}
});
pub fn scan_markdown_content(
content: &str,
path: &Path,
vendor: Option<&str>,
heuristic: bool,
skip_tokens: &[String],
) -> Vec<ValidationError> {
let pattern = if heuristic {
&*GTS_DISCOVERY_PATTERN_RELAXED
} else {
&*GTS_DISCOVERY_PATTERN_WELL_FORMED
};
let mut errors = Vec::new();
let mut state = MarkdownState::Prose;
let mut seen_candidates: HashSet<(usize, String)> = HashSet::new();
for (line_num, line) in content.lines().enumerate() {
let line_number = line_num + 1;
let trimmed_line = line.trim_start();
if let Some((fence_char, fence_len)) = parse_fence(trimmed_line) {
match &state {
MarkdownState::Prose => {
let language = trimmed_line[fence_len..].trim().to_lowercase();
let skip = matches!(
language.as_str(),
"ebnf" | "regex" | "bnf" | "abnf" | "grammar"
);
state = MarkdownState::FencedBlock {
skip,
fence_char,
opening_fence_len: fence_len,
};
continue;
}
MarkdownState::FencedBlock {
fence_char: open_fence_char,
opening_fence_len,
..
} => {
if fence_char == *open_fence_char && fence_len >= *opening_fence_len {
state = MarkdownState::Prose;
continue;
}
}
}
}
if let MarkdownState::FencedBlock { skip: true, .. } = state {
continue;
}
for mat in pattern.find_iter(line) {
let candidate_str = mat.as_str();
let match_start = mat.start();
if !seen_candidates.insert((line_number, candidate_str.to_owned())) {
continue;
}
if is_bad_example_context(line, mat.start()) {
continue;
}
if !skip_tokens.is_empty()
&& let Some(before) = line.get(..mat.start())
{
let before_lower = before.to_lowercase();
if skip_tokens
.iter()
.any(|token| before_lower.contains(&token.to_lowercase()))
{
continue;
}
}
let candidate = match normalize_candidate(candidate_str) {
Ok(c) => c,
Err(e) => {
errors.push(ValidationError {
file: path.to_owned(),
line: line_number,
column: match_start + 1, json_path: String::new(),
raw_value: candidate_str.to_owned(),
normalized_id: String::new(),
error: e,
context: line.to_owned(),
});
continue;
}
};
let allow_wildcards = is_wildcard_context(line, match_start);
let validation_errors = validate_candidate(&candidate, vendor, allow_wildcards);
for err in validation_errors {
errors.push(ValidationError {
file: path.to_owned(),
line: line_number,
column: match_start + 1, json_path: String::new(),
raw_value: candidate.original.clone(),
normalized_id: candidate.gts_id.clone(),
error: err,
context: line.to_owned(),
});
}
}
}
errors
}
#[cfg(test)]
pub fn scan_markdown_file(
path: &Path,
vendor: Option<&str>,
max_file_size: u64,
heuristic: bool,
) -> Vec<ValidationError> {
if let Ok(metadata) = std::fs::metadata(path)
&& metadata.len() > max_file_size
{
return vec![];
}
let content = match std::fs::read_to_string(path) {
Ok(c) => c,
Err(_e) => return vec![],
};
scan_markdown_content(&content, path, vendor, heuristic, &[])
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
fn create_temp_md(content: &str) -> NamedTempFile {
let mut file = NamedTempFile::new().unwrap();
file.write_all(content.as_bytes()).unwrap();
file
}
#[test]
fn test_scan_markdown_valid_id() {
let file = create_temp_md("The type is gts.x.core.events.type.v1~");
let errors = scan_markdown_file(file.path(), None, 10_485_760, false);
assert!(errors.is_empty(), "Unexpected errors: {errors:?}");
}
#[test]
fn test_scan_markdown_invalid_id() {
let file = create_temp_md("The type is gts.x.core.events.type.v1");
let errors = scan_markdown_file(file.path(), None, 10_485_760, false);
assert!(
!errors.is_empty(),
"Single-segment instance ID should be rejected"
);
}
#[test]
fn test_scan_markdown_skip_ebnf_block() {
let content = r"
```ebnf
gts.invalid.pattern.here.v1~
```
";
let file = create_temp_md(content);
let errors = scan_markdown_file(file.path(), None, 10_485_760, false);
assert!(errors.is_empty(), "EBNF blocks should be skipped");
}
#[test]
fn test_scan_markdown_validate_json_block() {
let content = r#"
```json
{"$id": "gts://gts.x.core.events.type.v1~"}
```
"#;
let file = create_temp_md(content);
let errors = scan_markdown_file(file.path(), None, 10_485_760, false);
assert!(errors.is_empty(), "JSON blocks should be validated");
}
#[test]
fn test_scan_markdown_skip_invalid_context() {
let file = create_temp_md("\u{274c} gts.invalid.id.here.v1");
let errors = scan_markdown_file(file.path(), None, 10_485_760, false);
assert!(errors.is_empty(), "Invalid examples should be skipped");
}
#[test]
fn test_scan_markdown_wildcard_in_pattern_context() {
let file = create_temp_md("pattern: gts.x.core.events.type.v1~");
let errors = scan_markdown_file(file.path(), None, 10_485_760, false);
assert!(
errors.is_empty(),
"Valid IDs in pattern context should be allowed"
);
}
#[test]
fn test_scan_markdown_wildcard_not_in_pattern_context() {
let file = create_temp_md("The type is gts.x.core.events.type.v1~");
let errors = scan_markdown_file(file.path(), None, 10_485_760, false);
assert!(errors.is_empty(), "Valid IDs should pass");
}
#[test]
fn test_scan_markdown_gts_uri() {
let file = create_temp_md(r#"Use "$id": "gts://gts.x.core.events.type.v1~""#);
let errors = scan_markdown_file(file.path(), None, 10_485_760, false);
assert!(
errors.is_empty(),
"gts:// URIs should be normalized and validated"
);
}
#[test]
fn test_scan_markdown_vendor_mismatch() {
let file = create_temp_md("The type is gts.hx.core.events.type.v1~");
let errors = scan_markdown_file(file.path(), Some("x"), 10_485_760, false);
assert!(!errors.is_empty());
assert!(errors[0].error.contains("Vendor mismatch"));
}
#[test]
fn test_scan_markdown_example_vendor_tolerated() {
let file = create_temp_md("Example: gts.acme.core.events.type.v1~");
let errors = scan_markdown_file(file.path(), Some("x"), 10_485_760, false);
assert!(errors.is_empty(), "Example vendors should be tolerated");
}
#[test]
fn test_scan_markdown_deduplication() {
let file = create_temp_md(
"gts.wrongvendor.core.events.type.v1~ and gts.wrongvendor.core.events.type.v1~ again",
);
let errors = scan_markdown_file(file.path(), Some("x"), 10_485_760, false);
assert_eq!(
errors.len(),
1,
"Duplicate invalid ID on same line should produce exactly 1 error, got: {errors:?}"
);
}
#[test]
fn test_scan_markdown_error_after_gts_id() {
let file = create_temp_md("gts.x.core.events.type.v1~ handles error cases");
let errors = scan_markdown_file(file.path(), None, 10_485_760, false);
assert!(
errors.is_empty(),
"Valid ID should not be suppressed by 'error' appearing after it"
);
}
#[test]
fn test_scan_markdown_invalid_before_gts_id() {
let file = create_temp_md("invalid: gts.bad.format.here.v1");
let errors = scan_markdown_file(file.path(), None, 10_485_760, false);
assert!(errors.is_empty(), "Invalid examples should be skipped");
}
#[test]
fn test_scan_markdown_heuristic_mode_catches_malformed() {
let file = create_temp_md("The type is gts.my-vendor.core.events.type.v1~");
let errors_heuristic = scan_markdown_file(file.path(), None, 10_485_760, true);
let errors_normal = scan_markdown_file(file.path(), None, 10_485_760, false);
assert!(
!errors_heuristic.is_empty(),
"Heuristic mode should catch malformed ID with hyphens"
);
assert!(
errors_normal.is_empty(),
"Normal mode won't match malformed pattern"
);
}
#[test]
fn test_scan_markdown_heuristic_mode_catches_extra_dots() {
let file = create_temp_md("The type is gts.x.core.events.type.name.v1~");
let errors_heuristic = scan_markdown_file(file.path(), None, 10_485_760, true);
assert!(
!errors_heuristic.is_empty(),
"Heuristic mode should catch ID with extra segments"
);
}
#[test]
fn test_scan_markdown_normal_mode_well_formed_only() {
let file = create_temp_md("Valid: gts.x.core.events.type.v1~ and malformed: gts.bad-id.v1");
let errors = scan_markdown_file(file.path(), None, 10_485_760, false);
assert!(
errors.is_empty(),
"Normal mode should only validate well-formed patterns"
);
}
#[test]
fn test_scan_markdown_skip_tokens() {
let content = "**given** gts.bad.format.here.v1~";
let errors = scan_markdown_content(
content,
Path::new("test.md"),
None,
true, &["**given**".to_owned()],
);
assert!(
errors.is_empty(),
"skip_tokens should suppress validation: {errors:?}"
);
let content_mismatch = "**given** gts.y.core.pkg.mytype.v1~ is registered";
let errors_no_skip = scan_markdown_content(
content_mismatch,
Path::new("test.md"),
Some("x"),
false,
&[],
);
assert!(
!errors_no_skip.is_empty(),
"Without skip_tokens, vendor mismatch should be reported"
);
let errors_with_skip = scan_markdown_content(
content_mismatch,
Path::new("test.md"),
Some("x"),
false,
&["**given**".to_owned()],
);
assert!(
errors_with_skip.is_empty(),
"With skip_tokens, vendor mismatch should be suppressed: {errors_with_skip:?}"
);
}
#[test]
fn test_scan_markdown_tilde_fence() {
let content = "~~~ebnf\ngts.invalid.pattern.here.v1~\n~~~\n";
let file = create_temp_md(content);
let errors = scan_markdown_file(file.path(), None, 10_485_760, false);
assert!(
errors.is_empty(),
"~~~ EBNF blocks should be skipped: {errors:?}"
);
}
#[test]
fn test_scan_markdown_tilde_fence_json_validated() {
let content = "~~~json\n{\"$id\": \"gts://gts.x.core.events.type.v1~\"}\n~~~\n";
let file = create_temp_md(content);
let errors = scan_markdown_file(file.path(), None, 10_485_760, false);
assert!(
errors.is_empty(),
"~~~json blocks should be validated and pass: {errors:?}"
);
}
#[test]
fn test_scan_markdown_mismatched_fence_does_not_close_block() {
let content = "```ebnf\n~~~\ngts.bad.format.here.v1~\n```\n";
let file = create_temp_md(content);
let errors = scan_markdown_file(file.path(), None, 10_485_760, true);
assert!(
errors.is_empty(),
"Mismatched fence should not close block; content inside ebnf block must be skipped: {errors:?}"
);
}
#[test]
fn test_scan_markdown_word_boundary() {
let content = "The identifier xgts.x.core.events.type.v1~ is wrong";
let errors = scan_markdown_content(content, Path::new("test.md"), None, false, &[]);
assert!(
errors.is_empty(),
"Word boundary should prevent matching xgts.*: {errors:?}"
);
}
}