Skip to main content

straymark_core/
document.rs

1use anyhow::{Context, Result};
2use serde::Deserialize;
3use std::fmt;
4use std::path::{Path, PathBuf};
5
6/// All supported StrayMark document types
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum DocType {
9    Ailog,
10    Aidec,
11    Adr,
12    Eth,
13    Req,
14    Tes,
15    Inc,
16    Tde,
17    Sec,
18    Mcard,
19    Sbom,
20    Dpia,
21    // China regulatory artifacts (regional_scope: china)
22    Pipia,
23    Cacfile,
24    Tc260ra,
25    Ailabel,
26}
27
28impl DocType {
29    /// Prefix used in filenames (e.g., "AILOG", "SEC")
30    pub fn prefix(&self) -> &'static str {
31        match self {
32            DocType::Ailog => "AILOG",
33            DocType::Aidec => "AIDEC",
34            DocType::Adr => "ADR",
35            DocType::Eth => "ETH",
36            DocType::Req => "REQ",
37            DocType::Tes => "TES",
38            DocType::Inc => "INC",
39            DocType::Tde => "TDE",
40            DocType::Sec => "SEC",
41            DocType::Mcard => "MCARD",
42            DocType::Sbom => "SBOM",
43            DocType::Dpia => "DPIA",
44            DocType::Pipia => "PIPIA",
45            DocType::Cacfile => "CACFILE",
46            DocType::Tc260ra => "TC260RA",
47            DocType::Ailabel => "AILABEL",
48        }
49    }
50
51    /// Parse a DocType from a filename prefix
52    pub fn from_prefix(prefix: &str) -> Option<DocType> {
53        match prefix {
54            "AILOG" => Some(DocType::Ailog),
55            "AIDEC" => Some(DocType::Aidec),
56            "ADR" => Some(DocType::Adr),
57            "ETH" => Some(DocType::Eth),
58            "REQ" => Some(DocType::Req),
59            "TES" => Some(DocType::Tes),
60            "INC" => Some(DocType::Inc),
61            "TDE" => Some(DocType::Tde),
62            "SEC" => Some(DocType::Sec),
63            "MCARD" => Some(DocType::Mcard),
64            "SBOM" => Some(DocType::Sbom),
65            "DPIA" => Some(DocType::Dpia),
66            "PIPIA" => Some(DocType::Pipia),
67            "CACFILE" => Some(DocType::Cacfile),
68            "TC260RA" => Some(DocType::Tc260ra),
69            "AILABEL" => Some(DocType::Ailabel),
70            _ => None,
71        }
72    }
73
74    /// All valid prefixes.
75    ///
76    /// Adding a DocType? Update BOTH this array and the
77    /// `DOC_TYPE_PREFIXES` env var at the top of
78    /// `dist/.github/workflows/docs-validation.yml`. The CI workflow uses
79    /// that env var as its single source of truth for valid type prefixes,
80    /// but it cannot import from Rust — the two must be kept in manual sync.
81    pub const ALL_PREFIXES: &'static [&'static str] = &[
82        "AILOG", "AIDEC", "ADR", "ETH", "REQ", "TES", "INC", "TDE",
83        "SEC", "MCARD", "SBOM", "DPIA",
84        "PIPIA", "CACFILE", "TC260RA", "AILABEL",
85    ];
86
87    /// All DocType variants in display order
88    pub const ALL: &'static [DocType] = &[
89        DocType::Ailog, DocType::Aidec, DocType::Adr, DocType::Eth,
90        DocType::Req, DocType::Tes, DocType::Inc, DocType::Tde,
91        DocType::Sec, DocType::Mcard, DocType::Sbom, DocType::Dpia,
92        DocType::Pipia, DocType::Cacfile, DocType::Tc260ra, DocType::Ailabel,
93    ];
94
95    /// DocType variants that are only enabled when `regional_scope` includes
96    /// "china". They are filtered out of `straymark new` and other UX surfaces
97    /// for projects that have not opted into Chinese regulatory coverage.
98    pub const CHINA_ONLY: &'static [DocType] = &[
99        DocType::Pipia, DocType::Cacfile, DocType::Tc260ra, DocType::Ailabel,
100    ];
101
102    /// True if this DocType requires `regional_scope` to include "china".
103    pub fn is_china_only(&self) -> bool {
104        Self::CHINA_ONLY.contains(self)
105    }
106
107    /// Human-readable display name
108    pub fn display_name(&self) -> &'static str {
109        match self {
110            DocType::Ailog => "AI Action Log",
111            DocType::Aidec => "AI Decision",
112            DocType::Adr => "Architecture Decision Record",
113            DocType::Eth => "Ethical Review",
114            DocType::Req => "Requirement",
115            DocType::Tes => "Test Plan",
116            DocType::Inc => "Incident Post-mortem",
117            DocType::Tde => "Technical Debt",
118            DocType::Sec => "Security Assessment",
119            DocType::Mcard => "Model/System Card",
120            DocType::Sbom => "Software Bill of Materials",
121            DocType::Dpia => "Data Protection Impact Assessment",
122            DocType::Pipia => "Personal Information Protection Impact Assessment",
123            DocType::Cacfile => "CAC Algorithm Filing",
124            DocType::Tc260ra => "TC260 Risk Assessment",
125            DocType::Ailabel => "GB 45438 Content Labeling Plan",
126        }
127    }
128
129    /// Subdirectory under .straymark/ where this document type lives
130    pub fn directory(&self) -> &'static str {
131        match self {
132            DocType::Ailog => "07-ai-audit/agent-logs",
133            DocType::Aidec => "07-ai-audit/decisions",
134            DocType::Eth => "07-ai-audit/ethical-reviews",
135            DocType::Adr => "02-design/decisions",
136            DocType::Req => "01-requirements",
137            DocType::Tes => "04-testing",
138            DocType::Inc => "05-operations/incidents",
139            DocType::Tde => "06-evolution/technical-debt",
140            DocType::Sec => "08-security",
141            DocType::Mcard => "09-ai-models",
142            DocType::Sbom => "07-ai-audit",
143            DocType::Dpia => "07-ai-audit/ethical-reviews",
144            DocType::Pipia => "07-ai-audit/ethical-reviews",
145            DocType::Cacfile => "07-ai-audit/regulatory-filings",
146            DocType::Tc260ra => "07-ai-audit/risk-assessments",
147            DocType::Ailabel => "09-ai-models/labeling",
148        }
149    }
150
151    /// Parse a DocType from a user-provided string (case-insensitive)
152    pub fn from_str_loose(s: &str) -> Option<DocType> {
153        match s.to_lowercase().as_str() {
154            "ailog" => Some(DocType::Ailog),
155            "aidec" => Some(DocType::Aidec),
156            "adr" => Some(DocType::Adr),
157            "eth" => Some(DocType::Eth),
158            "req" => Some(DocType::Req),
159            "tes" => Some(DocType::Tes),
160            "inc" => Some(DocType::Inc),
161            "tde" => Some(DocType::Tde),
162            "sec" => Some(DocType::Sec),
163            "mcard" => Some(DocType::Mcard),
164            "sbom" => Some(DocType::Sbom),
165            "dpia" => Some(DocType::Dpia),
166            "pipia" => Some(DocType::Pipia),
167            "cacfile" => Some(DocType::Cacfile),
168            "tc260ra" => Some(DocType::Tc260ra),
169            "ailabel" => Some(DocType::Ailabel),
170            _ => None,
171        }
172    }
173}
174
175impl fmt::Display for DocType {
176    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
177        write!(f, "{}", self.prefix())
178    }
179}
180
181/// Frontmatter fields extracted from a StrayMark document.
182/// All fields are optional so the validator can report which are missing.
183#[derive(Debug, Clone, Deserialize, Default)]
184#[allow(dead_code)]
185pub struct Frontmatter {
186    pub id: Option<String>,
187    pub title: Option<String>,
188    pub status: Option<String>,
189    pub created: Option<String>,
190    pub agent: Option<String>,
191    pub confidence: Option<String>,
192    pub review_required: Option<bool>,
193    /// Reviewer identity (email | github-handle | DID). Set by `straymark approve`
194    /// and by `## 3.5 Recording Approval` of the framework's documentation policy.
195    pub reviewed_by: Option<String>,
196    /// Date of formal approval (must be >= `created`).
197    pub reviewed_at: Option<String>,
198    /// Closure signal — one of: `approved | revisions_requested | rejected`.
199    /// Presence of this field is the canonical "human has reviewed" signal.
200    pub review_outcome: Option<String>,
201    pub risk_level: Option<String>,
202    pub eu_ai_act_risk: Option<String>,
203    pub nist_genai_risks: Option<Vec<String>>,
204    pub iso_42001_clause: Option<Vec<u8>>,
205    pub tags: Option<Vec<String>>,
206    pub related: Option<Vec<String>>,
207    /// Documents this one supersedes (graph edge `SUPERSEDES`).
208    pub supersedes: Option<Vec<String>>,
209    /// Alternatives documented elsewhere (graph edge `DOCUMENTS_ALTERNATIVE`).
210    pub alternatives_documented: Option<Vec<String>>,
211    /// AILOGs a document originates from (graph edge `ORIGINATES_FROM`).
212    pub originating_ailogs: Option<Vec<String>>,
213    // INC-specific
214    pub severity: Option<String>,
215    // ETH-specific
216    pub gdpr_legal_basis: Option<String>,
217    // SEC-specific
218    pub threat_model_methodology: Option<String>,
219    pub owasp_asvs_level: Option<serde_yaml::Value>,
220    // MCARD-specific
221    pub model_name: Option<String>,
222    pub model_type: Option<String>,
223    pub model_version: Option<String>,
224    pub provider: Option<String>,
225    pub license: Option<String>,
226    // SBOM-specific
227    pub sbom_format_reference: Option<String>,
228    pub system_name: Option<String>,
229    // DPIA-specific
230    pub gdpr_article_35: Option<bool>,
231    pub dpo_consulted: Option<bool>,
232    pub supervisory_authority_consulted: Option<bool>,
233    // ADR-specific
234    pub api_changes: Option<Vec<String>>,
235    // REQ-specific (OpenAPI/AsyncAPI)
236    pub api_spec_path: Option<String>,
237
238    // ----- China regulatory profile (regional_scope: china) -----
239
240    // TC260 v2.0 (AI Safety Governance Framework)
241    /// One of: "low" | "medium" | "high" | "very_high" | "extremely_severe" | "not_applicable"
242    pub tc260_risk_level: Option<String>,
243    pub tc260_application_scenario: Option<String>,
244    /// One of: "narrow" | "foundation" | "agentic" | "general"
245    pub tc260_intelligence_level: Option<String>,
246    /// One of: "individual" | "organization" | "societal" | "cross_border"
247    pub tc260_application_scale: Option<String>,
248    pub tc260_endogenous_risks: Option<Vec<String>>,
249    pub tc260_application_risks: Option<Vec<String>>,
250    pub tc260_derivative_risks: Option<Vec<String>>,
251
252    // PIPL / PIPIA (Personal Information Protection Law, Art. 55-56)
253    pub pipl_applicable: Option<bool>,
254    /// One of: "sensitive_data" | "automated_decision" | "third_party_disclosure"
255    /// | "cross_border" | "public_disclosure" | "other"
256    pub pipl_article_55_trigger: Option<String>,
257    pub pipl_sensitive_data: Option<bool>,
258    pub pipl_cross_border_transfer: Option<bool>,
259    /// YYYY-MM-DD — minimum 3 years from `created` per PIPL.
260    pub pipl_retention_until: Option<String>,
261
262    // GB 45438-2025 — Cybersecurity Technology — Labeling Method for AI-Generated Content
263    pub gb45438_applicable: Option<bool>,
264    /// Subset of: "text" | "image" | "audio" | "video" | "virtual_scene"
265    pub gb45438_content_types: Option<Vec<String>>,
266    /// One of: "disclaimer" | "watermark" | "caption" | "audio_cue" | "banner"
267    pub gb45438_explicit_label_strategy: Option<String>,
268    /// One of: "C2PA" | "XMP" | "EXIF" | "custom" | "none"
269    pub gb45438_implicit_metadata_format: Option<String>,
270    pub gb45438_distributor_obligations_documented: Option<bool>,
271
272    // CAC Algorithm Filing (Cyberspace Administration of China)
273    pub cac_filing_required: Option<bool>,
274    pub cac_filing_number: Option<String>,
275    /// One of: "pending" | "provincial_submitted" | "provincial_approved"
276    /// | "national_submitted" | "national_approved" | "rejected" | "not_required"
277    pub cac_filing_status: Option<String>,
278    /// One of: "algorithm" | "generative_ai" | "dual"
279    pub cac_filing_type: Option<String>,
280    pub cac_provincial_authority: Option<String>,
281    pub cac_national_decision_date: Option<String>,
282
283    // GB/T 45652-2025 — Pre-training & fine-tuning data security
284    pub gb45652_training_data_compliance: Option<bool>,
285
286    // CSL 2026 — Cybersecurity Law amendments + incident reporting administrative measures
287    /// One of: "particularly_serious" | "relatively_major" | "major" | "general" | "not_applicable"
288    pub csl_severity_level: Option<String>,
289    /// 1 (particularly serious) | 4 (relatively major) | 24 (general)
290    pub csl_report_deadline_hours: Option<u32>,
291}
292
293/// A parsed StrayMark document
294#[derive(Debug)]
295pub struct StrayMarkDocument {
296    pub path: PathBuf,
297    pub filename: String,
298    pub doc_type: DocType,
299    pub frontmatter: Frontmatter,
300    pub body: String,
301}
302
303/// Parse a StrayMark document from a file path
304pub fn parse_document(path: &Path) -> Result<StrayMarkDocument> {
305    let content = std::fs::read_to_string(path)
306        .with_context(|| format!("Failed to read {}", path.display()))?;
307
308    let filename = path
309        .file_name()
310        .and_then(|n| n.to_str())
311        .unwrap_or("")
312        .to_string();
313
314    // Determine doc type from filename prefix
315    let doc_type = detect_doc_type(&filename)
316        .with_context(|| format!("Cannot determine document type for {}", filename))?;
317
318    // Extract frontmatter
319    let (frontmatter, body) = extract_frontmatter(&content)
320        .with_context(|| format!("Failed to parse frontmatter in {}", path.display()))?;
321
322    Ok(StrayMarkDocument {
323        path: path.to_path_buf(),
324        filename,
325        doc_type,
326        frontmatter,
327        body,
328    })
329}
330
331/// Detect document type from filename prefix
332pub fn detect_doc_type(filename: &str) -> Option<DocType> {
333    for prefix in DocType::ALL_PREFIXES {
334        if filename.starts_with(&format!("{}-", prefix)) {
335            return DocType::from_prefix(prefix);
336        }
337    }
338    None
339}
340
341/// Extract YAML frontmatter (between --- delimiters) and body
342fn extract_frontmatter(content: &str) -> Result<(Frontmatter, String)> {
343    let trimmed = content.trim_start();
344    if !trimmed.starts_with("---") {
345        anyhow::bail!("No frontmatter found (missing opening ---)");
346    }
347
348    let after_first = &trimmed[3..];
349    let end_pos = after_first
350        .find("\n---")
351        .ok_or_else(|| anyhow::anyhow!("No closing --- found for frontmatter"))?;
352
353    let yaml_str = &after_first[..end_pos];
354    let body_start = end_pos + 4; // skip "\n---"
355    let body = if body_start < after_first.len() {
356        after_first[body_start..].to_string()
357    } else {
358        String::new()
359    };
360
361    let frontmatter: Frontmatter = serde_yaml::from_str(yaml_str)
362        .with_context(|| "Failed to deserialize frontmatter YAML")?;
363
364    Ok((frontmatter, body))
365}
366
367/// Discover all user-generated StrayMark documents under a .straymark/ directory
368pub fn discover_documents(straymark_dir: &Path) -> Vec<PathBuf> {
369    let mut results = Vec::new();
370    walk_for_documents(straymark_dir, &mut results);
371    results.sort();
372    results
373}
374
375fn walk_for_documents(dir: &Path, results: &mut Vec<PathBuf>) {
376    let entries = match std::fs::read_dir(dir) {
377        Ok(e) => e,
378        Err(_) => return,
379    };
380
381    for entry in entries.flatten() {
382        let path = entry.path();
383        if path.is_dir() {
384            // Skip templates directory
385            if path.ends_with("templates") {
386                continue;
387            }
388            walk_for_documents(&path, results);
389        } else if path.extension().and_then(|e| e.to_str()) == Some("md") {
390            let filename = path
391                .file_name()
392                .and_then(|n| n.to_str())
393                .unwrap_or("");
394            // Match pattern: TYPE-YYYY-MM-DD-NNN-*.md
395            if detect_doc_type(filename).is_some() && is_dated_document(filename) {
396                results.push(path);
397            }
398        }
399    }
400}
401
402/// Check if filename follows the dated pattern TYPE-YYYY-MM-DD-NNN-*.md
403fn is_dated_document(filename: &str) -> bool {
404    // Find the first '-' (after the type prefix)
405    let after_prefix = match filename.find('-') {
406        Some(pos) => &filename[pos + 1..],
407        None => return false,
408    };
409    // Should start with a date pattern YYYY-MM-DD
410    if after_prefix.len() < 10 {
411        return false;
412    }
413    let date_part = &after_prefix[..10];
414    // Basic date pattern check: NNNN-NN-NN
415    date_part.len() == 10
416        && date_part.chars().nth(4) == Some('-')
417        && date_part.chars().nth(7) == Some('-')
418        && date_part[..4].chars().all(|c| c.is_ascii_digit())
419        && date_part[5..7].chars().all(|c| c.is_ascii_digit())
420        && date_part[8..10].chars().all(|c| c.is_ascii_digit())
421}
422
423#[cfg(test)]
424mod tests {
425    use super::*;
426
427    #[test]
428    fn test_detect_doc_type() {
429        assert_eq!(detect_doc_type("AILOG-2025-01-01-001-test.md"), Some(DocType::Ailog));
430        assert_eq!(detect_doc_type("SEC-2025-01-01-001-auth.md"), Some(DocType::Sec));
431        assert_eq!(detect_doc_type("MCARD-2025-01-01-001-gpt.md"), Some(DocType::Mcard));
432        assert_eq!(detect_doc_type("SBOM-2025-01-01-001-deps.md"), Some(DocType::Sbom));
433        assert_eq!(detect_doc_type("DPIA-2025-01-01-001-gdpr.md"), Some(DocType::Dpia));
434        assert_eq!(detect_doc_type("README.md"), None);
435        assert_eq!(detect_doc_type("TEMPLATE-SEC.md"), None);
436    }
437
438    #[test]
439    fn test_is_dated_document() {
440        assert!(is_dated_document("AILOG-2025-01-27-001-implement-auth.md"));
441        assert!(is_dated_document("SEC-2026-03-24-001-api-review.md"));
442        assert!(!is_dated_document("TEMPLATE-SEC.md"));
443        assert!(!is_dated_document("README.md"));
444    }
445
446    #[test]
447    fn test_extract_frontmatter() {
448        let content = "---\nid: AILOG-2025-01-01-001\ntitle: Test\nstatus: draft\n---\n\n# Body";
449        let (fm, body) = extract_frontmatter(content).unwrap();
450        assert_eq!(fm.id.as_deref(), Some("AILOG-2025-01-01-001"));
451        assert_eq!(fm.title.as_deref(), Some("Test"));
452        assert!(body.contains("# Body"));
453    }
454
455    #[test]
456    fn test_doc_type_all_has_16_entries() {
457        // 12 base types + 4 China-specific (PIPIA, CACFILE, TC260RA, AILABEL)
458        assert_eq!(DocType::ALL.len(), 16);
459        assert_eq!(DocType::ALL_PREFIXES.len(), 16);
460    }
461
462    #[test]
463    fn test_china_only_doc_types() {
464        assert_eq!(DocType::CHINA_ONLY.len(), 4);
465        assert!(DocType::Pipia.is_china_only());
466        assert!(DocType::Cacfile.is_china_only());
467        assert!(DocType::Tc260ra.is_china_only());
468        assert!(DocType::Ailabel.is_china_only());
469        assert!(!DocType::Ailog.is_china_only());
470        assert!(!DocType::Dpia.is_china_only());
471    }
472
473    #[test]
474    fn test_china_doc_type_detection() {
475        assert_eq!(detect_doc_type("PIPIA-2026-04-25-001-chatbot.md"), Some(DocType::Pipia));
476        assert_eq!(detect_doc_type("CACFILE-2026-04-25-001-chatbot.md"), Some(DocType::Cacfile));
477        assert_eq!(detect_doc_type("TC260RA-2026-04-25-001-chatbot.md"), Some(DocType::Tc260ra));
478        assert_eq!(detect_doc_type("AILABEL-2026-04-25-001-chatbot.md"), Some(DocType::Ailabel));
479    }
480
481    #[test]
482    fn test_china_doc_type_directories() {
483        assert_eq!(DocType::Pipia.directory(), "07-ai-audit/ethical-reviews");
484        assert_eq!(DocType::Cacfile.directory(), "07-ai-audit/regulatory-filings");
485        assert_eq!(DocType::Tc260ra.directory(), "07-ai-audit/risk-assessments");
486        assert_eq!(DocType::Ailabel.directory(), "09-ai-models/labeling");
487    }
488
489    #[test]
490    fn test_china_frontmatter_parsing() {
491        let content = "---\n\
492            id: PIPIA-2026-04-25-001\n\
493            title: Test PIPIA\n\
494            pipl_applicable: true\n\
495            pipl_sensitive_data: true\n\
496            pipl_cross_border_transfer: false\n\
497            pipl_retention_until: 2029-04-25\n\
498            tc260_risk_level: high\n\
499            cac_filing_number: CAC-2026-00123\n\
500            cac_filing_status: national_approved\n\
501            gb45438_content_types: [text, image]\n\
502            csl_severity_level: relatively_major\n\
503            csl_report_deadline_hours: 4\n\
504            ---\n\nbody";
505        let (fm, _) = extract_frontmatter(content).unwrap();
506        assert_eq!(fm.pipl_applicable, Some(true));
507        assert_eq!(fm.pipl_sensitive_data, Some(true));
508        assert_eq!(fm.pipl_retention_until.as_deref(), Some("2029-04-25"));
509        assert_eq!(fm.tc260_risk_level.as_deref(), Some("high"));
510        assert_eq!(fm.cac_filing_number.as_deref(), Some("CAC-2026-00123"));
511        assert_eq!(fm.cac_filing_status.as_deref(), Some("national_approved"));
512        assert_eq!(fm.gb45438_content_types.as_ref().unwrap().len(), 2);
513        assert_eq!(fm.csl_severity_level.as_deref(), Some("relatively_major"));
514        assert_eq!(fm.csl_report_deadline_hours, Some(4));
515    }
516
517    #[test]
518    fn test_doc_type_directory_mapping() {
519        for dt in DocType::ALL {
520            let dir = dt.directory();
521            assert!(!dir.is_empty(), "{} has empty directory", dt.prefix());
522            assert!(!dir.starts_with('/'), "{} directory should be relative", dt.prefix());
523        }
524    }
525
526    #[test]
527    fn test_doc_type_display_names() {
528        for dt in DocType::ALL {
529            let name = dt.display_name();
530            assert!(!name.is_empty(), "{} has empty display_name", dt.prefix());
531        }
532    }
533
534    #[test]
535    fn test_doc_type_from_str_loose() {
536        assert_eq!(DocType::from_str_loose("ailog"), Some(DocType::Ailog));
537        assert_eq!(DocType::from_str_loose("AILOG"), Some(DocType::Ailog));
538        assert_eq!(DocType::from_str_loose("AiLog"), Some(DocType::Ailog));
539        assert_eq!(DocType::from_str_loose("sec"), Some(DocType::Sec));
540        assert_eq!(DocType::from_str_loose("mcard"), Some(DocType::Mcard));
541        assert_eq!(DocType::from_str_loose("invalid"), None);
542    }
543}