entrenar/finetune/
popperian.rs

1//! Popperian Falsification QA System
2//!
3//! Implements a 100-point scientific validation checklist based on
4//! Karl Popper's philosophy of falsifiability.
5//!
6//! # References
7//!
8//! - Popper, K. (1959) "The Logic of Scientific Discovery"
9//! - Popper, K. (1963) "Conjectures and Refutations"
10
11use std::fmt;
12
13/// Quality grade based on Popperian score
14/// Ordering: F < C < B < BPlus < A < APlus (worst to best)
15#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
16pub enum QAGrade {
17    /// <70: Failing
18    F,
19    /// 70-79: Needs Improvement
20    C,
21    /// 80-84: Satisfactory
22    B,
23    /// 85-89: Good
24    BPlus,
25    /// 90-94: Very Good
26    A,
27    /// 95-100: Excellent
28    APlus,
29}
30
31impl QAGrade {
32    /// Create grade from score
33    #[must_use]
34    pub const fn from_score(score: u8) -> Self {
35        match score {
36            95..=100 => Self::APlus,
37            90..=94 => Self::A,
38            85..=89 => Self::BPlus,
39            80..=84 => Self::B,
40            70..=79 => Self::C,
41            0..=69 => Self::F,
42            101.. => Self::F,
43        }
44    }
45
46    /// Check if grade is passing (C or better)
47    #[must_use]
48    pub const fn is_passing(&self) -> bool {
49        !matches!(self, Self::F)
50    }
51}
52
53impl fmt::Display for QAGrade {
54    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
55        match self {
56            Self::APlus => write!(f, "A+ (Excellent)"),
57            Self::A => write!(f, "A  (Very Good)"),
58            Self::BPlus => write!(f, "B+ (Good)"),
59            Self::B => write!(f, "B  (Satisfactory)"),
60            Self::C => write!(f, "C  (Needs Improvement)"),
61            Self::F => write!(f, "F  (Failing)"),
62        }
63    }
64}
65
66/// 100-Point Popperian Falsification Checklist
67///
68/// Each field represents a falsifiable hypothesis. If the test passes,
69/// the hypothesis is corroborated (not proven). If it fails, the hypothesis
70/// is falsified.
71#[allow(clippy::struct_excessive_bools)]
72#[derive(Debug, Clone, Default)]
73pub struct PopperianQA {
74    // === REPRODUCIBILITY (20 points) ===
75    // H1: Training is deterministic with fixed seed
76    /// Same seed produces identical loss curves (±1e-6)
77    pub r1_same_loss_curve: bool,
78    /// Adapter weights match exactly across runs
79    pub r2_same_final_weights: bool,
80    /// Evaluation metrics are identical
81    pub r3_same_eval_metrics: bool,
82    /// All dependencies are version-locked
83    pub r4_environment_locked: bool,
84
85    // === COMPILATION (20 points) ===
86    // H2: Generated tests are syntactically valid Rust
87    /// rustfmt succeeds on generated code
88    pub c1_parses_as_rust: bool,
89    /// cargo check succeeds
90    pub c2_type_checks: bool,
91    /// No clippy warnings
92    pub c3_no_unused_warnings: bool,
93    /// Links correctly against target crate
94    pub c4_links_correctly: bool,
95
96    // === CORRECTNESS (20 points) ===
97    // H3: Generated tests are semantically meaningful
98    /// Tests pass on original implementation
99    pub x1_tests_pass_on_correct: bool,
100    /// Tests fail on mutated implementation
101    pub x2_tests_fail_on_mutant: bool,
102    /// Assertions are not trivial (not just `assert!(true)`)
103    pub x3_assertions_meaningful: bool,
104    /// No tautologies (not `assert_eq!(x, x)`)
105    pub x4_no_tautologies: bool,
106
107    // === COVERAGE (15 points) ===
108    // H4: Tests exercise meaningful code paths
109    /// Branch coverage delta ≥+5%
110    pub v1_branch_coverage_delta: bool,
111    /// Line coverage delta ≥+10%
112    pub v2_line_coverage_delta: bool,
113    /// Tests include edge cases (empty, null, max)
114    pub v3_edge_cases_present: bool,
115
116    // === EFFICIENCY (10 points) ===
117    // H5: Training completes within resource bounds
118    /// Peak VRAM < 8GB
119    pub e1_vram_under_8gb: bool,
120    /// Training completes in <4 hours
121    pub e2_training_under_4hrs: bool,
122    /// Inference < 1s per function
123    pub e3_inference_under_1s: bool,
124
125    // === EDGE CASES (10 points) ===
126    // H6: Handles difficult inputs gracefully
127    /// Handles generic functions
128    pub g1_handles_generics: bool,
129    /// Handles lifetime annotations
130    pub g2_handles_lifetimes: bool,
131    /// Handles async functions
132    pub g3_handles_async: bool,
133    /// Handles unsafe blocks
134    pub g4_handles_unsafe: bool,
135    /// Handles macro-heavy code
136    pub g5_handles_macros: bool,
137
138    // === DOCUMENTATION (5 points) ===
139    // H7: Output is self-explanatory
140    /// Test names describe intent (test_*)
141    pub d1_test_names_descriptive: bool,
142    /// Comments explain edge cases
143    pub d2_comments_present: bool,
144    /// Proptest strategies have clear names
145    pub d3_proptest_strategies_clear: bool,
146}
147
148impl PopperianQA {
149    /// Create new QA checklist with all items unchecked
150    #[must_use]
151    pub const fn new() -> Self {
152        Self {
153            r1_same_loss_curve: false,
154            r2_same_final_weights: false,
155            r3_same_eval_metrics: false,
156            r4_environment_locked: false,
157            c1_parses_as_rust: false,
158            c2_type_checks: false,
159            c3_no_unused_warnings: false,
160            c4_links_correctly: false,
161            x1_tests_pass_on_correct: false,
162            x2_tests_fail_on_mutant: false,
163            x3_assertions_meaningful: false,
164            x4_no_tautologies: false,
165            v1_branch_coverage_delta: false,
166            v2_line_coverage_delta: false,
167            v3_edge_cases_present: false,
168            e1_vram_under_8gb: false,
169            e2_training_under_4hrs: false,
170            e3_inference_under_1s: false,
171            g1_handles_generics: false,
172            g2_handles_lifetimes: false,
173            g3_handles_async: false,
174            g4_handles_unsafe: false,
175            g5_handles_macros: false,
176            d1_test_names_descriptive: false,
177            d2_comments_present: false,
178            d3_proptest_strategies_clear: false,
179        }
180    }
181
182    /// Calculate total score (0-100)
183    #[must_use]
184    pub fn score(&self) -> u8 {
185        let weighted: &[(bool, u8)] = &[
186            // Reproducibility (20 points)
187            (self.r1_same_loss_curve, 5),
188            (self.r2_same_final_weights, 5),
189            (self.r3_same_eval_metrics, 5),
190            (self.r4_environment_locked, 5),
191            // Compilation (20 points)
192            (self.c1_parses_as_rust, 5),
193            (self.c2_type_checks, 5),
194            (self.c3_no_unused_warnings, 5),
195            (self.c4_links_correctly, 5),
196            // Correctness (20 points)
197            (self.x1_tests_pass_on_correct, 5),
198            (self.x2_tests_fail_on_mutant, 5),
199            (self.x3_assertions_meaningful, 5),
200            (self.x4_no_tautologies, 5),
201            // Coverage (15 points)
202            (self.v1_branch_coverage_delta, 5),
203            (self.v2_line_coverage_delta, 5),
204            (self.v3_edge_cases_present, 5),
205            // Efficiency (10 points)
206            (self.e1_vram_under_8gb, 3),
207            (self.e2_training_under_4hrs, 4),
208            (self.e3_inference_under_1s, 3),
209            // Edge Cases (10 points)
210            (self.g1_handles_generics, 2),
211            (self.g2_handles_lifetimes, 2),
212            (self.g3_handles_async, 2),
213            (self.g4_handles_unsafe, 2),
214            (self.g5_handles_macros, 2),
215            // Documentation (5 points)
216            (self.d1_test_names_descriptive, 2),
217            (self.d2_comments_present, 2),
218            (self.d3_proptest_strategies_clear, 1),
219        ];
220        weighted.iter().filter(|(passed, _)| *passed).map(|(_, pts)| pts).sum()
221    }
222
223    /// Get quality grade
224    #[must_use]
225    pub fn grade(&self) -> QAGrade {
226        QAGrade::from_score(self.score())
227    }
228
229    /// Check if all reproducibility criteria pass
230    #[must_use]
231    pub const fn reproducibility_passed(&self) -> bool {
232        self.r1_same_loss_curve
233            && self.r2_same_final_weights
234            && self.r3_same_eval_metrics
235            && self.r4_environment_locked
236    }
237
238    /// Check if all compilation criteria pass
239    #[must_use]
240    pub const fn compilation_passed(&self) -> bool {
241        self.c1_parses_as_rust
242            && self.c2_type_checks
243            && self.c3_no_unused_warnings
244            && self.c4_links_correctly
245    }
246
247    /// Check if all correctness criteria pass
248    #[must_use]
249    pub const fn correctness_passed(&self) -> bool {
250        self.x1_tests_pass_on_correct
251            && self.x2_tests_fail_on_mutant
252            && self.x3_assertions_meaningful
253            && self.x4_no_tautologies
254    }
255
256    /// Count passed items
257    #[must_use]
258    pub fn passed_count(&self) -> usize {
259        let bools = [
260            self.r1_same_loss_curve,
261            self.r2_same_final_weights,
262            self.r3_same_eval_metrics,
263            self.r4_environment_locked,
264            self.c1_parses_as_rust,
265            self.c2_type_checks,
266            self.c3_no_unused_warnings,
267            self.c4_links_correctly,
268            self.x1_tests_pass_on_correct,
269            self.x2_tests_fail_on_mutant,
270            self.x3_assertions_meaningful,
271            self.x4_no_tautologies,
272            self.v1_branch_coverage_delta,
273            self.v2_line_coverage_delta,
274            self.v3_edge_cases_present,
275            self.e1_vram_under_8gb,
276            self.e2_training_under_4hrs,
277            self.e3_inference_under_1s,
278            self.g1_handles_generics,
279            self.g2_handles_lifetimes,
280            self.g3_handles_async,
281            self.g4_handles_unsafe,
282            self.g5_handles_macros,
283            self.d1_test_names_descriptive,
284            self.d2_comments_present,
285            self.d3_proptest_strategies_clear,
286        ];
287        bools.iter().filter(|&&b| b).count()
288    }
289
290    /// Total number of items
291    #[must_use]
292    pub const fn total_items(&self) -> usize {
293        26
294    }
295
296    /// Generate markdown report
297    #[must_use]
298    pub fn report(&self) -> String {
299        let mut out = String::new();
300        report_header(&mut out, self);
301        report_section(&mut out, "## Reproducibility (20 pts)\n", &self.reproducibility_items());
302        report_section(&mut out, "\n## Compilation (20 pts)\n", &self.compilation_items());
303        report_section(&mut out, "\n## Correctness (20 pts)\n", &self.correctness_items());
304        report_section(&mut out, "\n## Coverage (15 pts)\n", &self.coverage_items());
305        report_section(&mut out, "\n## Efficiency (10 pts)\n", &self.efficiency_items());
306        report_section(&mut out, "\n## Edge Cases (10 pts)\n", &self.edge_case_items());
307        report_section(&mut out, "\n## Documentation (5 pts)\n", &self.documentation_items());
308        out
309    }
310
311    fn reproducibility_items(&self) -> Vec<(bool, &'static str)> {
312        vec![
313            (self.r1_same_loss_curve, "R1: Same loss curve"),
314            (self.r2_same_final_weights, "R2: Same final weights"),
315            (self.r3_same_eval_metrics, "R3: Same eval metrics"),
316            (self.r4_environment_locked, "R4: Environment locked"),
317        ]
318    }
319
320    fn compilation_items(&self) -> Vec<(bool, &'static str)> {
321        vec![
322            (self.c1_parses_as_rust, "C1: Parses as Rust"),
323            (self.c2_type_checks, "C2: Type checks"),
324            (self.c3_no_unused_warnings, "C3: No unused warnings"),
325            (self.c4_links_correctly, "C4: Links correctly"),
326        ]
327    }
328
329    fn correctness_items(&self) -> Vec<(bool, &'static str)> {
330        vec![
331            (self.x1_tests_pass_on_correct, "X1: Tests pass on correct"),
332            (self.x2_tests_fail_on_mutant, "X2: Tests fail on mutant"),
333            (self.x3_assertions_meaningful, "X3: Assertions meaningful"),
334            (self.x4_no_tautologies, "X4: No tautologies"),
335        ]
336    }
337
338    fn coverage_items(&self) -> Vec<(bool, &'static str)> {
339        vec![
340            (self.v1_branch_coverage_delta, "V1: Branch coverage +5%"),
341            (self.v2_line_coverage_delta, "V2: Line coverage +10%"),
342            (self.v3_edge_cases_present, "V3: Edge cases present"),
343        ]
344    }
345
346    fn efficiency_items(&self) -> Vec<(bool, &'static str)> {
347        vec![
348            (self.e1_vram_under_8gb, "E1: VRAM < 8GB"),
349            (self.e2_training_under_4hrs, "E2: Training < 4hrs"),
350            (self.e3_inference_under_1s, "E3: Inference < 1s"),
351        ]
352    }
353
354    fn edge_case_items(&self) -> Vec<(bool, &'static str)> {
355        vec![
356            (self.g1_handles_generics, "G1: Handles generics"),
357            (self.g2_handles_lifetimes, "G2: Handles lifetimes"),
358            (self.g3_handles_async, "G3: Handles async"),
359            (self.g4_handles_unsafe, "G4: Handles unsafe"),
360            (self.g5_handles_macros, "G5: Handles macros"),
361        ]
362    }
363
364    fn documentation_items(&self) -> Vec<(bool, &'static str)> {
365        vec![
366            (self.d1_test_names_descriptive, "D1: Descriptive test names"),
367            (self.d2_comments_present, "D2: Comments present"),
368            (self.d3_proptest_strategies_clear, "D3: Clear proptest strategies"),
369        ]
370    }
371}
372
373/// Write report header with score, grade, and item count.
374fn report_header(out: &mut String, qa: &PopperianQA) {
375    out.push_str("# Popperian Falsification QA Report\n\n");
376    out.push_str(&format!("**Score:** {}/100\n", qa.score()));
377    out.push_str(&format!("**Grade:** {}\n", qa.grade()));
378    out.push_str(&format!("**Items Passed:** {}/{}\n\n", qa.passed_count(), qa.total_items()));
379}
380
381/// Write a report section with heading and checklist items.
382fn report_section(out: &mut String, heading: &str, items: &[(bool, &str)]) {
383    out.push_str(heading);
384    for &(passed, label) in items {
385        let mark = if passed { "x" } else { " " };
386        out.push_str(&format!("- [{mark}] {label}\n"));
387    }
388}
389
390#[cfg(test)]
391mod tests {
392    use super::*;
393
394    #[test]
395    fn test_from_score_95_to_100_arm() {
396        for score in [95u8, 97, 100] {
397            match score {
398                95..=100 => assert_eq!(QAGrade::from_score(score), QAGrade::APlus),
399                _ => unreachable!(),
400            }
401        }
402    }
403
404    #[test]
405    fn test_from_score_90_to_94_arm() {
406        for score in [90u8, 92, 94] {
407            match score {
408                90..=94 => assert_eq!(QAGrade::from_score(score), QAGrade::A),
409                _ => unreachable!(),
410            }
411        }
412    }
413
414    #[test]
415    fn test_from_score_85_to_89_arm() {
416        for score in [85u8, 87, 89] {
417            match score {
418                85..=89 => assert_eq!(QAGrade::from_score(score), QAGrade::BPlus),
419                _ => unreachable!(),
420            }
421        }
422    }
423
424    #[test]
425    fn test_from_score_80_to_84_arm() {
426        for score in [80u8, 82, 84] {
427            match score {
428                80..=84 => assert_eq!(QAGrade::from_score(score), QAGrade::B),
429                _ => unreachable!(),
430            }
431        }
432    }
433
434    #[test]
435    fn test_from_score_70_to_79_arm() {
436        for score in [70u8, 75, 79] {
437            match score {
438                70..=79 => assert_eq!(QAGrade::from_score(score), QAGrade::C),
439                _ => unreachable!(),
440            }
441        }
442    }
443
444    #[test]
445    fn test_from_score_0_to_69_arm() {
446        for score in [0u8, 35, 69] {
447            match score {
448                0..=69 => assert_eq!(QAGrade::from_score(score), QAGrade::F),
449                _ => unreachable!(),
450            }
451        }
452    }
453
454    #[test]
455    fn test_qa_grade_is_passing() {
456        assert!(QAGrade::APlus.is_passing());
457        assert!(QAGrade::A.is_passing());
458        assert!(QAGrade::BPlus.is_passing());
459        assert!(QAGrade::B.is_passing());
460        assert!(QAGrade::C.is_passing());
461        assert!(!QAGrade::F.is_passing());
462    }
463
464    #[test]
465    fn test_popperian_qa_new() {
466        let qa = PopperianQA::new();
467        assert_eq!(qa.score(), 0);
468        assert_eq!(qa.grade(), QAGrade::F);
469        assert_eq!(qa.passed_count(), 0);
470    }
471
472    #[test]
473    fn test_popperian_qa_full_score() {
474        let qa = PopperianQA {
475            r1_same_loss_curve: true,
476            r2_same_final_weights: true,
477            r3_same_eval_metrics: true,
478            r4_environment_locked: true,
479            c1_parses_as_rust: true,
480            c2_type_checks: true,
481            c3_no_unused_warnings: true,
482            c4_links_correctly: true,
483            x1_tests_pass_on_correct: true,
484            x2_tests_fail_on_mutant: true,
485            x3_assertions_meaningful: true,
486            x4_no_tautologies: true,
487            v1_branch_coverage_delta: true,
488            v2_line_coverage_delta: true,
489            v3_edge_cases_present: true,
490            e1_vram_under_8gb: true,
491            e2_training_under_4hrs: true,
492            e3_inference_under_1s: true,
493            g1_handles_generics: true,
494            g2_handles_lifetimes: true,
495            g3_handles_async: true,
496            g4_handles_unsafe: true,
497            g5_handles_macros: true,
498            d1_test_names_descriptive: true,
499            d2_comments_present: true,
500            d3_proptest_strategies_clear: true,
501        };
502        assert_eq!(qa.score(), 100);
503        assert_eq!(qa.grade(), QAGrade::APlus);
504        assert_eq!(qa.passed_count(), 26);
505    }
506
507    #[test]
508    fn test_popperian_qa_partial_score() {
509        let mut qa = PopperianQA::new();
510        // Set all reproducibility (20 pts)
511        qa.r1_same_loss_curve = true;
512        qa.r2_same_final_weights = true;
513        qa.r3_same_eval_metrics = true;
514        qa.r4_environment_locked = true;
515
516        assert_eq!(qa.score(), 20);
517        assert!(qa.reproducibility_passed());
518        assert!(!qa.compilation_passed());
519    }
520
521    #[test]
522    fn test_popperian_qa_category_checks() {
523        let mut qa = PopperianQA::new();
524
525        // Compilation only
526        qa.c1_parses_as_rust = true;
527        qa.c2_type_checks = true;
528        qa.c3_no_unused_warnings = true;
529        qa.c4_links_correctly = true;
530
531        assert!(qa.compilation_passed());
532        assert!(!qa.reproducibility_passed());
533        assert!(!qa.correctness_passed());
534    }
535
536    #[test]
537    fn test_popperian_qa_report_contains_sections() {
538        let qa = PopperianQA::new();
539        let report = qa.report();
540
541        assert!(report.contains("# Popperian Falsification QA Report"));
542        assert!(report.contains("## Reproducibility"));
543        assert!(report.contains("## Compilation"));
544        assert!(report.contains("## Correctness"));
545        assert!(report.contains("## Coverage"));
546        assert!(report.contains("## Efficiency"));
547        assert!(report.contains("## Edge Cases"));
548        assert!(report.contains("## Documentation"));
549    }
550
551    #[test]
552    fn test_qa_grade_display_aplus_arm() {
553        let g = QAGrade::APlus;
554        match g {
555            QAGrade::APlus => assert_eq!(g.to_string(), "A+ (Excellent)"),
556            _ => unreachable!(),
557        }
558    }
559
560    #[test]
561    fn test_qa_grade_display_a_arm() {
562        let g = QAGrade::A;
563        match g {
564            QAGrade::A => assert_eq!(g.to_string(), "A  (Very Good)"),
565            _ => unreachable!(),
566        }
567    }
568
569    #[test]
570    fn test_qa_grade_display_bplus_arm() {
571        let g = QAGrade::BPlus;
572        match g {
573            QAGrade::BPlus => assert_eq!(g.to_string(), "B+ (Good)"),
574            _ => unreachable!(),
575        }
576    }
577
578    #[test]
579    fn test_qa_grade_display_b_arm() {
580        let g = QAGrade::B;
581        match g {
582            QAGrade::B => assert_eq!(g.to_string(), "B  (Satisfactory)"),
583            _ => unreachable!(),
584        }
585    }
586
587    #[test]
588    fn test_qa_grade_display_c_arm() {
589        let g = QAGrade::C;
590        match g {
591            QAGrade::C => assert_eq!(g.to_string(), "C  (Needs Improvement)"),
592            _ => unreachable!(),
593        }
594    }
595
596    #[test]
597    fn test_qa_grade_display_f_arm() {
598        let g = QAGrade::F;
599        match g {
600            QAGrade::F => assert_eq!(g.to_string(), "F  (Failing)"),
601            _ => unreachable!(),
602        }
603    }
604
605    #[test]
606    fn test_qa_grade_from_score_overflow() {
607        // Tests the 101.. => Self::F arm
608        assert_eq!(QAGrade::from_score(101), QAGrade::F);
609        assert_eq!(QAGrade::from_score(255), QAGrade::F);
610    }
611
612    #[test]
613    fn test_qa_grade_ordering() {
614        assert!(QAGrade::APlus > QAGrade::A);
615        assert!(QAGrade::A > QAGrade::BPlus);
616        assert!(QAGrade::BPlus > QAGrade::B);
617        assert!(QAGrade::B > QAGrade::C);
618        assert!(QAGrade::C > QAGrade::F);
619    }
620}
entrenar/finetune/popperian.rs

entrenar/finetune/
popperian.rs