1use std::collections::HashMap;
7use std::time::{SystemTime, UNIX_EPOCH};
8
9fn nano_id() -> u64 {
12 SystemTime::now()
13 .duration_since(UNIX_EPOCH)
14 .unwrap_or_default()
15 .as_nanos() as u64
16}
17
18fn nano_id_seeded(seed: u64) -> u64 {
19 seed.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407)
21}
22
23#[derive(Debug, Clone, PartialEq)]
26pub enum DeployTier {
27 Live,
28 Monitored,
29 HumanGated,
30}
31
32impl std::fmt::Display for DeployTier {
33 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
34 match self {
35 DeployTier::Live => write!(f, "Live"),
36 DeployTier::Monitored => write!(f, "Monitored"),
37 DeployTier::HumanGated => write!(f, "HumanGated"),
38 }
39 }
40}
41
42#[derive(Debug, Clone)]
43pub struct Tile {
44 pub id: u64,
45 pub content: String,
46 pub source: String,
47 pub confidence: f32,
48 pub trust: f32,
49 pub relevance: f32,
50 pub tier: Option<DeployTier>,
51 pub tags: Vec<String>,
52 pub byte_size: usize,
53}
54
55impl Tile {
56 pub fn new(content: impl Into<String>, source: impl Into<String>) -> Self {
57 let content = content.into();
58 let byte_size = content.len();
59 Self {
60 id: nano_id(),
61 content,
62 source: source.into(),
63 confidence: 0.0,
64 trust: 0.0,
65 relevance: 0.0,
66 tier: None,
67 tags: Vec::new(),
68 byte_size,
69 }
70 }
71
72 pub fn with_id(mut self, id: u64) -> Self {
73 self.id = id;
74 self
75 }
76
77 pub fn belief_score(&self) -> f32 {
78 (self.confidence + self.trust + self.relevance) / 3.0
79 }
80}
81
82#[derive(Debug, Clone)]
83pub struct SeedInput {
84 pub raw: String,
85 pub source: String,
86 pub weight: f32,
87}
88
89impl SeedInput {
90 pub fn new(raw: impl Into<String>, source: impl Into<String>) -> Self {
91 Self {
92 raw: raw.into(),
93 source: source.into(),
94 weight: 1.0,
95 }
96 }
97
98 pub fn with_weight(mut self, weight: f32) -> Self {
99 self.weight = weight;
100 self
101 }
102}
103
104#[derive(Debug)]
105pub struct ValidationError {
106 pub tile_id: u64,
107 pub reason: String,
108}
109
110#[derive(Debug)]
111pub struct PipelineResult {
112 pub tiles_produced: Vec<Tile>,
113 pub tiles_rejected: Vec<(Tile, String)>,
114 pub compression_ratio: f64,
115 pub tier_distribution: HashMap<String, usize>,
116 pub stage_timings_ns: Vec<(String, u64)>,
117}
118
119impl PipelineResult {
120 pub fn live_count(&self) -> usize {
121 self.tier_distribution.get("Live").copied().unwrap_or(0)
122 }
123 pub fn monitored_count(&self) -> usize {
124 self.tier_distribution.get("Monitored").copied().unwrap_or(0)
125 }
126 pub fn human_gated_count(&self) -> usize {
127 self.tier_distribution.get("HumanGated").copied().unwrap_or(0)
128 }
129}
130
131pub trait Stage {
134 type Input;
135 type Output;
136 fn name(&self) -> &'static str;
137 fn process(&self, input: Self::Input) -> Self::Output;
138}
139
140pub struct ExtractStage {
143 pub expansion_factor: usize,
145}
146
147impl Default for ExtractStage {
148 fn default() -> Self {
149 Self { expansion_factor: 43 } }
151}
152
153impl Stage for ExtractStage {
154 type Input = Vec<SeedInput>;
155 type Output = Vec<Tile>;
156
157 fn name(&self) -> &'static str { "EXTRACT" }
158
159 fn process(&self, seeds: Vec<SeedInput>) -> Vec<Tile> {
160 let mut tiles = Vec::new();
161 let mut counter: u64 = 1;
162
163 for seed in &seeds {
164 let base_id = nano_id_seeded(counter);
166 counter = counter.wrapping_add(1);
167
168 let seed_tile = Tile {
169 id: base_id,
170 content: seed.raw.clone(),
171 source: seed.source.clone(),
172 confidence: 0.5 * seed.weight,
173 trust: 0.5,
174 relevance: 0.5,
175 tier: None,
176 tags: vec!["seed".to_string()],
177 byte_size: seed.raw.len(),
178 };
179 tiles.push(seed_tile);
180
181 for i in 1..self.expansion_factor {
183 let derived_id = nano_id_seeded(counter);
184 counter = counter.wrapping_add(1);
185
186 let variant_content = format!(
187 "{} [variant:{} src:{}]",
188 &seed.raw,
189 i,
190 &seed.source
191 );
192 let byte_size = variant_content.len();
193
194 let confidence = clamp(0.3 + (i as f32 * 0.015) * seed.weight, 0.0, 1.0);
195 let trust = clamp(0.4 + (i as f32 * 0.01), 0.0, 1.0);
196 let relevance = clamp(0.35 + (i as f32 * 0.012) * seed.weight, 0.0, 1.0);
197
198 tiles.push(Tile {
199 id: derived_id,
200 content: variant_content,
201 source: seed.source.clone(),
202 confidence,
203 trust,
204 relevance,
205 tier: None,
206 tags: vec![format!("derived:{}", i)],
207 byte_size,
208 });
209 }
210 }
211
212 tiles
213 }
214}
215
216pub struct ValidateStage {
219 pub min_content_len: usize,
220 pub max_content_len: usize,
221}
222
223impl Default for ValidateStage {
224 fn default() -> Self {
225 Self {
226 min_content_len: 3,
227 max_content_len: 4096,
228 }
229 }
230}
231
232impl Stage for ValidateStage {
233 type Input = Vec<Tile>;
234 type Output = (Vec<Tile>, Vec<(Tile, String)>);
235
236 fn name(&self) -> &'static str { "VALIDATE" }
237
238 fn process(&self, tiles: Vec<Tile>) -> (Vec<Tile>, Vec<(Tile, String)>) {
239 let mut valid = Vec::new();
240 let mut rejected = Vec::new();
241
242 for tile in tiles {
243 if let Some(reason) = self.check(&tile) {
244 rejected.push((tile, reason));
245 } else {
246 valid.push(tile);
247 }
248 }
249
250 (valid, rejected)
251 }
252}
253
254impl ValidateStage {
255 fn check(&self, tile: &Tile) -> Option<String> {
256 if tile.content.trim().is_empty() {
257 return Some("empty content".to_string());
258 }
259 if tile.content.len() < self.min_content_len {
260 return Some(format!("content too short ({})", tile.content.len()));
261 }
262 if tile.content.len() > self.max_content_len {
263 return Some(format!("content too long ({})", tile.content.len()));
264 }
265 if tile.source.trim().is_empty() {
266 return Some("missing source".to_string());
267 }
268 if tile.content.contains('\0') {
270 return Some("null byte in content".to_string());
271 }
272 None
273 }
274}
275
276pub struct ScoreStage;
279
280impl Default for ScoreStage {
281 fn default() -> Self { Self }
282}
283
284impl Stage for ScoreStage {
285 type Input = Vec<Tile>;
286 type Output = Vec<Tile>;
287
288 fn name(&self) -> &'static str { "SCORE" }
289
290 fn process(&self, mut tiles: Vec<Tile>) -> Vec<Tile> {
291 for tile in &mut tiles {
292 let content_density = content_density_score(&tile.content);
294 let tag_boost = if tile.tags.contains(&"seed".to_string()) { 0.1 } else { 0.0 };
295
296 tile.confidence = clamp(tile.confidence + content_density * 0.2 + tag_boost, 0.0, 1.0);
297 tile.trust = clamp(tile.trust + tag_boost, 0.0, 1.0);
298 tile.relevance = clamp(tile.relevance + content_density * 0.15, 0.0, 1.0);
299 }
300 tiles
301 }
302}
303
304fn content_density_score(content: &str) -> f32 {
305 let words = content.split_whitespace().count();
306 let chars = content.len();
307 if chars == 0 { return 0.0; }
308 let ratio = words as f32 / chars as f32;
309 clamp(ratio * 5.0, 0.0, 1.0)
311}
312
313pub struct TierStage {
316 pub live_threshold: f32,
317 pub monitored_threshold: f32,
318}
319
320impl Default for TierStage {
321 fn default() -> Self {
322 Self {
323 live_threshold: 0.63,
324 monitored_threshold: 0.42,
325 }
326 }
327}
328
329impl Stage for TierStage {
330 type Input = Vec<Tile>;
331 type Output = Vec<Tile>;
332
333 fn name(&self) -> &'static str { "TIER" }
334
335 fn process(&self, mut tiles: Vec<Tile>) -> Vec<Tile> {
336 for tile in &mut tiles {
337 let score = tile.belief_score();
338 tile.tier = Some(if score >= self.live_threshold {
339 DeployTier::Live
340 } else if score >= self.monitored_threshold {
341 DeployTier::Monitored
342 } else {
343 DeployTier::HumanGated
344 });
345 }
346 tiles
347 }
348}
349
350pub struct CommitStage {
353 pub model_bytes: u64,
355}
356
357impl Default for CommitStage {
358 fn default() -> Self {
359 Self { model_bytes: 4_400_000_000 }
362 }
363}
364
365pub struct CommitOutput {
366 pub committed: Vec<Tile>,
367 pub rejected: Vec<(Tile, String)>,
368 pub compression_ratio: f64,
369 pub tier_distribution: HashMap<String, usize>,
370}
371
372impl Stage for CommitStage {
373 type Input = (Vec<Tile>, Vec<(Tile, String)>);
374 type Output = CommitOutput;
375
376 fn name(&self) -> &'static str { "COMMIT" }
377
378 fn process(&self, (tiles, already_rejected): (Vec<Tile>, Vec<(Tile, String)>)) -> CommitOutput {
379 let mut tier_distribution: HashMap<String, usize> = HashMap::new();
380 let mut committed = Vec::new();
381
382 for tile in tiles {
383 let tier_key = tile.tier.as_ref()
384 .map(|t| t.to_string())
385 .unwrap_or_else(|| "Unknown".to_string());
386 *tier_distribution.entry(tier_key).or_insert(0) += 1;
387 committed.push(tile);
388 }
389
390 let total_tile_bytes: u64 = committed.iter().map(|t| t.byte_size as u64).sum();
392 let compression_ratio = if total_tile_bytes > 0 {
393 self.model_bytes as f64 / total_tile_bytes as f64
394 } else {
395 0.0
396 };
397
398 CommitOutput {
399 committed,
400 rejected: already_rejected,
401 compression_ratio,
402 tier_distribution,
403 }
404 }
405}
406
407pub fn forge_pipeline(seeds: Vec<SeedInput>) -> PipelineResult {
410 forge_pipeline_with_stages(
411 seeds,
412 ExtractStage::default(),
413 ValidateStage::default(),
414 ScoreStage,
415 TierStage::default(),
416 CommitStage::default(),
417 )
418}
419
420pub fn forge_pipeline_with_stages(
421 seeds: Vec<SeedInput>,
422 extract: ExtractStage,
423 validate: ValidateStage,
424 score: ScoreStage,
425 tier: TierStage,
426 commit: CommitStage,
427) -> PipelineResult {
428 let mut timings = Vec::new();
429
430 let t0 = nano_id();
432 let raw_tiles = extract.process(seeds);
433 timings.push(("EXTRACT".to_string(), nano_id().saturating_sub(t0)));
434
435 let t0 = nano_id();
437 let (valid_tiles, rejected_tiles) = validate.process(raw_tiles);
438 timings.push(("VALIDATE".to_string(), nano_id().saturating_sub(t0)));
439
440 let t0 = nano_id();
442 let scored_tiles = score.process(valid_tiles);
443 timings.push(("SCORE".to_string(), nano_id().saturating_sub(t0)));
444
445 let t0 = nano_id();
447 let tiered_tiles = tier.process(scored_tiles);
448 timings.push(("TIER".to_string(), nano_id().saturating_sub(t0)));
449
450 let t0 = nano_id();
452 let commit_out = commit.process((tiered_tiles, rejected_tiles));
453 timings.push(("COMMIT".to_string(), nano_id().saturating_sub(t0)));
454
455 PipelineResult {
456 compression_ratio: commit_out.compression_ratio,
457 tier_distribution: commit_out.tier_distribution,
458 tiles_produced: commit_out.committed,
459 tiles_rejected: commit_out.rejected,
460 stage_timings_ns: timings,
461 }
462}
463
464fn clamp(v: f32, lo: f32, hi: f32) -> f32 {
467 if v < lo { lo } else if v > hi { hi } else { v }
468}
469
470#[cfg(test)]
473mod tests {
474 use super::*;
475
476 fn make_seeds(n: usize) -> Vec<SeedInput> {
477 (0..n).map(|i| SeedInput::new(
478 format!("seed content number {} with enough words for density", i),
479 format!("source-{}", i),
480 )).collect()
481 }
482
483 #[test]
486 fn test_extract_stage_expansion() {
487 let stage = ExtractStage { expansion_factor: 43 };
488 let seeds = make_seeds(59);
489 let tiles = stage.process(seeds);
490 assert!(tiles.len() >= 2501,
492 "expected ≥2501 tiles, got {}", tiles.len());
493 for t in &tiles {
495 assert!(t.id > 0);
496 }
497 }
498
499 #[test]
500 fn test_validate_stage_filters() {
501 let stage = ValidateStage::default();
502 let tiles = vec![
503 Tile::new("valid content here", "src").with_id(1),
504 Tile::new("", "src").with_id(2), Tile::new("ab", "src").with_id(3), Tile::new("ok", "").with_id(4), {
508 let mut t = Tile::new("null\0byte", "src");
509 t.id = 5;
510 t
511 }, ];
513 let (valid, rejected) = stage.process(tiles);
514 assert_eq!(valid.len(), 1);
515 assert_eq!(rejected.len(), 4);
516 }
517
518 #[test]
519 fn test_score_stage_adjusts_beliefs() {
520 let stage = ScoreStage;
521 let mut tile = Tile::new("hello world this is a test sentence", "src");
522 tile.id = 42;
523 tile.confidence = 0.5;
524 tile.trust = 0.5;
525 tile.relevance = 0.5;
526 tile.tags = vec!["seed".to_string()];
527
528 let out = stage.process(vec![tile]);
529 assert_eq!(out.len(), 1);
530 assert!(out[0].confidence >= 0.5);
532 assert!(out[0].trust >= 0.5);
533 }
534
535 #[test]
536 fn test_tier_stage_classification() {
537 let stage = TierStage::default();
538 let mut high = Tile::new("x", "src").with_id(1);
539 high.confidence = 0.9; high.trust = 0.9; high.relevance = 0.9;
540
541 let mut mid = Tile::new("x", "src").with_id(2);
542 mid.confidence = 0.45; mid.trust = 0.45; mid.relevance = 0.45;
543
544 let mut low = Tile::new("x", "src").with_id(3);
545 low.confidence = 0.1; low.trust = 0.1; low.relevance = 0.1;
546
547 let out = stage.process(vec![high, mid, low]);
548 assert_eq!(out[0].tier, Some(DeployTier::Live));
549 assert_eq!(out[1].tier, Some(DeployTier::Monitored));
550 assert_eq!(out[2].tier, Some(DeployTier::HumanGated));
551 }
552
553 #[test]
554 fn test_commit_stage_compression_and_distribution() {
555 let stage = CommitStage { model_bytes: 4_400_000_000 };
556
557 let mut t1 = Tile::new("a".repeat(1000), "src").with_id(1);
558 t1.tier = Some(DeployTier::Live);
559 t1.byte_size = 1000;
560
561 let mut t2 = Tile::new("b".repeat(1000), "src").with_id(2);
562 t2.tier = Some(DeployTier::Monitored);
563 t2.byte_size = 1000;
564
565 let out = stage.process((vec![t1, t2], vec![]));
566 assert_eq!(out.committed.len(), 2);
567 assert_eq!(*out.tier_distribution.get("Live").unwrap(), 1);
568 assert_eq!(*out.tier_distribution.get("Monitored").unwrap(), 1);
569 assert!(out.compression_ratio > 1.0);
571 }
572
573 #[test]
576 fn test_pipeline_happy_path() {
577 let seeds = make_seeds(10);
578 let result = forge_pipeline(seeds);
579
580 assert!(!result.tiles_produced.is_empty(), "should produce tiles");
581 assert!(result.compression_ratio > 0.0, "compression ratio must be positive");
582 assert!(!result.tier_distribution.is_empty(), "tier distribution must be non-empty");
583 }
584
585 #[test]
586 fn test_pipeline_all_rejected() {
587 let seeds = vec![
589 SeedInput { raw: String::new(), source: "src".to_string(), weight: 1.0 },
590 ];
591
592 let extract = ExtractStage { expansion_factor: 1 };
594 let validate = ValidateStage::default();
595 let score = ScoreStage;
596 let tier = TierStage::default();
597 let commit = CommitStage::default();
598
599 let result = forge_pipeline_with_stages(seeds, extract, validate, score, tier, commit);
600 assert!(result.tiles_produced.is_empty() || result.tiles_rejected.len() > 0,
602 "all-rejected or empty result expected");
603 }
604
605 #[test]
606 fn test_pipeline_mixed_tiers() {
607 let seeds: Vec<SeedInput> = vec![
609 SeedInput::new("high quality seed with many informative words", "high-src").with_weight(2.0),
610 SeedInput::new("medium quality content here", "mid-src").with_weight(1.0),
611 SeedInput::new("low", "low-src").with_weight(0.1),
612 ];
613
614 let result = forge_pipeline(seeds);
615 assert!(!result.tiles_produced.is_empty());
616
617 let total = result.tiles_produced.len();
618 assert!(total > 0);
619
620 assert!(result.tier_distribution.len() >= 2,
622 "expected mixed tiers, got: {:?}", result.tier_distribution);
623 }
624
625 #[test]
628 fn test_compression_ratio_880_to_1() {
629 let commit = CommitStage { model_bytes: 4_400_000_000 };
632
633 let tile_count = 5000;
635 let content = "x".repeat(1000); let tiles: Vec<Tile> = (0..tile_count).map(|i| {
637 let mut t = Tile::new(content.clone(), "src").with_id(i as u64 + 1);
638 t.tier = Some(DeployTier::Live);
639 t.byte_size = 1000;
640 t
641 }).collect();
642
643 let out = commit.process((tiles, vec![]));
644 let ratio = out.compression_ratio;
645
646 assert!(
648 (ratio - 880.0).abs() < 0.01,
649 "expected compression ratio ~880:1, got {:.2}", ratio
650 );
651 }
652
653 #[test]
656 fn test_tier_distribution_live_dominates_good_input() {
657 let seeds = make_seeds(59);
658 let result = forge_pipeline(seeds);
659
660 let live = result.live_count();
661 let monitored = result.monitored_count();
662 let human_gated = result.human_gated_count();
663
664 assert!(live > monitored,
666 "Live ({}) should exceed Monitored ({}) for good input", live, monitored);
667 assert!(live > human_gated,
668 "Live ({}) should exceed HumanGated ({}) for good input", live, human_gated);
669 }
670
671 #[test]
674 fn test_empty_input() {
675 let result = forge_pipeline(vec![]);
676 assert!(result.tiles_produced.is_empty());
677 assert!(result.tiles_rejected.is_empty());
678 assert_eq!(result.compression_ratio, 0.0);
679 }
680
681 #[test]
682 fn test_single_tile() {
683 let seeds = vec![SeedInput::new("a single seed input tile", "single-src")];
684 let extract = ExtractStage { expansion_factor: 1 };
685 let result = forge_pipeline_with_stages(
686 seeds,
687 extract,
688 ValidateStage::default(),
689 ScoreStage,
690 TierStage::default(),
691 CommitStage::default(),
692 );
693 assert_eq!(result.tiles_produced.len(), 1);
694 assert!(result.tiles_rejected.is_empty());
695 }
696
697 #[test]
698 fn test_all_tiles_fail_validation() {
699 let stage = ValidateStage { min_content_len: 10_000, max_content_len: 10_001 };
700 let tiles = vec![
701 Tile::new("short", "src").with_id(1),
702 Tile::new("also short", "src").with_id(2),
703 ];
704 let (valid, rejected) = stage.process(tiles);
705 assert!(valid.is_empty());
706 assert_eq!(rejected.len(), 2);
707 }
708
709 #[test]
710 fn test_59_seeds_produce_2501_tiles() {
711 let stage = ExtractStage { expansion_factor: 43 };
712 let seeds = make_seeds(59);
713 let tiles = stage.process(seeds);
714 assert!(tiles.len() >= 2501,
716 "JC1's 59 seeds should expand to ≥2501 tiles, got {}", tiles.len());
717 }
718
719 #[test]
720 fn test_stage_names() {
721 assert_eq!(ExtractStage::default().name(), "EXTRACT");
722 assert_eq!(ValidateStage::default().name(), "VALIDATE");
723 assert_eq!(ScoreStage.name(), "SCORE");
724 assert_eq!(TierStage::default().name(), "TIER");
725 assert_eq!(CommitStage::default().name(), "COMMIT");
726 }
727}