1#![doc = "Text integrity detection pipeline"]
2#![deny(clippy::unwrap_used)]
7#![deny(clippy::expect_used)]
8#![deny(clippy::panic)]
9#![warn(missing_docs)]
10pub mod aggregation;
11pub mod bloom;
12pub mod burstiness;
13pub mod chemistry;
14pub mod classify;
15pub mod entropy;
16pub mod perplexity;
17pub mod tokenize;
18pub mod zipf;
19
20use crate::aggregation::RawFeatures;
21use crate::bloom::BloomThresholds;
22use crate::classify::Verdict;
23use serde::{Deserialize, Serialize};
24
25pub const MIN_TOKENS: usize = 50;
27pub const DEFAULT_WINDOW_SIZE: usize = 50;
29pub const DEFAULT_WINDOW_STEP: usize = 25;
31
32#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct PipelineResult {
35 pub verdict: Verdict,
37 pub probability: f64,
38 pub confidence: f64,
39 pub threshold: f64,
40
41 pub bloom_level: u8,
43 pub bloom_name: String,
44 pub preset_name: String,
45
46 pub total_tokens: usize,
48 pub unique_tokens: usize,
49 pub ttr: f64,
50
51 pub zipf_deviation: f64,
53 pub zipf_alpha: f64,
54 pub zipf_r_squared: f64,
55 pub entropy_mean: f64,
56 pub entropy_std: f64,
57 pub entropy_window_count: usize,
58 pub burstiness_coeff: f64,
59 pub burstiness_tokens_analyzed: usize,
60 pub perplexity_mean: f64,
61 pub perplexity_var: f64,
62 pub perplexity_sentence_count: usize,
63 pub ttr_deviation: f64,
64
65 pub normalized: [f64; 5],
67 pub weights: [f64; 5],
68
69 pub beer_lambert_score: f64,
71 pub composite: f64,
72 pub hill_score: f64,
73
74 pub error: String,
76}
77
78impl PipelineResult {
79 fn error(msg: String) -> Self {
81 Self {
82 verdict: Verdict::Human,
83 probability: 0.0,
84 confidence: 0.0,
85 threshold: 0.0,
86 bloom_level: 0,
87 bloom_name: String::new(),
88 preset_name: String::new(),
89 total_tokens: 0,
90 unique_tokens: 0,
91 ttr: 0.0,
92 zipf_deviation: 0.0,
93 zipf_alpha: 0.0,
94 zipf_r_squared: 0.0,
95 entropy_mean: 0.0,
96 entropy_std: 0.0,
97 entropy_window_count: 0,
98 burstiness_coeff: 0.0,
99 burstiness_tokens_analyzed: 0,
100 perplexity_mean: 0.0,
101 perplexity_var: 0.0,
102 perplexity_sentence_count: 0,
103 ttr_deviation: 0.0,
104 normalized: [0.0; 5],
105 weights: aggregation::WEIGHTS,
106 beer_lambert_score: 0.0,
107 composite: 0.0,
108 hill_score: 0.0,
109 error: msg,
110 }
111 }
112}
113
114pub fn run_pipeline(text: &str, bloom_level: u8, preset: &str) -> PipelineResult {
116 let stats = tokenize::tokenize(text);
118
119 if stats.total_tokens < MIN_TOKENS {
120 return PipelineResult::error(format!(
121 "Need at least {} tokens, got {}",
122 MIN_TOKENS, stats.total_tokens
123 ));
124 }
125
126 let bloom_thresholds = BloomThresholds::from_name(preset);
128 let threshold = bloom_thresholds
129 .threshold_for_level(bloom_level)
130 .unwrap_or(0.64);
131 let bloom_name = BloomThresholds::level_name(bloom_level)
132 .unwrap_or("Unknown")
133 .to_string();
134
135 let zipf_result = zipf::zipf_analysis(&stats.frequencies);
137 let entropy_profile =
138 entropy::entropy_profile(&stats.tokens, DEFAULT_WINDOW_SIZE, DEFAULT_WINDOW_STEP);
139 let burst_result = burstiness::burstiness_analysis(&stats.tokens, &stats.frequencies);
140 let perp_result = perplexity::perplexity_variance(text);
141 let ttr_dev = tokenize::ttr_deviation(stats.ttr);
142
143 let raw = RawFeatures {
145 zipf_deviation: zipf_result.deviation,
146 entropy_std: entropy_profile.std_dev,
147 burstiness: burst_result.coefficient,
148 perplexity_var: perp_result.variance,
149 ttr_deviation: ttr_dev,
150 };
151 let agg = aggregation::aggregate(&raw);
152
153 let classification = classify::classify_with_threshold(agg.hill_score, threshold);
155
156 PipelineResult {
157 verdict: classification.verdict,
158 probability: classification.probability,
159 confidence: classification.confidence,
160 threshold,
161 bloom_level,
162 bloom_name,
163 preset_name: bloom_thresholds.name,
164 total_tokens: stats.total_tokens,
165 unique_tokens: stats.unique_tokens,
166 ttr: stats.ttr,
167 zipf_deviation: zipf_result.deviation,
168 zipf_alpha: zipf_result.alpha,
169 zipf_r_squared: zipf_result.r_squared,
170 entropy_mean: entropy_profile.mean,
171 entropy_std: entropy_profile.std_dev,
172 entropy_window_count: entropy_profile.window_count,
173 burstiness_coeff: burst_result.coefficient,
174 burstiness_tokens_analyzed: burst_result.tokens_analyzed,
175 perplexity_mean: perp_result.mean_entropy,
176 perplexity_var: perp_result.variance,
177 perplexity_sentence_count: perp_result.sentence_count,
178 ttr_deviation: ttr_dev,
179 normalized: agg.normalized,
180 weights: aggregation::WEIGHTS,
181 beer_lambert_score: agg.beer_lambert_score,
182 composite: agg.composite,
183 hill_score: agg.hill_score,
184 error: String::new(),
185 }
186}