1use anyhow::Result;
2use regex::Regex;
3use reqwest::Client;
4use serde::{Deserialize, Serialize};
5use std::collections::hash_map::DefaultHasher;
6use std::collections::HashMap;
7use std::hash::{Hash, Hasher};
8use uuid::Uuid;
9
10pub const MIN_USER_BYTES: usize = 70 * 1024; const YVDB_URL: &str = "http://127.0.0.1:8080";
12const COLLECTION: &str = "stylometry_refs";
13const DIMENSION: u32 = 384; const METRIC: &str = "cosine";
15const CHUNK_SIZE: usize = 512; const OVERLAP: usize = 100; #[derive(Serialize, Deserialize)]
19struct UpsertRequest {
20 dimension: u32,
21 metric: String,
22 records: Vec<Record>,
23}
24
25#[derive(Serialize, Deserialize)]
26struct Record {
27 id: String,
28 vector: Vec<f32>,
29 metadata: HashMap<String, String>,
30}
31
32#[derive(Serialize, Deserialize)]
33struct QueryRequest {
34 vector: Vec<f32>,
35 k: u32,
36 return_distance: bool,
37}
38
39#[derive(Serialize, Deserialize)]
40struct QueryResponse {
41 results: Vec<ResultItem>,
42}
43
44#[derive(Serialize, Deserialize)]
45struct ResultItem {
46 id: String,
47 score: f32,
48 metadata: HashMap<String, String>,
49}
50
51#[derive(Serialize)]
52pub struct StylometryResult {
53 pub dominant: String,
54 pub percentages: HashMap<String, f32>,
55 pub explanations: HashMap<String, String>,
56}
57
58pub async fn is_seeded(client: &Client) -> Result<bool> {
59 let res = client
60 .get(format!("{}/collections/{}/stats", YVDB_URL, COLLECTION))
61 .send()
62 .await?;
63 if res.status().is_success() {
64 let stats: serde_json::Value = res.json().await?;
65 Ok(stats["count"].as_u64().unwrap_or(0) > 0)
66 } else {
67 Ok(false)
68 }
69}
70
71pub async fn pre_seed_categories(client: &Client) -> Result<()> {
72 let prototypes = get_prototypes();
73 let mut records = Vec::new();
74
75 let total_chunks: usize = prototypes
76 .iter()
77 .map(|(_, text)| chunk_text(text).len())
78 .sum();
79 let mut embedded_chunks = 0usize;
80
81 for (label, text) in prototypes {
82 let chunks = chunk_text(&text);
83 let embeddings = embed_chunks(&chunks);
84 for emb in embeddings {
85 let mut metadata = HashMap::new();
86 metadata.insert("category".to_string(), label.clone());
87 records.push(Record {
88 id: Uuid::new_v4().to_string(),
89 vector: emb,
90 metadata,
91 });
92 embedded_chunks += 1;
93 if embedded_chunks % 50 == 0 || embedded_chunks == total_chunks {
94 println!(
95 " pre-seed embedding progress: {}/{} chunks",
96 embedded_chunks, total_chunks
97 );
98 }
99 }
100 }
101
102 println!(
103 " pre-seed embedding complete: {} chunks -> {} records",
104 embedded_chunks,
105 records.len()
106 );
107
108 let upsert = UpsertRequest {
109 dimension: DIMENSION,
110 metric: METRIC.to_string(),
111 records,
112 };
113
114 println!(
115 " pre-seed upsert starting: {} records",
116 upsert.records.len()
117 );
118 let res = client
119 .post(format!("{}/collections/{}/upsert", YVDB_URL, COLLECTION))
120 .json(&upsert)
121 .send()
122 .await?;
123 res.error_for_status()?;
124 println!(" pre-seed upsert done");
125
126 Ok(())
127}
128
129pub async fn analyze(client: &Client, text: &str) -> Result<StylometryResult> {
130 let chunks = chunk_text(text);
131 let embeddings = embed_chunks(&chunks);
132
133 let mut category_scores = HashMap::new();
134
135 for emb in embeddings {
136 let query = QueryRequest {
137 vector: emb,
138 k: 10,
139 return_distance: false,
140 };
141
142 let res = client
143 .post(format!("{}/collections/{}/query", YVDB_URL, COLLECTION))
144 .json(&query)
145 .send()
146 .await?;
147 let response: QueryResponse = res.json().await?;
148
149 for item in response.results {
150 if let Some(category) = item.metadata.get("category") {
151 *category_scores.entry(category.clone()).or_insert(0.0) += item.score;
152 }
153 }
154 }
155
156 let total = category_scores.values().sum::<f32>();
157 let mut scored: Vec<(String, f32)> = category_scores
158 .into_iter()
159 .map(|(cat, score)| (cat, score / total))
160 .collect();
161 scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
162
163 let mut percentages = HashMap::new();
164 let mut explanations_ordered = HashMap::new();
165 let mut dominant = String::new();
166 let exps = get_explanations();
167 for (idx, (cat, perc)) in scored.into_iter().enumerate() {
168 if idx == 0 {
169 dominant = cat.clone();
170 }
171 percentages.insert(cat.clone(), perc);
172 if let Some(exp) = exps.get(&cat) {
173 explanations_ordered.insert(cat, exp.clone());
174 }
175 }
176
177 Ok(StylometryResult {
178 dominant,
179 percentages,
180 explanations: explanations_ordered,
181 })
182}
183
184pub fn extract_user_content(text: &str) -> String {
185 let re = Regex::new(r"(?is)You said:(.*?)ChatGPT said:").unwrap();
186 let mut extracted = String::new();
187 for cap in re.captures_iter(text) {
188 extracted.push_str(&cap[1].trim());
189 extracted.push('\n');
190 }
191 if extracted.is_empty() {
192 text.to_string()
193 } else {
194 extracted
195 }
196}
197
198fn chunk_text(text: &str) -> Vec<String> {
199 let words: Vec<&str> = text.split_whitespace().collect();
200 let mut chunks = Vec::new();
201 let mut i = 0;
202 while i < words.len() {
203 let end = (i + CHUNK_SIZE).min(words.len());
204 chunks.push(words[i..end].join(" "));
205 if end == words.len() {
206 break;
207 }
208 i = end.saturating_sub(OVERLAP);
209 }
210 chunks
211}
212
213fn embed_chunks(chunks: &[String]) -> Vec<Vec<f32>> {
214 chunks
215 .iter()
216 .map(|chunk| {
217 let mut vector = vec![0.0f32; DIMENSION as usize];
218 for token in chunk.split_whitespace() {
219 let mut hasher = DefaultHasher::new();
220 token.hash(&mut hasher);
221 let bucket = (hasher.finish() % DIMENSION as u64) as usize;
222 vector[bucket] += 1.0;
223 }
224 let norm = vector.iter().map(|v| v * v).sum::<f32>().sqrt();
225 if norm > 0.0 {
226 for v in &mut vector {
227 *v /= norm;
228 }
229 }
230 vector
231 })
232 .collect()
233}
234
235fn get_prototypes() -> Vec<(String, String)> {
236 vec![
237 ("System-Engineer Mind".to_string(), "When troubleshooting a system crash, I begin by isolating the fault domain. Is it hardware, software, or network? I recurse into the logs, checking timestamps for anomalies. If the CPU spike occurs before the memory leak, that suggests a trigger in the computation layer. I debug my own hypothesis: what if the assumption about thread safety is wrong? Layer by layer, I test with mocks, verifying each component in isolation. Thoughts loop: re-run the test, adjust the variable, observe the output change. This recursive approach ensures I catch the root cause, not just symptoms. In code, I use binary search on the commit history to pinpoint the breaking change. Real-time adjustment: if the error shifts, I update my mental model accordingly. This method, honed through years of system building, turns chaos into structured understanding. For example, in a distributed database, I traced a replication lag to a misconfigured clock skew by recursively checking sync points across nodes. Each step builds on the last, debugging not just the system but my thinking process. ... [continue with 500-800 words of layered, recursive, debugging-style text]".to_string()),
238 ("Protocol Architect".to_string(), "A protocol must define rigid boundaries between layers. Inputs are validated at entry, with explicit schemas for every field. Hierarchical thinking governs the design: base layer provides atomic guarantees, upper layers compose them deterministically. No loose definitions—every state transition is mapped with pre and post conditions. Tight boundaries prevent leakage: what belongs to security module stays there. ... [full text]".to_string()),
239 ("Moral Technologist".to_string(), "Technology must serve ethics first. Proprietary code is a moral failure, denying users freedom. Every system should be open, modifiable, distributable without restriction. Tech as manifesto: we build to empower, not control. Fight against surveillance capitalism with code that respects privacy by design. ... [full text]".to_string()),
240 ("Narrative Optimizer".to_string(), "Picture a future where AI unlocks human potential, a story of innovation driving progress. Metaphors abound: like rockets breaking gravity, AI breaks barriers. Persuasive structure: start with the vision, build with evidence, end with call to action. Story-driven: the hero's journey of tech adoption. ... [full text]".to_string()),
241 ("Chaotic Synthesizer".to_string(), "Ideas burst forth: antifragility in volatile markets, references to Taleb, Popper, maximalism in crypto. Edgy phrasing: most experts are wrong, fooled by randomness. Brainstorm: what if we invert the paradigm? Maximalist: all or nothing on robust systems. ... [full text]".to_string()),
242 ("Socratic Analyst".to_string(), "What does 'progress' mean exactly? Precision first: define terms before arguing. Ask more than state: is this assumption testable? Double back: if we reject the premise, what follows? Constant questioning uncovers flaws. ... [full text]".to_string()),
243 ("Enterprise Apologist".to_string(), "In corporate environments, decisions are justified by risk mitigation. Passive-assertive: while innovation is important, we must prioritize stability. Everything has a reason: compliance, stakeholder alignment, long-term viability. Risk-averse: incremental changes over disruption. ... [full text]".to_string()),
244 ]
245}
246
247fn get_explanations() -> HashMap<String, String> {
248 let mut exp = HashMap::new();
249 exp.insert(
250 "System-Engineer Mind".to_string(),
251 "Layered logic, recursive phrasing, real-time debugging of thoughts".to_string(),
252 );
253 exp.insert(
254 "Protocol Architect".to_string(),
255 "Rigid definitions, tight boundaries, hierarchical thinking".to_string(),
256 );
257 exp.insert(
258 "Moral Technologist".to_string(),
259 "Writes like a manifesto; tech as ethics".to_string(),
260 );
261 exp.insert(
262 "Narrative Optimizer".to_string(),
263 "Story-driven, metaphor-rich, persuasive by structure".to_string(),
264 );
265 exp.insert(
266 "Chaotic Synthesizer".to_string(),
267 "Brainstorming bursts, edgy phrasing, maximalist references".to_string(),
268 );
269 exp.insert(
270 "Socratic Analyst".to_string(),
271 "Precision-first, asks more than states, doubles back constantly".to_string(),
272 );
273 exp.insert(
274 "Enterprise Apologist".to_string(),
275 "Passive-assertive tone, justifies everything, risk-averse".to_string(),
276 );
277 exp
278}