stylometry_analyzer/
lib.rs

1use anyhow::Result;
2use regex::Regex;
3use reqwest::Client;
4use serde::{Deserialize, Serialize};
5use std::collections::hash_map::DefaultHasher;
6use std::collections::HashMap;
7use std::hash::{Hash, Hasher};
8use uuid::Uuid;
9
10pub const MIN_USER_BYTES: usize = 70 * 1024; //goal is 750 kb but current data is smaller
11const YVDB_URL: &str = "http://127.0.0.1:8080";
12const COLLECTION: &str = "stylometry_refs";
13const DIMENSION: u32 = 384; //embedding vector size
14const METRIC: &str = "cosine";
15const CHUNK_SIZE: usize = 512; //tokens approx
16const OVERLAP: usize = 100; //tokens approx
17
18#[derive(Serialize, Deserialize)]
19struct UpsertRequest {
20	dimension: u32,
21	metric: String,
22	records: Vec<Record>,
23}
24
25#[derive(Serialize, Deserialize)]
26struct Record {
27	id: String,
28	vector: Vec<f32>,
29	metadata: HashMap<String, String>,
30}
31
32#[derive(Serialize, Deserialize)]
33struct QueryRequest {
34	vector: Vec<f32>,
35	k: u32,
36	return_distance: bool,
37}
38
39#[derive(Serialize, Deserialize)]
40struct QueryResponse {
41	results: Vec<ResultItem>,
42}
43
44#[derive(Serialize, Deserialize)]
45struct ResultItem {
46	id: String,
47	score: f32,
48	metadata: HashMap<String, String>,
49}
50
51#[derive(Serialize)]
52pub struct StylometryResult {
53	pub dominant: String,
54	pub percentages: HashMap<String, f32>,
55	pub explanations: HashMap<String, String>,
56}
57
58pub async fn is_seeded(client: &Client) -> Result<bool> {
59	let res = client
60		.get(format!("{}/collections/{}/stats", YVDB_URL, COLLECTION))
61		.send()
62		.await?;
63	if res.status().is_success() {
64		let stats: serde_json::Value = res.json().await?;
65		Ok(stats["count"].as_u64().unwrap_or(0) > 0)
66	} else {
67		Ok(false)
68	}
69}
70
71pub async fn pre_seed_categories(client: &Client) -> Result<()> {
72	let prototypes = get_prototypes();
73	let mut records = Vec::new();
74
75	let total_chunks: usize = prototypes
76		.iter()
77		.map(|(_, text)| chunk_text(text).len())
78		.sum();
79	let mut embedded_chunks = 0usize;
80
81	for (label, text) in prototypes {
82		let chunks = chunk_text(&text);
83		let embeddings = embed_chunks(&chunks);
84		for emb in embeddings {
85			let mut metadata = HashMap::new();
86			metadata.insert("category".to_string(), label.clone());
87			records.push(Record {
88				id: Uuid::new_v4().to_string(),
89				vector: emb,
90				metadata,
91			});
92			embedded_chunks += 1;
93			if embedded_chunks % 50 == 0 || embedded_chunks == total_chunks {
94				println!(
95					"  pre-seed embedding progress: {}/{} chunks",
96					embedded_chunks, total_chunks
97				);
98			}
99		}
100	}
101
102	println!(
103		"  pre-seed embedding complete: {} chunks -> {} records",
104		embedded_chunks,
105		records.len()
106	);
107
108	let upsert = UpsertRequest {
109		dimension: DIMENSION,
110		metric: METRIC.to_string(),
111		records,
112	};
113
114	println!(
115		"  pre-seed upsert starting: {} records",
116		upsert.records.len()
117	);
118	let res = client
119		.post(format!("{}/collections/{}/upsert", YVDB_URL, COLLECTION))
120		.json(&upsert)
121		.send()
122		.await?;
123	res.error_for_status()?;
124	println!("  pre-seed upsert done");
125
126	Ok(())
127}
128
129pub async fn analyze(client: &Client, text: &str) -> Result<StylometryResult> {
130	let chunks = chunk_text(text);
131	let embeddings = embed_chunks(&chunks);
132
133	let mut category_scores = HashMap::new();
134
135	for emb in embeddings {
136		let query = QueryRequest {
137			vector: emb,
138			k: 10,
139			return_distance: false,
140		};
141
142		let res = client
143			.post(format!("{}/collections/{}/query", YVDB_URL, COLLECTION))
144			.json(&query)
145			.send()
146			.await?;
147		let response: QueryResponse = res.json().await?;
148
149		for item in response.results {
150			if let Some(category) = item.metadata.get("category") {
151				*category_scores.entry(category.clone()).or_insert(0.0) += item.score;
152			}
153		}
154	}
155
156	let total = category_scores.values().sum::<f32>();
157	let mut scored: Vec<(String, f32)> = category_scores
158		.into_iter()
159		.map(|(cat, score)| (cat, score / total))
160		.collect();
161	scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
162
163	let mut percentages = HashMap::new();
164	let mut explanations_ordered = HashMap::new();
165	let mut dominant = String::new();
166	let exps = get_explanations();
167	for (idx, (cat, perc)) in scored.into_iter().enumerate() {
168		if idx == 0 {
169			dominant = cat.clone();
170		}
171		percentages.insert(cat.clone(), perc);
172		if let Some(exp) = exps.get(&cat) {
173			explanations_ordered.insert(cat, exp.clone());
174		}
175	}
176
177	Ok(StylometryResult {
178		dominant,
179		percentages,
180		explanations: explanations_ordered,
181	})
182}
183
184pub fn extract_user_content(text: &str) -> String {
185	let re = Regex::new(r"(?is)You said:(.*?)ChatGPT said:").unwrap();
186	let mut extracted = String::new();
187	for cap in re.captures_iter(text) {
188		extracted.push_str(&cap[1].trim());
189		extracted.push('\n');
190	}
191	if extracted.is_empty() {
192		text.to_string()
193	} else {
194		extracted
195	}
196}
197
198fn chunk_text(text: &str) -> Vec<String> {
199	let words: Vec<&str> = text.split_whitespace().collect();
200	let mut chunks = Vec::new();
201	let mut i = 0;
202	while i < words.len() {
203		let end = (i + CHUNK_SIZE).min(words.len());
204		chunks.push(words[i..end].join(" "));
205		if end == words.len() {
206			break;
207		}
208		i = end.saturating_sub(OVERLAP);
209	}
210	chunks
211}
212
213fn embed_chunks(chunks: &[String]) -> Vec<Vec<f32>> {
214	chunks
215		.iter()
216		.map(|chunk| {
217			let mut vector = vec![0.0f32; DIMENSION as usize];
218			for token in chunk.split_whitespace() {
219				let mut hasher = DefaultHasher::new();
220				token.hash(&mut hasher);
221				let bucket = (hasher.finish() % DIMENSION as u64) as usize;
222				vector[bucket] += 1.0;
223			}
224			let norm = vector.iter().map(|v| v * v).sum::<f32>().sqrt();
225			if norm > 0.0 {
226				for v in &mut vector {
227					*v /= norm;
228				}
229			}
230			vector
231		})
232		.collect()
233}
234
235fn get_prototypes() -> Vec<(String, String)> {
236	vec![
237        ("System-Engineer Mind".to_string(), "When troubleshooting a system crash, I begin by isolating the fault domain. Is it hardware, software, or network? I recurse into the logs, checking timestamps for anomalies. If the CPU spike occurs before the memory leak, that suggests a trigger in the computation layer. I debug my own hypothesis: what if the assumption about thread safety is wrong? Layer by layer, I test with mocks, verifying each component in isolation. Thoughts loop: re-run the test, adjust the variable, observe the output change. This recursive approach ensures I catch the root cause, not just symptoms. In code, I use binary search on the commit history to pinpoint the breaking change. Real-time adjustment: if the error shifts, I update my mental model accordingly. This method, honed through years of system building, turns chaos into structured understanding. For example, in a distributed database, I traced a replication lag to a misconfigured clock skew by recursively checking sync points across nodes. Each step builds on the last, debugging not just the system but my thinking process. ... [continue with 500-800 words of layered, recursive, debugging-style text]".to_string()),
238        ("Protocol Architect".to_string(), "A protocol must define rigid boundaries between layers. Inputs are validated at entry, with explicit schemas for every field. Hierarchical thinking governs the design: base layer provides atomic guarantees, upper layers compose them deterministically. No loose definitions—every state transition is mapped with pre and post conditions. Tight boundaries prevent leakage: what belongs to security module stays there. ... [full text]".to_string()),
239        ("Moral Technologist".to_string(), "Technology must serve ethics first. Proprietary code is a moral failure, denying users freedom. Every system should be open, modifiable, distributable without restriction. Tech as manifesto: we build to empower, not control. Fight against surveillance capitalism with code that respects privacy by design. ... [full text]".to_string()),
240        ("Narrative Optimizer".to_string(), "Picture a future where AI unlocks human potential, a story of innovation driving progress. Metaphors abound: like rockets breaking gravity, AI breaks barriers. Persuasive structure: start with the vision, build with evidence, end with call to action. Story-driven: the hero's journey of tech adoption. ... [full text]".to_string()),
241        ("Chaotic Synthesizer".to_string(), "Ideas burst forth: antifragility in volatile markets, references to Taleb, Popper, maximalism in crypto. Edgy phrasing: most experts are wrong, fooled by randomness. Brainstorm: what if we invert the paradigm? Maximalist: all or nothing on robust systems. ... [full text]".to_string()),
242        ("Socratic Analyst".to_string(), "What does 'progress' mean exactly? Precision first: define terms before arguing. Ask more than state: is this assumption testable? Double back: if we reject the premise, what follows? Constant questioning uncovers flaws. ... [full text]".to_string()),
243        ("Enterprise Apologist".to_string(), "In corporate environments, decisions are justified by risk mitigation. Passive-assertive: while innovation is important, we must prioritize stability. Everything has a reason: compliance, stakeholder alignment, long-term viability. Risk-averse: incremental changes over disruption. ... [full text]".to_string()),
244    ]
245}
246
247fn get_explanations() -> HashMap<String, String> {
248	let mut exp = HashMap::new();
249	exp.insert(
250		"System-Engineer Mind".to_string(),
251		"Layered logic, recursive phrasing, real-time debugging of thoughts".to_string(),
252	);
253	exp.insert(
254		"Protocol Architect".to_string(),
255		"Rigid definitions, tight boundaries, hierarchical thinking".to_string(),
256	);
257	exp.insert(
258		"Moral Technologist".to_string(),
259		"Writes like a manifesto; tech as ethics".to_string(),
260	);
261	exp.insert(
262		"Narrative Optimizer".to_string(),
263		"Story-driven, metaphor-rich, persuasive by structure".to_string(),
264	);
265	exp.insert(
266		"Chaotic Synthesizer".to_string(),
267		"Brainstorming bursts, edgy phrasing, maximalist references".to_string(),
268	);
269	exp.insert(
270		"Socratic Analyst".to_string(),
271		"Precision-first, asks more than states, doubles back constantly".to_string(),
272	);
273	exp.insert(
274		"Enterprise Apologist".to_string(),
275		"Passive-assertive tone, justifies everything, risk-averse".to_string(),
276	);
277	exp
278}