1pub async fn sample_data(
10 file_path: &str,
11 n: usize,
12 fraction: f64,
13 seed: u64,
14 split: f64,
15 output: &str,
16) -> Result<String, String> {
17 let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
18 let hex_output: String = output.bytes().map(|b| format!("{:02x}", b)).collect();
19
20 let script = format!(
21 r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys, random
22
23_path = bytes.fromhex("{hex_path}").decode().strip()
24_outdir = bytes.fromhex("{hex_output}").decode().strip()
25_n = {n}
26_frac = {fraction}
27_seed = {seed}
28_split = {split}
29
30random.seed(_seed)
31
32def _load(path):
33 ext = os.path.splitext(path)[1].lower().lstrip('.')
34 if ext in ('csv','tsv'):
35 with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
36 r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
37 return list(r), ext
38 elif ext == 'json':
39 with open(path, encoding='utf-8') as fh: d = _js.load(fh)
40 rows = d if isinstance(d, list) else next(iter(d.values()), [])
41 return rows, 'json'
42 elif ext in ('db','sqlite','sqlite3'):
43 con = _sq.connect(path)
44 cur = con.cursor()
45 cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
46 t = cur.fetchone()
47 if not t: return [], 'csv'
48 cur.execute("SELECT * FROM [%s]" % t[0])
49 cols2 = [d[0] for d in cur.description]
50 rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
51 con.close()
52 return rows2, 'csv'
53 print("Unsupported format: "+ext, file=sys.stderr); sys.exit(1)
54
55rows, ext = _load(_path)
56total = len(rows)
57if total == 0:
58 print("No rows found."); sys.exit(0)
59
60if _frac > 0 and _frac <= 1:
61 k = max(1, int(total * _frac))
62elif _n > 0:
63 k = min(_n, total)
64else:
65 k = min(100, total)
66
67sample = random.sample(rows, k)
68
69def _write_csv(data, path):
70 if not data: return
71 fieldnames = list(data[0].keys())
72 with open(path, 'w', newline='', encoding='utf-8') as fh:
73 w = _csv.DictWriter(fh, fieldnames=fieldnames)
74 w.writeheader(); w.writerows(data)
75
76if _split > 0 and _split < 1 and _outdir:
77 split_n = int(k * _split)
78 train = sample[:split_n]
79 test = sample[split_n:]
80 base = os.path.splitext(os.path.basename(_path))[0]
81 train_path = os.path.join(_outdir, base + '_train.csv')
82 test_path = os.path.join(_outdir, base + '_test.csv')
83 os.makedirs(_outdir, exist_ok=True)
84 _write_csv(train, train_path)
85 _write_csv(test, test_path)
86 print("Sampled %d rows (seed=%d) → %d%% split" % (k, _seed, int(_split*100)))
87 print("Train: %d rows → %s" % (len(train), train_path))
88 print("Test: %d rows → %s" % (len(test), test_path))
89elif _outdir:
90 base = os.path.splitext(os.path.basename(_path))[0]
91 out_path = os.path.join(_outdir, base + '_sample%d.csv' % k)
92 os.makedirs(_outdir, exist_ok=True)
93 _write_csv(sample, out_path)
94 print("Sampled %d / %d rows (seed=%d) → %s" % (k, total, _seed, out_path))
95else:
96 # Print sample to stdout as CSV
97 fieldnames = list(sample[0].keys())
98 print(','.join(fieldnames))
99 for row in sample:
100 print(','.join(str(row.get(f,'')) for f in fieldnames))
101 print()
102 print("# Sampled %d / %d rows (seed=%d)" % (k, total, _seed))
103 print("# Use --sample-output DIR to save to file, or --split 0.8 for train/test split")
104"####,
105 hex_path = hex_path,
106 hex_output = hex_output,
107 n = n,
108 fraction = fraction,
109 seed = seed,
110 split = split,
111 );
112
113 let sandbox_args = serde_json::json!({
114 "language": "python",
115 "code": script,
116 "timeout_seconds": 30
117 });
118 crate::tools::code_sandbox::execute(&sandbox_args).await
119}
120
121pub async fn correlation_matrix(file_path: &str, method: &str) -> Result<String, String> {
124 let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
125 let hex_method: String = method.bytes().map(|b| format!("{:02x}", b)).collect();
126
127 let script = format!(
128 r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys, math
129
130_path = bytes.fromhex("{hex_path}").decode().strip()
131_method = bytes.fromhex("{hex_method}").decode().strip().lower() or "pearson"
132
133def _load(path):
134 ext = os.path.splitext(path)[1].lower().lstrip('.')
135 if ext in ('csv','tsv'):
136 with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
137 r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
138 return list(r)
139 elif ext == 'json':
140 with open(path, encoding='utf-8') as fh: d = _js.load(fh)
141 return d if isinstance(d, list) else next(iter(d.values()), [])
142 elif ext in ('db','sqlite','sqlite3'):
143 con = _sq.connect(path)
144 cur = con.cursor()
145 cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
146 t = cur.fetchone()
147 if not t: return []
148 cur.execute("SELECT * FROM [%s]" % t[0])
149 cols2 = [d[0] for d in cur.description]
150 rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
151 con.close()
152 return rows2
153 print("Unsupported: "+ext, file=sys.stderr); sys.exit(1)
154
155def _tf(v):
156 try: return float(str(v).replace(',','').strip())
157 except: return None
158
159rows = _load(_path)
160if not rows:
161 print("No rows found."); sys.exit(0)
162
163all_cols = list(rows[0].keys())
164# Keep columns that are at least 50% numeric
165num_cols = [c for c in all_cols
166 if sum(1 for r in rows if _tf(r.get(c,'')) is not None) >= len(rows)*0.5]
167
168if len(num_cols) < 2:
169 print("Need at least 2 numeric columns. Found: %s" % ', '.join(num_cols or ['(none)']))
170 sys.exit(0)
171
172# Build column vectors (paired — both must be non-null for each row)
173def _col_vec(c): return [_tf(r.get(c,'')) for r in rows]
174
175vecs = {{c: _col_vec(c) for c in num_cols}}
176
177def _pearson(a, b):
178 pairs = [(x,y) for x,y in zip(a,b) if x is not None and y is not None]
179 n = len(pairs)
180 if n < 3: return float('nan')
181 mx = sum(x for x,_ in pairs)/n
182 my = sum(y for _,y in pairs)/n
183 num = sum((x-mx)*(y-my) for x,y in pairs)
184 dx = math.sqrt(sum((x-mx)**2 for x,_ in pairs))
185 dy = math.sqrt(sum((y-my)**2 for _,y in pairs))
186 return num/(dx*dy) if dx*dy else float('nan')
187
188def _spearman(a, b):
189 pairs = [(x,y) for x,y in zip(a,b) if x is not None and y is not None]
190 n = len(pairs)
191 if n < 3: return float('nan')
192 def _rank(vs):
193 sorted_vs = sorted(enumerate(vs), key=lambda x: x[1])
194 ranks = [0.0]*n
195 i = 0
196 while i < n:
197 j = i
198 while j < n-1 and sorted_vs[j+1][1] == sorted_vs[i][1]: j+=1
199 avg_rank = (i + j)/2 + 1
200 for k in range(i,j+1): ranks[sorted_vs[k][0]] = avg_rank
201 i = j+1
202 return ranks
203 ra = _rank([p[0] for p in pairs])
204 rb = _rank([p[1] for p in pairs])
205 return _pearson(ra, rb)
206
207corr_fn = _spearman if _method.startswith('sp') else _pearson
208
209nc = len(num_cols)
210matrix = [[corr_fn(vecs[a], vecs[b]) for b in num_cols] for a in num_cols]
211
212W = 64
213print("="*W)
214print(" Correlation Matrix (%s) — %s" % (_method.capitalize(), os.path.basename(_path)))
215print("-"*W)
216# Print header
217col_w = 8
218print("%*s" % (20, ""), end="")
219for c in num_cols:
220 print(" %*s" % (col_w, c[:col_w]), end="")
221print()
222print("-"*W)
223for i, ra in enumerate(num_cols):
224 print("%-20s" % ra[:20], end="")
225 for j in range(nc):
226 v = matrix[i][j]
227 if math.isnan(v): s = " nan "
228 else: s = " %7.4f" % v
229 # Highlight strong correlations
230 if i != j and not math.isnan(v) and abs(v) >= 0.7:
231 s = s + "*"
232 else:
233 s = s + " "
234 print(" %s" % s[:col_w+1], end="")
235 print()
236print("="*W)
237print(" * |r| >= 0.7 (strong correlation)")
238print()
239# Report top correlations
240pairs_flat = []
241for i in range(nc):
242 for j in range(i+1, nc):
243 v = matrix[i][j]
244 if not math.isnan(v):
245 pairs_flat.append((abs(v), v, num_cols[i], num_cols[j]))
246pairs_flat.sort(reverse=True)
247if pairs_flat:
248 print("Top correlations:")
249 for _abs, v, a, b in pairs_flat[:min(5, len(pairs_flat))]:
250 direction = "positive" if v > 0 else "negative"
251 strength = "strong" if abs(v)>=0.7 else "moderate" if abs(v)>=0.4 else "weak"
252 print(" %s %-15s × %-15s" % (("r=%+.4f"%v), a[:15], b[:15]))
253 print(" (%s %s)" % (strength, direction))
254"####,
255 hex_path = hex_path,
256 hex_method = hex_method,
257 );
258
259 let sandbox_args = serde_json::json!({
260 "language": "python",
261 "code": script,
262 "timeout_seconds": 30
263 });
264 crate::tools::code_sandbox::execute(&sandbox_args).await
265}
266
267pub async fn timeseries_analyze(
270 file_path: &str,
271 date_col: &str,
272 value_col: &str,
273 window: usize,
274) -> Result<String, String> {
275 let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
276 let hex_date_col: String = date_col.bytes().map(|b| format!("{:02x}", b)).collect();
277 let hex_val_col: String = value_col.bytes().map(|b| format!("{:02x}", b)).collect();
278
279 let script = format!(
280 r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys, math, re
281
282_path = bytes.fromhex("{hex_path}").decode().strip()
283_date_col = bytes.fromhex("{hex_date_col}").decode().strip()
284_val_col = bytes.fromhex("{hex_val_col}").decode().strip()
285_window = {window}
286if _window < 2: _window = 7
287
288def _load(path):
289 ext = os.path.splitext(path)[1].lower().lstrip('.')
290 if ext in ('csv','tsv'):
291 with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
292 r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
293 return list(r)
294 elif ext == 'json':
295 with open(path, encoding='utf-8') as fh: d = _js.load(fh)
296 return d if isinstance(d, list) else next(iter(d.values()), [])
297 elif ext in ('db','sqlite','sqlite3'):
298 con = _sq.connect(path)
299 cur = con.cursor()
300 cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
301 t = cur.fetchone()
302 if not t: return []
303 cur.execute("SELECT * FROM [%s]" % t[0])
304 cols2 = [d[0] for d in cur.description]
305 rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
306 con.close()
307 return rows2
308 print("Unsupported: "+ext, file=sys.stderr); sys.exit(1)
309
310def _tf(v):
311 try: return float(str(v).replace(',','').strip())
312 except: return None
313
314# Detect date columns if not specified
315def _looks_like_date(v):
316 return bool(re.match(r'\d{{4}}[-/]\d{{1,2}}[-/]\d{{1,2}}', str(v)))
317
318rows = _load(_path)
319if not rows:
320 print("No rows found."); sys.exit(0)
321
322all_cols = list(rows[0].keys())
323
324if not _date_col:
325 date_candidates = [c for c in all_cols if sum(1 for r in rows[:50] if _looks_like_date(r.get(c,''))) > 20]
326 _date_col = date_candidates[0] if date_candidates else all_cols[0]
327
328num_cols = [c for c in all_cols
329 if c != _date_col and sum(1 for r in rows if _tf(r.get(c,'')) is not None) >= len(rows)*0.5]
330
331if not _val_col and num_cols:
332 _val_col = num_cols[0]
333
334if not _val_col:
335 print("No numeric value column found. Use --ts-value COL to specify one."); sys.exit(0)
336
337# Extract and sort by date string (lexicographic — works for ISO dates)
338pairs = []
339for r in rows:
340 d = str(r.get(_date_col,'')).strip()
341 v = _tf(r.get(_val_col,''))
342 if d and v is not None:
343 pairs.append((d, v))
344pairs.sort(key=lambda p: p[0])
345
346if len(pairs) < 3:
347 print("Need at least 3 data points. Found: %d" % len(pairs)); sys.exit(0)
348
349dates = [p[0] for p in pairs]
350vals = [p[1] for p in pairs]
351n = len(vals)
352
353# Rolling mean
354def _roll(vs, w):
355 return [sum(vs[max(0,i-w+1):i+1])/len(vs[max(0,i-w+1):i+1]) for i in range(len(vs))]
356
357roll_mean = _roll(vals, _window)
358
359# Linear trend (least squares)
360xs = list(range(n))
361xm = sum(xs)/n; ym = sum(vals)/n
362b = sum((x-xm)*(y-ym) for x,y in zip(xs,vals)) / sum((x-xm)**2 for x in xs)
363a = ym - b*xm
364trend_line = [a + b*x for x in xs]
365
366# Peak/valley detection
367peaks = [i for i in range(1,n-1) if vals[i]>vals[i-1] and vals[i]>vals[i+1]]
368valleys = [i for i in range(1,n-1) if vals[i]<vals[i-1] and vals[i]<vals[i+1]]
369
370W = 64
371print("="*W)
372print(" Time-Series Analysis: %s" % os.path.basename(_path))
373print(" Date column: %s Value column: %s" % (_date_col, _val_col))
374print("-"*W)
375print(" Points: %d Range: %s → %s" % (n, dates[0][:16], dates[-1][:16]))
376print(" Min: %g (at %s)" % (min(vals), dates[vals.index(min(vals))][:16]))
377print(" Max: %g (at %s)" % (max(vals), dates[vals.index(max(vals))][:16]))
378print(" Mean: %.4f Std: %.4f" % (ym, math.sqrt(sum((v-ym)**2 for v in vals)/n)))
379print(" Trend: %.4f per step (%s)" % (b, "↑ upward" if b>0 else "↓ downward" if b<0 else "→ flat"))
380print(" Peaks: %d local maxima Valleys: %d local minima" % (len(peaks), len(valleys)))
381print("-"*W)
382print(" Rolling mean (window=%d):" % _window)
383# Compact sparkline using ASCII
384W2 = 50
385rng = max(vals) - min(vals) if max(vals) != min(vals) else 1
386bar_chars = " â–▂▃▄▅▆▇█"
387spark = ''.join(bar_chars[min(8,int((v-min(vals))/rng*8))] for v in vals)
388# Wrap
389for i in range(0, len(spark), W2):
390 chunk = spark[i:i+W2]
391 print(" [%s] %s–%s" % (chunk, dates[i][:10], dates[min(i+W2-1,n-1)][:10]))
392print("-"*W)
393# Last few rolling values
394print(" Recent rolling mean (%d-period):" % _window)
395for i in range(max(0,n-5), n):
396 flag = " ↠latest" if i==n-1 else ""
397 print(" %-16s value=%g roll_mean=%.4f%s" % (dates[i][:16], vals[i], roll_mean[i], flag))
398print("="*W)
399"####,
400 hex_path = hex_path,
401 hex_date_col = hex_date_col,
402 hex_val_col = hex_val_col,
403 window = window,
404 );
405
406 let sandbox_args = serde_json::json!({
407 "language": "python",
408 "code": script,
409 "timeout_seconds": 30
410 });
411 crate::tools::code_sandbox::execute(&sandbox_args).await
412}
413
414pub async fn percentile_report(file_path: &str, col: &str) -> Result<String, String> {
419 let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
420 let hex_col: String = col.bytes().map(|b| format!("{:02x}", b)).collect();
421
422 let script = format!(
423 r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys, math
424
425_path = bytes.fromhex("{hex_path}").decode().strip()
426_col = bytes.fromhex("{hex_col}").decode().strip()
427
428def _load(path):
429 ext = os.path.splitext(path)[1].lower().lstrip('.')
430 if ext in ('csv','tsv'):
431 with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
432 r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
433 return list(r)
434 elif ext == 'json':
435 with open(path, encoding='utf-8') as fh: d = _js.load(fh)
436 return d if isinstance(d, list) else next(iter(d.values()), [])
437 elif ext in ('db','sqlite','sqlite3'):
438 con = _sq.connect(path)
439 cur = con.cursor()
440 cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
441 t = cur.fetchone()
442 if not t: return []
443 cur.execute("SELECT * FROM [%s]" % t[0])
444 cols2 = [d[0] for d in cur.description]
445 rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
446 con.close()
447 return rows2
448 print("Unsupported: "+ext, file=sys.stderr); sys.exit(1)
449
450def _tf(v):
451 try: return float(str(v).replace(',','').strip())
452 except: return None
453
454def _percentile(data, p):
455 if not data: return float('nan')
456 s = sorted(data)
457 n = len(s)
458 idx = (p/100.0) * (n-1)
459 lo = int(idx); hi = lo + 1
460 frac = idx - lo
461 if hi >= n: return s[-1]
462 return s[lo] + frac*(s[hi]-s[lo])
463
464rows = _load(_path)
465if not rows:
466 print("No rows found."); sys.exit(0)
467
468all_cols = list(rows[0].keys())
469if _col:
470 target_cols = [c for c in all_cols if c.lower() == _col.lower()]
471 if not target_cols:
472 print("Column '%s' not found. Available: %s" % (_col, ', '.join(all_cols)))
473 sys.exit(1)
474else:
475 target_cols = [c for c in all_cols
476 if sum(1 for r in rows if _tf(r.get(c,'')) is not None) >= len(rows)*0.5]
477 if not target_cols:
478 print("No numeric columns found."); sys.exit(0)
479
480W = 72
481print("="*W)
482print(" Percentile Report — %s (%d rows)" % (os.path.basename(_path), len(rows)))
483print("-"*W)
484hdr = "%-20s %8s %8s %8s %8s %8s %8s %8s" % ("Column", "P25", "P50", "P75", "P90", "P99", "Min", "Max")
485print(hdr)
486print("-"*W)
487for c in target_cols:
488 vals = [_tf(r.get(c,'')) for r in rows]
489 vals = [v for v in vals if v is not None]
490 if not vals: continue
491 p25=_percentile(vals,25); p50=_percentile(vals,50); p75=_percentile(vals,75)
492 p90=_percentile(vals,90); p99=_percentile(vals,99)
493 mn=min(vals); mx=max(vals)
494 def _f(v): return "%8g" % v
495 print("%-20s %s %s %s %s %s %s %s" % (c[:20], _f(p25), _f(p50), _f(p75), _f(p90), _f(p99), _f(mn), _f(mx)))
496print("="*W)
497print()
498if len(target_cols) == 1:
499 c = target_cols[0]
500 vals = [_tf(r.get(c,'')) for r in rows if _tf(r.get(c,'')) is not None]
501 print("Detailed percentile table for '%s':" % c)
502 for p in [1, 5, 10, 25, 50, 75, 90, 95, 99]:
503 v = _percentile(vals, p)
504 print(" P%-3d %g" % (p, v))
505 mean = sum(vals)/len(vals)
506 std = math.sqrt(sum((x-mean)**2 for x in vals)/len(vals))
507 iqr = _percentile(vals,75) - _percentile(vals,25)
508 print()
509 print(" Mean: %g Std: %g IQR: %g N: %d" % (mean, std, iqr, len(vals)))
510"####,
511 hex_path = hex_path,
512 hex_col = hex_col,
513 );
514
515 let sandbox_args = serde_json::json!({
516 "language": "python",
517 "code": script,
518 "timeout_seconds": 30
519 });
520 crate::tools::code_sandbox::execute(&sandbox_args).await
521}
522
523pub async fn pivot_table(
528 file_path: &str,
529 row_col: &str,
530 col_col: &str,
531 value_col: &str,
532 agg: &str,
533) -> Result<String, String> {
534 let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
535 let hex_row_col: String = row_col.bytes().map(|b| format!("{:02x}", b)).collect();
536 let hex_col_col: String = col_col.bytes().map(|b| format!("{:02x}", b)).collect();
537 let hex_val_col: String = value_col.bytes().map(|b| format!("{:02x}", b)).collect();
538 let hex_agg: String = agg.bytes().map(|b| format!("{:02x}", b)).collect();
539
540 let script = format!(
541 r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys
542
543_path = bytes.fromhex("{hex_path}").decode().strip()
544_row_col = bytes.fromhex("{hex_row_col}").decode().strip()
545_col_col = bytes.fromhex("{hex_col_col}").decode().strip()
546_val_col = bytes.fromhex("{hex_val_col}").decode().strip()
547_agg = bytes.fromhex("{hex_agg}").decode().strip().lower() or "count"
548
549def _load(path):
550 ext = os.path.splitext(path)[1].lower().lstrip('.')
551 if ext in ('csv','tsv'):
552 with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
553 r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
554 return list(r)
555 elif ext == 'json':
556 with open(path, encoding='utf-8') as fh: d = _js.load(fh)
557 return d if isinstance(d, list) else next(iter(d.values()), [])
558 elif ext in ('db','sqlite','sqlite3'):
559 con = _sq.connect(path)
560 cur = con.cursor()
561 cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
562 t = cur.fetchone()
563 if not t: return []
564 cur.execute("SELECT * FROM [%s]" % t[0])
565 cols2 = [d[0] for d in cur.description]
566 rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
567 con.close()
568 return rows2
569 print("Unsupported: "+ext, file=sys.stderr); sys.exit(1)
570
571def _tf(v):
572 try: return float(str(v).replace(',','').strip())
573 except: return None
574
575rows = _load(_path)
576if not rows:
577 print("No rows found."); sys.exit(0)
578
579all_cols = list(rows[0].keys())
580
581if not _row_col:
582 cat_cols = [c for c in all_cols if sum(1 for r in rows if _tf(r.get(c,'')) is None) > len(rows)*0.3]
583 _row_col = cat_cols[0] if cat_cols else all_cols[0]
584if not _col_col:
585 cat_cols = [c for c in all_cols if sum(1 for r in rows if _tf(r.get(c,'')) is None) > len(rows)*0.3]
586 _col_col = cat_cols[1] if len(cat_cols) > 1 else (all_cols[1] if len(all_cols) > 1 else _row_col)
587if not _val_col and _agg != 'count':
588 num_cols = [c for c in all_cols if c not in (_row_col, _col_col) and
589 sum(1 for r in rows if _tf(r.get(c,'')) is not None) >= len(rows)*0.5]
590 _val_col = num_cols[0] if num_cols else ''
591
592data = {{}}
593for r in rows:
594 rk = str(r.get(_row_col, '')).strip()
595 ck = str(r.get(_col_col, '')).strip()
596 v = _tf(r.get(_val_col, '')) if _val_col else 1.0
597 if rk not in data: data[rk] = {{}}
598 if ck not in data[rk]: data[rk][ck] = []
599 if v is not None: data[rk][ck].append(v)
600
601row_keys = sorted(data.keys())
602col_keys = sorted({{ck for rv in data.values() for ck in rv}})
603
604def _agg_fn(vals):
605 if not vals: return ''
606 if _agg == 'count': return str(len(vals))
607 if _agg == 'sum': return "%.4g" % sum(vals)
608 if _agg == 'mean': return "%.4g" % (sum(vals)/len(vals))
609 if _agg == 'min': return "%.4g" % min(vals)
610 if _agg == 'max': return "%.4g" % max(vals)
611 return str(len(vals))
612
613CW = 10
614RW = 16
615print("Pivot: %s x %s (%s of %s) | rows=%d cols=%d" % (
616 _row_col, _col_col, _agg, _val_col or 'rows', len(row_keys), len(col_keys)))
617print()
618print("%-*s" % (RW, _row_col[:RW]), end="")
619for ck in col_keys: print(" %-*s" % (CW, ck[:CW]), end="")
620print()
621print("-" * (RW + len(col_keys)*(CW+2)))
622for rk in row_keys:
623 print("%-*s" % (RW, rk[:RW]), end="")
624 for ck in col_keys:
625 vals = data.get(rk, {{}}).get(ck, [])
626 cell = _agg_fn(vals) if vals else '-'
627 print(" %-*s" % (CW, cell[:CW]), end="")
628 print()
629"####,
630 hex_path = hex_path,
631 hex_row_col = hex_row_col,
632 hex_col_col = hex_col_col,
633 hex_val_col = hex_val_col,
634 hex_agg = hex_agg,
635 );
636
637 let sandbox_args = serde_json::json!({
638 "language": "python",
639 "code": script,
640 "timeout_seconds": 30
641 });
642 crate::tools::code_sandbox::execute(&sandbox_args).await
643}
644
645pub async fn linear_regression(
651 file_path: &str,
652 predictors: &[&str],
653 target: &str,
654) -> Result<String, String> {
655 let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
656 let hex_target: String = target.bytes().map(|b| format!("{:02x}", b)).collect();
657 let preds_joined = predictors.join("\n");
658 let hex_preds: String = preds_joined.bytes().map(|b| format!("{:02x}", b)).collect();
659
660 let script = format!(
661 r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys, math
662
663_path = bytes.fromhex("{hex_path}").decode().strip()
664_target = bytes.fromhex("{hex_target}").decode().strip()
665_preds_raw = bytes.fromhex("{hex_preds}").decode().strip()
666_preds = [p.strip() for p in _preds_raw.split('\n') if p.strip()] if _preds_raw else []
667
668def _load(path):
669 ext = os.path.splitext(path)[1].lower().lstrip('.')
670 if ext in ('csv','tsv'):
671 with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
672 r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
673 return list(r)
674 elif ext == 'json':
675 with open(path, encoding='utf-8') as fh: d = _js.load(fh)
676 return d if isinstance(d, list) else next(iter(d.values()), [])
677 elif ext in ('db','sqlite','sqlite3'):
678 con = _sq.connect(path)
679 cur = con.cursor()
680 cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
681 t = cur.fetchone()
682 if not t: return []
683 cur.execute("SELECT * FROM [%s]" % t[0])
684 cols2 = [d[0] for d in cur.description]
685 rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
686 con.close()
687 return rows2
688 print("Unsupported: "+ext, file=sys.stderr); sys.exit(1)
689
690def _tf(v):
691 try: return float(str(v).replace(',','').strip())
692 except: return None
693
694rows = _load(_path)
695if not rows:
696 print("No rows found."); sys.exit(0)
697
698all_cols = list(rows[0].keys())
699num_cols = [c for c in all_cols if sum(1 for r in rows if _tf(r.get(c,'')) is not None) >= len(rows)*0.5]
700
701if not _target:
702 _target = num_cols[-1] if num_cols else ''
703if not _preds:
704 _preds = [c for c in num_cols if c != _target]
705
706if not _target:
707 print("No target column. Use --regression-target COL"); sys.exit(1)
708if not _preds:
709 print("No predictor columns. Use --regression-predictors COL1,COL2,..."); sys.exit(1)
710
711valid = []
712for r in rows:
713 y = _tf(r.get(_target,''))
714 xs = [_tf(r.get(pp,'')) for pp in _preds]
715 if y is not None and all(x is not None for x in xs):
716 valid.append((xs, y))
717
718n = len(valid)
719pp = len(_preds)
720if n < pp + 2:
721 print("Not enough valid rows (%d) for %d predictors." % (n, pp)); sys.exit(1)
722
723X = [[1.0] + row[0] for row in valid]
724y = [row[1] for row in valid]
725
726def _mat_mul_sq(A, B):
727 ra,ca = len(A),len(A[0]); cb = len(B[0])
728 return [[sum(A[i][k]*B[k][j] for k in range(ca)) for j in range(cb)] for i in range(ra)]
729
730def _mat_T(A):
731 return [[A[i][j] for i in range(len(A))] for j in range(len(A[0]))]
732
733def _lu_solve(A, b):
734 n2 = len(A)
735 M = [row[:] + [b[i]] for i,row in enumerate(A)]
736 for col in range(n2):
737 pivot = max(range(col,n2), key=lambda r2: abs(M[r2][col]))
738 M[col],M[pivot] = M[pivot],M[col]
739 if abs(M[col][col]) < 1e-12: return None
740 for row in range(col+1,n2):
741 f = M[row][col]/M[col][col]
742 for j in range(col,n2+1): M[row][j] -= f*M[col][j]
743 x2 = [0.0]*n2
744 for i in range(n2-1,-1,-1):
745 x2[i] = (M[i][n2] - sum(M[i][j]*x2[j] for j in range(i+1,n2))) / M[i][i]
746 return x2
747
748Xt = _mat_T(X)
749XtX_sq = _mat_mul_sq(Xt, X)
750Xty = [sum(Xt[i][k]*y[k] for k in range(n)) for i in range(pp+1)]
751beta = _lu_solve(XtX_sq, Xty)
752if beta is None:
753 print("Matrix is singular — check for collinear predictors."); sys.exit(1)
754
755preds_vals = [sum(beta[j]*X[i][j] for j in range(pp+1)) for i in range(n)]
756residuals = [y[i]-preds_vals[i] for i in range(n)]
757ss_res = sum(r**2 for r in residuals)
758ym = sum(y)/n
759ss_tot = sum((v-ym)**2 for v in y)
760r2 = 1 - ss_res/ss_tot if ss_tot else 0
761rmse = math.sqrt(ss_res/n)
762adj_r2 = 1 - (1-r2)*(n-1)/(n-pp-1) if n > pp+1 else float('nan')
763
764W = 64
765print("="*W)
766print(" Linear Regression — %s" % os.path.basename(_path))
767print(" Target: %-20s N=%d Predictors=%d" % (_target, n, pp))
768print("-"*W)
769print(" Coefficients:")
770print(" %-20s %12.6f" % ("(intercept)", beta[0]))
771for i2,c2 in enumerate(_preds):
772 print(" %-20s %12.6f" % (c2[:20], beta[i2+1]))
773print("-"*W)
774print(" R² = %.6f" % r2)
775print(" Adj. R² = %.6f" % adj_r2)
776print(" RMSE = %.6f" % rmse)
777print(" Residuals min=%.4g max=%.4g mean=%.4g" % (min(residuals), max(residuals), sum(residuals)/n))
778print("-"*W)
779terms = ["%.4g" % beta[0]]
780for i2,c2 in enumerate(_preds):
781 sign = "+" if beta[i2+1] >= 0 else "-"
782 terms.append("%s %.4g*%s" % (sign, abs(beta[i2+1]), c2))
783print(" Equation: %s = %s" % (_target, " ".join(terms)))
784print("="*W)
785print()
786print(" First 10 predictions vs actual:")
787print(" %-10s %-10s %-10s" % ("Actual", "Predicted", "Residual"))
788for i3 in range(min(10,n)):
789 print(" %-10.4g %-10.4g %-10.4g" % (y[i3], preds_vals[i3], residuals[i3]))
790"####,
791 hex_path = hex_path,
792 hex_target = hex_target,
793 hex_preds = hex_preds,
794 );
795
796 let sandbox_args = serde_json::json!({
797 "language": "python",
798 "code": script,
799 "timeout_seconds": 30
800 });
801 crate::tools::code_sandbox::execute(&sandbox_args).await
802}
803
804pub async fn detect_outliers(file_path: &str, col: &str, output: &str) -> Result<String, String> {
809 let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
810 let hex_col: String = col.bytes().map(|b| format!("{:02x}", b)).collect();
811 let hex_output: String = output.bytes().map(|b| format!("{:02x}", b)).collect();
812
813 let script = format!(
814 r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys, math
815
816_path = bytes.fromhex("{hex_path}").decode().strip()
817_col = bytes.fromhex("{hex_col}").decode().strip()
818_output = bytes.fromhex("{hex_output}").decode().strip()
819
820def _load(path):
821 ext = os.path.splitext(path)[1].lower().lstrip('.')
822 if ext in ('csv','tsv'):
823 with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
824 rd = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
825 data = list(rd)
826 fns = list(rd.fieldnames or [])
827 return data, fns
828 elif ext == 'json':
829 with open(path, encoding='utf-8') as fh: d = _js.load(fh)
830 rows2 = d if isinstance(d, list) else next(iter(d.values()), [])
831 fns2 = list(rows2[0].keys()) if rows2 else []
832 return rows2, fns2
833 elif ext in ('db','sqlite','sqlite3'):
834 con = _sq.connect(path)
835 cur = con.cursor()
836 cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
837 t = cur.fetchone()
838 if not t: return [], []
839 cur.execute("SELECT * FROM [%s]" % t[0])
840 cols2 = [d2[0] for d2 in cur.description]
841 rows3 = [dict(zip(cols2, r)) for r in cur.fetchall()]
842 con.close()
843 return rows3, cols2
844 print("Unsupported: "+ext, file=sys.stderr); sys.exit(1)
845
846def _tf(v):
847 try: return float(str(v).replace(',','').strip())
848 except: return None
849
850def _pct(data, p):
851 s = sorted(data); n = len(s)
852 idx = (p/100.0)*(n-1); lo = int(idx); hi = lo+1; frac = idx-lo
853 return s[-1] if hi >= n else s[lo]+frac*(s[hi]-s[lo])
854
855rows, fieldnames = _load(_path)
856if not rows:
857 print("No rows found."); sys.exit(0)
858
859all_cols = list(rows[0].keys())
860if _col:
861 target_cols = [c for c in all_cols if c.lower() == _col.lower()]
862 if not target_cols:
863 print("Column '%s' not found. Available: %s" % (_col, ', '.join(all_cols))); sys.exit(1)
864else:
865 target_cols = [c for c in all_cols
866 if sum(1 for r in rows if _tf(r.get(c,'')) is not None) >= len(rows)*0.5]
867
868W = 68
869print("="*W)
870print(" Outlier Detection — %s (%d rows)" % (os.path.basename(_path), len(rows)))
871print("-"*W)
872
873outlier_row_indices = set()
874for c in target_cols:
875 valid = [(i, _tf(r.get(c,''))) for i,r in enumerate(rows)]
876 valid = [(i,v) for i,v in valid if v is not None]
877 if len(valid) < 4: continue
878 vs = [v for _,v in valid]
879 mean = sum(vs)/len(vs)
880 std = math.sqrt(sum((x-mean)**2 for x in vs)/len(vs))
881 q1 = _pct(vs,25); q3 = _pct(vs,75); iqr = q3-q1
882 lo_fence = q1 - 1.5*iqr; hi_fence = q3 + 1.5*iqr
883 iqr_out = [(i,v) for i,v in valid if v < lo_fence or v > hi_fence]
884 z_out = [(i,v) for i,v in valid if std > 0 and abs((v-mean)/std) > 3]
885 print()
886 print(" Column: %s (n=%d mean=%.4g std=%.4g)" % (c, len(vs), mean, std))
887 print(" IQR fence: [%.4g, %.4g] IQR outliers: %d" % (lo_fence, hi_fence, len(iqr_out)))
888 print(" Z-score |z|>3: Z outliers: %d" % len(z_out))
889 if iqr_out:
890 print(" IQR outliers (row, value):")
891 for i,v in iqr_out[:10]:
892 z = (v-mean)/std if std > 0 else float('nan')
893 print(" row %-5d value=%-12g z=%.3f" % (i+1, v, z))
894 outlier_row_indices.add(i)
895 if len(iqr_out) > 10:
896 print(" ... and %d more" % (len(iqr_out)-10))
897 else:
898 print(" No IQR outliers found.")
899
900print()
901print("="*W)
902print(" Total outlier rows (IQR): %d / %d (%.1f%%)" % (
903 len(outlier_row_indices), len(rows), 100*len(outlier_row_indices)/max(1,len(rows))))
904
905if _output and outlier_row_indices:
906 clean = [r for i,r in enumerate(rows) if i not in outlier_row_indices]
907 fns2 = fieldnames if fieldnames else (list(clean[0].keys()) if clean else [])
908 with open(_output, 'w', newline='', encoding='utf-8') as fh:
909 w = _csv.DictWriter(fh, fieldnames=fns2)
910 w.writeheader(); w.writerows(clean)
911 print(" Clean data (%d rows) saved to: %s" % (len(clean), _output))
912elif _output:
913 print(" No outliers to remove — output file not written.")
914"####,
915 hex_path = hex_path,
916 hex_col = hex_col,
917 hex_output = hex_output,
918 );
919
920 let sandbox_args = serde_json::json!({
921 "language": "python",
922 "code": script,
923 "timeout_seconds": 30
924 });
925 crate::tools::code_sandbox::execute(&sandbox_args).await
926}
927
928pub async fn plot_chart(
934 file_path: &str,
935 x_col: &str,
936 y_col: &str,
937 chart_type: &str,
938 title: &str,
939 output: &str,
940) -> Result<String, String> {
941 let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
942 let hex_x: String = x_col.bytes().map(|b| format!("{:02x}", b)).collect();
943 let hex_y: String = y_col.bytes().map(|b| format!("{:02x}", b)).collect();
944 let hex_type: String = chart_type.bytes().map(|b| format!("{:02x}", b)).collect();
945 let hex_title: String = title.bytes().map(|b| format!("{:02x}", b)).collect();
946 let hex_out: String = output.bytes().map(|b| format!("{:02x}", b)).collect();
947
948 let script = format!(
949 r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys, math
950
951_path = bytes.fromhex("{hex_path}").decode().strip()
952_xcol = bytes.fromhex("{hex_x}").decode().strip()
953_ycol = bytes.fromhex("{hex_y}").decode().strip()
954_ctype = bytes.fromhex("{hex_type}").decode().strip().lower() or "line"
955_title = bytes.fromhex("{hex_title}").decode().strip()
956_out = bytes.fromhex("{hex_out}").decode().strip()
957
958def _load(path):
959 ext = os.path.splitext(path)[1].lower().lstrip('.')
960 if ext in ('csv','tsv'):
961 with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
962 r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
963 return list(r)
964 elif ext == 'json':
965 with open(path, encoding='utf-8') as fh: d = _js.load(fh)
966 return d if isinstance(d, list) else next(iter(d.values()), [])
967 elif ext in ('db','sqlite','sqlite3'):
968 con = _sq.connect(path)
969 cur = con.cursor()
970 cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
971 t = cur.fetchone()
972 if not t: return []
973 cur.execute("SELECT * FROM [%s]" % t[0])
974 cols2 = [d2[0] for d2 in cur.description]
975 rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
976 con.close()
977 return rows2
978 print("Unsupported: "+ext, file=sys.stderr); sys.exit(1)
979
980def _tf(v):
981 try: return float(str(v).replace(',','').strip())
982 except: return None
983
984rows = _load(_path)
985if not rows:
986 print("No rows found."); sys.exit(0)
987
988all_cols = list(rows[0].keys())
989num_cols = [c for c in all_cols if sum(1 for r in rows if _tf(r.get(c,'')) is not None) >= len(rows)*0.5]
990
991if not _xcol:
992 _xcol = all_cols[0]
993if not _ycol:
994 _ycol = num_cols[0] if num_cols else (all_cols[1] if len(all_cols)>1 else all_cols[0])
995
996if not _title:
997 _title = "%s — %s vs %s" % (os.path.basename(_path), _xcol, _ycol)
998
999if not _out:
1000 base = os.path.splitext(_path)[0]
1001 _out = base + "_plot.svg"
1002
1003# Extract data points
1004def _to_num_or_str(v): return _tf(v) if _tf(v) is not None else str(v).strip()
1005
1006raw_pairs = [(_to_num_or_str(r.get(_xcol,'')), _tf(r.get(_ycol,''))) for r in rows]
1007pairs = [(x,y) for x,y in raw_pairs if y is not None]
1008
1009if not pairs:
1010 print("No plottable data in columns '%s' vs '%s'." % (_xcol, _ycol)); sys.exit(0)
1011
1012# For bar/histogram: bucket string x values
1013xs_raw = [p[0] for p in pairs]
1014ys = [p[1] for p in pairs]
1015
1016# SVG dimensions
1017W = 800; H = 500; PAD = 70; TW = W-2*PAD; TH = H-2*PAD
1018
1019def _esc(s): return str(s).replace('&','&').replace('<','<').replace('>','>').replace('"','"')
1020
1021def _scale(vals, lo, hi, out_lo, out_hi):
1022 if hi == lo: return [out_lo + (out_hi-out_lo)/2 for _ in vals]
1023 return [out_lo + (v-lo)/(hi-lo)*(out_hi-out_lo) for v in vals]
1024
1025svg_parts = []
1026svg_parts.append('<?xml version="1.0" encoding="UTF-8"?>')
1027svg_parts.append('<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" style="background:#1e1e2e">' % (W, H))
1028svg_parts.append('<style>text{{font-family:monospace;fill:#cdd6f4}}line{{stroke:#45475a}}circle{{opacity:0.8}}</style>')
1029# Title
1030svg_parts.append('<text x="%d" y="28" font-size="15" text-anchor="middle" font-weight="bold">%s</text>' % (W//2, _esc(_title)))
1031# Axes
1032svg_parts.append('<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="#89b4fa" stroke-width="1.5"/>' % (PAD, PAD, PAD, H-PAD))
1033svg_parts.append('<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="#89b4fa" stroke-width="1.5"/>' % (PAD, H-PAD, W-PAD, H-PAD))
1034# Axis labels
1035svg_parts.append('<text x="%d" y="%d" font-size="12" text-anchor="middle">%s</text>' % (W//2, H-10, _esc(_xcol)))
1036svg_parts.append('<text x="15" y="%d" font-size="12" text-anchor="middle" transform="rotate(-90,15,%d)">%s</text>' % (H//2, H//2, _esc(_ycol)))
1037
1038if _ctype == 'bar' or (not all(isinstance(x, (int,float)) for x in xs_raw)):
1039 # Bar chart: group by string x
1040 from collections import OrderedDict
1041 groups = OrderedDict()
1042 for x,y in pairs:
1043 k = str(x)
1044 groups.setdefault(k, []).append(y)
1045 labels = list(groups.keys())[:30]
1046 bar_vals = [sum(groups[k])/len(groups[k]) for k in labels]
1047 bw = TW / max(len(labels),1) * 0.8
1048 x_positions = [PAD + (i+0.5) * TW / max(len(labels),1) for i in range(len(labels))]
1049 ymin = min(0, min(bar_vals)); ymax = max(bar_vals) if bar_vals else 1
1050 if ymin == ymax: ymax = ymin + 1
1051 def _sy(v): return H-PAD - (v-ymin)/(ymax-ymin)*TH
1052 for i,(lbl,v) in enumerate(zip(labels,bar_vals)):
1053 x0 = x_positions[i] - bw/2
1054 y0 = _sy(max(v,0)); y1 = _sy(min(v,0))
1055 bar_h = abs(y0-y1)
1056 svg_parts.append('<rect x="%.1f" y="%.1f" width="%.1f" height="%.1f" fill="#89b4fa" rx="2"/>' % (x0, min(y0,y1), bw, max(bar_h,1)))
1057 if len(labels) <= 15:
1058 svg_parts.append('<text x="%.1f" y="%d" font-size="10" text-anchor="middle" transform="rotate(-45,%.1f,%d)">%s</text>' % (x_positions[i], H-PAD+14, x_positions[i], H-PAD+14, _esc(lbl[:12])))
1059 # y-axis ticks
1060 for tick in [ymin, (ymin+ymax)/2, ymax]:
1061 sy = _sy(tick)
1062 svg_parts.append('<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="#45475a"/>' % (PAD, sy, W-PAD, sy))
1063 svg_parts.append('<text x="%d" y="%.1f" font-size="10" text-anchor="end">%.3g</text>' % (PAD-4, sy+4, tick))
1064
1065elif _ctype == 'histogram':
1066 n_bins = min(30, max(5, int(math.sqrt(len(ys)))))
1067 ymin_h = min(ys); ymax_h = max(ys)
1068 if ymin_h == ymax_h: ymax_h = ymin_h + 1
1069 bin_w = (ymax_h-ymin_h)/n_bins
1070 counts = [0]*n_bins
1071 for v in ys:
1072 idx = min(int((v-ymin_h)/bin_w), n_bins-1)
1073 counts[idx] += 1
1074 bar_w = TW/n_bins
1075 cmax = max(counts) if counts else 1
1076 for i,c in enumerate(counts):
1077 x0 = PAD + i*bar_w
1078 bar_h2 = c/cmax * TH
1079 y0 = H-PAD-bar_h2
1080 svg_parts.append('<rect x="%.1f" y="%.1f" width="%.1f" height="%.1f" fill="#a6e3a1" rx="1"/>' % (x0, y0, bar_w-1, bar_h2))
1081 for i in range(5):
1082 tick_v = ymin_h + i*(ymax_h-ymin_h)/4
1083 sx = PAD + (tick_v-ymin_h)/(ymax_h-ymin_h)*TW
1084 svg_parts.append('<text x="%.1f" y="%d" font-size="10" text-anchor="middle">%.3g</text>' % (sx, H-PAD+14, tick_v))
1085 for i in range(5):
1086 tick_c = i*cmax/4
1087 sy = H-PAD - tick_c/cmax*TH
1088 svg_parts.append('<text x="%d" y="%.1f" font-size="10" text-anchor="end">%d</text>' % (PAD-4, sy+4, int(tick_c)))
1089
1090else:
1091 # Line or scatter: numeric x required
1092 xs_num = [p[0] if isinstance(p[0],(int,float)) else i for i,p in enumerate(pairs)]
1093 xmin = min(xs_num); xmax = max(xs_num)
1094 ymin2 = min(ys); ymax2 = max(ys)
1095 if xmin == xmax: xmax = xmin+1
1096 if ymin2 == ymax2: ymax2 = ymin2+1
1097 def _sx2(v): return PAD + (v-xmin)/(xmax-xmin)*TW
1098 def _sy2(v): return H-PAD - (v-ymin2)/(ymax2-ymin2)*TH
1099 # Grid
1100 for i in range(5):
1101 gx = PAD + i*TW/4; gy = H-PAD - i*TH/4
1102 svg_parts.append('<line x1="%.1f" y1="%d" x2="%.1f" y2="%d" stroke="#313244" stroke-dasharray="4"/>' % (gx,PAD,gx,H-PAD))
1103 svg_parts.append('<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="#313244" stroke-dasharray="4"/>' % (PAD,gy,W-PAD,gy))
1104 # x ticks
1105 for i in range(5):
1106 tv = xmin + i*(xmax-xmin)/4
1107 sx2 = _sx2(tv)
1108 svg_parts.append('<text x="%.1f" y="%d" font-size="10" text-anchor="middle">%.3g</text>' % (sx2, H-PAD+14, tv))
1109 # y ticks
1110 for i in range(5):
1111 tv = ymin2 + i*(ymax2-ymin2)/4
1112 sy2 = _sy2(tv)
1113 svg_parts.append('<text x="%d" y="%.1f" font-size="10" text-anchor="end">%.3g</text>' % (PAD-4, sy2+4, tv))
1114 pts = list(zip(xs_num, ys))
1115 pts.sort(key=lambda p: p[0])
1116 sx_list = [_sx2(x) for x,_ in pts]
1117 sy_list = [_sy2(y) for _,y in pts]
1118 if _ctype != 'scatter' and len(pts) > 1:
1119 path_d = "M %.1f %.1f " % (sx_list[0], sy_list[0])
1120 path_d += " ".join("L %.1f %.1f" % (sx_list[i], sy_list[i]) for i in range(1,len(pts)))
1121 svg_parts.append('<path d="%s" fill="none" stroke="#89b4fa" stroke-width="2"/>' % path_d)
1122 for i in range(len(pts)):
1123 svg_parts.append('<circle cx="%.1f" cy="%.1f" r="3" fill="#cba6f7"/>' % (sx_list[i], sy_list[i]))
1124
1125svg_parts.append('</svg>')
1126svg_content = '\n'.join(svg_parts)
1127
1128with open(_out, 'w', encoding='utf-8') as fh:
1129 fh.write(svg_content)
1130
1131print("Chart saved: %s (%d data points type=%s)" % (_out, len(pairs), _ctype))
1132print("Open in any browser to view.")
1133"####,
1134 hex_path = hex_path,
1135 hex_x = hex_x,
1136 hex_y = hex_y,
1137 hex_type = hex_type,
1138 hex_title = hex_title,
1139 hex_out = hex_out,
1140 );
1141
1142 let sandbox_args = serde_json::json!({
1143 "language": "python",
1144 "code": script,
1145 "timeout_seconds": 30
1146 });
1147 crate::tools::code_sandbox::execute(&sandbox_args).await
1148}
1149
1150pub async fn fourier_analysis(
1154 file_path: &str,
1155 col: &str,
1156 top_n: usize,
1157 sample_rate: f64,
1158) -> Result<String, String> {
1159 let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
1160 let hex_col: String = col.bytes().map(|b| format!("{:02x}", b)).collect();
1161
1162 let script = format!(
1163 r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys, math
1164
1165_path = bytes.fromhex("{hex_path}").decode().strip()
1166_col = bytes.fromhex("{hex_col}").decode().strip()
1167_top_n = {top_n}
1168_sample_rate = {sample_rate}
1169
1170def _load(path):
1171 ext = os.path.splitext(path)[1].lower().lstrip('.')
1172 if ext in ('csv','tsv'):
1173 with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
1174 r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
1175 return list(r)
1176 elif ext == 'json':
1177 with open(path, encoding='utf-8') as fh: d = _js.load(fh)
1178 return d if isinstance(d, list) else next(iter(d.values()), [])
1179 elif ext in ('db','sqlite','sqlite3'):
1180 con = _sq.connect(path)
1181 cur = con.cursor()
1182 cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
1183 t = cur.fetchone()
1184 if not t: return []
1185 cur.execute("SELECT * FROM [%s]" % t[0])
1186 cols2 = [d2[0] for d2 in cur.description]
1187 rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
1188 con.close()
1189 return rows2
1190 print("Unsupported: "+ext, file=sys.stderr); sys.exit(1)
1191
1192def _tf(v):
1193 try: return float(str(v).replace(',','').strip())
1194 except: return None
1195
1196rows = _load(_path)
1197if not rows:
1198 print("No rows found."); sys.exit(0)
1199
1200all_cols = list(rows[0].keys())
1201if _col:
1202 target_col = next((c for c in all_cols if c.lower() == _col.lower()), None)
1203 if not target_col:
1204 print("Column '%s' not found. Available: %s" % (_col, ', '.join(all_cols))); sys.exit(1)
1205else:
1206 num_cols = [c for c in all_cols if sum(1 for r in rows if _tf(r.get(c,'')) is not None) >= len(rows)*0.5]
1207 target_col = num_cols[0] if num_cols else None
1208 if not target_col:
1209 print("No numeric column found."); sys.exit(0)
1210
1211vals = [_tf(r.get(target_col,'')) for r in rows]
1212vals = [v for v in vals if v is not None]
1213n = len(vals)
1214if n < 4:
1215 print("Need at least 4 data points for DFT."); sys.exit(0)
1216
1217mean = sum(vals)/n
1218x = [v - mean for v in vals]
1219
1220if n > 512:
1221 x = x[:512]; n = 512
1222 print("Note: DFT computed on first 512 points (large dataset).")
1223
1224def dft(x2):
1225 N = len(x2)
1226 result = []
1227 for k in range(N//2 + 1):
1228 re = sum(x2[t]*math.cos(2*math.pi*k*t/N) for t in range(N))
1229 im = sum(x2[t]*math.sin(2*math.pi*k*t/N) for t in range(N))
1230 amp = math.sqrt(re**2 + im**2) / N
1231 phase = math.atan2(-im, re)
1232 result.append((k, amp, phase))
1233 return result
1234
1235spectrum = dft(x)
1236spectrum_sorted = sorted(spectrum[1:], key=lambda t: -t[1])
1237
1238sr = _sample_rate if _sample_rate > 0 else 1.0
1239top = spectrum_sorted[:min(_top_n, len(spectrum_sorted))]
1240
1241W = 64
1242print("="*W)
1243print(" Fourier / Frequency Analysis: %s" % os.path.basename(_path))
1244print(" Column: %-20s N=%d Sample rate: %g Hz" % (target_col, n, sr))
1245print("-"*W)
1246print(" DC component (mean offset): %.6f" % spectrum[0][1])
1247print()
1248print(" %-5s %-12s %-12s %-10s %-10s" % ("Rank", "Freq (Hz)", "Period", "Amplitude", "Phase (deg)"))
1249print(" " + "-"*58)
1250for i,(k,amp,phase) in enumerate(top):
1251 freq = k * sr / n
1252 period = (1.0/freq) if freq > 0 else float('inf')
1253 period_str = "%.4g" % period if period < 1e10 else "inf"
1254 print(" %-5d %-12.6g %-12s %-10.6f %-10.2f" % (
1255 i+1, freq, period_str, amp, math.degrees(phase)))
1256total_power = sum(t[1]**2 for t in spectrum[1:])
1257top_power = sum(t[1]**2 for t in top)
1258print()
1259print(" Top %d components contain %.1f%% of signal power." % (len(top), 100*top_power/max(total_power,1e-30)))
1260print("="*W)
1261"####,
1262 hex_path = hex_path,
1263 hex_col = hex_col,
1264 top_n = top_n,
1265 sample_rate = sample_rate,
1266 );
1267
1268 let sandbox_args = serde_json::json!({
1269 "language": "python",
1270 "code": script,
1271 "timeout_seconds": 60
1272 });
1273 crate::tools::code_sandbox::execute(&sandbox_args).await
1274}
1275
1276pub async fn cluster_kmeans(
1281 file_path: &str,
1282 k: usize,
1283 cols: &[&str],
1284 max_iter: usize,
1285 output: &str,
1286) -> Result<String, String> {
1287 let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
1288 let cols_joined = cols.join("\n");
1289 let hex_cols: String = cols_joined.bytes().map(|b| format!("{:02x}", b)).collect();
1290 let hex_output: String = output.bytes().map(|b| format!("{:02x}", b)).collect();
1291
1292 let script = format!(
1293 r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys, math, random
1294
1295_path = bytes.fromhex("{hex_path}").decode().strip()
1296_cols_raw = bytes.fromhex("{hex_cols}").decode().strip()
1297_cols = [c.strip() for c in _cols_raw.split('\n') if c.strip()] if _cols_raw else []
1298_k = {k}
1299_max_iter = {max_iter}
1300_output = bytes.fromhex("{hex_output}").decode().strip()
1301
1302def _load(path):
1303 ext = os.path.splitext(path)[1].lower().lstrip('.')
1304 if ext in ('csv','tsv'):
1305 with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
1306 r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
1307 return list(r)
1308 elif ext == 'json':
1309 with open(path, encoding='utf-8') as fh: d = _js.load(fh)
1310 return d if isinstance(d, list) else next(iter(d.values()), [])
1311 elif ext in ('db','sqlite','sqlite3'):
1312 con = _sq.connect(path)
1313 cur = con.cursor()
1314 cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
1315 t = cur.fetchone()
1316 if not t: return []
1317 cur.execute("SELECT * FROM [%s]" % t[0])
1318 cols2 = [d2[0] for d2 in cur.description]
1319 rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
1320 con.close()
1321 return rows2
1322 print("Unsupported: "+ext, file=sys.stderr); sys.exit(1)
1323
1324def _tf(v):
1325 try: return float(str(v).replace(',','').strip())
1326 except: return None
1327
1328rows = _load(_path)
1329if not rows:
1330 print("No rows found."); sys.exit(0)
1331
1332all_cols = list(rows[0].keys())
1333if _cols:
1334 feature_cols = [c for c in all_cols if c in _cols]
1335else:
1336 feature_cols = [c for c in all_cols if sum(1 for r in rows if _tf(r.get(c,'')) is not None) >= len(rows)*0.5]
1337
1338if len(feature_cols) < 1:
1339 print("No numeric feature columns found."); sys.exit(1)
1340
1341valid_rows = [r for r in rows if all(_tf(r.get(c,'')) is not None for c in feature_cols)]
1342if len(valid_rows) < _k:
1343 print("Fewer valid rows (%d) than clusters (%d)." % (len(valid_rows), _k)); sys.exit(1)
1344
1345X = [[_tf(r[c]) for c in feature_cols] for r in valid_rows]
1346n = len(X); d = len(feature_cols)
1347
1348def _dist(a, b): return math.sqrt(sum((ai-bi)**2 for ai,bi in zip(a,b)))
1349def _centroid(pts): return [sum(p[j] for p in pts)/len(pts) for j in range(d)] if pts else [0.0]*d
1350
1351# k-means++ init
1352random.seed(42)
1353centroids = [X[random.randint(0,n-1)]]
1354for _ in range(_k-1):
1355 dists = [min(_dist(x, c)**2 for c in centroids) for x in X]
1356 total = sum(dists)
1357 r = random.random() * total
1358 cum = 0
1359 for i,dd in enumerate(dists):
1360 cum += dd
1361 if cum >= r: centroids.append(X[i]); break
1362 else: centroids.append(X[-1])
1363
1364labels = [0]*n
1365for _ in range(_max_iter):
1366 new_labels = [min(range(_k), key=lambda c: _dist(x, centroids[c])) for x in X]
1367 if new_labels == labels: break
1368 labels = new_labels
1369 for c in range(_k):
1370 pts = [X[i] for i in range(n) if labels[i]==c]
1371 if pts: centroids[c] = _centroid(pts)
1372
1373inertia = sum(_dist(X[i], centroids[labels[i]])**2 for i in range(n))
1374cluster_sizes = [labels.count(c) for c in range(_k)]
1375
1376W = 64
1377print("="*W)
1378print(" k-Means Clustering: %s (k=%d)" % (os.path.basename(_path), _k))
1379print(" Features: %s" % ', '.join(feature_cols))
1380print(" Rows: %d Inertia: %.4f" % (n, inertia))
1381print("-"*W)
1382for c in range(_k):
1383 centroid_str = ' '.join("%.4g" % v for v in centroids[c])
1384 print(" Cluster %d (%d rows): centroid = [%s]" % (c, cluster_sizes[c], centroid_str))
1385print("="*W)
1386
1387if _output:
1388 with open(_output, 'w', newline='', encoding='utf-8') as fh:
1389 fns2 = list(valid_rows[0].keys()) + ['cluster']
1390 w = _csv.DictWriter(fh, fieldnames=fns2)
1391 w.writeheader()
1392 for i,r in enumerate(valid_rows):
1393 r2 = dict(r); r2['cluster'] = labels[i]
1394 w.writerow(r2)
1395 print("Labeled data saved to: %s" % _output)
1396"####,
1397 hex_path = hex_path,
1398 hex_cols = hex_cols,
1399 hex_output = hex_output,
1400 k = k,
1401 max_iter = max_iter,
1402 );
1403
1404 let sandbox_args = serde_json::json!({
1405 "language": "python",
1406 "code": script,
1407 "timeout_seconds": 60
1408 });
1409 crate::tools::code_sandbox::execute(&sandbox_args).await
1410}
1411
1412pub async fn normalize_dataset(
1417 file_path: &str,
1418 method: &str,
1419 cols: &[&str],
1420 output: &str,
1421) -> Result<String, String> {
1422 let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
1423 let cols_joined = cols.join("\n");
1424 let hex_cols: String = cols_joined.bytes().map(|b| format!("{:02x}", b)).collect();
1425 let hex_method: String = method.bytes().map(|b| format!("{:02x}", b)).collect();
1426 let hex_output: String = output.bytes().map(|b| format!("{:02x}", b)).collect();
1427
1428 let script = format!(
1429 r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys, math
1430
1431_path = bytes.fromhex("{hex_path}").decode().strip()
1432_cols_raw = bytes.fromhex("{hex_cols}").decode().strip()
1433_cols = [c.strip() for c in _cols_raw.split('\n') if c.strip()] if _cols_raw else []
1434_method = bytes.fromhex("{hex_method}").decode().strip().lower() or "minmax"
1435_output = bytes.fromhex("{hex_output}").decode().strip()
1436
1437def _load(path):
1438 ext = os.path.splitext(path)[1].lower().lstrip('.')
1439 if ext in ('csv','tsv'):
1440 with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
1441 r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
1442 return list(r), list(r.fieldnames or [])
1443 elif ext == 'json':
1444 with open(path, encoding='utf-8') as fh: d = _js.load(fh)
1445 rows2 = d if isinstance(d, list) else next(iter(d.values()), [])
1446 return rows2, list(rows2[0].keys()) if rows2 else []
1447 elif ext in ('db','sqlite','sqlite3'):
1448 con = _sq.connect(path)
1449 cur = con.cursor()
1450 cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
1451 t = cur.fetchone()
1452 if not t: return [], []
1453 cur.execute("SELECT * FROM [%s]" % t[0])
1454 cols2 = [d2[0] for d2 in cur.description]
1455 rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
1456 con.close()
1457 return rows2, cols2
1458 print("Unsupported: "+ext, file=sys.stderr); sys.exit(1)
1459
1460def _tf(v):
1461 try: return float(str(v).replace(',','').strip())
1462 except: return None
1463
1464rows, fieldnames = _load(_path)
1465if not rows:
1466 print("No rows found."); sys.exit(0)
1467
1468all_cols = list(rows[0].keys())
1469if _cols:
1470 target_cols = [c for c in all_cols if c in _cols]
1471else:
1472 target_cols = [c for c in all_cols if sum(1 for r in rows if _tf(r.get(c,'')) is not None) >= len(rows)*0.5]
1473
1474params = {{}}
1475for c in target_cols:
1476 vals = [_tf(r.get(c,'')) for r in rows if _tf(r.get(c,'')) is not None]
1477 if not vals: continue
1478 mean = sum(vals)/len(vals)
1479 std = math.sqrt(sum((v-mean)**2 for v in vals)/len(vals))
1480 mn = min(vals); mx = max(vals)
1481 params[c] = (mean, std, mn, mx)
1482
1483W = 64
1484print("="*W)
1485print(" Dataset Normalization: %s (method=%s)" % (os.path.basename(_path), _method))
1486print("-"*W)
1487print(" %-20s %-10s %-10s %-10s %-10s" % ("Column", "Min", "Max", "Mean", "Std"))
1488print(" " + "-"*56)
1489for c,( mean,std,mn,mx) in params.items():
1490 print(" %-20s %-10.4g %-10.4g %-10.4g %-10.4g" % (c[:20], mn, mx, mean, std))
1491print("="*W)
1492
1493if _output:
1494 out_rows = []
1495 for r in rows:
1496 out_r = dict(r)
1497 for c,(mean,std,mn,mx) in params.items():
1498 v = _tf(r.get(c,''))
1499 if v is None:
1500 out_r[c] = ''
1501 continue
1502 if _method in ('minmax','min-max','min_max'):
1503 rng = mx-mn
1504 out_r[c] = "%.8f" % ((v-mn)/rng if rng else 0.0)
1505 elif _method in ('zscore','z-score','z_score','standard','standardize'):
1506 out_r[c] = "%.8f" % ((v-mean)/std if std else 0.0)
1507 elif _method in ('robust',):
1508 from functools import reduce
1509 # Use median and IQR
1510 vals2 = sorted(_tf(rr.get(c,'')) for rr in rows if _tf(rr.get(c,'')) is not None)
1511 n2 = len(vals2)
1512 q1 = vals2[n2//4]; q3 = vals2[3*n2//4]
1513 iqr = q3-q1
1514 med = vals2[n2//2]
1515 out_r[c] = "%.8f" % ((v-med)/iqr if iqr else 0.0)
1516 out_rows.append(out_r)
1517 with open(_output, 'w', newline='', encoding='utf-8') as fh:
1518 fns2 = fieldnames if fieldnames else list(out_rows[0].keys()) if out_rows else []
1519 w = _csv.DictWriter(fh, fieldnames=fns2)
1520 w.writeheader(); w.writerows(out_rows)
1521 print("Normalized data (%d rows) saved to: %s" % (len(out_rows), _output))
1522else:
1523 print(" (No --normalize-output specified — use --normalize-output FILE to save scaled CSV)")
1524"####,
1525 hex_path = hex_path,
1526 hex_cols = hex_cols,
1527 hex_method = hex_method,
1528 hex_output = hex_output,
1529 );
1530
1531 let sandbox_args = serde_json::json!({
1532 "language": "python",
1533 "code": script,
1534 "timeout_seconds": 30
1535 });
1536 crate::tools::code_sandbox::execute(&sandbox_args).await
1537}
1538
1539pub async fn pca_analyze(
1545 file_path: &str,
1546 n_components: usize,
1547 cols: &[&str],
1548 output: &str,
1549) -> Result<String, String> {
1550 let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
1551 let hex_cols: String = cols
1552 .join(",")
1553 .bytes()
1554 .map(|b| format!("{:02x}", b))
1555 .collect();
1556 let hex_output: String = output.bytes().map(|b| format!("{:02x}", b)).collect();
1557
1558 let script = format!(
1559 r####"import csv as _csv, os, sys, math
1560
1561_path = bytes.fromhex("{hex_path}").decode().strip()
1562_cstr = bytes.fromhex("{hex_cols}").decode().strip()
1563_output = bytes.fromhex("{hex_output}").decode().strip()
1564_n_comp = {n_components}
1565
1566def _load(path):
1567 ext = os.path.splitext(path)[1].lower().lstrip('.')
1568 if ext in ('csv','tsv'):
1569 with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
1570 r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
1571 return list(r)
1572 raise ValueError("Unsupported file type: " + ext + " (CSV/TSV only for PCA)")
1573
1574def _tf(v):
1575 try: return float(v)
1576 except: return None
1577
1578rows = _load(_path)
1579if not rows:
1580 print("No data found."); sys.exit(0)
1581
1582all_cols = list(rows[0].keys())
1583sel = [c.strip() for c in _cstr.split(',') if c.strip()] if _cstr else []
1584num_cols = sel if sel else [c for c in all_cols if any(_tf(r.get(c,'')) is not None for r in rows[:20])]
1585num_cols = [c for c in num_cols if c in all_cols]
1586
1587mat = []
1588for r in rows:
1589 vals = [_tf(r.get(c,'')) for c in num_cols]
1590 if all(v is not None for v in vals):
1591 mat.append(vals)
1592
1593n_rows = len(mat); n_cols = len(num_cols)
1594if n_rows < 2 or n_cols < 2:
1595 print("Need at least 2 rows and 2 numeric columns for PCA."); sys.exit(0)
1596
1597means = [sum(mat[i][j] for i in range(n_rows))/n_rows for j in range(n_cols)]
1598X = [[mat[i][j] - means[j] for j in range(n_cols)] for i in range(n_rows)]
1599
1600def cov_matrix(X, nc, nr):
1601 C = [[0.0]*nc for _ in range(nc)]
1602 for j in range(nc):
1603 for k in range(j, nc):
1604 s = sum(X[i][j]*X[i][k] for i in range(nr)) / (nr-1)
1605 C[j][k] = C[k][j] = s
1606 return C
1607
1608C = cov_matrix(X, n_cols, n_rows)
1609
1610def mat_vec(M, v):
1611 return [sum(M[i][j]*v[j] for j in range(len(v))) for i in range(len(v))]
1612
1613def vec_norm(v): return math.sqrt(sum(x*x for x in v))
1614def vec_scale(v, s): return [x*s for x in v]
1615
1616n_comp = min(_n_comp, n_cols, n_rows-1)
1617total_var = sum(C[j][j] for j in range(n_cols))
1618eigvals = []; eigvecs = []
1619Cd = [row[:] for row in C]
1620
1621for ci in range(n_comp):
1622 v = [1.0 if j == ci % n_cols else 0.1 for j in range(n_cols)]
1623 nrm = vec_norm(v); v = vec_scale(v, 1.0/nrm)
1624 for _it in range(300):
1625 v_new = mat_vec(Cd, v)
1626 nrm = vec_norm(v_new)
1627 if nrm < 1e-14: break
1628 v_new = vec_scale(v_new, 1.0/nrm)
1629 delta = vec_norm([v_new[j]-v[j] for j in range(n_cols)])
1630 v = v_new
1631 if delta < 1e-10: break
1632 lam = sum(mat_vec(Cd, v)[j]*v[j] for j in range(n_cols))
1633 if lam < 0: lam = 0.0
1634 eigvals.append(lam)
1635 eigvecs.append(v[:])
1636 for i in range(n_cols):
1637 for j in range(n_cols):
1638 Cd[i][j] -= lam * v[i] * v[j]
1639
1640projected = []
1641for row_x in X:
1642 projected.append([sum(row_x[j]*eigvecs[c][j] for j in range(n_cols)) for c in range(n_comp)])
1643
1644W = 68
1645print("="*W)
1646print(" PCA — Principal Component Analysis")
1647print(" File : %s" % os.path.basename(_path))
1648print(" Rows : %d | Columns : %d | Components: %d" % (n_rows, n_cols, n_comp))
1649print(" Columns: %s" % ', '.join(num_cols[:6]) + ((' +%d more' % (len(num_cols)-6)) if len(num_cols)>6 else ''))
1650print("="*W)
1651
1652cum = 0.0
1653for ci in range(n_comp):
1654 pct = (eigvals[ci]/total_var*100) if total_var > 0 else 0.0
1655 cum += pct
1656 bar = int(round(pct / 2.5))
1657 bar_str = "â–ˆ"*bar + "â–‘"*(40-bar)
1658 print("\n PC%d eigenvalue %.4f | var %5.1f%% | cumulative %5.1f%%" % (ci+1, eigvals[ci], pct, cum))
1659 print(" %s" % bar_str)
1660 loads = sorted(enumerate(eigvecs[ci]), key=lambda x: -abs(x[1]))
1661 print(" Top loadings:")
1662 for _idx, (fidx, w) in enumerate(loads[:8]):
1663 sign = '+' if w >= 0 else '-'
1664 bar2 = int(abs(w)*20)
1665 print(" %-22s %s%.4f %s" % (num_cols[fidx][:22], sign, abs(w), "▌"*bar2))
1666
1667print()
1668print(" Projected sample (first 5 rows):")
1669print(" " + "".join(" PC%-7d" % (c+1) for c in range(n_comp)))
1670for row_p in projected[:5]:
1671 print(" " + "".join("%+-10.4f" % v for v in row_p))
1672print()
1673print("="*W)
1674
1675if _output:
1676 pc_cols = ["PC%d" % (c+1) for c in range(n_comp)]
1677 with open(_output, 'w', newline='', encoding='utf-8') as fh:
1678 w2 = _csv.writer(fh)
1679 w2.writerow(pc_cols)
1680 for row_p in projected:
1681 w2.writerow(["%.8f" % v for v in row_p])
1682 print(" Projected data (%d rows) saved to: %s" % (len(projected), _output))
1683else:
1684 print(" (Use --pca-output FILE to save projected coordinates as CSV)")
1685"####,
1686 hex_path = hex_path,
1687 hex_cols = hex_cols,
1688 hex_output = hex_output,
1689 n_components = n_components,
1690 );
1691
1692 let sandbox_args = serde_json::json!({
1693 "language": "python",
1694 "code": script,
1695 "timeout_seconds": 60
1696 });
1697 crate::tools::code_sandbox::execute(&sandbox_args).await
1698}
1699
1700pub async fn hypothesis_test(
1704 test_type: &str,
1705 group1_str: &str,
1706 group2_str: &str,
1707 alpha: f64,
1708 mu: f64,
1709) -> Result<String, String> {
1710 let hex_test: String = test_type.bytes().map(|b| format!("{:02x}", b)).collect();
1711 let hex_g1: String = group1_str.bytes().map(|b| format!("{:02x}", b)).collect();
1712 let hex_g2: String = group2_str.bytes().map(|b| format!("{:02x}", b)).collect();
1713
1714 let script = format!(
1715 r####"import math, statistics as _st, sys
1716
1717_test = bytes.fromhex("{hex_test}").decode().strip().lower()
1718_g1s = bytes.fromhex("{hex_g1}").decode().strip()
1719_g2s = bytes.fromhex("{hex_g2}").decode().strip()
1720_alpha = {alpha}
1721_mu0 = {mu}
1722W = 60
1723
1724def parse_nums(s):
1725 if not s: return []
1726 try:
1727 return [float(x.strip()) for x in s.replace(';',',').split(',') if x.strip()]
1728 except:
1729 return []
1730
1731def fmt_p(p):
1732 if p < 0.001: return "< 0.001"
1733 return "%.4f" % p
1734
1735def decision(p, alpha):
1736 if p < alpha:
1737 return "REJECT H0 (p %s < a=%.2f)" % (fmt_p(p), alpha)
1738 return "FAIL TO REJECT H0 (p %s >= a=%.2f)" % (fmt_p(p), alpha)
1739
1740def t_cdf(t, df):
1741 x = df / (df + t*t)
1742 def betainc(a, b, xv):
1743 if xv<=0: return 0.0
1744 if xv>=1: return 1.0
1745 tiny=1e-300; fp_min=tiny
1746 qab=a+b; qap=a+1; qam=a-1
1747 c=1.0; d=1.0-qab*xv/qap
1748 if abs(d)<fp_min: d=fp_min
1749 d=1.0/d; h=d
1750 for m in range(1,201):
1751 m2=2*m; aa=m*(b-m)*xv/((qam+m2)*(a+m2))
1752 d=1.0+aa*d
1753 if abs(d)<fp_min: d=fp_min
1754 c=1.0+aa/c
1755 if abs(c)<fp_min: c=fp_min
1756 d=1.0/d; h*=d*c
1757 aa=-(a+m)*(qab+m)*xv/((a+m2)*(qap+m2))
1758 d=1.0+aa*d
1759 if abs(d)<fp_min: d=fp_min
1760 c=1.0+aa/c
1761 if abs(c)<fp_min: c=fp_min
1762 d=1.0/d; delta=d*c; h*=delta
1763 if abs(delta-1.0)<3e-7: break
1764 return math.exp(math.lgamma(a+b)-math.lgamma(a)-math.lgamma(b)+
1765 a*math.log(xv)+b*math.log(1-xv))*h/a
1766 a=df/2; b=0.5
1767 ibeta=betainc(b,a,x)
1768 return ibeta
1769
1770def normal_cdf_h(z):
1771 return 0.5*(1+math.erf(z/math.sqrt(2)))
1772
1773def chi2_pval(chi2, df):
1774 a=df/2.0; x2=chi2/2.0
1775 if x2<=0: return 1.0
1776 if x2<a+1:
1777 ap=a; delta=s=1.0/a
1778 for _ in range(300):
1779 ap+=1; delta*=x2/ap; s+=delta
1780 if abs(delta)<abs(s)*1e-9: break
1781 p=s*math.exp(-x2+a*math.log(x2)-math.lgamma(a))
1782 return max(0.0,min(1.0,1.0-p))
1783 else:
1784 b=x2+1-a; c=1e300; d=1.0/b; h=d
1785 for i in range(1,301):
1786 an=-i*(i-a); b+=2
1787 d=an*d+b
1788 if abs(d)<1e-300: d=1e-300
1789 c=b+an/c
1790 if abs(c)<1e-300: c=1e-300
1791 d=1.0/d; delta=d*c; h*=delta
1792 if abs(delta-1.0)<1e-9: break
1793 q=math.exp(-x2+a*math.log(x2)-math.lgamma(a))*h
1794 return max(0.0,min(1.0,q))
1795
1796def f_pval(f, df1, df2):
1797 x=df2/(df2+df1*f)
1798 def betainc(a, b, xv):
1799 if xv<=0: return 0.0
1800 if xv>=1: return 1.0
1801 tiny=1e-300; fp_min=tiny
1802 qab=a+b; qap=a+1; qam=a-1
1803 c=1.0; d=1.0-qab*xv/qap
1804 if abs(d)<fp_min: d=fp_min
1805 d=1.0/d; h=d
1806 for m in range(1,201):
1807 m2=2*m; aa=m*(b-m)*xv/((qam+m2)*(a+m2))
1808 d=1.0+aa*d
1809 if abs(d)<fp_min: d=fp_min
1810 c=1.0+aa/c
1811 if abs(c)<fp_min: c=fp_min
1812 d=1.0/d; h*=d*c
1813 aa=-(a+m)*(qab+m)*xv/((a+m2)*(qap+m2))
1814 d=1.0+aa*d
1815 if abs(d)<fp_min: d=fp_min
1816 c=1.0+aa/c
1817 if abs(c)<fp_min: c=fp_min
1818 d=1.0/d; delta=d*c; h*=delta
1819 if abs(delta-1.0)<3e-7: break
1820 return math.exp(math.lgamma(a+b)-math.lgamma(a)-math.lgamma(b)+
1821 a*math.log(xv)+b*math.log(1-xv))*h/a
1822 return betainc(df2/2,df1/2,x)
1823
1824print("="*W)
1825print(" HYPOTHESIS TEST")
1826print("="*W)
1827
1828if _test in ("one-t","one_t","onesample","one-sample","t1","t-one"):
1829 g1=parse_nums(_g1s)
1830 if len(g1)<2: print(" ERROR: need >=2 values for one-sample t-test"); sys.exit(0)
1831 n=len(g1); xbar=_st.mean(g1); s=_st.stdev(g1)
1832 se=s/math.sqrt(n); t=(xbar-_mu0)/se; df=n-1
1833 p=t_cdf(abs(t),df)
1834 print(" One-Sample t-Test (H0: mu = %.4g)" % _mu0)
1835 print(" n=%-5d xbar=%.4f s=%.4f SE=%.4f" % (n,xbar,s,se))
1836 print(" t=%.4f df=%d p=%s" % (t,df,fmt_p(p)))
1837 from_t=1.96 if df>30 else (2.093 if df>19 else (2.262 if df>9 else 2.776))
1838 lo=xbar-from_t*se; hi=xbar+from_t*se
1839 print(" %.0f%% CI: [%.4f, %.4f]" % ((1-_alpha)*100,lo,hi))
1840 print(); print(" "+decision(p,_alpha))
1841
1842elif _test in ("two-t","two_t","twosample","two-sample","welch","t2","t-two"):
1843 g1=parse_nums(_g1s); g2=parse_nums(_g2s)
1844 if len(g1)<2 or len(g2)<2: print(" ERROR: need >=2 values in each group"); sys.exit(0)
1845 n1=len(g1); n2=len(g2)
1846 x1=_st.mean(g1); x2=_st.mean(g2)
1847 s1=_st.stdev(g1); s2=_st.stdev(g2)
1848 se=math.sqrt(s1**2/n1+s2**2/n2)
1849 t=(x1-x2)/se
1850 df=(s1**2/n1+s2**2/n2)**2/((s1**2/n1)**2/(n1-1)+(s2**2/n2)**2/(n2-1))
1851 p=t_cdf(abs(t),df)
1852 print(" Two-Sample (Welch) t-Test (H0: mu1 = mu2)")
1853 print(" G1: n=%d xbar=%.4f s=%.4f" % (n1,x1,s1))
1854 print(" G2: n=%d xbar=%.4f s=%.4f" % (n2,x2,s2))
1855 print(" delta_xbar=%.4f SE=%.4f t=%.4f df=%.1f p=%s" % (x1-x2,se,t,df,fmt_p(p)))
1856 print(); print(" "+decision(p,_alpha))
1857
1858elif _test in ("paired","paired-t","pairedt","t-paired"):
1859 g1=parse_nums(_g1s); g2=parse_nums(_g2s)
1860 if len(g1)!=len(g2) or len(g1)<2: print(" ERROR: groups must match in length (>=2)"); sys.exit(0)
1861 diffs=[a-b for a,b in zip(g1,g2)]
1862 n=len(diffs); dbar=_st.mean(diffs); sd=_st.stdev(diffs)
1863 se=sd/math.sqrt(n); t=dbar/se; df=n-1
1864 p=t_cdf(abs(t),df)
1865 print(" Paired t-Test (H0: mu_diff = 0)")
1866 print(" n=%d dbar=%.4f sd=%.4f SE=%.4f" % (n,dbar,sd,se))
1867 print(" t=%.4f df=%d p=%s" % (t,df,fmt_p(p)))
1868 print(); print(" "+decision(p,_alpha))
1869
1870elif _test in ("chi2","chi-square","chisquare","chi-sq","goodness"):
1871 observed=parse_nums(_g1s)
1872 if len(observed)<2: print(" ERROR: need >=2 observed counts"); sys.exit(0)
1873 expected_s=_g2s.strip()
1874 if expected_s:
1875 expected=parse_nums(expected_s)
1876 if len(expected)!=len(observed): print(" ERROR: observed/expected length mismatch"); sys.exit(0)
1877 else:
1878 e=sum(observed)/len(observed); expected=[e]*len(observed)
1879 chi2=sum((o-e)**2/e for o,e in zip(observed,expected) if e>0)
1880 df=len(observed)-1; p=chi2_pval(chi2,df)
1881 print(" Chi-Square Goodness-of-Fit (H0: observed ~ expected)")
1882 print(" %-12s %-10s %-10s %-8s" % ("Category","Observed","Expected","(O-E)^2/E"))
1883 for i,(o,e) in enumerate(zip(observed,expected)):
1884 print(" %-12s %-10.2f %-10.2f %.4f" % ("cat%d"%(i+1),o,e,(o-e)**2/e if e>0 else 0))
1885 print(" chi2=%.4f df=%d p=%s" % (chi2,df,fmt_p(p)))
1886 print(); print(" "+decision(p,_alpha))
1887
1888elif _test in ("anova","one-way","oneway","f-test"):
1889 raw_groups=[g.strip() for g in _g1s.split('|')]
1890 groups=[parse_nums(g) for g in raw_groups if g]
1891 groups=[g for g in groups if len(g)>=2]
1892 if len(groups)<2: print(" ERROR: need >=2 groups separated by | for ANOVA"); sys.exit(0)
1893 k=len(groups); N=sum(len(g) for g in groups)
1894 grand=sum(sum(g) for g in groups)/N
1895 SSB=sum(len(g)*(_st.mean(g)-grand)**2 for g in groups)
1896 SSW=sum(sum((x-_st.mean(g))**2 for x in g) for g in groups)
1897 dfB=k-1; dfW=N-k
1898 F=(SSB/dfB)/(SSW/dfW) if dfW>0 and SSW>0 else float('inf')
1899 p=f_pval(F,dfB,dfW)
1900 print(" One-Way ANOVA (H0: all group means equal)")
1901 for i,g in enumerate(groups):
1902 print(" G%d: n=%d xbar=%.4f s=%.4f" % (i+1,len(g),_st.mean(g),_st.stdev(g)))
1903 print(" SSB=%.4f (df=%d) SSW=%.4f (df=%d)" % (SSB,dfB,SSW,dfW))
1904 print(" F=%.4f p=%s" % (F,fmt_p(p)))
1905 print(); print(" "+decision(p,_alpha))
1906
1907elif _test in ("mannwhitney","mann-whitney","mwu","wilcoxon-rank","ranksum"):
1908 g1=parse_nums(_g1s); g2=parse_nums(_g2s)
1909 if len(g1)<1 or len(g2)<1: print(" ERROR: need values in both groups"); sys.exit(0)
1910 n1=len(g1); n2=len(g2)
1911 combined=sorted([(v,'a',i) for i,v in enumerate(g1)]+[(v,'b',i) for i,v in enumerate(g2)])
1912 ranks={{}}
1913 i=0
1914 while i<len(combined):
1915 j=i
1916 while j<len(combined)-1 and combined[j][0]==combined[j+1][0]: j+=1
1917 avg_rank=(i+1+j+1)/2
1918 for kk in range(i,j+1): ranks[(combined[kk][1],combined[kk][2])]=avg_rank
1919 i=j+1
1920 R1=sum(ranks[('a',i)] for i in range(n1))
1921 U1=R1-n1*(n1+1)/2; U2=n1*n2-U1; U=min(U1,U2)
1922 mu_U=n1*n2/2; sigma_U=math.sqrt(n1*n2*(n1+n2+1)/12)
1923 z=(U-mu_U)/sigma_U if sigma_U>0 else 0
1924 p=2*(1-normal_cdf_h(abs(z)))
1925 print(" Mann-Whitney U Test (H0: distributions equal)")
1926 print(" n1=%d n2=%d U=%.1f z=%.4f p=%s" % (n1,n2,U,z,fmt_p(p)))
1927 print(); print(" "+decision(p,_alpha))
1928
1929elif _test in ("pearson","correlation","corr"):
1930 g1=parse_nums(_g1s); g2=parse_nums(_g2s)
1931 if len(g1)!=len(g2) or len(g1)<3: print(" ERROR: need matching vectors of length >=3"); sys.exit(0)
1932 n=len(g1); x1=_st.mean(g1); x2=_st.mean(g2)
1933 num=sum((a-x1)*(b-x2) for a,b in zip(g1,g2))
1934 d1=math.sqrt(sum((a-x1)**2 for a in g1)); d2=math.sqrt(sum((b-x2)**2 for b in g2))
1935 r=num/(d1*d2) if d1*d2>0 else 0
1936 t=r*math.sqrt(n-2)/math.sqrt(1-r**2) if abs(r)<1 else float('inf')
1937 df=n-2; p=t_cdf(abs(t),df)
1938 print(" Pearson Correlation Test (H0: rho = 0)")
1939 print(" n=%d r=%.4f t=%.4f df=%d p=%s" % (n,r,t,df,fmt_p(p)))
1940 strength="negligible" if abs(r)<0.1 else "weak" if abs(r)<0.3 else "moderate" if abs(r)<0.5 else "strong"
1941 print(" Strength: %s (%s)" % (strength,"positive" if r>=0 else "negative"))
1942 print(); print(" "+decision(p,_alpha))
1943
1944elif _test in ("proportion","prop","z-prop","zprop","prop1","one-prop"):
1945 parts=parse_nums(_g1s)
1946 if len(parts)<2: print(" ERROR: provide 'successes,n' as group1"); sys.exit(0)
1947 k=int(parts[0]); n=int(parts[1]); p_hat=k/n; p0=_mu0 if _mu0>0 else 0.5
1948 se=math.sqrt(p0*(1-p0)/n)
1949 z=(p_hat-p0)/se if se>0 else 0
1950 p=2*(1-normal_cdf_h(abs(z)))
1951 ci_se=math.sqrt(p_hat*(1-p_hat)/n); z_crit=1.96
1952 lo=p_hat-z_crit*ci_se; hi=p_hat+z_crit*ci_se
1953 print(" One-Proportion z-Test (H0: p = %.4g)" % p0)
1954 print(" k=%d n=%d p_hat=%.4f SE=%.4f z=%.4f p=%s" % (k,n,p_hat,se,z,fmt_p(p)))
1955 print(" 95%% CI: [%.4f, %.4f]" % (lo,hi))
1956 print(); print(" "+decision(p,_alpha))
1957
1958elif _test in ("prop2","two-prop","twoprop","two-proportion"):
1959 p1=parse_nums(_g1s); p2=parse_nums(_g2s)
1960 if len(p1)<2 or len(p2)<2: print(" ERROR: each group needs 'successes,n'"); sys.exit(0)
1961 k1=int(p1[0]); n1=int(p1[1]); k2=int(p2[0]); n2=int(p2[1])
1962 ph1=k1/n1; ph2=k2/n2; pp=(k1+k2)/(n1+n2)
1963 se=math.sqrt(pp*(1-pp)*(1/n1+1/n2))
1964 z=(ph1-ph2)/se if se>0 else 0
1965 p=2*(1-normal_cdf_h(abs(z)))
1966 print(" Two-Proportion z-Test (H0: p1 = p2)")
1967 print(" G1: %d/%d (p_hat=%.4f) G2: %d/%d (p_hat=%.4f)" % (k1,n1,ph1,k2,n2,ph2))
1968 print(" Pooled p_hat=%.4f SE=%.4f z=%.4f p=%s" % (pp,se,z,fmt_p(p)))
1969 print(); print(" "+decision(p,_alpha))
1970
1971elif _test in ("ci","confidence","conf-interval","interval"):
1972 g1=parse_nums(_g1s)
1973 if len(g1)<2: print(" ERROR: need >=2 values for confidence interval"); sys.exit(0)
1974 n=len(g1); xbar=_st.mean(g1); s=_st.stdev(g1); se=s/math.sqrt(n)
1975 z_crit=1.96 if n>30 else (2.093 if n>19 else (2.262 if n>9 else 2.776))
1976 lo=xbar-z_crit*se; hi=xbar+z_crit*se
1977 print(" Confidence Interval for Mean")
1978 print(" n=%d xbar=%.4f s=%.4f SE=%.4f" % (n,xbar,s,se))
1979 print(" %.0f%% CI: [%.4f, %.4f] (+-%.4f)" % ((1-_alpha)*100,lo,hi,z_crit*se))
1980 if _mu0!=0:
1981 inside=lo<=_mu0<=hi
1982 print(" H0 value (mu=%.4g) is %s the interval" % (_mu0,"INSIDE" if inside else "OUTSIDE"))
1983
1984else:
1985 print(" Available tests:")
1986 print(" one-t One-sample t-test: --hypothesis-mu H0_MEAN")
1987 print(" two-t Two-sample (Welch) t-test (--hypothesis-group2 DATA)")
1988 print(" paired Paired t-test (--hypothesis-group2 DATA)")
1989 print(" chi2 Chi-square goodness-of-fit (--hypothesis-group2 EXPECTED)")
1990 print(" anova One-way ANOVA (groups separated by | in group1)")
1991 print(" mannwhitney Mann-Whitney U (--hypothesis-group2 DATA)")
1992 print(" pearson Pearson correlation test (--hypothesis-group2 DATA)")
1993 print(" proportion One-proportion z-test: 'successes,n' --hypothesis-mu P0")
1994 print(" prop2 Two-proportion z-test (--hypothesis-group2 'k2,n2')")
1995 print(" ci Confidence interval for mean")
1996 print()
1997 print(" Data format: comma-separated numbers, e.g. 3.1,2.8,4.0,3.5")
1998 print(" For ANOVA: groups separated by | e.g. 2.1,2.3|3.4,3.6|1.9,2.0")
1999
2000print("="*W)
2001"####,
2002 hex_test = hex_test,
2003 hex_g1 = hex_g1,
2004 hex_g2 = hex_g2,
2005 alpha = alpha,
2006 mu = mu,
2007 );
2008
2009 let sandbox_args = serde_json::json!({
2010 "language": "python",
2011 "code": script,
2012 "timeout_seconds": 30
2013 });
2014 crate::tools::code_sandbox::execute(&sandbox_args).await
2015}
2016
2017pub async fn describe_stats(
2021 file_path: &str,
2022 cols_str: &str,
2023 output: &str,
2024) -> Result<String, String> {
2025 let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
2026 let hex_cols: String = cols_str.bytes().map(|b| format!("{:02x}", b)).collect();
2027 let hex_output: String = output.bytes().map(|b| format!("{:02x}", b)).collect();
2028
2029 let script = format!(
2030 r####"import csv as _csv, json as _js, sqlite3 as _sq, os, math
2031
2032_path = bytes.fromhex("{hex_path}").decode().strip()
2033_cols_s = bytes.fromhex("{hex_cols}").decode().strip()
2034_output = bytes.fromhex("{hex_output}").decode().strip()
2035W = 68
2036
2037def _load(path):
2038 ext = os.path.splitext(path)[1].lower().lstrip('.')
2039 if ext in ('csv','tsv'):
2040 with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
2041 r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
2042 rows = list(r)
2043 return rows
2044 elif ext == 'json':
2045 with open(path, encoding='utf-8') as fh: d = _js.load(fh)
2046 return d if isinstance(d, list) else next(iter(d.values()), [])
2047 elif ext in ('db','sqlite','sqlite3'):
2048 con = _sq.connect(path)
2049 cur = con.cursor()
2050 cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
2051 t = cur.fetchone()
2052 if not t: return []
2053 cur.execute("SELECT * FROM [%s]" % t[0])
2054 cols2 = [d[0] for d in cur.description]
2055 rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
2056 con.close()
2057 return rows2
2058 return []
2059
2060rows = _load(_path)
2061if not rows:
2062 print(" ERROR: no rows loaded from", _path)
2063 raise SystemExit(0)
2064
2065all_cols = list(rows[0].keys())
2066if _cols_s:
2067 req = [c.strip() for c in _cols_s.split(',')]
2068 cols = [c for c in req if c in all_cols]
2069 if not cols:
2070 print(" WARNING: none of the specified columns found. Available:", ', '.join(all_cols))
2071 cols = all_cols
2072else:
2073 cols = all_cols
2074
2075def parse_nums(rows, col):
2076 vals = []
2077 for r in rows:
2078 v = r.get(col, '')
2079 if v is None: continue
2080 v = str(v).strip()
2081 if not v: continue
2082 try: vals.append(float(v))
2083 except: pass
2084 return vals
2085
2086def percentile(sorted_v, p):
2087 if not sorted_v: return float('nan')
2088 n = len(sorted_v)
2089 idx = (n - 1) * p / 100.0
2090 lo = int(idx); hi = lo + 1
2091 if hi >= n: return sorted_v[lo]
2092 frac = idx - lo
2093 return sorted_v[lo] * (1 - frac) + sorted_v[hi] * frac
2094
2095def skewness(vals, mean, std):
2096 if std == 0 or len(vals) < 3: return float('nan')
2097 n = len(vals)
2098 s3 = sum((x - mean)**3 for x in vals)
2099 return (n / ((n-1)*(n-2))) * s3 / std**3
2100
2101def kurtosis(vals, mean, std):
2102 if std == 0 or len(vals) < 4: return float('nan')
2103 n = len(vals)
2104 s4 = sum((x - mean)**4 for x in vals)
2105 k = (n*(n+1)/((n-1)*(n-2)*(n-3))) * s4 / std**4 - 3*(n-1)**2/((n-2)*(n-3))
2106 return k
2107
2108def ascii_hist(vals, bins=16, width=36):
2109 if not vals: return []
2110 mn = min(vals); mx = max(vals)
2111 rng = mx - mn
2112 if rng == 0: return [" (all values identical: %.4g)" % mn]
2113 bw = rng / bins
2114 counts = [0]*bins
2115 for v in vals:
2116 idx = min(int((v - mn) / bw), bins - 1)
2117 counts[idx] += 1
2118 max_c = max(counts) or 1
2119 lines = []
2120 for i, c in enumerate(counts):
2121 lo = mn + i*bw; hi = lo+bw
2122 bar = int(c / max_c * width)
2123 lines.append(" [%8.3g, %8.3g) %s %d" % (lo, hi, 'â–ˆ'*bar, c))
2124 return lines
2125
2126print("=" * W)
2127print(" DESCRIPTIVE STATISTICS")
2128print(" File: %s" % os.path.basename(_path))
2129print(" Rows: %d Cols examined: %d" % (len(rows), len(cols)))
2130print("=" * W)
2131
2132results = []
2133for col in cols:
2134 vals = parse_nums(rows, col)
2135 if len(vals) < 2:
2136 print("\n %s: too few numeric values (%d)" % (col, len(vals)))
2137 continue
2138 s = sorted(vals)
2139 n = len(vals)
2140 mean = sum(vals) / n
2141 var = sum((x-mean)**2 for x in vals) / (n-1) if n > 1 else 0
2142 std = math.sqrt(var)
2143 med = percentile(s, 50)
2144 # mode (simple: most frequent rounded value)
2145 from collections import Counter
2146 mode_ctr = Counter(round(v, 4) for v in vals)
2147 mode_val, mode_cnt = mode_ctr.most_common(1)[0]
2148 p5 = percentile(s, 5)
2149 p25 = percentile(s, 25)
2150 p75 = percentile(s, 75)
2151 p95 = percentile(s, 95)
2152 iqr = p75 - p25
2153 skew = skewness(vals, mean, std)
2154 kurt = kurtosis(vals, mean, std)
2155 # Outliers via IQR method
2156 lo_fence = p25 - 1.5*iqr; hi_fence = p75 + 1.5*iqr
2157 outliers = [v for v in vals if v < lo_fence or v > hi_fence]
2158 missing = len(rows) - sum(1 for r in rows if str(r.get(col,'')).strip())
2159
2160 print("\n %s" % col)
2161 print(" " + "-"*50)
2162 print(" n=%-8d missing=%-6d unique=%d" % (n, missing, len(set(round(v,6) for v in vals))))
2163 print(" mean=%11.6g std=%11.6g var=%11.6g" % (mean, std, var))
2164 print(" min=%12.6g max=%12.6g range=%10.6g" % (s[0], s[-1], s[-1]-s[0]))
2165 print(" P5=%12.6g P25=%11.6g median=%9.6g" % (p5, p25, med))
2166 print(" P75=%11.6g P95=%11.6g IQR=%11.6g" % (p75, p95, iqr))
2167 print(" mode=%11.6g (count=%d)" % (mode_val, mode_cnt))
2168 if not math.isnan(skew): print(" skewness=%8.4f kurtosis=%8.4f" % (skew, kurt))
2169 if outliers: print(" outliers (IQR): %d value(s) min=%.4g max=%.4g" % (len(outliers), min(outliers), max(outliers)))
2170 print()
2171 for line in ascii_hist(vals): print(line)
2172 results.append((col, n, mean, std, s[0], s[-1], med))
2173
2174if _output and results:
2175 with open(_output, 'w', newline='', encoding='utf-8') as fh:
2176 w2 = _csv.writer(fh)
2177 w2.writerow(['column','n','mean','std','min','max','median'])
2178 for row in results:
2179 w2.writerow(['%.8g'%v if isinstance(v,float) else v for v in row])
2180 print("\n Summary saved to: %s" % _output)
2181elif _output == '' and results:
2182 print("\n (Use --stats-output FILE to save summary CSV)")
2183
2184print("\n" + "=" * W)
2185"####,
2186 hex_path = hex_path,
2187 hex_cols = hex_cols,
2188 hex_output = hex_output,
2189 );
2190
2191 let sandbox_args = serde_json::json!({
2192 "language": "python",
2193 "code": script,
2194 "timeout_seconds": 30
2195 });
2196 crate::tools::code_sandbox::execute(&sandbox_args).await
2197}
2198
2199pub async fn classify_data(
2203 file_path: &str,
2204 label_col: &str,
2205 feature_cols: &str,
2206 predict_str: &str,
2207 k: usize,
2208 method: &str,
2209) -> Result<String, String> {
2210 let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
2211 let hex_label: String = label_col.bytes().map(|b| format!("{:02x}", b)).collect();
2212 let hex_feats: String = feature_cols.bytes().map(|b| format!("{:02x}", b)).collect();
2213 let hex_predict: String = predict_str.bytes().map(|b| format!("{:02x}", b)).collect();
2214 let hex_method: String = method.bytes().map(|b| format!("{:02x}", b)).collect();
2215
2216 let script = format!(
2217 r####"import csv as _csv, json as _js, sqlite3 as _sq, os, math
2218from collections import Counter, defaultdict
2219
2220_path = bytes.fromhex("{hex_path}").decode().strip()
2221_label = bytes.fromhex("{hex_label}").decode().strip()
2222_feats_s = bytes.fromhex("{hex_feats}").decode().strip()
2223_pred_s = bytes.fromhex("{hex_predict}").decode().strip()
2224_k = {k}
2225_method = bytes.fromhex("{hex_method}").decode().strip().lower()
2226W = 60
2227
2228def _load(path):
2229 ext = os.path.splitext(path)[1].lower().lstrip('.')
2230 if ext in ('csv','tsv'):
2231 with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
2232 r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
2233 return list(r)
2234 elif ext == 'json':
2235 with open(path, encoding='utf-8') as fh: d = _js.load(fh)
2236 return d if isinstance(d, list) else next(iter(d.values()), [])
2237 elif ext in ('db','sqlite','sqlite3'):
2238 con = _sq.connect(path)
2239 cur = con.cursor()
2240 cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
2241 t = cur.fetchone()
2242 if not t: return []
2243 cur.execute("SELECT * FROM [%s]" % t[0])
2244 cols2 = [d[0] for d in cur.description]
2245 rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
2246 con.close()
2247 return rows2
2248 return []
2249
2250rows = _load(_path)
2251if not rows:
2252 print(" ERROR: no rows loaded from", _path); raise SystemExit(0)
2253all_cols = list(rows[0].keys())
2254
2255# Determine label and feature columns
2256if not _label:
2257 _label = all_cols[-1]
2258 print(" (No --classify-label specified; using last column: %s)" % _label)
2259
2260if _label not in all_cols:
2261 print(" ERROR: label column '%s' not found. Available: %s" % (_label, ', '.join(all_cols)))
2262 raise SystemExit(0)
2263
2264if _feats_s:
2265 feat_cols = [c.strip() for c in _feats_s.split(',') if c.strip() in all_cols]
2266else:
2267 feat_cols = [c for c in all_cols if c != _label]
2268
2269if not feat_cols:
2270 print(" ERROR: no feature columns found."); raise SystemExit(0)
2271
2272# Extract numeric feature vectors
2273def row_to_vec(row):
2274 v = []
2275 for c in feat_cols:
2276 try: v.append(float(row.get(c, 0) or 0))
2277 except: v.append(0.0)
2278 return v
2279
2280labeled = [(row_to_vec(r), str(r[_label]).strip()) for r in rows if str(r.get(_label,'')).strip()]
2281if len(labeled) < 3:
2282 print(" ERROR: need at least 3 labeled rows."); raise SystemExit(0)
2283
2284X = [v for v,_ in labeled]
2285y = [lbl for _,lbl in labeled]
2286classes = sorted(set(y))
2287
2288print("="*W)
2289print(" CLASSIFICATION")
2290print(" File: %s" % os.path.basename(_path))
2291print(" Label: %s Features: %s" % (_label, ', '.join(feat_cols)))
2292print(" Method: %s Classes: %s" % (_method, ', '.join(classes)))
2293print(" Samples: %d" % len(labeled))
2294print("="*W)
2295
2296# ── k-NN ──────────────────────────────────────────────────────────────────────
2297def knn_predict(X_train, y_train, x_q, k):
2298 dists = [(math.sqrt(sum((a-b)**2 for a,b in zip(x_q, xi))), yi)
2299 for xi, yi in zip(X_train, y_train)]
2300 dists.sort(key=lambda d: d[0])
2301 top = [yi for _, yi in dists[:k]]
2302 return Counter(top).most_common(1)[0][0]
2303
2304# ── Gaussian Naive Bayes ───────────────────────────────────────────────────────
2305def gnb_fit(X_train, y_train):
2306 classes_t = sorted(set(y_train))
2307 stats = {{}}
2308 priors = {{}}
2309 n = len(y_train)
2310 for c in classes_t:
2311 idx = [i for i,yi in enumerate(y_train) if yi == c]
2312 priors[c] = len(idx) / n
2313 vecs = [X_train[i] for i in idx]
2314 m = [sum(vecs[j][f] for j in range(len(vecs)))/len(vecs) for f in range(len(feat_cols))]
2315 v = [sum((vecs[j][f]-m[f])**2 for j in range(len(vecs)))/max(len(vecs)-1,1) for f in range(len(feat_cols))]
2316 stats[c] = (m, v)
2317 return priors, stats
2318
2319def gnb_predict(priors, stats, x_q):
2320 best_c = None; best_log = float('-inf')
2321 for c, (m, v) in stats.items():
2322 log_p = math.log(priors[c] + 1e-300)
2323 for xi, mi, vi in zip(x_q, m, v):
2324 vi = max(vi, 1e-9)
2325 log_p += -0.5 * math.log(2*math.pi*vi) - (xi-mi)**2/(2*vi)
2326 if log_p > best_log: best_log = log_p; best_c = c
2327 return best_c
2328
2329# ── LOO cross-validation ──────────────────────────────────────────────────────
2330correct = 0
2331confusion = defaultdict(lambda: defaultdict(int))
2332for i in range(len(labeled)):
2333 Xt = [X[j] for j in range(len(X)) if j != i]
2334 yt = [y[j] for j in range(len(y)) if j != i]
2335 x_q = X[i]; true = y[i]
2336 if _method == 'nb' or _method == 'naive_bayes' or _method == 'gnb':
2337 p, s = gnb_fit(Xt, yt); pred = gnb_predict(p, s, x_q)
2338 else:
2339 pred = knn_predict(Xt, yt, x_q, _k)
2340 confusion[true][pred] += 1
2341 if pred == true: correct += 1
2342
2343acc = correct / len(labeled)
2344print("\n Leave-One-Out Cross-Validation")
2345print(" Accuracy: %d/%d = %.2f%%" % (correct, len(labeled), acc*100))
2346print()
2347
2348# Confusion matrix
2349print(" Confusion Matrix (actual=rows, predicted=cols):")
2350max_w = max(len(c) for c in classes) + 2
2351print(" " + " "*(max_w) + " " + " ".join(c.ljust(max_w) for c in classes))
2352for actual in classes:
2353 row = " " + actual.ljust(max_w) + " "
2354 row += " ".join(str(confusion[actual].get(pred, 0)).ljust(max_w) for pred in classes)
2355 print(row)
2356
2357# Per-class precision/recall
2358print()
2359print(" Per-class metrics:")
2360print(" %-15s %-10s %-10s %-10s" % ("Class","Precision","Recall","F1"))
2361print(" " + "-"*48)
2362for c in classes:
2363 tp = confusion[c].get(c, 0)
2364 fp = sum(confusion[other].get(c,0) for other in classes if other != c)
2365 fn = sum(confusion[c].get(other,0) for other in classes if other != c)
2366 prec = tp/(tp+fp) if tp+fp > 0 else 0
2367 rec = tp/(tp+fn) if tp+fn > 0 else 0
2368 f1 = 2*prec*rec/(prec+rec) if prec+rec > 0 else 0
2369 print(" %-15s %-10.3f %-10.3f %-10.3f" % (c[:15], prec, rec, f1))
2370
2371# Predict new sample if provided
2372if _pred_s:
2373 print()
2374 p_vals = [float(v.strip()) for v in _pred_s.split(',') if v.strip()]
2375 if len(p_vals) != len(feat_cols):
2376 print(" WARNING: --classify-predict has %d values but %d features expected" % (len(p_vals), len(feat_cols)))
2377 else:
2378 if _method in ('nb','naive_bayes','gnb'):
2379 p2, s2 = gnb_fit(X, y); pred_new = gnb_predict(p2, s2, p_vals)
2380 else:
2381 pred_new = knn_predict(X, y, p_vals, _k)
2382 print(" Prediction for [%s]:" % ', '.join('%.4g'%v for v in p_vals))
2383 print(" => %s" % pred_new)
2384
2385print("="*W)
2386"####,
2387 hex_path = hex_path,
2388 hex_label = hex_label,
2389 hex_feats = hex_feats,
2390 hex_predict = hex_predict,
2391 hex_method = hex_method,
2392 k = k,
2393 );
2394
2395 let sandbox_args = serde_json::json!({
2396 "language": "python",
2397 "code": script,
2398 "timeout_seconds": 30
2399 });
2400 crate::tools::code_sandbox::execute(&sandbox_args).await
2401}
2402
2403pub async fn regression_analysis(
2405 file_path: &str,
2406 x_col: &str,
2407 y_col: &str,
2408 degree: usize,
2409 predict_x: &str,
2410) -> Result<String, String> {
2411 let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
2412 let hex_x_col: String = x_col.bytes().map(|b| format!("{:02x}", b)).collect();
2413 let hex_y_col: String = y_col.bytes().map(|b| format!("{:02x}", b)).collect();
2414 let hex_predict: String = predict_x.bytes().map(|b| format!("{:02x}", b)).collect();
2415 let deg = degree.clamp(1, 10);
2416
2417 let script = format!(
2418 r####"import csv, sys, math
2419
2420_path = bytes.fromhex("{hex_path}").decode().strip()
2421_x_col = bytes.fromhex("{hex_x_col}").decode().strip()
2422_y_col = bytes.fromhex("{hex_y_col}").decode().strip()
2423_degree = {deg}
2424_pred_s = bytes.fromhex("{hex_predict}").decode().strip()
2425W = 64
2426
2427# ── load CSV ──────────────────────────────────────────────────────────────────
2428with open(_path, newline="", encoding="utf-8-sig") as f:
2429 reader = csv.DictReader(f)
2430 rows = list(reader)
2431 header = reader.fieldnames or []
2432
2433if not header:
2434 print("ERROR: empty file or no header"); sys.exit(1)
2435
2436x_col = _x_col if _x_col else header[0]
2437y_col = _y_col if _y_col else ([c for c in header if c != x_col] or [header[-1]])[-1]
2438
2439try:
2440 xs = [float(r[x_col]) for r in rows]
2441 ys = [float(r[y_col]) for r in rows]
2442except (KeyError, ValueError) as e:
2443 print(f"ERROR: {{e}}"); sys.exit(1)
2444
2445n = len(xs)
2446if n < 2:
2447 print("ERROR: need at least 2 data points"); sys.exit(1)
2448
2449deg = max(1, min(_degree, 10))
2450
2451# ── Vandermonde least-squares via Gaussian elimination ────────────────────────
2452def poly_fit(xs, ys, deg):
2453 d = deg + 1
2454 ATA = [[0.0]*d for _ in range(d)]
2455 ATy = [0.0]*d
2456 for x, y in zip(xs, ys):
2457 pows = [x**k for k in range(d)]
2458 for i in range(d):
2459 ATy[i] += pows[i] * y
2460 for j in range(d):
2461 ATA[i][j] += pows[i] * pows[j]
2462 mat = [ATA[i][:] + [ATy[i]] for i in range(d)]
2463 for col in range(d):
2464 pivot = max(range(col, d), key=lambda r: abs(mat[r][col]))
2465 mat[col], mat[pivot] = mat[pivot], mat[col]
2466 if abs(mat[col][col]) < 1e-12:
2467 continue
2468 for row in range(col+1, d):
2469 f = mat[row][col] / mat[col][col]
2470 for k in range(col, d+1):
2471 mat[row][k] -= f * mat[col][k]
2472 coeffs = [0.0]*d
2473 for row in range(d-1, -1, -1):
2474 coeffs[row] = mat[row][d]
2475 for k in range(row+1, d):
2476 coeffs[row] -= mat[row][k] * coeffs[k]
2477 if abs(mat[row][row]) > 1e-12:
2478 coeffs[row] /= mat[row][row]
2479 return coeffs
2480
2481def poly_eval(coeffs, x):
2482 return sum(c * x**k for k, c in enumerate(coeffs))
2483
2484coeffs = poly_fit(xs, ys, deg)
2485
2486# ── metrics ───────────────────────────────────────────────────────────────────
2487y_mean = sum(ys) / n
2488ss_res = sum((y - poly_eval(coeffs, x))**2 for x, y in zip(xs, ys))
2489ss_tot = sum((y - y_mean)**2 for y in ys)
2490r2 = 1.0 - ss_res / ss_tot if ss_tot > 1e-12 else 1.0
2491rmse = math.sqrt(ss_res / n)
2492mae = sum(abs(y - poly_eval(coeffs, x)) for x, y in zip(xs, ys)) / n
2493
2494if deg == 1 and n > 1:
2495 sx = math.sqrt(sum((x - sum(xs)/n)**2 for x in xs) / (n-1))
2496 sy = math.sqrt(sum((y - y_mean)**2 for y in ys) / (n-1))
2497 sxy = sum((x - sum(xs)/n)*(y - y_mean) for x, y in zip(xs, ys)) / (n-1)
2498 pearson_r = sxy / (sx * sy) if sx * sy > 1e-12 else 0.0
2499else:
2500 pearson_r = None
2501
2502# ── header ────────────────────────────────────────────────────────────────────
2503print("=" * W)
2504label = "LINEAR" if deg == 1 else f"POLYNOMIAL (degree {{deg}})"
2505print(f" REGRESSION ANALYSIS — {{label}}")
2506print(f" X: {{x_col}} Y: {{y_col}} N: {{n}}")
2507print("=" * W)
2508terms = []
2509for k, c in enumerate(coeffs):
2510 if abs(c) < 1e-12: continue
2511 if k == 0: terms.append(f"{{c:.6g}}")
2512 elif k == 1: terms.append(f"{{c:+.6g}}*x")
2513 else: terms.append(f"{{c:+.6g}}*x^{{k}}")
2514print(" y = " + " ".join(terms) if terms else " y = 0")
2515print()
2516print(f" R2 : {{r2:.6f}}")
2517if pearson_r is not None:
2518 print(f" Pearson : {{pearson_r:.6f}}")
2519print(f" RMSE : {{rmse:.6g}}")
2520print(f" MAE : {{mae:.6g}}")
2521if r2 >= 0.95: qual = "Excellent fit (R2 >= 0.95)"
2522elif r2 >= 0.80: qual = "Good fit (R2 >= 0.80)"
2523elif r2 >= 0.60: qual = "Moderate fit (R2 >= 0.60)"
2524else: qual = "Weak fit (R2 < 0.60)"
2525print(f" Quality : {{qual}}")
2526print()
2527
2528# ── ASCII scatter + fit curve ─────────────────────────────────────────────────
2529ROWS, COLS = 16, W - 6
2530x_min, x_max = min(xs), max(xs)
2531cx_list = [x_min + (x_max - x_min)*i/(COLS-1) for i in range(COLS)] if COLS > 1 else [x_min]
2532cy_list = [poly_eval(coeffs, x) for x in cx_list]
2533y_min2 = min(list(ys) + cy_list)
2534y_max2 = max(list(ys) + cy_list)
2535
2536def to_col(x):
2537 return int((x - x_min) / (x_max - x_min) * (COLS-1)) if x_max != x_min else 0
2538
2539def to_row(y):
2540 return int((y_max2 - y) / (y_max2 - y_min2) * (ROWS-1)) if y_max2 != y_min2 else ROWS//2
2541
2542grid = [[" "]*COLS for _ in range(ROWS)]
2543for cx, cy in zip(cx_list, cy_list):
2544 r, c = to_row(cy), to_col(cx)
2545 if 0 <= r < ROWS and 0 <= c < COLS and grid[r][c] == " ":
2546 grid[r][c] = "-"
2547for x, y in zip(xs, ys):
2548 r, c = to_row(y), to_col(x)
2549 if 0 <= r < ROWS and 0 <= c < COLS:
2550 grid[r][c] = "*"
2551
2552print(" Scatter (* = data, - = fit curve):")
2553for i, row in enumerate(grid):
2554 if i == 0: lbl = f"{{y_max2:.3g}}"
2555 elif i == ROWS-1: lbl = f"{{y_min2:.3g}}"
2556 else: lbl = ""
2557 print(f" {{lbl:>8}} |{{''.join(row)}}")
2558print(f" {{' ':>8}} +" + "-"*COLS)
2559xl, xr = f"{{x_min:.3g}}", f"{{x_max:.3g}}"
2560pad_w = max(0, COLS - len(xl) - len(xr))
2561print(" " + " "*9 + xl + " "*pad_w + xr)
2562print()
2563
2564# ── residuals ─────────────────────────────────────────────────────────────────
2565resids = [y - poly_eval(coeffs, x) for x, y in zip(xs, ys)]
2566r_min, r_max = min(resids), max(resids)
2567rRs, rCs = 6, W - 6
2568grid_r = [[" "]*rCs for _ in range(rRs)]
2569rng = max(abs(r_min), abs(r_max), 1e-12)
2570mid_r = rRs // 2
2571for c in range(rCs):
2572 grid_r[mid_r][c] = "."
2573for x, res in zip(xs, resids):
2574 c = int((x - x_min)/(x_max - x_min)*(rCs-1)) if x_max != x_min else 0
2575 r = int((rng - res)/(2*rng)*(rRs-1))
2576 if 0 <= r < rRs and 0 <= c < rCs:
2577 grid_r[r][c] = "o"
2578print(" Residuals (o = data, . = zero line):")
2579for row in grid_r:
2580 print(" " + "".join(row))
2581print(" " + "-"*rCs)
2582print(f" Range: [{{r_min:.4g}}, {{r_max:.4g}}]")
2583print()
2584
2585# ── predictions ───────────────────────────────────────────────────────────────
2586if _pred_s:
2587 try:
2588 pred_xs = [float(v.strip()) for v in _pred_s.split(",")]
2589 print(" Predictions:")
2590 for px in pred_xs:
2591 py = poly_eval(coeffs, px)
2592 print(f" x = {{px:.6g}} => y = {{py:.6g}}")
2593 print()
2594 except ValueError:
2595 print(f" WARNING: bad --regression-predict value: {{_pred_s}}")
2596 print()
2597print("=" * W)
2598"####,
2599 hex_path = hex_path,
2600 hex_x_col = hex_x_col,
2601 hex_y_col = hex_y_col,
2602 hex_predict = hex_predict,
2603 deg = deg,
2604 );
2605
2606 let sandbox_args = serde_json::json!({
2607 "language": "python",
2608 "code": script,
2609 "timeout_seconds": 30
2610 });
2611 crate::tools::code_sandbox::execute(&sandbox_args).await
2612}