hematite/tools/
data_tools.rs

1// â”€â”€â”€ Data analysis tools â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
2// Random sampling, correlation matrix, time-series analysis.
3// All use the Python code sandbox â€” no external deps, no model required.
4
5// â”€â”€ Random data sampling â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
6// Draws N rows (or a fraction) from a CSV/TSV/JSON/SQLite file.
7// Optionally splits into train/test sets.
8
9pub async fn sample_data(
10    file_path: &str,
11    n: usize,
12    fraction: f64,
13    seed: u64,
14    split: f64,
15    output: &str,
16) -> Result<String, String> {
17    let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
18    let hex_output: String = output.bytes().map(|b| format!("{:02x}", b)).collect();
19
20    let script = format!(
21        r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys, random
22
23_path   = bytes.fromhex("{hex_path}").decode().strip()
24_outdir = bytes.fromhex("{hex_output}").decode().strip()
25_n      = {n}
26_frac   = {fraction}
27_seed   = {seed}
28_split  = {split}
29
30random.seed(_seed)
31
32def _load(path):
33    ext = os.path.splitext(path)[1].lower().lstrip('.')
34    if ext in ('csv','tsv'):
35        with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
36            r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
37            return list(r), ext
38    elif ext == 'json':
39        with open(path, encoding='utf-8') as fh: d = _js.load(fh)
40        rows = d if isinstance(d, list) else next(iter(d.values()), [])
41        return rows, 'json'
42    elif ext in ('db','sqlite','sqlite3'):
43        con = _sq.connect(path)
44        cur = con.cursor()
45        cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
46        t = cur.fetchone()
47        if not t: return [], 'csv'
48        cur.execute("SELECT * FROM [%s]" % t[0])
49        cols2 = [d[0] for d in cur.description]
50        rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
51        con.close()
52        return rows2, 'csv'
53    print("Unsupported format: "+ext, file=sys.stderr); sys.exit(1)
54
55rows, ext = _load(_path)
56total = len(rows)
57if total == 0:
58    print("No rows found."); sys.exit(0)
59
60if _frac > 0 and _frac <= 1:
61    k = max(1, int(total * _frac))
62elif _n > 0:
63    k = min(_n, total)
64else:
65    k = min(100, total)
66
67sample = random.sample(rows, k)
68
69def _write_csv(data, path):
70    if not data: return
71    fieldnames = list(data[0].keys())
72    with open(path, 'w', newline='', encoding='utf-8') as fh:
73        w = _csv.DictWriter(fh, fieldnames=fieldnames)
74        w.writeheader(); w.writerows(data)
75
76if _split > 0 and _split < 1 and _outdir:
77    split_n = int(k * _split)
78    train = sample[:split_n]
79    test  = sample[split_n:]
80    base = os.path.splitext(os.path.basename(_path))[0]
81    train_path = os.path.join(_outdir, base + '_train.csv')
82    test_path  = os.path.join(_outdir, base + '_test.csv')
83    os.makedirs(_outdir, exist_ok=True)
84    _write_csv(train, train_path)
85    _write_csv(test,  test_path)
86    print("Sampled %d rows (seed=%d) â†’ %d%% split" % (k, _seed, int(_split*100)))
87    print("Train: %d rows â†’ %s" % (len(train), train_path))
88    print("Test:  %d rows â†’ %s" % (len(test), test_path))
89elif _outdir:
90    base = os.path.splitext(os.path.basename(_path))[0]
91    out_path = os.path.join(_outdir, base + '_sample%d.csv' % k)
92    os.makedirs(_outdir, exist_ok=True)
93    _write_csv(sample, out_path)
94    print("Sampled %d / %d rows (seed=%d) â†’ %s" % (k, total, _seed, out_path))
95else:
96    # Print sample to stdout as CSV
97    fieldnames = list(sample[0].keys())
98    print(','.join(fieldnames))
99    for row in sample:
100        print(','.join(str(row.get(f,'')) for f in fieldnames))
101    print()
102    print("# Sampled %d / %d rows  (seed=%d)" % (k, total, _seed))
103    print("# Use --sample-output DIR to save to file, or --split 0.8 for train/test split")
104"####,
105        hex_path = hex_path,
106        hex_output = hex_output,
107        n = n,
108        fraction = fraction,
109        seed = seed,
110        split = split,
111    );
112
113    let sandbox_args = serde_json::json!({
114        "language": "python",
115        "code": script,
116        "timeout_seconds": 30
117    });
118    crate::tools::code_sandbox::execute(&sandbox_args).await
119}
120
121// â”€â”€ Correlation matrix â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
122
123pub async fn correlation_matrix(file_path: &str, method: &str) -> Result<String, String> {
124    let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
125    let hex_method: String = method.bytes().map(|b| format!("{:02x}", b)).collect();
126
127    let script = format!(
128        r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys, math
129
130_path   = bytes.fromhex("{hex_path}").decode().strip()
131_method = bytes.fromhex("{hex_method}").decode().strip().lower() or "pearson"
132
133def _load(path):
134    ext = os.path.splitext(path)[1].lower().lstrip('.')
135    if ext in ('csv','tsv'):
136        with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
137            r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
138            return list(r)
139    elif ext == 'json':
140        with open(path, encoding='utf-8') as fh: d = _js.load(fh)
141        return d if isinstance(d, list) else next(iter(d.values()), [])
142    elif ext in ('db','sqlite','sqlite3'):
143        con = _sq.connect(path)
144        cur = con.cursor()
145        cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
146        t = cur.fetchone()
147        if not t: return []
148        cur.execute("SELECT * FROM [%s]" % t[0])
149        cols2 = [d[0] for d in cur.description]
150        rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
151        con.close()
152        return rows2
153    print("Unsupported: "+ext, file=sys.stderr); sys.exit(1)
154
155def _tf(v):
156    try: return float(str(v).replace(',','').strip())
157    except: return None
158
159rows = _load(_path)
160if not rows:
161    print("No rows found."); sys.exit(0)
162
163all_cols = list(rows[0].keys())
164# Keep columns that are at least 50% numeric
165num_cols = [c for c in all_cols
166            if sum(1 for r in rows if _tf(r.get(c,'')) is not None) >= len(rows)*0.5]
167
168if len(num_cols) < 2:
169    print("Need at least 2 numeric columns. Found: %s" % ', '.join(num_cols or ['(none)']))
170    sys.exit(0)
171
172# Build column vectors (paired â€” both must be non-null for each row)
173def _col_vec(c): return [_tf(r.get(c,'')) for r in rows]
174
175vecs = {{c: _col_vec(c) for c in num_cols}}
176
177def _pearson(a, b):
178    pairs = [(x,y) for x,y in zip(a,b) if x is not None and y is not None]
179    n = len(pairs)
180    if n < 3: return float('nan')
181    mx = sum(x for x,_ in pairs)/n
182    my = sum(y for _,y in pairs)/n
183    num = sum((x-mx)*(y-my) for x,y in pairs)
184    dx  = math.sqrt(sum((x-mx)**2 for x,_ in pairs))
185    dy  = math.sqrt(sum((y-my)**2 for _,y in pairs))
186    return num/(dx*dy) if dx*dy else float('nan')
187
188def _spearman(a, b):
189    pairs = [(x,y) for x,y in zip(a,b) if x is not None and y is not None]
190    n = len(pairs)
191    if n < 3: return float('nan')
192    def _rank(vs):
193        sorted_vs = sorted(enumerate(vs), key=lambda x: x[1])
194        ranks = [0.0]*n
195        i = 0
196        while i < n:
197            j = i
198            while j < n-1 and sorted_vs[j+1][1] == sorted_vs[i][1]: j+=1
199            avg_rank = (i + j)/2 + 1
200            for k in range(i,j+1): ranks[sorted_vs[k][0]] = avg_rank
201            i = j+1
202        return ranks
203    ra = _rank([p[0] for p in pairs])
204    rb = _rank([p[1] for p in pairs])
205    return _pearson(ra, rb)
206
207corr_fn = _spearman if _method.startswith('sp') else _pearson
208
209nc = len(num_cols)
210matrix = [[corr_fn(vecs[a], vecs[b]) for b in num_cols] for a in num_cols]
211
212W = 64
213print("="*W)
214print(" Correlation Matrix (%s)  â€”  %s" % (_method.capitalize(), os.path.basename(_path)))
215print("-"*W)
216# Print header
217col_w = 8
218print("%*s" % (20, ""), end="")
219for c in num_cols:
220    print("  %*s" % (col_w, c[:col_w]), end="")
221print()
222print("-"*W)
223for i, ra in enumerate(num_cols):
224    print("%-20s" % ra[:20], end="")
225    for j in range(nc):
226        v = matrix[i][j]
227        if math.isnan(v): s = "   nan  "
228        else: s = " %7.4f" % v
229        # Highlight strong correlations
230        if i != j and not math.isnan(v) and abs(v) >= 0.7:
231            s = s + "*"
232        else:
233            s = s + " "
234        print(" %s" % s[:col_w+1], end="")
235    print()
236print("="*W)
237print("  * |r| >= 0.7  (strong correlation)")
238print()
239# Report top correlations
240pairs_flat = []
241for i in range(nc):
242    for j in range(i+1, nc):
243        v = matrix[i][j]
244        if not math.isnan(v):
245            pairs_flat.append((abs(v), v, num_cols[i], num_cols[j]))
246pairs_flat.sort(reverse=True)
247if pairs_flat:
248    print("Top correlations:")
249    for _abs, v, a, b in pairs_flat[:min(5, len(pairs_flat))]:
250        direction = "positive" if v > 0 else "negative"
251        strength = "strong" if abs(v)>=0.7 else "moderate" if abs(v)>=0.4 else "weak"
252        print("  %s  %-15s  Ã—  %-15s" % (("r=%+.4f"%v), a[:15], b[:15]))
253        print("         (%s %s)" % (strength, direction))
254"####,
255        hex_path = hex_path,
256        hex_method = hex_method,
257    );
258
259    let sandbox_args = serde_json::json!({
260        "language": "python",
261        "code": script,
262        "timeout_seconds": 30
263    });
264    crate::tools::code_sandbox::execute(&sandbox_args).await
265}
266
267// â”€â”€ Time-series basic analysis â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
268
269pub async fn timeseries_analyze(
270    file_path: &str,
271    date_col: &str,
272    value_col: &str,
273    window: usize,
274) -> Result<String, String> {
275    let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
276    let hex_date_col: String = date_col.bytes().map(|b| format!("{:02x}", b)).collect();
277    let hex_val_col: String = value_col.bytes().map(|b| format!("{:02x}", b)).collect();
278
279    let script = format!(
280        r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys, math, re
281
282_path     = bytes.fromhex("{hex_path}").decode().strip()
283_date_col = bytes.fromhex("{hex_date_col}").decode().strip()
284_val_col  = bytes.fromhex("{hex_val_col}").decode().strip()
285_window   = {window}
286if _window < 2: _window = 7
287
288def _load(path):
289    ext = os.path.splitext(path)[1].lower().lstrip('.')
290    if ext in ('csv','tsv'):
291        with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
292            r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
293            return list(r)
294    elif ext == 'json':
295        with open(path, encoding='utf-8') as fh: d = _js.load(fh)
296        return d if isinstance(d, list) else next(iter(d.values()), [])
297    elif ext in ('db','sqlite','sqlite3'):
298        con = _sq.connect(path)
299        cur = con.cursor()
300        cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
301        t = cur.fetchone()
302        if not t: return []
303        cur.execute("SELECT * FROM [%s]" % t[0])
304        cols2 = [d[0] for d in cur.description]
305        rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
306        con.close()
307        return rows2
308    print("Unsupported: "+ext, file=sys.stderr); sys.exit(1)
309
310def _tf(v):
311    try: return float(str(v).replace(',','').strip())
312    except: return None
313
314# Detect date columns if not specified
315def _looks_like_date(v):
316    return bool(re.match(r'\d{{4}}[-/]\d{{1,2}}[-/]\d{{1,2}}', str(v)))
317
318rows = _load(_path)
319if not rows:
320    print("No rows found."); sys.exit(0)
321
322all_cols = list(rows[0].keys())
323
324if not _date_col:
325    date_candidates = [c for c in all_cols if sum(1 for r in rows[:50] if _looks_like_date(r.get(c,''))) > 20]
326    _date_col = date_candidates[0] if date_candidates else all_cols[0]
327
328num_cols = [c for c in all_cols
329            if c != _date_col and sum(1 for r in rows if _tf(r.get(c,'')) is not None) >= len(rows)*0.5]
330
331if not _val_col and num_cols:
332    _val_col = num_cols[0]
333
334if not _val_col:
335    print("No numeric value column found. Use --ts-value COL to specify one."); sys.exit(0)
336
337# Extract and sort by date string (lexicographic â€” works for ISO dates)
338pairs = []
339for r in rows:
340    d = str(r.get(_date_col,'')).strip()
341    v = _tf(r.get(_val_col,''))
342    if d and v is not None:
343        pairs.append((d, v))
344pairs.sort(key=lambda p: p[0])
345
346if len(pairs) < 3:
347    print("Need at least 3 data points. Found: %d" % len(pairs)); sys.exit(0)
348
349dates = [p[0] for p in pairs]
350vals  = [p[1] for p in pairs]
351n = len(vals)
352
353# Rolling mean
354def _roll(vs, w):
355    return [sum(vs[max(0,i-w+1):i+1])/len(vs[max(0,i-w+1):i+1]) for i in range(len(vs))]
356
357roll_mean = _roll(vals, _window)
358
359# Linear trend (least squares)
360xs = list(range(n))
361xm = sum(xs)/n; ym = sum(vals)/n
362b  = sum((x-xm)*(y-ym) for x,y in zip(xs,vals)) / sum((x-xm)**2 for x in xs)
363a  = ym - b*xm
364trend_line = [a + b*x for x in xs]
365
366# Peak/valley detection
367peaks   = [i for i in range(1,n-1) if vals[i]>vals[i-1] and vals[i]>vals[i+1]]
368valleys = [i for i in range(1,n-1) if vals[i]<vals[i-1] and vals[i]<vals[i+1]]
369
370W = 64
371print("="*W)
372print(" Time-Series Analysis: %s" % os.path.basename(_path))
373print(" Date column:  %s    Value column: %s" % (_date_col, _val_col))
374print("-"*W)
375print("  Points:  %d   Range: %s â†’ %s" % (n, dates[0][:16], dates[-1][:16]))
376print("  Min:     %g  (at %s)" % (min(vals), dates[vals.index(min(vals))][:16]))
377print("  Max:     %g  (at %s)" % (max(vals), dates[vals.index(max(vals))][:16]))
378print("  Mean:    %.4f   Std: %.4f" % (ym, math.sqrt(sum((v-ym)**2 for v in vals)/n)))
379print("  Trend:   %.4f per step  (%s)" % (b, "â†‘ upward" if b>0 else "â†“ downward" if b<0 else "â†’ flat"))
380print("  Peaks:   %d local maxima   Valleys: %d local minima" % (len(peaks), len(valleys)))
381print("-"*W)
382print("  Rolling mean (window=%d):" % _window)
383# Compact sparkline using ASCII
384W2 = 50
385rng = max(vals) - min(vals) if max(vals) != min(vals) else 1
386bar_chars = " â–â–‚â–ƒâ–„â–…â–†â–‡â–ˆ"
387spark = ''.join(bar_chars[min(8,int((v-min(vals))/rng*8))] for v in vals)
388# Wrap
389for i in range(0, len(spark), W2):
390    chunk = spark[i:i+W2]
391    print("  [%s]  %sâ€“%s" % (chunk, dates[i][:10], dates[min(i+W2-1,n-1)][:10]))
392print("-"*W)
393# Last few rolling values
394print("  Recent rolling mean (%d-period):" % _window)
395for i in range(max(0,n-5), n):
396    flag = " â† latest" if i==n-1 else ""
397    print("    %-16s  value=%g   roll_mean=%.4f%s" % (dates[i][:16], vals[i], roll_mean[i], flag))
398print("="*W)
399"####,
400        hex_path = hex_path,
401        hex_date_col = hex_date_col,
402        hex_val_col = hex_val_col,
403        window = window,
404    );
405
406    let sandbox_args = serde_json::json!({
407        "language": "python",
408        "code": script,
409        "timeout_seconds": 30
410    });
411    crate::tools::code_sandbox::execute(&sandbox_args).await
412}
413
414// â”€â”€ Percentile / quantile report â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
415// Computes P1 P5 P10 P25 P50 P75 P90 P95 P99 for each numeric column
416// (or a specific column if col is non-empty).
417
418pub async fn percentile_report(file_path: &str, col: &str) -> Result<String, String> {
419    let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
420    let hex_col: String = col.bytes().map(|b| format!("{:02x}", b)).collect();
421
422    let script = format!(
423        r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys, math
424
425_path   = bytes.fromhex("{hex_path}").decode().strip()
426_col    = bytes.fromhex("{hex_col}").decode().strip()
427
428def _load(path):
429    ext = os.path.splitext(path)[1].lower().lstrip('.')
430    if ext in ('csv','tsv'):
431        with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
432            r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
433            return list(r)
434    elif ext == 'json':
435        with open(path, encoding='utf-8') as fh: d = _js.load(fh)
436        return d if isinstance(d, list) else next(iter(d.values()), [])
437    elif ext in ('db','sqlite','sqlite3'):
438        con = _sq.connect(path)
439        cur = con.cursor()
440        cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
441        t = cur.fetchone()
442        if not t: return []
443        cur.execute("SELECT * FROM [%s]" % t[0])
444        cols2 = [d[0] for d in cur.description]
445        rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
446        con.close()
447        return rows2
448    print("Unsupported: "+ext, file=sys.stderr); sys.exit(1)
449
450def _tf(v):
451    try: return float(str(v).replace(',','').strip())
452    except: return None
453
454def _percentile(data, p):
455    if not data: return float('nan')
456    s = sorted(data)
457    n = len(s)
458    idx = (p/100.0) * (n-1)
459    lo = int(idx); hi = lo + 1
460    frac = idx - lo
461    if hi >= n: return s[-1]
462    return s[lo] + frac*(s[hi]-s[lo])
463
464rows = _load(_path)
465if not rows:
466    print("No rows found."); sys.exit(0)
467
468all_cols = list(rows[0].keys())
469if _col:
470    target_cols = [c for c in all_cols if c.lower() == _col.lower()]
471    if not target_cols:
472        print("Column '%s' not found. Available: %s" % (_col, ', '.join(all_cols)))
473        sys.exit(1)
474else:
475    target_cols = [c for c in all_cols
476                   if sum(1 for r in rows if _tf(r.get(c,'')) is not None) >= len(rows)*0.5]
477    if not target_cols:
478        print("No numeric columns found."); sys.exit(0)
479
480W = 72
481print("="*W)
482print(" Percentile Report â€” %s  (%d rows)" % (os.path.basename(_path), len(rows)))
483print("-"*W)
484hdr = "%-20s %8s %8s %8s %8s %8s %8s %8s" % ("Column", "P25", "P50", "P75", "P90", "P99", "Min", "Max")
485print(hdr)
486print("-"*W)
487for c in target_cols:
488    vals = [_tf(r.get(c,'')) for r in rows]
489    vals = [v for v in vals if v is not None]
490    if not vals: continue
491    p25=_percentile(vals,25); p50=_percentile(vals,50); p75=_percentile(vals,75)
492    p90=_percentile(vals,90); p99=_percentile(vals,99)
493    mn=min(vals); mx=max(vals)
494    def _f(v): return "%8g" % v
495    print("%-20s %s %s %s %s %s %s %s" % (c[:20], _f(p25), _f(p50), _f(p75), _f(p90), _f(p99), _f(mn), _f(mx)))
496print("="*W)
497print()
498if len(target_cols) == 1:
499    c = target_cols[0]
500    vals = [_tf(r.get(c,'')) for r in rows if _tf(r.get(c,'')) is not None]
501    print("Detailed percentile table for '%s':" % c)
502    for p in [1, 5, 10, 25, 50, 75, 90, 95, 99]:
503        v = _percentile(vals, p)
504        print("  P%-3d  %g" % (p, v))
505    mean = sum(vals)/len(vals)
506    std  = math.sqrt(sum((x-mean)**2 for x in vals)/len(vals))
507    iqr  = _percentile(vals,75) - _percentile(vals,25)
508    print()
509    print("  Mean: %g   Std: %g   IQR: %g   N: %d" % (mean, std, iqr, len(vals)))
510"####,
511        hex_path = hex_path,
512        hex_col = hex_col,
513    );
514
515    let sandbox_args = serde_json::json!({
516        "language": "python",
517        "code": script,
518        "timeout_seconds": 30
519    });
520    crate::tools::code_sandbox::execute(&sandbox_args).await
521}
522
523// â”€â”€ Pivot table â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
524// Groups rows by row_col Ã— col_col and aggregates value_col.
525// Agg: count (default), sum, mean, min, max.
526
527pub async fn pivot_table(
528    file_path: &str,
529    row_col: &str,
530    col_col: &str,
531    value_col: &str,
532    agg: &str,
533) -> Result<String, String> {
534    let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
535    let hex_row_col: String = row_col.bytes().map(|b| format!("{:02x}", b)).collect();
536    let hex_col_col: String = col_col.bytes().map(|b| format!("{:02x}", b)).collect();
537    let hex_val_col: String = value_col.bytes().map(|b| format!("{:02x}", b)).collect();
538    let hex_agg: String = agg.bytes().map(|b| format!("{:02x}", b)).collect();
539
540    let script = format!(
541        r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys
542
543_path    = bytes.fromhex("{hex_path}").decode().strip()
544_row_col = bytes.fromhex("{hex_row_col}").decode().strip()
545_col_col = bytes.fromhex("{hex_col_col}").decode().strip()
546_val_col = bytes.fromhex("{hex_val_col}").decode().strip()
547_agg     = bytes.fromhex("{hex_agg}").decode().strip().lower() or "count"
548
549def _load(path):
550    ext = os.path.splitext(path)[1].lower().lstrip('.')
551    if ext in ('csv','tsv'):
552        with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
553            r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
554            return list(r)
555    elif ext == 'json':
556        with open(path, encoding='utf-8') as fh: d = _js.load(fh)
557        return d if isinstance(d, list) else next(iter(d.values()), [])
558    elif ext in ('db','sqlite','sqlite3'):
559        con = _sq.connect(path)
560        cur = con.cursor()
561        cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
562        t = cur.fetchone()
563        if not t: return []
564        cur.execute("SELECT * FROM [%s]" % t[0])
565        cols2 = [d[0] for d in cur.description]
566        rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
567        con.close()
568        return rows2
569    print("Unsupported: "+ext, file=sys.stderr); sys.exit(1)
570
571def _tf(v):
572    try: return float(str(v).replace(',','').strip())
573    except: return None
574
575rows = _load(_path)
576if not rows:
577    print("No rows found."); sys.exit(0)
578
579all_cols = list(rows[0].keys())
580
581if not _row_col:
582    cat_cols = [c for c in all_cols if sum(1 for r in rows if _tf(r.get(c,'')) is None) > len(rows)*0.3]
583    _row_col = cat_cols[0] if cat_cols else all_cols[0]
584if not _col_col:
585    cat_cols = [c for c in all_cols if sum(1 for r in rows if _tf(r.get(c,'')) is None) > len(rows)*0.3]
586    _col_col = cat_cols[1] if len(cat_cols) > 1 else (all_cols[1] if len(all_cols) > 1 else _row_col)
587if not _val_col and _agg != 'count':
588    num_cols = [c for c in all_cols if c not in (_row_col, _col_col) and
589                sum(1 for r in rows if _tf(r.get(c,'')) is not None) >= len(rows)*0.5]
590    _val_col = num_cols[0] if num_cols else ''
591
592data = {{}}
593for r in rows:
594    rk = str(r.get(_row_col, '')).strip()
595    ck = str(r.get(_col_col, '')).strip()
596    v  = _tf(r.get(_val_col, '')) if _val_col else 1.0
597    if rk not in data: data[rk] = {{}}
598    if ck not in data[rk]: data[rk][ck] = []
599    if v is not None: data[rk][ck].append(v)
600
601row_keys = sorted(data.keys())
602col_keys = sorted({{ck for rv in data.values() for ck in rv}})
603
604def _agg_fn(vals):
605    if not vals: return ''
606    if _agg == 'count':  return str(len(vals))
607    if _agg == 'sum':    return "%.4g" % sum(vals)
608    if _agg == 'mean':   return "%.4g" % (sum(vals)/len(vals))
609    if _agg == 'min':    return "%.4g" % min(vals)
610    if _agg == 'max':    return "%.4g" % max(vals)
611    return str(len(vals))
612
613CW = 10
614RW = 16
615print("Pivot: %s x %s  (%s of %s)  |  rows=%d  cols=%d" % (
616    _row_col, _col_col, _agg, _val_col or 'rows', len(row_keys), len(col_keys)))
617print()
618print("%-*s" % (RW, _row_col[:RW]), end="")
619for ck in col_keys: print("  %-*s" % (CW, ck[:CW]), end="")
620print()
621print("-" * (RW + len(col_keys)*(CW+2)))
622for rk in row_keys:
623    print("%-*s" % (RW, rk[:RW]), end="")
624    for ck in col_keys:
625        vals = data.get(rk, {{}}).get(ck, [])
626        cell = _agg_fn(vals) if vals else '-'
627        print("  %-*s" % (CW, cell[:CW]), end="")
628    print()
629"####,
630        hex_path = hex_path,
631        hex_row_col = hex_row_col,
632        hex_col_col = hex_col_col,
633        hex_val_col = hex_val_col,
634        hex_agg = hex_agg,
635    );
636
637    let sandbox_args = serde_json::json!({
638        "language": "python",
639        "code": script,
640        "timeout_seconds": 30
641    });
642    crate::tools::code_sandbox::execute(&sandbox_args).await
643}
644
645// â”€â”€ Multivariate linear regression â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
646// OLS via normal equations: Î² = (Xáµ€X)â»Â¹Xáµ€y
647// Supports one or more predictor columns. Reports coefficients, RÂ², RMSE,
648// and predicted vs actual for first 10 rows.
649
650pub async fn linear_regression(
651    file_path: &str,
652    predictors: &[&str],
653    target: &str,
654) -> Result<String, String> {
655    let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
656    let hex_target: String = target.bytes().map(|b| format!("{:02x}", b)).collect();
657    let preds_joined = predictors.join("\n");
658    let hex_preds: String = preds_joined.bytes().map(|b| format!("{:02x}", b)).collect();
659
660    let script = format!(
661        r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys, math
662
663_path   = bytes.fromhex("{hex_path}").decode().strip()
664_target = bytes.fromhex("{hex_target}").decode().strip()
665_preds_raw = bytes.fromhex("{hex_preds}").decode().strip()
666_preds  = [p.strip() for p in _preds_raw.split('\n') if p.strip()] if _preds_raw else []
667
668def _load(path):
669    ext = os.path.splitext(path)[1].lower().lstrip('.')
670    if ext in ('csv','tsv'):
671        with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
672            r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
673            return list(r)
674    elif ext == 'json':
675        with open(path, encoding='utf-8') as fh: d = _js.load(fh)
676        return d if isinstance(d, list) else next(iter(d.values()), [])
677    elif ext in ('db','sqlite','sqlite3'):
678        con = _sq.connect(path)
679        cur = con.cursor()
680        cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
681        t = cur.fetchone()
682        if not t: return []
683        cur.execute("SELECT * FROM [%s]" % t[0])
684        cols2 = [d[0] for d in cur.description]
685        rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
686        con.close()
687        return rows2
688    print("Unsupported: "+ext, file=sys.stderr); sys.exit(1)
689
690def _tf(v):
691    try: return float(str(v).replace(',','').strip())
692    except: return None
693
694rows = _load(_path)
695if not rows:
696    print("No rows found."); sys.exit(0)
697
698all_cols = list(rows[0].keys())
699num_cols = [c for c in all_cols if sum(1 for r in rows if _tf(r.get(c,'')) is not None) >= len(rows)*0.5]
700
701if not _target:
702    _target = num_cols[-1] if num_cols else ''
703if not _preds:
704    _preds = [c for c in num_cols if c != _target]
705
706if not _target:
707    print("No target column. Use --regression-target COL"); sys.exit(1)
708if not _preds:
709    print("No predictor columns. Use --regression-predictors COL1,COL2,..."); sys.exit(1)
710
711valid = []
712for r in rows:
713    y = _tf(r.get(_target,''))
714    xs = [_tf(r.get(pp,'')) for pp in _preds]
715    if y is not None and all(x is not None for x in xs):
716        valid.append((xs, y))
717
718n = len(valid)
719pp = len(_preds)
720if n < pp + 2:
721    print("Not enough valid rows (%d) for %d predictors." % (n, pp)); sys.exit(1)
722
723X = [[1.0] + row[0] for row in valid]
724y = [row[1] for row in valid]
725
726def _mat_mul_sq(A, B):
727    ra,ca = len(A),len(A[0]); cb = len(B[0])
728    return [[sum(A[i][k]*B[k][j] for k in range(ca)) for j in range(cb)] for i in range(ra)]
729
730def _mat_T(A):
731    return [[A[i][j] for i in range(len(A))] for j in range(len(A[0]))]
732
733def _lu_solve(A, b):
734    n2 = len(A)
735    M = [row[:] + [b[i]] for i,row in enumerate(A)]
736    for col in range(n2):
737        pivot = max(range(col,n2), key=lambda r2: abs(M[r2][col]))
738        M[col],M[pivot] = M[pivot],M[col]
739        if abs(M[col][col]) < 1e-12: return None
740        for row in range(col+1,n2):
741            f = M[row][col]/M[col][col]
742            for j in range(col,n2+1): M[row][j] -= f*M[col][j]
743    x2 = [0.0]*n2
744    for i in range(n2-1,-1,-1):
745        x2[i] = (M[i][n2] - sum(M[i][j]*x2[j] for j in range(i+1,n2))) / M[i][i]
746    return x2
747
748Xt = _mat_T(X)
749XtX_sq = _mat_mul_sq(Xt, X)
750Xty = [sum(Xt[i][k]*y[k] for k in range(n)) for i in range(pp+1)]
751beta = _lu_solve(XtX_sq, Xty)
752if beta is None:
753    print("Matrix is singular â€” check for collinear predictors."); sys.exit(1)
754
755preds_vals = [sum(beta[j]*X[i][j] for j in range(pp+1)) for i in range(n)]
756residuals  = [y[i]-preds_vals[i] for i in range(n)]
757ss_res = sum(r**2 for r in residuals)
758ym = sum(y)/n
759ss_tot = sum((v-ym)**2 for v in y)
760r2 = 1 - ss_res/ss_tot if ss_tot else 0
761rmse = math.sqrt(ss_res/n)
762adj_r2 = 1 - (1-r2)*(n-1)/(n-pp-1) if n > pp+1 else float('nan')
763
764W = 64
765print("="*W)
766print(" Linear Regression â€” %s" % os.path.basename(_path))
767print(" Target: %-20s   N=%d   Predictors=%d" % (_target, n, pp))
768print("-"*W)
769print("  Coefficients:")
770print("    %-20s  %12.6f" % ("(intercept)", beta[0]))
771for i2,c2 in enumerate(_preds):
772    print("    %-20s  %12.6f" % (c2[:20], beta[i2+1]))
773print("-"*W)
774print("  RÂ²         = %.6f" % r2)
775print("  Adj. RÂ²    = %.6f" % adj_r2)
776print("  RMSE       = %.6f" % rmse)
777print("  Residuals  min=%.4g  max=%.4g  mean=%.4g" % (min(residuals), max(residuals), sum(residuals)/n))
778print("-"*W)
779terms = ["%.4g" % beta[0]]
780for i2,c2 in enumerate(_preds):
781    sign = "+" if beta[i2+1] >= 0 else "-"
782    terms.append("%s %.4g*%s" % (sign, abs(beta[i2+1]), c2))
783print("  Equation: %s = %s" % (_target, " ".join(terms)))
784print("="*W)
785print()
786print("  First 10 predictions vs actual:")
787print("  %-10s  %-10s  %-10s" % ("Actual", "Predicted", "Residual"))
788for i3 in range(min(10,n)):
789    print("  %-10.4g  %-10.4g  %-10.4g" % (y[i3], preds_vals[i3], residuals[i3]))
790"####,
791        hex_path = hex_path,
792        hex_target = hex_target,
793        hex_preds = hex_preds,
794    );
795
796    let sandbox_args = serde_json::json!({
797        "language": "python",
798        "code": script,
799        "timeout_seconds": 30
800    });
801    crate::tools::code_sandbox::execute(&sandbox_args).await
802}
803
804// â”€â”€ Outlier detection â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
805// IQR (1.5Ã— fence) and Z-score (|z|>3) detection.
806// Optional: output clean CSV with outliers removed.
807
808pub async fn detect_outliers(file_path: &str, col: &str, output: &str) -> Result<String, String> {
809    let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
810    let hex_col: String = col.bytes().map(|b| format!("{:02x}", b)).collect();
811    let hex_output: String = output.bytes().map(|b| format!("{:02x}", b)).collect();
812
813    let script = format!(
814        r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys, math
815
816_path   = bytes.fromhex("{hex_path}").decode().strip()
817_col    = bytes.fromhex("{hex_col}").decode().strip()
818_output = bytes.fromhex("{hex_output}").decode().strip()
819
820def _load(path):
821    ext = os.path.splitext(path)[1].lower().lstrip('.')
822    if ext in ('csv','tsv'):
823        with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
824            rd = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
825            data = list(rd)
826            fns = list(rd.fieldnames or [])
827            return data, fns
828    elif ext == 'json':
829        with open(path, encoding='utf-8') as fh: d = _js.load(fh)
830        rows2 = d if isinstance(d, list) else next(iter(d.values()), [])
831        fns2 = list(rows2[0].keys()) if rows2 else []
832        return rows2, fns2
833    elif ext in ('db','sqlite','sqlite3'):
834        con = _sq.connect(path)
835        cur = con.cursor()
836        cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
837        t = cur.fetchone()
838        if not t: return [], []
839        cur.execute("SELECT * FROM [%s]" % t[0])
840        cols2 = [d2[0] for d2 in cur.description]
841        rows3 = [dict(zip(cols2, r)) for r in cur.fetchall()]
842        con.close()
843        return rows3, cols2
844    print("Unsupported: "+ext, file=sys.stderr); sys.exit(1)
845
846def _tf(v):
847    try: return float(str(v).replace(',','').strip())
848    except: return None
849
850def _pct(data, p):
851    s = sorted(data); n = len(s)
852    idx = (p/100.0)*(n-1); lo = int(idx); hi = lo+1; frac = idx-lo
853    return s[-1] if hi >= n else s[lo]+frac*(s[hi]-s[lo])
854
855rows, fieldnames = _load(_path)
856if not rows:
857    print("No rows found."); sys.exit(0)
858
859all_cols = list(rows[0].keys())
860if _col:
861    target_cols = [c for c in all_cols if c.lower() == _col.lower()]
862    if not target_cols:
863        print("Column '%s' not found. Available: %s" % (_col, ', '.join(all_cols))); sys.exit(1)
864else:
865    target_cols = [c for c in all_cols
866                   if sum(1 for r in rows if _tf(r.get(c,'')) is not None) >= len(rows)*0.5]
867
868W = 68
869print("="*W)
870print(" Outlier Detection â€” %s  (%d rows)" % (os.path.basename(_path), len(rows)))
871print("-"*W)
872
873outlier_row_indices = set()
874for c in target_cols:
875    valid = [(i, _tf(r.get(c,''))) for i,r in enumerate(rows)]
876    valid = [(i,v) for i,v in valid if v is not None]
877    if len(valid) < 4: continue
878    vs = [v for _,v in valid]
879    mean = sum(vs)/len(vs)
880    std  = math.sqrt(sum((x-mean)**2 for x in vs)/len(vs))
881    q1 = _pct(vs,25); q3 = _pct(vs,75); iqr = q3-q1
882    lo_fence = q1 - 1.5*iqr; hi_fence = q3 + 1.5*iqr
883    iqr_out = [(i,v) for i,v in valid if v < lo_fence or v > hi_fence]
884    z_out   = [(i,v) for i,v in valid if std > 0 and abs((v-mean)/std) > 3]
885    print()
886    print("  Column: %s  (n=%d  mean=%.4g  std=%.4g)" % (c, len(vs), mean, std))
887    print("  IQR fence: [%.4g, %.4g]    IQR outliers: %d" % (lo_fence, hi_fence, len(iqr_out)))
888    print("  Z-score |z|>3:  Z outliers: %d" % len(z_out))
889    if iqr_out:
890        print("  IQR outliers (row, value):")
891        for i,v in iqr_out[:10]:
892            z = (v-mean)/std if std > 0 else float('nan')
893            print("    row %-5d  value=%-12g  z=%.3f" % (i+1, v, z))
894            outlier_row_indices.add(i)
895        if len(iqr_out) > 10:
896            print("    ... and %d more" % (len(iqr_out)-10))
897    else:
898        print("  No IQR outliers found.")
899
900print()
901print("="*W)
902print("  Total outlier rows (IQR): %d / %d  (%.1f%%)" % (
903    len(outlier_row_indices), len(rows), 100*len(outlier_row_indices)/max(1,len(rows))))
904
905if _output and outlier_row_indices:
906    clean = [r for i,r in enumerate(rows) if i not in outlier_row_indices]
907    fns2 = fieldnames if fieldnames else (list(clean[0].keys()) if clean else [])
908    with open(_output, 'w', newline='', encoding='utf-8') as fh:
909        w = _csv.DictWriter(fh, fieldnames=fns2)
910        w.writeheader(); w.writerows(clean)
911    print("  Clean data (%d rows) saved to: %s" % (len(clean), _output))
912elif _output:
913    print("  No outliers to remove â€” output file not written.")
914"####,
915        hex_path = hex_path,
916        hex_col = hex_col,
917        hex_output = hex_output,
918    );
919
920    let sandbox_args = serde_json::json!({
921        "language": "python",
922        "code": script,
923        "timeout_seconds": 30
924    });
925    crate::tools::code_sandbox::execute(&sandbox_args).await
926}
927
928// â”€â”€ SVG chart generator â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
929// Produces a self-contained SVG file â€” no matplotlib, no external deps.
930// Chart types: line (default), scatter, bar, histogram.
931// Reads CSV/TSV/JSON/SQLite. Auto-opens with --open flag (handled in main.rs).
932
933pub async fn plot_chart(
934    file_path: &str,
935    x_col: &str,
936    y_col: &str,
937    chart_type: &str,
938    title: &str,
939    output: &str,
940) -> Result<String, String> {
941    let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
942    let hex_x: String = x_col.bytes().map(|b| format!("{:02x}", b)).collect();
943    let hex_y: String = y_col.bytes().map(|b| format!("{:02x}", b)).collect();
944    let hex_type: String = chart_type.bytes().map(|b| format!("{:02x}", b)).collect();
945    let hex_title: String = title.bytes().map(|b| format!("{:02x}", b)).collect();
946    let hex_out: String = output.bytes().map(|b| format!("{:02x}", b)).collect();
947
948    let script = format!(
949        r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys, math
950
951_path  = bytes.fromhex("{hex_path}").decode().strip()
952_xcol  = bytes.fromhex("{hex_x}").decode().strip()
953_ycol  = bytes.fromhex("{hex_y}").decode().strip()
954_ctype = bytes.fromhex("{hex_type}").decode().strip().lower() or "line"
955_title = bytes.fromhex("{hex_title}").decode().strip()
956_out   = bytes.fromhex("{hex_out}").decode().strip()
957
958def _load(path):
959    ext = os.path.splitext(path)[1].lower().lstrip('.')
960    if ext in ('csv','tsv'):
961        with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
962            r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
963            return list(r)
964    elif ext == 'json':
965        with open(path, encoding='utf-8') as fh: d = _js.load(fh)
966        return d if isinstance(d, list) else next(iter(d.values()), [])
967    elif ext in ('db','sqlite','sqlite3'):
968        con = _sq.connect(path)
969        cur = con.cursor()
970        cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
971        t = cur.fetchone()
972        if not t: return []
973        cur.execute("SELECT * FROM [%s]" % t[0])
974        cols2 = [d2[0] for d2 in cur.description]
975        rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
976        con.close()
977        return rows2
978    print("Unsupported: "+ext, file=sys.stderr); sys.exit(1)
979
980def _tf(v):
981    try: return float(str(v).replace(',','').strip())
982    except: return None
983
984rows = _load(_path)
985if not rows:
986    print("No rows found."); sys.exit(0)
987
988all_cols = list(rows[0].keys())
989num_cols = [c for c in all_cols if sum(1 for r in rows if _tf(r.get(c,'')) is not None) >= len(rows)*0.5]
990
991if not _xcol:
992    _xcol = all_cols[0]
993if not _ycol:
994    _ycol = num_cols[0] if num_cols else (all_cols[1] if len(all_cols)>1 else all_cols[0])
995
996if not _title:
997    _title = "%s â€” %s vs %s" % (os.path.basename(_path), _xcol, _ycol)
998
999if not _out:
1000    base = os.path.splitext(_path)[0]
1001    _out = base + "_plot.svg"
1002
1003# Extract data points
1004def _to_num_or_str(v): return _tf(v) if _tf(v) is not None else str(v).strip()
1005
1006raw_pairs = [(_to_num_or_str(r.get(_xcol,'')), _tf(r.get(_ycol,''))) for r in rows]
1007pairs = [(x,y) for x,y in raw_pairs if y is not None]
1008
1009if not pairs:
1010    print("No plottable data in columns '%s' vs '%s'." % (_xcol, _ycol)); sys.exit(0)
1011
1012# For bar/histogram: bucket string x values
1013xs_raw = [p[0] for p in pairs]
1014ys = [p[1] for p in pairs]
1015
1016# SVG dimensions
1017W = 800; H = 500; PAD = 70; TW = W-2*PAD; TH = H-2*PAD
1018
1019def _esc(s): return str(s).replace('&','&amp;').replace('<','&lt;').replace('>','&gt;').replace('"','&quot;')
1020
1021def _scale(vals, lo, hi, out_lo, out_hi):
1022    if hi == lo: return [out_lo + (out_hi-out_lo)/2 for _ in vals]
1023    return [out_lo + (v-lo)/(hi-lo)*(out_hi-out_lo) for v in vals]
1024
1025svg_parts = []
1026svg_parts.append('<?xml version="1.0" encoding="UTF-8"?>')
1027svg_parts.append('<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" style="background:#1e1e2e">' % (W, H))
1028svg_parts.append('<style>text{{font-family:monospace;fill:#cdd6f4}}line{{stroke:#45475a}}circle{{opacity:0.8}}</style>')
1029# Title
1030svg_parts.append('<text x="%d" y="28" font-size="15" text-anchor="middle" font-weight="bold">%s</text>' % (W//2, _esc(_title)))
1031# Axes
1032svg_parts.append('<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="#89b4fa" stroke-width="1.5"/>' % (PAD, PAD, PAD, H-PAD))
1033svg_parts.append('<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="#89b4fa" stroke-width="1.5"/>' % (PAD, H-PAD, W-PAD, H-PAD))
1034# Axis labels
1035svg_parts.append('<text x="%d" y="%d" font-size="12" text-anchor="middle">%s</text>' % (W//2, H-10, _esc(_xcol)))
1036svg_parts.append('<text x="15" y="%d" font-size="12" text-anchor="middle" transform="rotate(-90,15,%d)">%s</text>' % (H//2, H//2, _esc(_ycol)))
1037
1038if _ctype == 'bar' or (not all(isinstance(x, (int,float)) for x in xs_raw)):
1039    # Bar chart: group by string x
1040    from collections import OrderedDict
1041    groups = OrderedDict()
1042    for x,y in pairs:
1043        k = str(x)
1044        groups.setdefault(k, []).append(y)
1045    labels = list(groups.keys())[:30]
1046    bar_vals = [sum(groups[k])/len(groups[k]) for k in labels]
1047    bw = TW / max(len(labels),1) * 0.8
1048    x_positions = [PAD + (i+0.5) * TW / max(len(labels),1) for i in range(len(labels))]
1049    ymin = min(0, min(bar_vals)); ymax = max(bar_vals) if bar_vals else 1
1050    if ymin == ymax: ymax = ymin + 1
1051    def _sy(v): return H-PAD - (v-ymin)/(ymax-ymin)*TH
1052    for i,(lbl,v) in enumerate(zip(labels,bar_vals)):
1053        x0 = x_positions[i] - bw/2
1054        y0 = _sy(max(v,0)); y1 = _sy(min(v,0))
1055        bar_h = abs(y0-y1)
1056        svg_parts.append('<rect x="%.1f" y="%.1f" width="%.1f" height="%.1f" fill="#89b4fa" rx="2"/>' % (x0, min(y0,y1), bw, max(bar_h,1)))
1057        if len(labels) <= 15:
1058            svg_parts.append('<text x="%.1f" y="%d" font-size="10" text-anchor="middle" transform="rotate(-45,%.1f,%d)">%s</text>' % (x_positions[i], H-PAD+14, x_positions[i], H-PAD+14, _esc(lbl[:12])))
1059    # y-axis ticks
1060    for tick in [ymin, (ymin+ymax)/2, ymax]:
1061        sy = _sy(tick)
1062        svg_parts.append('<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="#45475a"/>' % (PAD, sy, W-PAD, sy))
1063        svg_parts.append('<text x="%d" y="%.1f" font-size="10" text-anchor="end">%.3g</text>' % (PAD-4, sy+4, tick))
1064
1065elif _ctype == 'histogram':
1066    n_bins = min(30, max(5, int(math.sqrt(len(ys)))))
1067    ymin_h = min(ys); ymax_h = max(ys)
1068    if ymin_h == ymax_h: ymax_h = ymin_h + 1
1069    bin_w = (ymax_h-ymin_h)/n_bins
1070    counts = [0]*n_bins
1071    for v in ys:
1072        idx = min(int((v-ymin_h)/bin_w), n_bins-1)
1073        counts[idx] += 1
1074    bar_w = TW/n_bins
1075    cmax = max(counts) if counts else 1
1076    for i,c in enumerate(counts):
1077        x0 = PAD + i*bar_w
1078        bar_h2 = c/cmax * TH
1079        y0 = H-PAD-bar_h2
1080        svg_parts.append('<rect x="%.1f" y="%.1f" width="%.1f" height="%.1f" fill="#a6e3a1" rx="1"/>' % (x0, y0, bar_w-1, bar_h2))
1081    for i in range(5):
1082        tick_v = ymin_h + i*(ymax_h-ymin_h)/4
1083        sx = PAD + (tick_v-ymin_h)/(ymax_h-ymin_h)*TW
1084        svg_parts.append('<text x="%.1f" y="%d" font-size="10" text-anchor="middle">%.3g</text>' % (sx, H-PAD+14, tick_v))
1085    for i in range(5):
1086        tick_c = i*cmax/4
1087        sy = H-PAD - tick_c/cmax*TH
1088        svg_parts.append('<text x="%d" y="%.1f" font-size="10" text-anchor="end">%d</text>' % (PAD-4, sy+4, int(tick_c)))
1089
1090else:
1091    # Line or scatter: numeric x required
1092    xs_num = [p[0] if isinstance(p[0],(int,float)) else i for i,p in enumerate(pairs)]
1093    xmin = min(xs_num); xmax = max(xs_num)
1094    ymin2 = min(ys); ymax2 = max(ys)
1095    if xmin == xmax: xmax = xmin+1
1096    if ymin2 == ymax2: ymax2 = ymin2+1
1097    def _sx2(v): return PAD + (v-xmin)/(xmax-xmin)*TW
1098    def _sy2(v): return H-PAD - (v-ymin2)/(ymax2-ymin2)*TH
1099    # Grid
1100    for i in range(5):
1101        gx = PAD + i*TW/4; gy = H-PAD - i*TH/4
1102        svg_parts.append('<line x1="%.1f" y1="%d" x2="%.1f" y2="%d" stroke="#313244" stroke-dasharray="4"/>' % (gx,PAD,gx,H-PAD))
1103        svg_parts.append('<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="#313244" stroke-dasharray="4"/>' % (PAD,gy,W-PAD,gy))
1104    # x ticks
1105    for i in range(5):
1106        tv = xmin + i*(xmax-xmin)/4
1107        sx2 = _sx2(tv)
1108        svg_parts.append('<text x="%.1f" y="%d" font-size="10" text-anchor="middle">%.3g</text>' % (sx2, H-PAD+14, tv))
1109    # y ticks
1110    for i in range(5):
1111        tv = ymin2 + i*(ymax2-ymin2)/4
1112        sy2 = _sy2(tv)
1113        svg_parts.append('<text x="%d" y="%.1f" font-size="10" text-anchor="end">%.3g</text>' % (PAD-4, sy2+4, tv))
1114    pts = list(zip(xs_num, ys))
1115    pts.sort(key=lambda p: p[0])
1116    sx_list = [_sx2(x) for x,_ in pts]
1117    sy_list = [_sy2(y) for _,y in pts]
1118    if _ctype != 'scatter' and len(pts) > 1:
1119        path_d = "M %.1f %.1f " % (sx_list[0], sy_list[0])
1120        path_d += " ".join("L %.1f %.1f" % (sx_list[i], sy_list[i]) for i in range(1,len(pts)))
1121        svg_parts.append('<path d="%s" fill="none" stroke="#89b4fa" stroke-width="2"/>' % path_d)
1122    for i in range(len(pts)):
1123        svg_parts.append('<circle cx="%.1f" cy="%.1f" r="3" fill="#cba6f7"/>' % (sx_list[i], sy_list[i]))
1124
1125svg_parts.append('</svg>')
1126svg_content = '\n'.join(svg_parts)
1127
1128with open(_out, 'w', encoding='utf-8') as fh:
1129    fh.write(svg_content)
1130
1131print("Chart saved: %s  (%d data points  type=%s)" % (_out, len(pairs), _ctype))
1132print("Open in any browser to view.")
1133"####,
1134        hex_path = hex_path,
1135        hex_x = hex_x,
1136        hex_y = hex_y,
1137        hex_type = hex_type,
1138        hex_title = hex_title,
1139        hex_out = hex_out,
1140    );
1141
1142    let sandbox_args = serde_json::json!({
1143        "language": "python",
1144        "code": script,
1145        "timeout_seconds": 30
1146    });
1147    crate::tools::code_sandbox::execute(&sandbox_args).await
1148}
1149
1150// â”€â”€ Discrete Fourier Transform / frequency analysis â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
1151// Pure-Python DFT. Reads numeric column, reports top-N frequency components.
1152
1153pub async fn fourier_analysis(
1154    file_path: &str,
1155    col: &str,
1156    top_n: usize,
1157    sample_rate: f64,
1158) -> Result<String, String> {
1159    let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
1160    let hex_col: String = col.bytes().map(|b| format!("{:02x}", b)).collect();
1161
1162    let script = format!(
1163        r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys, math
1164
1165_path        = bytes.fromhex("{hex_path}").decode().strip()
1166_col         = bytes.fromhex("{hex_col}").decode().strip()
1167_top_n       = {top_n}
1168_sample_rate = {sample_rate}
1169
1170def _load(path):
1171    ext = os.path.splitext(path)[1].lower().lstrip('.')
1172    if ext in ('csv','tsv'):
1173        with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
1174            r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
1175            return list(r)
1176    elif ext == 'json':
1177        with open(path, encoding='utf-8') as fh: d = _js.load(fh)
1178        return d if isinstance(d, list) else next(iter(d.values()), [])
1179    elif ext in ('db','sqlite','sqlite3'):
1180        con = _sq.connect(path)
1181        cur = con.cursor()
1182        cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
1183        t = cur.fetchone()
1184        if not t: return []
1185        cur.execute("SELECT * FROM [%s]" % t[0])
1186        cols2 = [d2[0] for d2 in cur.description]
1187        rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
1188        con.close()
1189        return rows2
1190    print("Unsupported: "+ext, file=sys.stderr); sys.exit(1)
1191
1192def _tf(v):
1193    try: return float(str(v).replace(',','').strip())
1194    except: return None
1195
1196rows = _load(_path)
1197if not rows:
1198    print("No rows found."); sys.exit(0)
1199
1200all_cols = list(rows[0].keys())
1201if _col:
1202    target_col = next((c for c in all_cols if c.lower() == _col.lower()), None)
1203    if not target_col:
1204        print("Column '%s' not found. Available: %s" % (_col, ', '.join(all_cols))); sys.exit(1)
1205else:
1206    num_cols = [c for c in all_cols if sum(1 for r in rows if _tf(r.get(c,'')) is not None) >= len(rows)*0.5]
1207    target_col = num_cols[0] if num_cols else None
1208    if not target_col:
1209        print("No numeric column found."); sys.exit(0)
1210
1211vals = [_tf(r.get(target_col,'')) for r in rows]
1212vals = [v for v in vals if v is not None]
1213n = len(vals)
1214if n < 4:
1215    print("Need at least 4 data points for DFT."); sys.exit(0)
1216
1217mean = sum(vals)/n
1218x = [v - mean for v in vals]
1219
1220if n > 512:
1221    x = x[:512]; n = 512
1222    print("Note: DFT computed on first 512 points (large dataset).")
1223
1224def dft(x2):
1225    N = len(x2)
1226    result = []
1227    for k in range(N//2 + 1):
1228        re = sum(x2[t]*math.cos(2*math.pi*k*t/N) for t in range(N))
1229        im = sum(x2[t]*math.sin(2*math.pi*k*t/N) for t in range(N))
1230        amp = math.sqrt(re**2 + im**2) / N
1231        phase = math.atan2(-im, re)
1232        result.append((k, amp, phase))
1233    return result
1234
1235spectrum = dft(x)
1236spectrum_sorted = sorted(spectrum[1:], key=lambda t: -t[1])
1237
1238sr = _sample_rate if _sample_rate > 0 else 1.0
1239top = spectrum_sorted[:min(_top_n, len(spectrum_sorted))]
1240
1241W = 64
1242print("="*W)
1243print(" Fourier / Frequency Analysis: %s" % os.path.basename(_path))
1244print(" Column: %-20s   N=%d   Sample rate: %g Hz" % (target_col, n, sr))
1245print("-"*W)
1246print("  DC component (mean offset): %.6f" % spectrum[0][1])
1247print()
1248print("  %-5s  %-12s  %-12s  %-10s  %-10s" % ("Rank", "Freq (Hz)", "Period", "Amplitude", "Phase (deg)"))
1249print("  " + "-"*58)
1250for i,(k,amp,phase) in enumerate(top):
1251    freq = k * sr / n
1252    period = (1.0/freq) if freq > 0 else float('inf')
1253    period_str = "%.4g" % period if period < 1e10 else "inf"
1254    print("  %-5d  %-12.6g  %-12s  %-10.6f  %-10.2f" % (
1255        i+1, freq, period_str, amp, math.degrees(phase)))
1256total_power = sum(t[1]**2 for t in spectrum[1:])
1257top_power   = sum(t[1]**2 for t in top)
1258print()
1259print("  Top %d components contain %.1f%% of signal power." % (len(top), 100*top_power/max(total_power,1e-30)))
1260print("="*W)
1261"####,
1262        hex_path = hex_path,
1263        hex_col = hex_col,
1264        top_n = top_n,
1265        sample_rate = sample_rate,
1266    );
1267
1268    let sandbox_args = serde_json::json!({
1269        "language": "python",
1270        "code": script,
1271        "timeout_seconds": 60
1272    });
1273    crate::tools::code_sandbox::execute(&sandbox_args).await
1274}
1275
1276// â”€â”€ k-means clustering â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
1277// Lloyd's algorithm, pure Python, no sklearn.
1278// Reports cluster centroids, sizes, inertia, and per-row assignments.
1279
1280pub async fn cluster_kmeans(
1281    file_path: &str,
1282    k: usize,
1283    cols: &[&str],
1284    max_iter: usize,
1285    output: &str,
1286) -> Result<String, String> {
1287    let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
1288    let cols_joined = cols.join("\n");
1289    let hex_cols: String = cols_joined.bytes().map(|b| format!("{:02x}", b)).collect();
1290    let hex_output: String = output.bytes().map(|b| format!("{:02x}", b)).collect();
1291
1292    let script = format!(
1293        r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys, math, random
1294
1295_path    = bytes.fromhex("{hex_path}").decode().strip()
1296_cols_raw = bytes.fromhex("{hex_cols}").decode().strip()
1297_cols    = [c.strip() for c in _cols_raw.split('\n') if c.strip()] if _cols_raw else []
1298_k       = {k}
1299_max_iter = {max_iter}
1300_output  = bytes.fromhex("{hex_output}").decode().strip()
1301
1302def _load(path):
1303    ext = os.path.splitext(path)[1].lower().lstrip('.')
1304    if ext in ('csv','tsv'):
1305        with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
1306            r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
1307            return list(r)
1308    elif ext == 'json':
1309        with open(path, encoding='utf-8') as fh: d = _js.load(fh)
1310        return d if isinstance(d, list) else next(iter(d.values()), [])
1311    elif ext in ('db','sqlite','sqlite3'):
1312        con = _sq.connect(path)
1313        cur = con.cursor()
1314        cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
1315        t = cur.fetchone()
1316        if not t: return []
1317        cur.execute("SELECT * FROM [%s]" % t[0])
1318        cols2 = [d2[0] for d2 in cur.description]
1319        rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
1320        con.close()
1321        return rows2
1322    print("Unsupported: "+ext, file=sys.stderr); sys.exit(1)
1323
1324def _tf(v):
1325    try: return float(str(v).replace(',','').strip())
1326    except: return None
1327
1328rows = _load(_path)
1329if not rows:
1330    print("No rows found."); sys.exit(0)
1331
1332all_cols = list(rows[0].keys())
1333if _cols:
1334    feature_cols = [c for c in all_cols if c in _cols]
1335else:
1336    feature_cols = [c for c in all_cols if sum(1 for r in rows if _tf(r.get(c,'')) is not None) >= len(rows)*0.5]
1337
1338if len(feature_cols) < 1:
1339    print("No numeric feature columns found."); sys.exit(1)
1340
1341valid_rows = [r for r in rows if all(_tf(r.get(c,'')) is not None for c in feature_cols)]
1342if len(valid_rows) < _k:
1343    print("Fewer valid rows (%d) than clusters (%d)." % (len(valid_rows), _k)); sys.exit(1)
1344
1345X = [[_tf(r[c]) for c in feature_cols] for r in valid_rows]
1346n = len(X); d = len(feature_cols)
1347
1348def _dist(a, b): return math.sqrt(sum((ai-bi)**2 for ai,bi in zip(a,b)))
1349def _centroid(pts): return [sum(p[j] for p in pts)/len(pts) for j in range(d)] if pts else [0.0]*d
1350
1351# k-means++ init
1352random.seed(42)
1353centroids = [X[random.randint(0,n-1)]]
1354for _ in range(_k-1):
1355    dists = [min(_dist(x, c)**2 for c in centroids) for x in X]
1356    total = sum(dists)
1357    r = random.random() * total
1358    cum = 0
1359    for i,dd in enumerate(dists):
1360        cum += dd
1361        if cum >= r: centroids.append(X[i]); break
1362    else: centroids.append(X[-1])
1363
1364labels = [0]*n
1365for _ in range(_max_iter):
1366    new_labels = [min(range(_k), key=lambda c: _dist(x, centroids[c])) for x in X]
1367    if new_labels == labels: break
1368    labels = new_labels
1369    for c in range(_k):
1370        pts = [X[i] for i in range(n) if labels[i]==c]
1371        if pts: centroids[c] = _centroid(pts)
1372
1373inertia = sum(_dist(X[i], centroids[labels[i]])**2 for i in range(n))
1374cluster_sizes = [labels.count(c) for c in range(_k)]
1375
1376W = 64
1377print("="*W)
1378print(" k-Means Clustering: %s  (k=%d)" % (os.path.basename(_path), _k))
1379print(" Features: %s" % ', '.join(feature_cols))
1380print(" Rows: %d   Inertia: %.4f" % (n, inertia))
1381print("-"*W)
1382for c in range(_k):
1383    centroid_str = '  '.join("%.4g" % v for v in centroids[c])
1384    print("  Cluster %d  (%d rows): centroid = [%s]" % (c, cluster_sizes[c], centroid_str))
1385print("="*W)
1386
1387if _output:
1388    with open(_output, 'w', newline='', encoding='utf-8') as fh:
1389        fns2 = list(valid_rows[0].keys()) + ['cluster']
1390        w = _csv.DictWriter(fh, fieldnames=fns2)
1391        w.writeheader()
1392        for i,r in enumerate(valid_rows):
1393            r2 = dict(r); r2['cluster'] = labels[i]
1394            w.writerow(r2)
1395    print("Labeled data saved to: %s" % _output)
1396"####,
1397        hex_path = hex_path,
1398        hex_cols = hex_cols,
1399        hex_output = hex_output,
1400        k = k,
1401        max_iter = max_iter,
1402    );
1403
1404    let sandbox_args = serde_json::json!({
1405        "language": "python",
1406        "code": script,
1407        "timeout_seconds": 60
1408    });
1409    crate::tools::code_sandbox::execute(&sandbox_args).await
1410}
1411
1412// â”€â”€ Normalize / standardize dataset â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
1413// Applies min-max scaling or z-score standardization to numeric columns.
1414// Outputs a new CSV with scaled values and reports the scaling parameters.
1415
1416pub async fn normalize_dataset(
1417    file_path: &str,
1418    method: &str,
1419    cols: &[&str],
1420    output: &str,
1421) -> Result<String, String> {
1422    let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
1423    let cols_joined = cols.join("\n");
1424    let hex_cols: String = cols_joined.bytes().map(|b| format!("{:02x}", b)).collect();
1425    let hex_method: String = method.bytes().map(|b| format!("{:02x}", b)).collect();
1426    let hex_output: String = output.bytes().map(|b| format!("{:02x}", b)).collect();
1427
1428    let script = format!(
1429        r####"import csv as _csv, json as _js, sqlite3 as _sq, os, sys, math
1430
1431_path    = bytes.fromhex("{hex_path}").decode().strip()
1432_cols_raw = bytes.fromhex("{hex_cols}").decode().strip()
1433_cols    = [c.strip() for c in _cols_raw.split('\n') if c.strip()] if _cols_raw else []
1434_method  = bytes.fromhex("{hex_method}").decode().strip().lower() or "minmax"
1435_output  = bytes.fromhex("{hex_output}").decode().strip()
1436
1437def _load(path):
1438    ext = os.path.splitext(path)[1].lower().lstrip('.')
1439    if ext in ('csv','tsv'):
1440        with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
1441            r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
1442            return list(r), list(r.fieldnames or [])
1443    elif ext == 'json':
1444        with open(path, encoding='utf-8') as fh: d = _js.load(fh)
1445        rows2 = d if isinstance(d, list) else next(iter(d.values()), [])
1446        return rows2, list(rows2[0].keys()) if rows2 else []
1447    elif ext in ('db','sqlite','sqlite3'):
1448        con = _sq.connect(path)
1449        cur = con.cursor()
1450        cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
1451        t = cur.fetchone()
1452        if not t: return [], []
1453        cur.execute("SELECT * FROM [%s]" % t[0])
1454        cols2 = [d2[0] for d2 in cur.description]
1455        rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
1456        con.close()
1457        return rows2, cols2
1458    print("Unsupported: "+ext, file=sys.stderr); sys.exit(1)
1459
1460def _tf(v):
1461    try: return float(str(v).replace(',','').strip())
1462    except: return None
1463
1464rows, fieldnames = _load(_path)
1465if not rows:
1466    print("No rows found."); sys.exit(0)
1467
1468all_cols = list(rows[0].keys())
1469if _cols:
1470    target_cols = [c for c in all_cols if c in _cols]
1471else:
1472    target_cols = [c for c in all_cols if sum(1 for r in rows if _tf(r.get(c,'')) is not None) >= len(rows)*0.5]
1473
1474params = {{}}
1475for c in target_cols:
1476    vals = [_tf(r.get(c,'')) for r in rows if _tf(r.get(c,'')) is not None]
1477    if not vals: continue
1478    mean = sum(vals)/len(vals)
1479    std  = math.sqrt(sum((v-mean)**2 for v in vals)/len(vals))
1480    mn   = min(vals); mx = max(vals)
1481    params[c] = (mean, std, mn, mx)
1482
1483W = 64
1484print("="*W)
1485print(" Dataset Normalization: %s  (method=%s)" % (os.path.basename(_path), _method))
1486print("-"*W)
1487print("  %-20s  %-10s  %-10s  %-10s  %-10s" % ("Column", "Min", "Max", "Mean", "Std"))
1488print("  " + "-"*56)
1489for c,( mean,std,mn,mx) in params.items():
1490    print("  %-20s  %-10.4g  %-10.4g  %-10.4g  %-10.4g" % (c[:20], mn, mx, mean, std))
1491print("="*W)
1492
1493if _output:
1494    out_rows = []
1495    for r in rows:
1496        out_r = dict(r)
1497        for c,(mean,std,mn,mx) in params.items():
1498            v = _tf(r.get(c,''))
1499            if v is None:
1500                out_r[c] = ''
1501                continue
1502            if _method in ('minmax','min-max','min_max'):
1503                rng = mx-mn
1504                out_r[c] = "%.8f" % ((v-mn)/rng if rng else 0.0)
1505            elif _method in ('zscore','z-score','z_score','standard','standardize'):
1506                out_r[c] = "%.8f" % ((v-mean)/std if std else 0.0)
1507            elif _method in ('robust',):
1508                from functools import reduce
1509                # Use median and IQR
1510                vals2 = sorted(_tf(rr.get(c,'')) for rr in rows if _tf(rr.get(c,'')) is not None)
1511                n2 = len(vals2)
1512                q1 = vals2[n2//4]; q3 = vals2[3*n2//4]
1513                iqr = q3-q1
1514                med = vals2[n2//2]
1515                out_r[c] = "%.8f" % ((v-med)/iqr if iqr else 0.0)
1516        out_rows.append(out_r)
1517    with open(_output, 'w', newline='', encoding='utf-8') as fh:
1518        fns2 = fieldnames if fieldnames else list(out_rows[0].keys()) if out_rows else []
1519        w = _csv.DictWriter(fh, fieldnames=fns2)
1520        w.writeheader(); w.writerows(out_rows)
1521    print("Normalized data (%d rows) saved to: %s" % (len(out_rows), _output))
1522else:
1523    print("  (No --normalize-output specified â€” use --normalize-output FILE to save scaled CSV)")
1524"####,
1525        hex_path = hex_path,
1526        hex_cols = hex_cols,
1527        hex_method = hex_method,
1528        hex_output = hex_output,
1529    );
1530
1531    let sandbox_args = serde_json::json!({
1532        "language": "python",
1533        "code": script,
1534        "timeout_seconds": 30
1535    });
1536    crate::tools::code_sandbox::execute(&sandbox_args).await
1537}
1538
1539// â”€â”€ PCA â€” Principal Component Analysis â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
1540// Pure-Python power-iteration covariance PCA.  No numpy.
1541// Reports top-N components: eigenvalue, variance explained, loadings bar chart.
1542// Optionally writes a projected-coordinates CSV.
1543
1544pub async fn pca_analyze(
1545    file_path: &str,
1546    n_components: usize,
1547    cols: &[&str],
1548    output: &str,
1549) -> Result<String, String> {
1550    let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
1551    let hex_cols: String = cols
1552        .join(",")
1553        .bytes()
1554        .map(|b| format!("{:02x}", b))
1555        .collect();
1556    let hex_output: String = output.bytes().map(|b| format!("{:02x}", b)).collect();
1557
1558    let script = format!(
1559        r####"import csv as _csv, os, sys, math
1560
1561_path   = bytes.fromhex("{hex_path}").decode().strip()
1562_cstr   = bytes.fromhex("{hex_cols}").decode().strip()
1563_output = bytes.fromhex("{hex_output}").decode().strip()
1564_n_comp = {n_components}
1565
1566def _load(path):
1567    ext = os.path.splitext(path)[1].lower().lstrip('.')
1568    if ext in ('csv','tsv'):
1569        with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
1570            r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
1571            return list(r)
1572    raise ValueError("Unsupported file type: " + ext + " (CSV/TSV only for PCA)")
1573
1574def _tf(v):
1575    try: return float(v)
1576    except: return None
1577
1578rows = _load(_path)
1579if not rows:
1580    print("No data found."); sys.exit(0)
1581
1582all_cols = list(rows[0].keys())
1583sel = [c.strip() for c in _cstr.split(',') if c.strip()] if _cstr else []
1584num_cols = sel if sel else [c for c in all_cols if any(_tf(r.get(c,'')) is not None for r in rows[:20])]
1585num_cols = [c for c in num_cols if c in all_cols]
1586
1587mat = []
1588for r in rows:
1589    vals = [_tf(r.get(c,'')) for c in num_cols]
1590    if all(v is not None for v in vals):
1591        mat.append(vals)
1592
1593n_rows = len(mat); n_cols = len(num_cols)
1594if n_rows < 2 or n_cols < 2:
1595    print("Need at least 2 rows and 2 numeric columns for PCA."); sys.exit(0)
1596
1597means = [sum(mat[i][j] for i in range(n_rows))/n_rows for j in range(n_cols)]
1598X = [[mat[i][j] - means[j] for j in range(n_cols)] for i in range(n_rows)]
1599
1600def cov_matrix(X, nc, nr):
1601    C = [[0.0]*nc for _ in range(nc)]
1602    for j in range(nc):
1603        for k in range(j, nc):
1604            s = sum(X[i][j]*X[i][k] for i in range(nr)) / (nr-1)
1605            C[j][k] = C[k][j] = s
1606    return C
1607
1608C = cov_matrix(X, n_cols, n_rows)
1609
1610def mat_vec(M, v):
1611    return [sum(M[i][j]*v[j] for j in range(len(v))) for i in range(len(v))]
1612
1613def vec_norm(v): return math.sqrt(sum(x*x for x in v))
1614def vec_scale(v, s): return [x*s for x in v]
1615
1616n_comp = min(_n_comp, n_cols, n_rows-1)
1617total_var = sum(C[j][j] for j in range(n_cols))
1618eigvals = []; eigvecs = []
1619Cd = [row[:] for row in C]
1620
1621for ci in range(n_comp):
1622    v = [1.0 if j == ci % n_cols else 0.1 for j in range(n_cols)]
1623    nrm = vec_norm(v); v = vec_scale(v, 1.0/nrm)
1624    for _it in range(300):
1625        v_new = mat_vec(Cd, v)
1626        nrm = vec_norm(v_new)
1627        if nrm < 1e-14: break
1628        v_new = vec_scale(v_new, 1.0/nrm)
1629        delta = vec_norm([v_new[j]-v[j] for j in range(n_cols)])
1630        v = v_new
1631        if delta < 1e-10: break
1632    lam = sum(mat_vec(Cd, v)[j]*v[j] for j in range(n_cols))
1633    if lam < 0: lam = 0.0
1634    eigvals.append(lam)
1635    eigvecs.append(v[:])
1636    for i in range(n_cols):
1637        for j in range(n_cols):
1638            Cd[i][j] -= lam * v[i] * v[j]
1639
1640projected = []
1641for row_x in X:
1642    projected.append([sum(row_x[j]*eigvecs[c][j] for j in range(n_cols)) for c in range(n_comp)])
1643
1644W = 68
1645print("="*W)
1646print("  PCA  â€”  Principal Component Analysis")
1647print("  File   : %s" % os.path.basename(_path))
1648print("  Rows   : %d  |  Columns : %d  |  Components: %d" % (n_rows, n_cols, n_comp))
1649print("  Columns: %s" % ', '.join(num_cols[:6]) + (('  +%d more' % (len(num_cols)-6)) if len(num_cols)>6 else ''))
1650print("="*W)
1651
1652cum = 0.0
1653for ci in range(n_comp):
1654    pct = (eigvals[ci]/total_var*100) if total_var > 0 else 0.0
1655    cum += pct
1656    bar = int(round(pct / 2.5))
1657    bar_str = "â–ˆ"*bar + "â–‘"*(40-bar)
1658    print("\n  PC%d  eigenvalue %.4f  |  var %5.1f%%  |  cumulative %5.1f%%" % (ci+1, eigvals[ci], pct, cum))
1659    print("  %s" % bar_str)
1660    loads = sorted(enumerate(eigvecs[ci]), key=lambda x: -abs(x[1]))
1661    print("  Top loadings:")
1662    for _idx, (fidx, w) in enumerate(loads[:8]):
1663        sign = '+' if w >= 0 else '-'
1664        bar2 = int(abs(w)*20)
1665        print("    %-22s  %s%.4f  %s" % (num_cols[fidx][:22], sign, abs(w), "â–Œ"*bar2))
1666
1667print()
1668print("  Projected sample (first 5 rows):")
1669print("  " + "".join("  PC%-7d" % (c+1) for c in range(n_comp)))
1670for row_p in projected[:5]:
1671    print("  " + "".join("%+-10.4f" % v for v in row_p))
1672print()
1673print("="*W)
1674
1675if _output:
1676    pc_cols = ["PC%d" % (c+1) for c in range(n_comp)]
1677    with open(_output, 'w', newline='', encoding='utf-8') as fh:
1678        w2 = _csv.writer(fh)
1679        w2.writerow(pc_cols)
1680        for row_p in projected:
1681            w2.writerow(["%.8f" % v for v in row_p])
1682    print("  Projected data (%d rows) saved to: %s" % (len(projected), _output))
1683else:
1684    print("  (Use --pca-output FILE to save projected coordinates as CSV)")
1685"####,
1686        hex_path = hex_path,
1687        hex_cols = hex_cols,
1688        hex_output = hex_output,
1689        n_components = n_components,
1690    );
1691
1692    let sandbox_args = serde_json::json!({
1693        "language": "python",
1694        "code": script,
1695        "timeout_seconds": 60
1696    });
1697    crate::tools::code_sandbox::execute(&sandbox_args).await
1698}
1699
1700// â”€â”€ Statistical hypothesis testing â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
1701// t-tests, chi-square, ANOVA, Mann-Whitney, Pearson, proportion z-test,
1702// confidence intervals â€” all via Python stdlib (statistics, math only).
1703pub async fn hypothesis_test(
1704    test_type: &str,
1705    group1_str: &str,
1706    group2_str: &str,
1707    alpha: f64,
1708    mu: f64,
1709) -> Result<String, String> {
1710    let hex_test: String = test_type.bytes().map(|b| format!("{:02x}", b)).collect();
1711    let hex_g1: String = group1_str.bytes().map(|b| format!("{:02x}", b)).collect();
1712    let hex_g2: String = group2_str.bytes().map(|b| format!("{:02x}", b)).collect();
1713
1714    let script = format!(
1715        r####"import math, statistics as _st, sys
1716
1717_test  = bytes.fromhex("{hex_test}").decode().strip().lower()
1718_g1s   = bytes.fromhex("{hex_g1}").decode().strip()
1719_g2s   = bytes.fromhex("{hex_g2}").decode().strip()
1720_alpha = {alpha}
1721_mu0   = {mu}
1722W = 60
1723
1724def parse_nums(s):
1725    if not s: return []
1726    try:
1727        return [float(x.strip()) for x in s.replace(';',',').split(',') if x.strip()]
1728    except:
1729        return []
1730
1731def fmt_p(p):
1732    if p < 0.001: return "< 0.001"
1733    return "%.4f" % p
1734
1735def decision(p, alpha):
1736    if p < alpha:
1737        return "REJECT H0  (p %s < a=%.2f)" % (fmt_p(p), alpha)
1738    return "FAIL TO REJECT H0  (p %s >= a=%.2f)" % (fmt_p(p), alpha)
1739
1740def t_cdf(t, df):
1741    x = df / (df + t*t)
1742    def betainc(a, b, xv):
1743        if xv<=0: return 0.0
1744        if xv>=1: return 1.0
1745        tiny=1e-300; fp_min=tiny
1746        qab=a+b; qap=a+1; qam=a-1
1747        c=1.0; d=1.0-qab*xv/qap
1748        if abs(d)<fp_min: d=fp_min
1749        d=1.0/d; h=d
1750        for m in range(1,201):
1751            m2=2*m; aa=m*(b-m)*xv/((qam+m2)*(a+m2))
1752            d=1.0+aa*d
1753            if abs(d)<fp_min: d=fp_min
1754            c=1.0+aa/c
1755            if abs(c)<fp_min: c=fp_min
1756            d=1.0/d; h*=d*c
1757            aa=-(a+m)*(qab+m)*xv/((a+m2)*(qap+m2))
1758            d=1.0+aa*d
1759            if abs(d)<fp_min: d=fp_min
1760            c=1.0+aa/c
1761            if abs(c)<fp_min: c=fp_min
1762            d=1.0/d; delta=d*c; h*=delta
1763            if abs(delta-1.0)<3e-7: break
1764        return math.exp(math.lgamma(a+b)-math.lgamma(a)-math.lgamma(b)+
1765                        a*math.log(xv)+b*math.log(1-xv))*h/a
1766    a=df/2; b=0.5
1767    ibeta=betainc(b,a,x)
1768    return ibeta
1769
1770def normal_cdf_h(z):
1771    return 0.5*(1+math.erf(z/math.sqrt(2)))
1772
1773def chi2_pval(chi2, df):
1774    a=df/2.0; x2=chi2/2.0
1775    if x2<=0: return 1.0
1776    if x2<a+1:
1777        ap=a; delta=s=1.0/a
1778        for _ in range(300):
1779            ap+=1; delta*=x2/ap; s+=delta
1780            if abs(delta)<abs(s)*1e-9: break
1781        p=s*math.exp(-x2+a*math.log(x2)-math.lgamma(a))
1782        return max(0.0,min(1.0,1.0-p))
1783    else:
1784        b=x2+1-a; c=1e300; d=1.0/b; h=d
1785        for i in range(1,301):
1786            an=-i*(i-a); b+=2
1787            d=an*d+b
1788            if abs(d)<1e-300: d=1e-300
1789            c=b+an/c
1790            if abs(c)<1e-300: c=1e-300
1791            d=1.0/d; delta=d*c; h*=delta
1792            if abs(delta-1.0)<1e-9: break
1793        q=math.exp(-x2+a*math.log(x2)-math.lgamma(a))*h
1794        return max(0.0,min(1.0,q))
1795
1796def f_pval(f, df1, df2):
1797    x=df2/(df2+df1*f)
1798    def betainc(a, b, xv):
1799        if xv<=0: return 0.0
1800        if xv>=1: return 1.0
1801        tiny=1e-300; fp_min=tiny
1802        qab=a+b; qap=a+1; qam=a-1
1803        c=1.0; d=1.0-qab*xv/qap
1804        if abs(d)<fp_min: d=fp_min
1805        d=1.0/d; h=d
1806        for m in range(1,201):
1807            m2=2*m; aa=m*(b-m)*xv/((qam+m2)*(a+m2))
1808            d=1.0+aa*d
1809            if abs(d)<fp_min: d=fp_min
1810            c=1.0+aa/c
1811            if abs(c)<fp_min: c=fp_min
1812            d=1.0/d; h*=d*c
1813            aa=-(a+m)*(qab+m)*xv/((a+m2)*(qap+m2))
1814            d=1.0+aa*d
1815            if abs(d)<fp_min: d=fp_min
1816            c=1.0+aa/c
1817            if abs(c)<fp_min: c=fp_min
1818            d=1.0/d; delta=d*c; h*=delta
1819            if abs(delta-1.0)<3e-7: break
1820        return math.exp(math.lgamma(a+b)-math.lgamma(a)-math.lgamma(b)+
1821                        a*math.log(xv)+b*math.log(1-xv))*h/a
1822    return betainc(df2/2,df1/2,x)
1823
1824print("="*W)
1825print("  HYPOTHESIS TEST")
1826print("="*W)
1827
1828if _test in ("one-t","one_t","onesample","one-sample","t1","t-one"):
1829    g1=parse_nums(_g1s)
1830    if len(g1)<2: print("  ERROR: need >=2 values for one-sample t-test"); sys.exit(0)
1831    n=len(g1); xbar=_st.mean(g1); s=_st.stdev(g1)
1832    se=s/math.sqrt(n); t=(xbar-_mu0)/se; df=n-1
1833    p=t_cdf(abs(t),df)
1834    print("  One-Sample t-Test  (H0: mu = %.4g)" % _mu0)
1835    print("  n=%-5d  xbar=%.4f  s=%.4f  SE=%.4f" % (n,xbar,s,se))
1836    print("  t=%.4f  df=%d  p=%s" % (t,df,fmt_p(p)))
1837    from_t=1.96 if df>30 else (2.093 if df>19 else (2.262 if df>9 else 2.776))
1838    lo=xbar-from_t*se; hi=xbar+from_t*se
1839    print("  %.0f%% CI: [%.4f, %.4f]" % ((1-_alpha)*100,lo,hi))
1840    print(); print("  "+decision(p,_alpha))
1841
1842elif _test in ("two-t","two_t","twosample","two-sample","welch","t2","t-two"):
1843    g1=parse_nums(_g1s); g2=parse_nums(_g2s)
1844    if len(g1)<2 or len(g2)<2: print("  ERROR: need >=2 values in each group"); sys.exit(0)
1845    n1=len(g1); n2=len(g2)
1846    x1=_st.mean(g1); x2=_st.mean(g2)
1847    s1=_st.stdev(g1); s2=_st.stdev(g2)
1848    se=math.sqrt(s1**2/n1+s2**2/n2)
1849    t=(x1-x2)/se
1850    df=(s1**2/n1+s2**2/n2)**2/((s1**2/n1)**2/(n1-1)+(s2**2/n2)**2/(n2-1))
1851    p=t_cdf(abs(t),df)
1852    print("  Two-Sample (Welch) t-Test  (H0: mu1 = mu2)")
1853    print("  G1: n=%d  xbar=%.4f  s=%.4f" % (n1,x1,s1))
1854    print("  G2: n=%d  xbar=%.4f  s=%.4f" % (n2,x2,s2))
1855    print("  delta_xbar=%.4f  SE=%.4f  t=%.4f  df=%.1f  p=%s" % (x1-x2,se,t,df,fmt_p(p)))
1856    print(); print("  "+decision(p,_alpha))
1857
1858elif _test in ("paired","paired-t","pairedt","t-paired"):
1859    g1=parse_nums(_g1s); g2=parse_nums(_g2s)
1860    if len(g1)!=len(g2) or len(g1)<2: print("  ERROR: groups must match in length (>=2)"); sys.exit(0)
1861    diffs=[a-b for a,b in zip(g1,g2)]
1862    n=len(diffs); dbar=_st.mean(diffs); sd=_st.stdev(diffs)
1863    se=sd/math.sqrt(n); t=dbar/se; df=n-1
1864    p=t_cdf(abs(t),df)
1865    print("  Paired t-Test  (H0: mu_diff = 0)")
1866    print("  n=%d  dbar=%.4f  sd=%.4f  SE=%.4f" % (n,dbar,sd,se))
1867    print("  t=%.4f  df=%d  p=%s" % (t,df,fmt_p(p)))
1868    print(); print("  "+decision(p,_alpha))
1869
1870elif _test in ("chi2","chi-square","chisquare","chi-sq","goodness"):
1871    observed=parse_nums(_g1s)
1872    if len(observed)<2: print("  ERROR: need >=2 observed counts"); sys.exit(0)
1873    expected_s=_g2s.strip()
1874    if expected_s:
1875        expected=parse_nums(expected_s)
1876        if len(expected)!=len(observed): print("  ERROR: observed/expected length mismatch"); sys.exit(0)
1877    else:
1878        e=sum(observed)/len(observed); expected=[e]*len(observed)
1879    chi2=sum((o-e)**2/e for o,e in zip(observed,expected) if e>0)
1880    df=len(observed)-1; p=chi2_pval(chi2,df)
1881    print("  Chi-Square Goodness-of-Fit  (H0: observed ~ expected)")
1882    print("  %-12s  %-10s  %-10s  %-8s" % ("Category","Observed","Expected","(O-E)^2/E"))
1883    for i,(o,e) in enumerate(zip(observed,expected)):
1884        print("  %-12s  %-10.2f  %-10.2f  %.4f" % ("cat%d"%(i+1),o,e,(o-e)**2/e if e>0 else 0))
1885    print("  chi2=%.4f  df=%d  p=%s" % (chi2,df,fmt_p(p)))
1886    print(); print("  "+decision(p,_alpha))
1887
1888elif _test in ("anova","one-way","oneway","f-test"):
1889    raw_groups=[g.strip() for g in _g1s.split('|')]
1890    groups=[parse_nums(g) for g in raw_groups if g]
1891    groups=[g for g in groups if len(g)>=2]
1892    if len(groups)<2: print("  ERROR: need >=2 groups separated by | for ANOVA"); sys.exit(0)
1893    k=len(groups); N=sum(len(g) for g in groups)
1894    grand=sum(sum(g) for g in groups)/N
1895    SSB=sum(len(g)*(_st.mean(g)-grand)**2 for g in groups)
1896    SSW=sum(sum((x-_st.mean(g))**2 for x in g) for g in groups)
1897    dfB=k-1; dfW=N-k
1898    F=(SSB/dfB)/(SSW/dfW) if dfW>0 and SSW>0 else float('inf')
1899    p=f_pval(F,dfB,dfW)
1900    print("  One-Way ANOVA  (H0: all group means equal)")
1901    for i,g in enumerate(groups):
1902        print("  G%d: n=%d  xbar=%.4f  s=%.4f" % (i+1,len(g),_st.mean(g),_st.stdev(g)))
1903    print("  SSB=%.4f (df=%d)  SSW=%.4f (df=%d)" % (SSB,dfB,SSW,dfW))
1904    print("  F=%.4f  p=%s" % (F,fmt_p(p)))
1905    print(); print("  "+decision(p,_alpha))
1906
1907elif _test in ("mannwhitney","mann-whitney","mwu","wilcoxon-rank","ranksum"):
1908    g1=parse_nums(_g1s); g2=parse_nums(_g2s)
1909    if len(g1)<1 or len(g2)<1: print("  ERROR: need values in both groups"); sys.exit(0)
1910    n1=len(g1); n2=len(g2)
1911    combined=sorted([(v,'a',i) for i,v in enumerate(g1)]+[(v,'b',i) for i,v in enumerate(g2)])
1912    ranks={{}}
1913    i=0
1914    while i<len(combined):
1915        j=i
1916        while j<len(combined)-1 and combined[j][0]==combined[j+1][0]: j+=1
1917        avg_rank=(i+1+j+1)/2
1918        for kk in range(i,j+1): ranks[(combined[kk][1],combined[kk][2])]=avg_rank
1919        i=j+1
1920    R1=sum(ranks[('a',i)] for i in range(n1))
1921    U1=R1-n1*(n1+1)/2; U2=n1*n2-U1; U=min(U1,U2)
1922    mu_U=n1*n2/2; sigma_U=math.sqrt(n1*n2*(n1+n2+1)/12)
1923    z=(U-mu_U)/sigma_U if sigma_U>0 else 0
1924    p=2*(1-normal_cdf_h(abs(z)))
1925    print("  Mann-Whitney U Test  (H0: distributions equal)")
1926    print("  n1=%d  n2=%d  U=%.1f  z=%.4f  p=%s" % (n1,n2,U,z,fmt_p(p)))
1927    print(); print("  "+decision(p,_alpha))
1928
1929elif _test in ("pearson","correlation","corr"):
1930    g1=parse_nums(_g1s); g2=parse_nums(_g2s)
1931    if len(g1)!=len(g2) or len(g1)<3: print("  ERROR: need matching vectors of length >=3"); sys.exit(0)
1932    n=len(g1); x1=_st.mean(g1); x2=_st.mean(g2)
1933    num=sum((a-x1)*(b-x2) for a,b in zip(g1,g2))
1934    d1=math.sqrt(sum((a-x1)**2 for a in g1)); d2=math.sqrt(sum((b-x2)**2 for b in g2))
1935    r=num/(d1*d2) if d1*d2>0 else 0
1936    t=r*math.sqrt(n-2)/math.sqrt(1-r**2) if abs(r)<1 else float('inf')
1937    df=n-2; p=t_cdf(abs(t),df)
1938    print("  Pearson Correlation Test  (H0: rho = 0)")
1939    print("  n=%d  r=%.4f  t=%.4f  df=%d  p=%s" % (n,r,t,df,fmt_p(p)))
1940    strength="negligible" if abs(r)<0.1 else "weak" if abs(r)<0.3 else "moderate" if abs(r)<0.5 else "strong"
1941    print("  Strength: %s (%s)" % (strength,"positive" if r>=0 else "negative"))
1942    print(); print("  "+decision(p,_alpha))
1943
1944elif _test in ("proportion","prop","z-prop","zprop","prop1","one-prop"):
1945    parts=parse_nums(_g1s)
1946    if len(parts)<2: print("  ERROR: provide 'successes,n' as group1"); sys.exit(0)
1947    k=int(parts[0]); n=int(parts[1]); p_hat=k/n; p0=_mu0 if _mu0>0 else 0.5
1948    se=math.sqrt(p0*(1-p0)/n)
1949    z=(p_hat-p0)/se if se>0 else 0
1950    p=2*(1-normal_cdf_h(abs(z)))
1951    ci_se=math.sqrt(p_hat*(1-p_hat)/n); z_crit=1.96
1952    lo=p_hat-z_crit*ci_se; hi=p_hat+z_crit*ci_se
1953    print("  One-Proportion z-Test  (H0: p = %.4g)" % p0)
1954    print("  k=%d  n=%d  p_hat=%.4f  SE=%.4f  z=%.4f  p=%s" % (k,n,p_hat,se,z,fmt_p(p)))
1955    print("  95%% CI: [%.4f, %.4f]" % (lo,hi))
1956    print(); print("  "+decision(p,_alpha))
1957
1958elif _test in ("prop2","two-prop","twoprop","two-proportion"):
1959    p1=parse_nums(_g1s); p2=parse_nums(_g2s)
1960    if len(p1)<2 or len(p2)<2: print("  ERROR: each group needs 'successes,n'"); sys.exit(0)
1961    k1=int(p1[0]); n1=int(p1[1]); k2=int(p2[0]); n2=int(p2[1])
1962    ph1=k1/n1; ph2=k2/n2; pp=(k1+k2)/(n1+n2)
1963    se=math.sqrt(pp*(1-pp)*(1/n1+1/n2))
1964    z=(ph1-ph2)/se if se>0 else 0
1965    p=2*(1-normal_cdf_h(abs(z)))
1966    print("  Two-Proportion z-Test  (H0: p1 = p2)")
1967    print("  G1: %d/%d (p_hat=%.4f)  G2: %d/%d (p_hat=%.4f)" % (k1,n1,ph1,k2,n2,ph2))
1968    print("  Pooled p_hat=%.4f  SE=%.4f  z=%.4f  p=%s" % (pp,se,z,fmt_p(p)))
1969    print(); print("  "+decision(p,_alpha))
1970
1971elif _test in ("ci","confidence","conf-interval","interval"):
1972    g1=parse_nums(_g1s)
1973    if len(g1)<2: print("  ERROR: need >=2 values for confidence interval"); sys.exit(0)
1974    n=len(g1); xbar=_st.mean(g1); s=_st.stdev(g1); se=s/math.sqrt(n)
1975    z_crit=1.96 if n>30 else (2.093 if n>19 else (2.262 if n>9 else 2.776))
1976    lo=xbar-z_crit*se; hi=xbar+z_crit*se
1977    print("  Confidence Interval for Mean")
1978    print("  n=%d  xbar=%.4f  s=%.4f  SE=%.4f" % (n,xbar,s,se))
1979    print("  %.0f%% CI: [%.4f, %.4f]  (+-%.4f)" % ((1-_alpha)*100,lo,hi,z_crit*se))
1980    if _mu0!=0:
1981        inside=lo<=_mu0<=hi
1982        print("  H0 value (mu=%.4g) is %s the interval" % (_mu0,"INSIDE" if inside else "OUTSIDE"))
1983
1984else:
1985    print("  Available tests:")
1986    print("  one-t       One-sample t-test:  --hypothesis-mu H0_MEAN")
1987    print("  two-t       Two-sample (Welch) t-test (--hypothesis-group2 DATA)")
1988    print("  paired      Paired t-test (--hypothesis-group2 DATA)")
1989    print("  chi2        Chi-square goodness-of-fit (--hypothesis-group2 EXPECTED)")
1990    print("  anova       One-way ANOVA (groups separated by | in group1)")
1991    print("  mannwhitney Mann-Whitney U (--hypothesis-group2 DATA)")
1992    print("  pearson     Pearson correlation test (--hypothesis-group2 DATA)")
1993    print("  proportion  One-proportion z-test: 'successes,n' --hypothesis-mu P0")
1994    print("  prop2       Two-proportion z-test (--hypothesis-group2 'k2,n2')")
1995    print("  ci          Confidence interval for mean")
1996    print()
1997    print("  Data format: comma-separated numbers, e.g.  3.1,2.8,4.0,3.5")
1998    print("  For ANOVA: groups separated by |  e.g.  2.1,2.3|3.4,3.6|1.9,2.0")
1999
2000print("="*W)
2001"####,
2002        hex_test = hex_test,
2003        hex_g1 = hex_g1,
2004        hex_g2 = hex_g2,
2005        alpha = alpha,
2006        mu = mu,
2007    );
2008
2009    let sandbox_args = serde_json::json!({
2010        "language": "python",
2011        "code": script,
2012        "timeout_seconds": 30
2013    });
2014    crate::tools::code_sandbox::execute(&sandbox_args).await
2015}
2016
2017// â”€â”€ Descriptive statistics â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
2018// Full per-column stats: mean, median, mode, std, variance, skewness, kurtosis,
2019// percentiles (P5/P25/P50/P75/P95), IQR, outliers, ASCII histogram.
2020pub async fn describe_stats(
2021    file_path: &str,
2022    cols_str: &str,
2023    output: &str,
2024) -> Result<String, String> {
2025    let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
2026    let hex_cols: String = cols_str.bytes().map(|b| format!("{:02x}", b)).collect();
2027    let hex_output: String = output.bytes().map(|b| format!("{:02x}", b)).collect();
2028
2029    let script = format!(
2030        r####"import csv as _csv, json as _js, sqlite3 as _sq, os, math
2031
2032_path   = bytes.fromhex("{hex_path}").decode().strip()
2033_cols_s = bytes.fromhex("{hex_cols}").decode().strip()
2034_output = bytes.fromhex("{hex_output}").decode().strip()
2035W = 68
2036
2037def _load(path):
2038    ext = os.path.splitext(path)[1].lower().lstrip('.')
2039    if ext in ('csv','tsv'):
2040        with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
2041            r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
2042            rows = list(r)
2043        return rows
2044    elif ext == 'json':
2045        with open(path, encoding='utf-8') as fh: d = _js.load(fh)
2046        return d if isinstance(d, list) else next(iter(d.values()), [])
2047    elif ext in ('db','sqlite','sqlite3'):
2048        con = _sq.connect(path)
2049        cur = con.cursor()
2050        cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
2051        t = cur.fetchone()
2052        if not t: return []
2053        cur.execute("SELECT * FROM [%s]" % t[0])
2054        cols2 = [d[0] for d in cur.description]
2055        rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
2056        con.close()
2057        return rows2
2058    return []
2059
2060rows = _load(_path)
2061if not rows:
2062    print("  ERROR: no rows loaded from", _path)
2063    raise SystemExit(0)
2064
2065all_cols = list(rows[0].keys())
2066if _cols_s:
2067    req = [c.strip() for c in _cols_s.split(',')]
2068    cols = [c for c in req if c in all_cols]
2069    if not cols:
2070        print("  WARNING: none of the specified columns found. Available:", ', '.join(all_cols))
2071        cols = all_cols
2072else:
2073    cols = all_cols
2074
2075def parse_nums(rows, col):
2076    vals = []
2077    for r in rows:
2078        v = r.get(col, '')
2079        if v is None: continue
2080        v = str(v).strip()
2081        if not v: continue
2082        try: vals.append(float(v))
2083        except: pass
2084    return vals
2085
2086def percentile(sorted_v, p):
2087    if not sorted_v: return float('nan')
2088    n = len(sorted_v)
2089    idx = (n - 1) * p / 100.0
2090    lo = int(idx); hi = lo + 1
2091    if hi >= n: return sorted_v[lo]
2092    frac = idx - lo
2093    return sorted_v[lo] * (1 - frac) + sorted_v[hi] * frac
2094
2095def skewness(vals, mean, std):
2096    if std == 0 or len(vals) < 3: return float('nan')
2097    n = len(vals)
2098    s3 = sum((x - mean)**3 for x in vals)
2099    return (n / ((n-1)*(n-2))) * s3 / std**3
2100
2101def kurtosis(vals, mean, std):
2102    if std == 0 or len(vals) < 4: return float('nan')
2103    n = len(vals)
2104    s4 = sum((x - mean)**4 for x in vals)
2105    k = (n*(n+1)/((n-1)*(n-2)*(n-3))) * s4 / std**4 - 3*(n-1)**2/((n-2)*(n-3))
2106    return k
2107
2108def ascii_hist(vals, bins=16, width=36):
2109    if not vals: return []
2110    mn = min(vals); mx = max(vals)
2111    rng = mx - mn
2112    if rng == 0: return ["  (all values identical: %.4g)" % mn]
2113    bw = rng / bins
2114    counts = [0]*bins
2115    for v in vals:
2116        idx = min(int((v - mn) / bw), bins - 1)
2117        counts[idx] += 1
2118    max_c = max(counts) or 1
2119    lines = []
2120    for i, c in enumerate(counts):
2121        lo = mn + i*bw; hi = lo+bw
2122        bar = int(c / max_c * width)
2123        lines.append("  [%8.3g, %8.3g)  %s %d" % (lo, hi, 'â–ˆ'*bar, c))
2124    return lines
2125
2126print("=" * W)
2127print("  DESCRIPTIVE STATISTICS")
2128print("  File:   %s" % os.path.basename(_path))
2129print("  Rows:   %d    Cols examined: %d" % (len(rows), len(cols)))
2130print("=" * W)
2131
2132results = []
2133for col in cols:
2134    vals = parse_nums(rows, col)
2135    if len(vals) < 2:
2136        print("\n  %s: too few numeric values (%d)" % (col, len(vals)))
2137        continue
2138    s = sorted(vals)
2139    n = len(vals)
2140    mean = sum(vals) / n
2141    var = sum((x-mean)**2 for x in vals) / (n-1) if n > 1 else 0
2142    std = math.sqrt(var)
2143    med = percentile(s, 50)
2144    # mode (simple: most frequent rounded value)
2145    from collections import Counter
2146    mode_ctr = Counter(round(v, 4) for v in vals)
2147    mode_val, mode_cnt = mode_ctr.most_common(1)[0]
2148    p5  = percentile(s, 5)
2149    p25 = percentile(s, 25)
2150    p75 = percentile(s, 75)
2151    p95 = percentile(s, 95)
2152    iqr = p75 - p25
2153    skew = skewness(vals, mean, std)
2154    kurt = kurtosis(vals, mean, std)
2155    # Outliers via IQR method
2156    lo_fence = p25 - 1.5*iqr; hi_fence = p75 + 1.5*iqr
2157    outliers = [v for v in vals if v < lo_fence or v > hi_fence]
2158    missing = len(rows) - sum(1 for r in rows if str(r.get(col,'')).strip())
2159
2160    print("\n  %s" % col)
2161    print("  " + "-"*50)
2162    print("  n=%-8d  missing=%-6d  unique=%d" % (n, missing, len(set(round(v,6) for v in vals))))
2163    print("  mean=%11.6g  std=%11.6g  var=%11.6g" % (mean, std, var))
2164    print("  min=%12.6g  max=%12.6g  range=%10.6g" % (s[0], s[-1], s[-1]-s[0]))
2165    print("  P5=%12.6g  P25=%11.6g  median=%9.6g" % (p5, p25, med))
2166    print("  P75=%11.6g  P95=%11.6g  IQR=%11.6g" % (p75, p95, iqr))
2167    print("  mode=%11.6g (count=%d)" % (mode_val, mode_cnt))
2168    if not math.isnan(skew): print("  skewness=%8.4f  kurtosis=%8.4f" % (skew, kurt))
2169    if outliers: print("  outliers (IQR): %d value(s)  min=%.4g  max=%.4g" % (len(outliers), min(outliers), max(outliers)))
2170    print()
2171    for line in ascii_hist(vals): print(line)
2172    results.append((col, n, mean, std, s[0], s[-1], med))
2173
2174if _output and results:
2175    with open(_output, 'w', newline='', encoding='utf-8') as fh:
2176        w2 = _csv.writer(fh)
2177        w2.writerow(['column','n','mean','std','min','max','median'])
2178        for row in results:
2179            w2.writerow(['%.8g'%v if isinstance(v,float) else v for v in row])
2180    print("\n  Summary saved to: %s" % _output)
2181elif _output == '' and results:
2182    print("\n  (Use --stats-output FILE to save summary CSV)")
2183
2184print("\n" + "=" * W)
2185"####,
2186        hex_path = hex_path,
2187        hex_cols = hex_cols,
2188        hex_output = hex_output,
2189    );
2190
2191    let sandbox_args = serde_json::json!({
2192        "language": "python",
2193        "code": script,
2194        "timeout_seconds": 30
2195    });
2196    crate::tools::code_sandbox::execute(&sandbox_args).await
2197}
2198
2199// â”€â”€ Classification (k-NN and Naive Bayes) â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
2200// Trains on labeled CSV data, predicts a class label for new input, and runs
2201// leave-one-out cross-validation to report accuracy. No external libraries.
2202pub async fn classify_data(
2203    file_path: &str,
2204    label_col: &str,
2205    feature_cols: &str,
2206    predict_str: &str,
2207    k: usize,
2208    method: &str,
2209) -> Result<String, String> {
2210    let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
2211    let hex_label: String = label_col.bytes().map(|b| format!("{:02x}", b)).collect();
2212    let hex_feats: String = feature_cols.bytes().map(|b| format!("{:02x}", b)).collect();
2213    let hex_predict: String = predict_str.bytes().map(|b| format!("{:02x}", b)).collect();
2214    let hex_method: String = method.bytes().map(|b| format!("{:02x}", b)).collect();
2215
2216    let script = format!(
2217        r####"import csv as _csv, json as _js, sqlite3 as _sq, os, math
2218from collections import Counter, defaultdict
2219
2220_path    = bytes.fromhex("{hex_path}").decode().strip()
2221_label   = bytes.fromhex("{hex_label}").decode().strip()
2222_feats_s = bytes.fromhex("{hex_feats}").decode().strip()
2223_pred_s  = bytes.fromhex("{hex_predict}").decode().strip()
2224_k       = {k}
2225_method  = bytes.fromhex("{hex_method}").decode().strip().lower()
2226W = 60
2227
2228def _load(path):
2229    ext = os.path.splitext(path)[1].lower().lstrip('.')
2230    if ext in ('csv','tsv'):
2231        with open(path, encoding='utf-8-sig', errors='replace', newline='') as fh:
2232            r = _csv.DictReader(fh, delimiter='\t' if ext=='tsv' else ',')
2233            return list(r)
2234    elif ext == 'json':
2235        with open(path, encoding='utf-8') as fh: d = _js.load(fh)
2236        return d if isinstance(d, list) else next(iter(d.values()), [])
2237    elif ext in ('db','sqlite','sqlite3'):
2238        con = _sq.connect(path)
2239        cur = con.cursor()
2240        cur.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1")
2241        t = cur.fetchone()
2242        if not t: return []
2243        cur.execute("SELECT * FROM [%s]" % t[0])
2244        cols2 = [d[0] for d in cur.description]
2245        rows2 = [dict(zip(cols2, r)) for r in cur.fetchall()]
2246        con.close()
2247        return rows2
2248    return []
2249
2250rows = _load(_path)
2251if not rows:
2252    print("  ERROR: no rows loaded from", _path); raise SystemExit(0)
2253all_cols = list(rows[0].keys())
2254
2255# Determine label and feature columns
2256if not _label:
2257    _label = all_cols[-1]
2258    print("  (No --classify-label specified; using last column: %s)" % _label)
2259
2260if _label not in all_cols:
2261    print("  ERROR: label column '%s' not found. Available: %s" % (_label, ', '.join(all_cols)))
2262    raise SystemExit(0)
2263
2264if _feats_s:
2265    feat_cols = [c.strip() for c in _feats_s.split(',') if c.strip() in all_cols]
2266else:
2267    feat_cols = [c for c in all_cols if c != _label]
2268
2269if not feat_cols:
2270    print("  ERROR: no feature columns found."); raise SystemExit(0)
2271
2272# Extract numeric feature vectors
2273def row_to_vec(row):
2274    v = []
2275    for c in feat_cols:
2276        try: v.append(float(row.get(c, 0) or 0))
2277        except: v.append(0.0)
2278    return v
2279
2280labeled = [(row_to_vec(r), str(r[_label]).strip()) for r in rows if str(r.get(_label,'')).strip()]
2281if len(labeled) < 3:
2282    print("  ERROR: need at least 3 labeled rows."); raise SystemExit(0)
2283
2284X = [v for v,_ in labeled]
2285y = [lbl for _,lbl in labeled]
2286classes = sorted(set(y))
2287
2288print("="*W)
2289print("  CLASSIFICATION")
2290print("  File:    %s" % os.path.basename(_path))
2291print("  Label:   %s   Features: %s" % (_label, ', '.join(feat_cols)))
2292print("  Method:  %s   Classes: %s" % (_method, ', '.join(classes)))
2293print("  Samples: %d" % len(labeled))
2294print("="*W)
2295
2296# â”€â”€ k-NN â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
2297def knn_predict(X_train, y_train, x_q, k):
2298    dists = [(math.sqrt(sum((a-b)**2 for a,b in zip(x_q, xi))), yi)
2299             for xi, yi in zip(X_train, y_train)]
2300    dists.sort(key=lambda d: d[0])
2301    top = [yi for _, yi in dists[:k]]
2302    return Counter(top).most_common(1)[0][0]
2303
2304# â”€â”€ Gaussian Naive Bayes â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
2305def gnb_fit(X_train, y_train):
2306    classes_t = sorted(set(y_train))
2307    stats = {{}}
2308    priors = {{}}
2309    n = len(y_train)
2310    for c in classes_t:
2311        idx = [i for i,yi in enumerate(y_train) if yi == c]
2312        priors[c] = len(idx) / n
2313        vecs = [X_train[i] for i in idx]
2314        m = [sum(vecs[j][f] for j in range(len(vecs)))/len(vecs) for f in range(len(feat_cols))]
2315        v = [sum((vecs[j][f]-m[f])**2 for j in range(len(vecs)))/max(len(vecs)-1,1) for f in range(len(feat_cols))]
2316        stats[c] = (m, v)
2317    return priors, stats
2318
2319def gnb_predict(priors, stats, x_q):
2320    best_c = None; best_log = float('-inf')
2321    for c, (m, v) in stats.items():
2322        log_p = math.log(priors[c] + 1e-300)
2323        for xi, mi, vi in zip(x_q, m, v):
2324            vi = max(vi, 1e-9)
2325            log_p += -0.5 * math.log(2*math.pi*vi) - (xi-mi)**2/(2*vi)
2326        if log_p > best_log: best_log = log_p; best_c = c
2327    return best_c
2328
2329# â”€â”€ LOO cross-validation â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
2330correct = 0
2331confusion = defaultdict(lambda: defaultdict(int))
2332for i in range(len(labeled)):
2333    Xt = [X[j] for j in range(len(X)) if j != i]
2334    yt = [y[j] for j in range(len(y)) if j != i]
2335    x_q = X[i]; true = y[i]
2336    if _method == 'nb' or _method == 'naive_bayes' or _method == 'gnb':
2337        p, s = gnb_fit(Xt, yt); pred = gnb_predict(p, s, x_q)
2338    else:
2339        pred = knn_predict(Xt, yt, x_q, _k)
2340    confusion[true][pred] += 1
2341    if pred == true: correct += 1
2342
2343acc = correct / len(labeled)
2344print("\n  Leave-One-Out Cross-Validation")
2345print("  Accuracy: %d/%d = %.2f%%" % (correct, len(labeled), acc*100))
2346print()
2347
2348# Confusion matrix
2349print("  Confusion Matrix (actual=rows, predicted=cols):")
2350max_w = max(len(c) for c in classes) + 2
2351print("  " + " "*(max_w) + "  " + "  ".join(c.ljust(max_w) for c in classes))
2352for actual in classes:
2353    row = "  " + actual.ljust(max_w) + "  "
2354    row += "  ".join(str(confusion[actual].get(pred, 0)).ljust(max_w) for pred in classes)
2355    print(row)
2356
2357# Per-class precision/recall
2358print()
2359print("  Per-class metrics:")
2360print("  %-15s  %-10s  %-10s  %-10s" % ("Class","Precision","Recall","F1"))
2361print("  " + "-"*48)
2362for c in classes:
2363    tp = confusion[c].get(c, 0)
2364    fp = sum(confusion[other].get(c,0) for other in classes if other != c)
2365    fn = sum(confusion[c].get(other,0) for other in classes if other != c)
2366    prec = tp/(tp+fp) if tp+fp > 0 else 0
2367    rec  = tp/(tp+fn) if tp+fn > 0 else 0
2368    f1   = 2*prec*rec/(prec+rec) if prec+rec > 0 else 0
2369    print("  %-15s  %-10.3f  %-10.3f  %-10.3f" % (c[:15], prec, rec, f1))
2370
2371# Predict new sample if provided
2372if _pred_s:
2373    print()
2374    p_vals = [float(v.strip()) for v in _pred_s.split(',') if v.strip()]
2375    if len(p_vals) != len(feat_cols):
2376        print("  WARNING: --classify-predict has %d values but %d features expected" % (len(p_vals), len(feat_cols)))
2377    else:
2378        if _method in ('nb','naive_bayes','gnb'):
2379            p2, s2 = gnb_fit(X, y); pred_new = gnb_predict(p2, s2, p_vals)
2380        else:
2381            pred_new = knn_predict(X, y, p_vals, _k)
2382        print("  Prediction for [%s]:" % ', '.join('%.4g'%v for v in p_vals))
2383        print("  => %s" % pred_new)
2384
2385print("="*W)
2386"####,
2387        hex_path = hex_path,
2388        hex_label = hex_label,
2389        hex_feats = hex_feats,
2390        hex_predict = hex_predict,
2391        hex_method = hex_method,
2392        k = k,
2393    );
2394
2395    let sandbox_args = serde_json::json!({
2396        "language": "python",
2397        "code": script,
2398        "timeout_seconds": 30
2399    });
2400    crate::tools::code_sandbox::execute(&sandbox_args).await
2401}
2402
2403// â”€â”€ Regression analysis â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
2404pub async fn regression_analysis(
2405    file_path: &str,
2406    x_col: &str,
2407    y_col: &str,
2408    degree: usize,
2409    predict_x: &str,
2410) -> Result<String, String> {
2411    let hex_path: String = file_path.bytes().map(|b| format!("{:02x}", b)).collect();
2412    let hex_x_col: String = x_col.bytes().map(|b| format!("{:02x}", b)).collect();
2413    let hex_y_col: String = y_col.bytes().map(|b| format!("{:02x}", b)).collect();
2414    let hex_predict: String = predict_x.bytes().map(|b| format!("{:02x}", b)).collect();
2415    let deg = degree.clamp(1, 10);
2416
2417    let script = format!(
2418        r####"import csv, sys, math
2419
2420_path   = bytes.fromhex("{hex_path}").decode().strip()
2421_x_col  = bytes.fromhex("{hex_x_col}").decode().strip()
2422_y_col  = bytes.fromhex("{hex_y_col}").decode().strip()
2423_degree = {deg}
2424_pred_s = bytes.fromhex("{hex_predict}").decode().strip()
2425W = 64
2426
2427# â”€â”€ load CSV â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
2428with open(_path, newline="", encoding="utf-8-sig") as f:
2429    reader = csv.DictReader(f)
2430    rows   = list(reader)
2431    header = reader.fieldnames or []
2432
2433if not header:
2434    print("ERROR: empty file or no header"); sys.exit(1)
2435
2436x_col = _x_col if _x_col else header[0]
2437y_col = _y_col if _y_col else ([c for c in header if c != x_col] or [header[-1]])[-1]
2438
2439try:
2440    xs = [float(r[x_col]) for r in rows]
2441    ys = [float(r[y_col]) for r in rows]
2442except (KeyError, ValueError) as e:
2443    print(f"ERROR: {{e}}"); sys.exit(1)
2444
2445n = len(xs)
2446if n < 2:
2447    print("ERROR: need at least 2 data points"); sys.exit(1)
2448
2449deg = max(1, min(_degree, 10))
2450
2451# â”€â”€ Vandermonde least-squares via Gaussian elimination â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
2452def poly_fit(xs, ys, deg):
2453    d = deg + 1
2454    ATA = [[0.0]*d for _ in range(d)]
2455    ATy = [0.0]*d
2456    for x, y in zip(xs, ys):
2457        pows = [x**k for k in range(d)]
2458        for i in range(d):
2459            ATy[i] += pows[i] * y
2460            for j in range(d):
2461                ATA[i][j] += pows[i] * pows[j]
2462    mat = [ATA[i][:] + [ATy[i]] for i in range(d)]
2463    for col in range(d):
2464        pivot = max(range(col, d), key=lambda r: abs(mat[r][col]))
2465        mat[col], mat[pivot] = mat[pivot], mat[col]
2466        if abs(mat[col][col]) < 1e-12:
2467            continue
2468        for row in range(col+1, d):
2469            f = mat[row][col] / mat[col][col]
2470            for k in range(col, d+1):
2471                mat[row][k] -= f * mat[col][k]
2472    coeffs = [0.0]*d
2473    for row in range(d-1, -1, -1):
2474        coeffs[row] = mat[row][d]
2475        for k in range(row+1, d):
2476            coeffs[row] -= mat[row][k] * coeffs[k]
2477        if abs(mat[row][row]) > 1e-12:
2478            coeffs[row] /= mat[row][row]
2479    return coeffs
2480
2481def poly_eval(coeffs, x):
2482    return sum(c * x**k for k, c in enumerate(coeffs))
2483
2484coeffs = poly_fit(xs, ys, deg)
2485
2486# â”€â”€ metrics â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
2487y_mean = sum(ys) / n
2488ss_res = sum((y - poly_eval(coeffs, x))**2 for x, y in zip(xs, ys))
2489ss_tot = sum((y - y_mean)**2 for y in ys)
2490r2     = 1.0 - ss_res / ss_tot if ss_tot > 1e-12 else 1.0
2491rmse   = math.sqrt(ss_res / n)
2492mae    = sum(abs(y - poly_eval(coeffs, x)) for x, y in zip(xs, ys)) / n
2493
2494if deg == 1 and n > 1:
2495    sx  = math.sqrt(sum((x - sum(xs)/n)**2 for x in xs) / (n-1))
2496    sy  = math.sqrt(sum((y - y_mean)**2 for y in ys) / (n-1))
2497    sxy = sum((x - sum(xs)/n)*(y - y_mean) for x, y in zip(xs, ys)) / (n-1)
2498    pearson_r = sxy / (sx * sy) if sx * sy > 1e-12 else 0.0
2499else:
2500    pearson_r = None
2501
2502# â”€â”€ header â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
2503print("=" * W)
2504label = "LINEAR" if deg == 1 else f"POLYNOMIAL (degree {{deg}})"
2505print(f"  REGRESSION ANALYSIS â€” {{label}}")
2506print(f"  X: {{x_col}}   Y: {{y_col}}   N: {{n}}")
2507print("=" * W)
2508terms = []
2509for k, c in enumerate(coeffs):
2510    if abs(c) < 1e-12: continue
2511    if k == 0:   terms.append(f"{{c:.6g}}")
2512    elif k == 1: terms.append(f"{{c:+.6g}}*x")
2513    else:        terms.append(f"{{c:+.6g}}*x^{{k}}")
2514print("  y = " + " ".join(terms) if terms else "  y = 0")
2515print()
2516print(f"  R2      : {{r2:.6f}}")
2517if pearson_r is not None:
2518    print(f"  Pearson : {{pearson_r:.6f}}")
2519print(f"  RMSE    : {{rmse:.6g}}")
2520print(f"  MAE     : {{mae:.6g}}")
2521if   r2 >= 0.95: qual = "Excellent fit (R2 >= 0.95)"
2522elif r2 >= 0.80: qual = "Good fit (R2 >= 0.80)"
2523elif r2 >= 0.60: qual = "Moderate fit (R2 >= 0.60)"
2524else:            qual = "Weak fit (R2 < 0.60)"
2525print(f"  Quality : {{qual}}")
2526print()
2527
2528# â”€â”€ ASCII scatter + fit curve â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
2529ROWS, COLS = 16, W - 6
2530x_min, x_max = min(xs), max(xs)
2531cx_list = [x_min + (x_max - x_min)*i/(COLS-1) for i in range(COLS)] if COLS > 1 else [x_min]
2532cy_list = [poly_eval(coeffs, x) for x in cx_list]
2533y_min2  = min(list(ys) + cy_list)
2534y_max2  = max(list(ys) + cy_list)
2535
2536def to_col(x):
2537    return int((x - x_min) / (x_max - x_min) * (COLS-1)) if x_max != x_min else 0
2538
2539def to_row(y):
2540    return int((y_max2 - y) / (y_max2 - y_min2) * (ROWS-1)) if y_max2 != y_min2 else ROWS//2
2541
2542grid = [[" "]*COLS for _ in range(ROWS)]
2543for cx, cy in zip(cx_list, cy_list):
2544    r, c = to_row(cy), to_col(cx)
2545    if 0 <= r < ROWS and 0 <= c < COLS and grid[r][c] == " ":
2546        grid[r][c] = "-"
2547for x, y in zip(xs, ys):
2548    r, c = to_row(y), to_col(x)
2549    if 0 <= r < ROWS and 0 <= c < COLS:
2550        grid[r][c] = "*"
2551
2552print("  Scatter (* = data, - = fit curve):")
2553for i, row in enumerate(grid):
2554    if i == 0:       lbl = f"{{y_max2:.3g}}"
2555    elif i == ROWS-1: lbl = f"{{y_min2:.3g}}"
2556    else:             lbl = ""
2557    print(f"  {{lbl:>8}} |{{''.join(row)}}")
2558print(f"  {{' ':>8}} +" + "-"*COLS)
2559xl, xr = f"{{x_min:.3g}}", f"{{x_max:.3g}}"
2560pad_w = max(0, COLS - len(xl) - len(xr))
2561print("  " + " "*9 + xl + " "*pad_w + xr)
2562print()
2563
2564# â”€â”€ residuals â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
2565resids = [y - poly_eval(coeffs, x) for x, y in zip(xs, ys)]
2566r_min, r_max = min(resids), max(resids)
2567rRs, rCs = 6, W - 6
2568grid_r = [[" "]*rCs for _ in range(rRs)]
2569rng    = max(abs(r_min), abs(r_max), 1e-12)
2570mid_r  = rRs // 2
2571for c in range(rCs):
2572    grid_r[mid_r][c] = "."
2573for x, res in zip(xs, resids):
2574    c = int((x - x_min)/(x_max - x_min)*(rCs-1)) if x_max != x_min else 0
2575    r = int((rng - res)/(2*rng)*(rRs-1))
2576    if 0 <= r < rRs and 0 <= c < rCs:
2577        grid_r[r][c] = "o"
2578print("  Residuals (o = data, . = zero line):")
2579for row in grid_r:
2580    print("  " + "".join(row))
2581print("  " + "-"*rCs)
2582print(f"  Range: [{{r_min:.4g}}, {{r_max:.4g}}]")
2583print()
2584
2585# â”€â”€ predictions â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
2586if _pred_s:
2587    try:
2588        pred_xs = [float(v.strip()) for v in _pred_s.split(",")]
2589        print("  Predictions:")
2590        for px in pred_xs:
2591            py = poly_eval(coeffs, px)
2592            print(f"    x = {{px:.6g}}  =>  y = {{py:.6g}}")
2593        print()
2594    except ValueError:
2595        print(f"  WARNING: bad --regression-predict value: {{_pred_s}}")
2596        print()
2597print("=" * W)
2598"####,
2599        hex_path = hex_path,
2600        hex_x_col = hex_x_col,
2601        hex_y_col = hex_y_col,
2602        hex_predict = hex_predict,
2603        deg = deg,
2604    );
2605
2606    let sandbox_args = serde_json::json!({
2607        "language": "python",
2608        "code": script,
2609        "timeout_seconds": 30
2610    });
2611    crate::tools::code_sandbox::execute(&sandbox_args).await
2612}
hematite/tools/data_tools.rs

hematite/tools/
data_tools.rs