Skip to main content

hedl_cli/commands/
stats.rs

1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! Stats command - shows size and token savings comparison
19//!
20//! This module provides parallel statistics generation for HEDL files,
21//! comparing size and token efficiency against JSON, YAML, and XML formats.
22//!
23//! # Performance
24//!
25//! Uses rayon for parallel format conversion and token estimation:
26//! - Format conversions run in parallel (JSON, YAML, XML)
27//! - Token estimations computed in parallel when enabled
28//! - Typical speedup: 3-5x on multi-core systems
29//!
30//! # Thread Safety
31//!
32//! All format conversions are independent and thread-safe. No shared mutable state.
33
34use super::read_file;
35use crate::error::CliError;
36use hedl_core::{parse, Document};
37use hedl_json::{to_json_value, ToJsonConfig};
38use hedl_xml::{to_xml as hedl_to_xml, ToXmlConfig};
39use hedl_yaml::{to_yaml as hedl_to_yaml, ToYamlConfig};
40use rayon::prelude::*;
41use std::sync::Arc;
42
43/// Token estimation constants for cl100k_base-like approximation.
44/// These constants represent empirical averages for structured data formats.
45const CHARS_PER_CONTENT_TOKEN: usize = 4;
46const WHITESPACE_PER_TOKEN: usize = 3;
47
48/// Estimate token count using cl100k_base-like approximation
49/// Rough heuristic: ~4 characters per token for structured data
50fn estimate_tokens(text: &str) -> usize {
51    // More accurate estimation for structured data:
52    // - Whitespace-heavy formats inflate char count but not tokens as much
53    // - Special characters often become single tokens
54    // - Numbers and short words are often single tokens
55
56    let chars = text.len();
57    let whitespace = text.chars().filter(|c| c.is_whitespace()).count();
58    let non_whitespace = chars - whitespace;
59
60    // Structured data averages ~3.5-4 chars per token for content
61    // Whitespace is compressed (roughly 2-3 whitespace per token)
62    let content_tokens = non_whitespace / CHARS_PER_CONTENT_TOKEN;
63    let whitespace_tokens = whitespace / WHITESPACE_PER_TOKEN;
64
65    content_tokens + whitespace_tokens
66}
67
68/// Format statistics computed in parallel
69#[derive(Debug, Clone)]
70struct FormatStats {
71    json_compact: String,
72    json_pretty: String,
73    yaml: String,
74    xml_compact: String,
75    xml_pretty: String,
76}
77
78impl FormatStats {
79    /// Compute all format conversions in parallel.
80    ///
81    /// Uses rayon to parallelize conversion to JSON (compact/pretty), YAML,
82    /// and XML (compact/pretty) formats for maximum performance.
83    ///
84    /// # Arguments
85    ///
86    /// * `doc` - The HEDL document to convert
87    ///
88    /// # Returns
89    ///
90    /// Returns `FormatStats` containing all converted formats.
91    ///
92    /// # Errors
93    ///
94    /// Returns `Err` if any format conversion fails.
95    ///
96    /// # Performance
97    ///
98    /// Achieves 3-5x speedup on multi-core systems by running conversions
99    /// in parallel threads.
100    fn compute_parallel(doc: &Document) -> Result<Self, CliError> {
101        // Use Arc to share the document across threads safely
102        let doc = Arc::new(doc.clone());
103
104        // Define conversion tasks as closures
105        let tasks: Vec<Box<dyn Fn() -> Result<String, CliError> + Send + Sync>> = vec![
106            // JSON compact
107            Box::new({
108                let doc = Arc::clone(&doc);
109                move || {
110                    let config = ToJsonConfig::default();
111                    let value = to_json_value(&doc, &config).map_err(|e| {
112                        CliError::json_conversion(format!("JSON conversion error: {e}"))
113                    })?;
114                    serde_json::to_string(&value).map_err(|e| {
115                        CliError::json_conversion(format!("JSON serialization error: {e}"))
116                    })
117                }
118            }),
119            // JSON pretty
120            Box::new({
121                let doc = Arc::clone(&doc);
122                move || {
123                    let config = ToJsonConfig::default();
124                    let value = to_json_value(&doc, &config).map_err(|e| {
125                        CliError::json_conversion(format!("JSON conversion error: {e}"))
126                    })?;
127                    serde_json::to_string_pretty(&value).map_err(|e| {
128                        CliError::json_conversion(format!("JSON pretty serialization error: {e}"))
129                    })
130                }
131            }),
132            // YAML
133            Box::new({
134                let doc = Arc::clone(&doc);
135                move || {
136                    let config = ToYamlConfig::default();
137                    hedl_to_yaml(&doc, &config).map_err(|e| {
138                        CliError::yaml_conversion(format!("YAML conversion error: {e}"))
139                    })
140                }
141            }),
142            // XML compact
143            Box::new({
144                let doc = Arc::clone(&doc);
145                move || {
146                    let config = ToXmlConfig {
147                        pretty: false,
148                        ..Default::default()
149                    };
150                    hedl_to_xml(&doc, &config)
151                        .map_err(|e| CliError::xml_conversion(format!("XML conversion error: {e}")))
152                }
153            }),
154            // XML pretty
155            Box::new({
156                let doc = Arc::clone(&doc);
157                move || {
158                    let config = ToXmlConfig {
159                        pretty: true,
160                        ..Default::default()
161                    };
162                    hedl_to_xml(&doc, &config).map_err(|e| {
163                        CliError::xml_conversion(format!("XML pretty conversion error: {e}"))
164                    })
165                }
166            }),
167        ];
168
169        // Execute all conversions in parallel
170        let results: Result<Vec<String>, CliError> = tasks.par_iter().map(|task| task()).collect();
171
172        let outputs = results?;
173
174        // Extract results in order (using indices for safety)
175        // We expect exactly 5 outputs from our 5 format conversion tasks
176        if outputs.len() != 5 {
177            return Err(CliError::parse(format!(
178                "Internal error: expected 5 format conversions, got {}",
179                outputs.len()
180            )));
181        }
182
183        // Use into_iter to consume the vector and extract in order
184        let mut iter = outputs.into_iter();
185        Ok(FormatStats {
186            // SAFETY: We verified length == 5 above
187            json_compact: iter.next().expect("length verified"),
188            json_pretty: iter.next().expect("length verified"),
189            yaml: iter.next().expect("length verified"),
190            xml_compact: iter.next().expect("length verified"),
191            xml_pretty: iter.next().expect("length verified"),
192        })
193    }
194}
195
196/// Generate comprehensive size and efficiency statistics for a HEDL file.
197///
198/// Parses a HEDL file and compares its size and token efficiency against equivalent
199/// JSON, YAML, and XML representations. Optionally estimates LLM token counts for
200/// context window optimization.
201///
202/// # Arguments
203///
204/// * `file` - Path to the HEDL file to analyze
205/// * `show_tokens` - If `true`, includes estimated token counts for LLM context
206///
207/// # Returns
208///
209/// Returns `Ok(())` on success.
210///
211/// # Errors
212///
213/// Returns `Err` if:
214/// - The file cannot be read
215/// - The file contains syntax errors
216/// - Format conversions fail
217///
218/// # Examples
219///
220/// ```no_run
221/// use hedl_cli::commands::stats;
222///
223/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
224/// // Show byte size comparison
225/// stats("data.hedl", false)?;
226///
227/// // Show byte and token comparison
228/// stats("data.hedl", true)?;
229/// # Ok(())
230/// # }
231/// ```
232///
233/// # Output
234///
235/// Displays a formatted table showing:
236/// - File sizes in bytes for HEDL, JSON (compact/pretty), YAML, XML (compact/pretty)
237/// - Size savings (absolute and percentage)
238/// - Estimated token counts (if enabled)
239/// - Token savings compared to other formats
240///
241/// # Performance
242///
243/// Uses parallel processing (rayon) to compute all format conversions simultaneously,
244/// achieving 3-5x speedup on multi-core systems. Token estimation is also parallelized.
245///
246/// # Token Estimation
247///
248/// Token counts use a heuristic approximation:
249/// - ~4 characters per content token
250/// - ~3 whitespace characters per token
251/// - Based on empirical averages for structured data formats
252pub fn stats(file: &str, show_tokens: bool) -> Result<(), CliError> {
253    let content = read_file(file)?;
254    let hedl_bytes = content.len();
255
256    // Parse HEDL
257    let doc =
258        parse(content.as_bytes()).map_err(|e| CliError::parse(format!("Parse error: {e}")))?;
259
260    // Compute all format conversions in parallel
261    let formats = FormatStats::compute_parallel(&doc)?;
262
263    // Extract byte sizes
264    let json_bytes = formats.json_compact.len();
265    let json_pretty_bytes = formats.json_pretty.len();
266    let yaml_bytes = formats.yaml.len();
267    let xml_bytes = formats.xml_compact.len();
268    let xml_pretty_bytes = formats.xml_pretty.len();
269
270    // Calculate savings
271    let calc_savings = |other: usize| -> (i64, f64) {
272        let diff = other as i64 - hedl_bytes as i64;
273        let pct = if other > 0 {
274            (diff as f64 / other as f64) * 100.0
275        } else {
276            0.0
277        };
278        (diff, pct)
279    };
280
281    println!("HEDL Size Comparison");
282    println!("====================");
283    println!();
284    println!("Input: {file}");
285    println!();
286
287    // Byte comparison table
288    println!("Bytes:");
289    println!(
290        "  {:<20} {:>10} {:>12} {:>10}",
291        "Format", "Size", "Savings", "%"
292    );
293    println!("  {:-<20} {:-^10} {:-^12} {:-^10}", "", "", "", "");
294
295    println!("  {:<20} {:>10}", "HEDL", format_bytes(hedl_bytes));
296
297    let (json_diff, json_pct) = calc_savings(json_bytes);
298    println!(
299        "  {:<20} {:>10} {:>12} {:>9.1}%",
300        "JSON (minified)",
301        format_bytes(json_bytes),
302        format_diff(json_diff),
303        json_pct
304    );
305
306    let (json_pretty_diff, json_pretty_pct) = calc_savings(json_pretty_bytes);
307    println!(
308        "  {:<20} {:>10} {:>12} {:>9.1}%",
309        "JSON (pretty)",
310        format_bytes(json_pretty_bytes),
311        format_diff(json_pretty_diff),
312        json_pretty_pct
313    );
314
315    let (yaml_diff, yaml_pct) = calc_savings(yaml_bytes);
316    println!(
317        "  {:<20} {:>10} {:>12} {:>9.1}%",
318        "YAML",
319        format_bytes(yaml_bytes),
320        format_diff(yaml_diff),
321        yaml_pct
322    );
323
324    let (xml_diff, xml_pct) = calc_savings(xml_bytes);
325    println!(
326        "  {:<20} {:>10} {:>12} {:>9.1}%",
327        "XML (minified)",
328        format_bytes(xml_bytes),
329        format_diff(xml_diff),
330        xml_pct
331    );
332
333    let (xml_pretty_diff, xml_pretty_pct) = calc_savings(xml_pretty_bytes);
334    println!(
335        "  {:<20} {:>10} {:>12} {:>9.1}%",
336        "XML (pretty)",
337        format_bytes(xml_pretty_bytes),
338        format_diff(xml_pretty_diff),
339        xml_pretty_pct
340    );
341
342    // Token estimation (parallel)
343    if show_tokens {
344        println!();
345        println!("Estimated Tokens (LLM context):");
346
347        // Compute token estimates in parallel
348        let texts = vec![
349            &content,
350            &formats.json_compact,
351            &formats.json_pretty,
352            &formats.yaml,
353            &formats.xml_compact,
354            &formats.xml_pretty,
355        ];
356
357        let token_counts: Vec<usize> = texts.par_iter().map(|text| estimate_tokens(text)).collect();
358
359        let hedl_tokens = token_counts[0];
360        let json_tokens = token_counts[1];
361        let json_pretty_tokens = token_counts[2];
362        let yaml_tokens = token_counts[3];
363        let xml_tokens = token_counts[4];
364        let xml_pretty_tokens = token_counts[5];
365
366        let calc_token_savings = |other: usize| -> (i64, f64) {
367            let diff = other as i64 - hedl_tokens as i64;
368            let pct = if other > 0 {
369                (diff as f64 / other as f64) * 100.0
370            } else {
371                0.0
372            };
373            (diff, pct)
374        };
375
376        println!(
377            "  {:<20} {:>10} {:>12} {:>10}",
378            "Format", "Tokens", "Savings", "%"
379        );
380        println!("  {:-<20} {:-^10} {:-^12} {:-^10}", "", "", "", "");
381
382        println!("  {:<20} {:>10}", "HEDL", format_number(hedl_tokens));
383
384        let (json_tok_diff, json_tok_pct) = calc_token_savings(json_tokens);
385        println!(
386            "  {:<20} {:>10} {:>12} {:>9.1}%",
387            "JSON (minified)",
388            format_number(json_tokens),
389            format_diff(json_tok_diff),
390            json_tok_pct
391        );
392
393        let (json_pretty_tok_diff, json_pretty_tok_pct) = calc_token_savings(json_pretty_tokens);
394        println!(
395            "  {:<20} {:>10} {:>12} {:>9.1}%",
396            "JSON (pretty)",
397            format_number(json_pretty_tokens),
398            format_diff(json_pretty_tok_diff),
399            json_pretty_tok_pct
400        );
401
402        let (yaml_tok_diff, yaml_tok_pct) = calc_token_savings(yaml_tokens);
403        println!(
404            "  {:<20} {:>10} {:>12} {:>9.1}%",
405            "YAML",
406            format_number(yaml_tokens),
407            format_diff(yaml_tok_diff),
408            yaml_tok_pct
409        );
410
411        let (xml_tok_diff, xml_tok_pct) = calc_token_savings(xml_tokens);
412        println!(
413            "  {:<20} {:>10} {:>12} {:>9.1}%",
414            "XML (minified)",
415            format_number(xml_tokens),
416            format_diff(xml_tok_diff),
417            xml_tok_pct
418        );
419
420        let (xml_pretty_tok_diff, xml_pretty_tok_pct) = calc_token_savings(xml_pretty_tokens);
421        println!(
422            "  {:<20} {:>10} {:>12} {:>9.1}%",
423            "XML (pretty)",
424            format_number(xml_pretty_tokens),
425            format_diff(xml_pretty_tok_diff),
426            xml_pretty_tok_pct
427        );
428
429        println!();
430        println!("Note: Token estimates use ~4 chars/token heuristic for structured data.");
431    }
432
433    Ok(())
434}
435
436fn format_bytes(bytes: usize) -> String {
437    if bytes >= 1_000_000 {
438        format!("{:.1} MB", bytes as f64 / 1_000_000.0)
439    } else if bytes >= 1_000 {
440        format!("{:.1} KB", bytes as f64 / 1_000.0)
441    } else {
442        format!("{bytes} B")
443    }
444}
445
446fn format_number(n: usize) -> String {
447    if n >= 1_000_000 {
448        format!("{:.1}M", n as f64 / 1_000_000.0)
449    } else if n >= 1_000 {
450        format!("{:.1}K", n as f64 / 1_000.0)
451    } else {
452        format!("{n}")
453    }
454}
455
456fn format_diff(diff: i64) -> String {
457    if diff > 0 {
458        format!("+{}", format_number(diff as usize))
459    } else if diff < 0 {
460        format!("-{}", format_number((-diff) as usize))
461    } else {
462        "0".to_string()
463    }
464}