sarif_rust 0.3.0

A comprehensive Rust library for parsing, generating, and manipulating SARIF (Static Analysis Results Interchange Format) v2.1.0 files
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
//! Advanced indexing system for fast SARIF lookups
//!
//! This module provides efficient indexing and lookup capabilities for SARIF data,
//! enabling fast cross-reference resolution, rule lookups, and result filtering.

use crate::parser::SarifResult as ParseResult;
use crate::types::{Artifact, ReportingDescriptor, Result as SarifResult, Run, SarifLog};
use std::collections::{HashMap, HashSet};
use std::path::Path;

/// Comprehensive index for fast SARIF data lookups
#[derive(Debug, Clone)]
pub struct SarifIndex {
    /// Maps rule IDs to their ReportingDescriptor objects
    pub rules: HashMap<String, ReportingDescriptor>,

    /// Maps artifact URIs to their Artifact objects and indices
    pub artifacts: HashMap<String, (Artifact, usize)>,

    /// Maps result GUIDs to their Result objects and locations
    pub results: HashMap<String, (SarifResult, ResultLocation)>,

    /// Maps rule IDs to lists of results that reference them
    pub rule_to_results: HashMap<String, Vec<String>>, // rule_id -> result GUIDs

    /// Maps artifact URIs to lists of results that reference them
    pub artifact_to_results: HashMap<String, Vec<String>>, // artifact_uri -> result GUIDs

    /// Maps tool names to their runs
    pub tool_to_runs: HashMap<String, Vec<usize>>, // tool_name -> run indices

    /// Statistical information about the index
    pub stats: IndexStats,

    /// Metadata about the indexed SARIF log
    pub metadata: IndexMetadata,
}

/// Location information for a result within the SARIF structure
#[derive(Debug, Clone, PartialEq)]
pub struct ResultLocation {
    /// Index of the run containing this result
    pub run_index: usize,

    /// Index of the result within the run
    pub result_index: usize,

    /// GUID of the result (if present)
    pub guid: Option<String>,

    /// Rule ID associated with the result (if present)
    pub rule_id: Option<String>,

    /// Primary artifact URI (if present)
    pub primary_artifact_uri: Option<String>,
}

/// Statistical information about the indexed SARIF data
#[derive(Debug, Clone, Default)]
pub struct IndexStats {
    /// Total number of runs
    pub run_count: usize,

    /// Total number of results across all runs
    pub result_count: usize,

    /// Total number of unique rules
    pub rule_count: usize,

    /// Total number of unique artifacts
    pub artifact_count: usize,

    /// Number of results with GUIDs
    pub results_with_guids: usize,

    /// Number of results with rule IDs
    pub results_with_rule_ids: usize,

    /// Number of results with locations
    pub results_with_locations: usize,

    /// Distribution of results by level
    pub results_by_level: HashMap<String, usize>,

    /// Distribution of results by kind
    pub results_by_kind: HashMap<String, usize>,
}

/// Metadata about the indexed SARIF log
#[derive(Debug, Clone)]
pub struct IndexMetadata {
    /// SARIF version
    pub version: String,

    /// Schema URI (if present)
    pub schema: Option<String>,

    /// Tools used in the runs
    pub tools: Vec<String>,

    /// Creation timestamp of the index
    pub indexed_at: String, // Using String instead of chrono for now

    /// Original file path (if known)
    pub source_path: Option<String>,
}

/// Query builder for searching indexed SARIF data
#[derive(Debug, Clone, Default)]
pub struct SarifQuery {
    /// Filter by rule IDs
    pub rule_ids: Option<HashSet<String>>,

    /// Filter by artifact URIs
    pub artifact_uris: Option<HashSet<String>>,

    /// Filter by result levels
    pub levels: Option<HashSet<String>>,

    /// Filter by result kinds
    pub kinds: Option<HashSet<String>>,

    /// Filter by tool names
    pub tools: Option<HashSet<String>>,

    /// Filter by run indices
    pub run_indices: Option<HashSet<usize>>,

    /// Text search in result messages
    pub message_contains: Option<String>,

    /// Filter by presence of GUID
    pub has_guid: Option<bool>,

    /// Filter by presence of fingerprints
    pub has_fingerprints: Option<bool>,

    /// Maximum number of results to return
    pub limit: Option<usize>,
}

/// Results of a SARIF query
#[derive(Debug, Clone)]
pub struct QueryResults {
    /// Matching results with their locations
    pub results: Vec<(SarifResult, ResultLocation)>,

    /// Total number of matches (before limit applied)
    pub total_matches: usize,

    /// Whether the results were limited
    pub limited: bool,

    /// Query execution statistics
    pub execution_stats: QueryExecutionStats,
}

/// Statistics about query execution performance
#[derive(Debug, Clone)]
pub struct QueryExecutionStats {
    /// Time taken to execute the query
    pub execution_time_ms: u64,

    /// Number of results scanned
    pub results_scanned: usize,

    /// Whether indices were used for optimization
    pub indices_used: Vec<String>,
}

impl SarifIndex {
    /// Create a new empty index
    pub fn new() -> Self {
        Self {
            rules: HashMap::new(),
            artifacts: HashMap::new(),
            results: HashMap::new(),
            rule_to_results: HashMap::new(),
            artifact_to_results: HashMap::new(),
            tool_to_runs: HashMap::new(),
            stats: IndexStats::default(),
            metadata: IndexMetadata {
                version: "2.1.0".to_string(),
                schema: None,
                tools: Vec::new(),
                indexed_at: "2024-01-01T00:00:00Z".to_string(), // Placeholder timestamp
                source_path: None,
            },
        }
    }

    /// Build an index from a SARIF log
    pub fn from_sarif_log(sarif_log: &SarifLog) -> Self {
        let mut index = Self::new();
        index.metadata.version = sarif_log.version.clone();
        index.metadata.schema = sarif_log.schema.clone();
        index.index_sarif_log(sarif_log);
        index
    }

    /// Build an index from a SARIF file
    pub fn from_file<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
        let content = std::fs::read_to_string(&path)?;
        let sarif_log: SarifLog = crate::from_str(&content)?;
        let mut index = Self::from_sarif_log(&sarif_log);
        index.metadata.source_path = Some(path.as_ref().to_string_lossy().to_string());
        Ok(index)
    }

    /// Index a SARIF log
    pub fn index_sarif_log(&mut self, sarif_log: &SarifLog) {
        let start_time = std::time::Instant::now();

        self.stats.run_count = sarif_log.runs.len();

        for (run_index, run) in sarif_log.runs.iter().enumerate() {
            self.index_run(run, run_index);
        }

        // Update metadata
        self.metadata.tools = self.tool_to_runs.keys().cloned().collect();

        println!("Indexed SARIF log in {:?}", start_time.elapsed());
    }

    /// Index a single run
    fn index_run(&mut self, run: &Run, run_index: usize) {
        // Index tool
        let tool_name = run.tool.driver.name.clone();
        self.tool_to_runs
            .entry(tool_name)
            .or_default()
            .push(run_index);

        // Index rules
        if let Some(ref rules) = run.tool.driver.rules {
            for rule in rules {
                self.rules.insert(rule.id.clone(), rule.clone());
                self.stats.rule_count += 1;
            }
        }

        // Index artifacts
        if let Some(ref artifacts) = run.artifacts {
            for (artifact_index, artifact) in artifacts.iter().enumerate() {
                if let Some(ref location) = artifact.location
                    && let Some(ref uri) = location.uri
                {
                    self.artifacts
                        .insert(uri.clone(), (artifact.clone(), artifact_index));
                    self.stats.artifact_count += 1;
                }
            }
        }

        // Index results
        if let Some(ref results) = run.results {
            for (result_index, result) in results.iter().enumerate() {
                self.index_result(result, run_index, result_index);
            }
        }
    }

    /// Index a single result
    fn index_result(&mut self, result: &SarifResult, run_index: usize, result_index: usize) {
        let result_location = ResultLocation {
            run_index,
            result_index,
            guid: result.guid.clone(),
            rule_id: result.rule_id.clone(),
            primary_artifact_uri: self.extract_primary_artifact_uri(result),
        };

        // Create a unique key for this result
        let result_key = result
            .guid
            .clone()
            .unwrap_or_else(|| format!("{}:{}", run_index, result_index));

        // Always index the result (whether it has GUID or not)
        self.results.insert(
            result_key.clone(),
            (result.clone(), result_location.clone()),
        );

        // Update GUID statistics
        if result.guid.is_some() {
            self.stats.results_with_guids += 1;
        }

        // Index by rule ID if present
        if let Some(ref rule_id) = result.rule_id {
            self.rule_to_results
                .entry(rule_id.clone())
                .or_default()
                .push(result_key.clone());
            self.stats.results_with_rule_ids += 1;
        }

        // Index by artifact URI if present
        if let Some(ref artifact_uri) = result_location.primary_artifact_uri {
            self.artifact_to_results
                .entry(artifact_uri.clone())
                .or_default()
                .push(result_key);
        }

        // Update statistics
        self.stats.result_count += 1;

        if result.locations.is_some() {
            self.stats.results_with_locations += 1;
        }

        // Update level distribution
        let level = result
            .level
            .as_ref()
            .map(|l| format!("{:?}", l))
            .unwrap_or_else(|| "None".to_string());
        *self.stats.results_by_level.entry(level).or_insert(0) += 1;

        // Update kind distribution
        let kind = result
            .kind
            .as_ref()
            .map(|k| format!("{:?}", k))
            .unwrap_or_else(|| "None".to_string());
        *self.stats.results_by_kind.entry(kind).or_insert(0) += 1;
    }

    /// Extract the primary artifact URI from a result
    fn extract_primary_artifact_uri(&self, result: &SarifResult) -> Option<String> {
        result
            .locations
            .as_ref()?
            .first()?
            .physical_location
            .as_ref()?
            .artifact_location
            .as_ref()?
            .uri
            .clone()
    }

    /// Query the index
    pub fn query(&self, query: &SarifQuery) -> QueryResults {
        let start_time = std::time::Instant::now();
        let mut matching_results = Vec::new();
        let mut results_scanned = 0;
        let mut indices_used = Vec::new();

        // Optimize query based on available indices
        if let Some(ref rule_ids) = query.rule_ids {
            indices_used.push("rule_to_results".to_string());
            for rule_id in rule_ids {
                if let Some(result_guids) = self.rule_to_results.get(rule_id) {
                    for guid in result_guids {
                        if let Some((result, location)) = self.results.get(guid) {
                            if self.matches_query(result, location, query) {
                                matching_results.push((result.clone(), location.clone()));
                            }
                            results_scanned += 1;
                        }
                    }
                }
            }
        } else if let Some(ref artifact_uris) = query.artifact_uris {
            indices_used.push("artifact_to_results".to_string());
            for uri in artifact_uris {
                if let Some(result_guids) = self.artifact_to_results.get(uri) {
                    for guid in result_guids {
                        if let Some((result, location)) = self.results.get(guid) {
                            if self.matches_query(result, location, query) {
                                matching_results.push((result.clone(), location.clone()));
                            }
                            results_scanned += 1;
                        }
                    }
                }
            }
        } else {
            // Full scan if no optimizable filters
            for (result, location) in self.results.values() {
                if self.matches_query(result, location, query) {
                    matching_results.push((result.clone(), location.clone()));
                }
                results_scanned += 1;
            }
        }

        let total_matches = matching_results.len();
        let limited = if let Some(limit) = query.limit {
            if matching_results.len() > limit {
                matching_results.truncate(limit);
                true
            } else {
                false
            }
        } else {
            false
        };

        let execution_time = start_time.elapsed();

        QueryResults {
            results: matching_results,
            total_matches,
            limited,
            execution_stats: QueryExecutionStats {
                execution_time_ms: execution_time.as_millis() as u64,
                results_scanned,
                indices_used,
            },
        }
    }

    /// Check if a result matches a query
    fn matches_query(
        &self,
        result: &SarifResult,
        location: &ResultLocation,
        query: &SarifQuery,
    ) -> bool {
        // Check rule IDs
        if let Some(ref rule_ids) = query.rule_ids {
            if let Some(ref result_rule_id) = result.rule_id {
                if !rule_ids.contains(result_rule_id) {
                    return false;
                }
            } else {
                return false;
            }
        }

        // Check artifact URIs
        if let Some(ref artifact_uris) = query.artifact_uris {
            if let Some(ref result_artifact_uri) = location.primary_artifact_uri {
                if !artifact_uris.contains(result_artifact_uri) {
                    return false;
                }
            } else {
                return false;
            }
        }

        // Check levels
        if let Some(ref levels) = query.levels {
            let result_level = result
                .level
                .as_ref()
                .map(|l| format!("{:?}", l))
                .unwrap_or_else(|| "None".to_string());
            if !levels.contains(&result_level) {
                return false;
            }
        }

        // Check kinds
        if let Some(ref kinds) = query.kinds {
            let result_kind = result
                .kind
                .as_ref()
                .map(|k| format!("{:?}", k))
                .unwrap_or_else(|| "None".to_string());
            if !kinds.contains(&result_kind) {
                return false;
            }
        }

        // Check run indices
        if let Some(ref run_indices) = query.run_indices
            && !run_indices.contains(&location.run_index)
        {
            return false;
        }

        // Check GUID presence
        if let Some(has_guid) = query.has_guid
            && has_guid != result.guid.is_some()
        {
            return false;
        }

        // Check fingerprints presence
        if let Some(has_fingerprints) = query.has_fingerprints {
            let result_has_fingerprints =
                result.fingerprints.is_some() || result.partial_fingerprints.is_some();
            if has_fingerprints != result_has_fingerprints {
                return false;
            }
        }

        // Check message content
        if let Some(ref message_contains) = query.message_contains {
            if let Some(ref text) = result.message.text
                && !text.to_lowercase().contains(&message_contains.to_lowercase())
            {
                return false;
            } else if result.message.text.is_none() {
                return false;
            }
        }

        true
    }

    /// Get results for a specific rule ID
    pub fn get_results_for_rule(&self, rule_id: &str) -> Vec<(SarifResult, ResultLocation)> {
        let mut query = SarifQuery::default();
        query.rule_ids = Some([rule_id.to_string()].into_iter().collect());
        self.query(&query).results
    }

    /// Get results for a specific artifact URI
    pub fn get_results_for_artifact(
        &self,
        artifact_uri: &str,
    ) -> Vec<(SarifResult, ResultLocation)> {
        let mut query = SarifQuery::default();
        query.artifact_uris = Some([artifact_uri.to_string()].into_iter().collect());
        self.query(&query).results
    }

    /// Get all unique rule IDs
    pub fn get_rule_ids(&self) -> Vec<String> {
        self.rules.keys().cloned().collect()
    }

    /// Get all unique artifact URIs
    pub fn get_artifact_uris(&self) -> Vec<String> {
        self.artifacts.keys().cloned().collect()
    }

    /// Get statistics about the index
    pub fn get_stats(&self) -> &IndexStats {
        &self.stats
    }

    /// Get index metadata
    pub fn get_metadata(&self) -> &IndexMetadata {
        &self.metadata
    }

    /// Legacy methods for backward compatibility
    /// Find artifact index by URI
    pub fn find_artifact_by_uri(&self, uri: &str) -> Option<usize> {
        self.artifacts.get(uri).map(|(_, index)| *index)
    }

    /// Find rule index by ID
    pub fn find_rule_by_id(&self, rule_id: &str) -> Option<usize> {
        // This method is less useful in the new design since we store the actual objects
        // But we maintain it for backward compatibility
        self.rules.keys().position(|id| id == rule_id)
    }

    /// Find result index by GUID
    pub fn find_result_by_guid(&self, guid: &str) -> Option<usize> {
        self.results
            .get(guid)
            .map(|(_, location)| location.result_index)
    }
}

impl SarifQuery {
    /// Create a new empty query
    pub fn new() -> Self {
        Self::default()
    }

    /// Filter by rule ID
    pub fn with_rule_id(mut self, rule_id: impl Into<String>) -> Self {
        self.rule_ids = Some([rule_id.into()].into_iter().collect());
        self
    }

    /// Filter by multiple rule IDs
    pub fn with_rule_ids(mut self, rule_ids: impl IntoIterator<Item = String>) -> Self {
        self.rule_ids = Some(rule_ids.into_iter().collect());
        self
    }

    /// Filter by artifact URI
    pub fn with_artifact_uri(mut self, uri: impl Into<String>) -> Self {
        self.artifact_uris = Some([uri.into()].into_iter().collect());
        self
    }

    /// Filter by level
    pub fn with_level(mut self, level: impl Into<String>) -> Self {
        self.levels = Some([level.into()].into_iter().collect());
        self
    }

    /// Filter by tool name
    pub fn with_tool(mut self, tool: impl Into<String>) -> Self {
        self.tools = Some([tool.into()].into_iter().collect());
        self
    }

    /// Filter by message content
    pub fn with_message_containing(mut self, text: impl Into<String>) -> Self {
        self.message_contains = Some(text.into());
        self
    }

    /// Limit the number of results
    pub fn with_limit(mut self, limit: usize) -> Self {
        self.limit = Some(limit);
        self
    }

    /// Filter by GUID presence
    pub fn with_guid_presence(mut self, has_guid: bool) -> Self {
        self.has_guid = Some(has_guid);
        self
    }
}

impl Default for SarifIndex {
    fn default() -> Self {
        Self::new()
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::builder::SarifLogBuilder;

    #[test]
    fn test_index_creation() {
        let index = SarifIndex::new();
        assert_eq!(index.stats.run_count, 0);
        assert_eq!(index.stats.result_count, 0);
        assert!(index.rules.is_empty());
        assert!(index.artifacts.is_empty());
    }

    #[test]
    fn test_index_from_sarif_log() {
        let sarif = SarifLogBuilder::single_error("test-tool", "Test error message", "test.rs", 42)
            .build_unchecked();

        let index = SarifIndex::from_sarif_log(&sarif);

        assert_eq!(index.stats.run_count, 1);
        assert_eq!(index.stats.result_count, 1);
        assert_eq!(index.metadata.version, "2.1.0");
        assert!(index.tool_to_runs.contains_key("test-tool"));
    }

    #[test]
    fn test_query_by_rule_id() {
        let sarif = SarifLogBuilder::error_finding(
            "eslint",
            "no-unused-vars",
            "Variable 'x' is assigned but never used",
            "src/app.js",
            15,
            5,
            15,
            6,
        )
        .build_unchecked();

        let index = SarifIndex::from_sarif_log(&sarif);

        let query = SarifQuery::new().with_rule_id("no-unused-vars");
        let results = index.query(&query);

        assert_eq!(results.results.len(), 1);
        assert_eq!(results.total_matches, 1);
        assert!(!results.limited);
    }

    #[test]
    fn test_get_results_for_rule() {
        let sarif = SarifLogBuilder::error_finding(
            "clippy",
            "unused_variable",
            "unused variable: `x`",
            "src/main.rs",
            10,
            5,
            10,
            6,
        )
        .build_unchecked();

        let index = SarifIndex::from_sarif_log(&sarif);
        let results = index.get_results_for_rule("unused_variable");

        assert_eq!(results.len(), 1);
        assert_eq!(results[0].1.rule_id, Some("unused_variable".to_string()));
    }

    #[test]
    fn test_statistics() {
        let sarif = SarifLogBuilder::error_finding(
            "tool",
            "RULE001",
            "Error message",
            "file.rs",
            1,
            1,
            1,
            10,
        )
        .build_unchecked();

        let index = SarifIndex::from_sarif_log(&sarif);
        let stats = index.get_stats();

        assert_eq!(stats.run_count, 1);
        assert_eq!(stats.result_count, 1);
        assert_eq!(stats.results_by_level.get("Error"), Some(&1));
    }

    #[test]
    fn test_legacy_compatibility() {
        let sarif = SarifLogBuilder::error_finding(
            "test-tool",
            "TEST001",
            "Test error message",
            "test.rs",
            42,
            1,
            42,
            10,
        )
        .build_unchecked();

        let index = SarifIndex::from_sarif_log(&sarif);

        // Test that we can query results even without artifacts in the artifacts list
        let results = index.get_results_for_artifact("test.rs");
        assert_eq!(results.len(), 1);

        // Test that we can find results by rule ID
        let rule_results = index.get_results_for_rule("TEST001");
        assert_eq!(rule_results.len(), 1);

        // The builders don't create rules in the tool driver by default,
        // so we test that the indexing system still works correctly
        assert_eq!(index.stats.result_count, 1);
        assert_eq!(index.stats.results_with_rule_ids, 1);
    }
}