use std::collections::HashMap;
use std::sync::Arc;
#[cfg(feature = "serde-extras")]
use serde::{Deserialize, Serialize};
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde-extras", derive(Serialize, Deserialize))]
pub struct ApiPatternConfig {
pub min_support: f64,
pub min_support_count: usize,
pub max_pattern_length: usize,
pub min_pattern_length: usize,
pub closed_only: bool,
pub max_patterns: usize,
pub allow_gaps: bool,
pub max_gap: usize,
}
impl Default for ApiPatternConfig {
fn default() -> Self {
Self {
min_support: 0.1,
min_support_count: 2,
max_pattern_length: 10,
min_pattern_length: 2,
closed_only: false,
max_patterns: 1000,
allow_gaps: true,
max_gap: 3,
}
}
}
impl ApiPatternConfig {
pub fn new() -> Self {
Self::default()
}
pub fn with_min_support(mut self, support: f64) -> Self {
self.min_support = support;
self
}
pub fn with_min_support_count(mut self, count: usize) -> Self {
self.min_support_count = count;
self
}
pub fn with_max_pattern_length(mut self, length: usize) -> Self {
self.max_pattern_length = length;
self
}
pub fn with_min_pattern_length(mut self, length: usize) -> Self {
self.min_pattern_length = length;
self
}
pub fn with_closed_only(mut self, closed: bool) -> Self {
self.closed_only = closed;
self
}
pub fn with_max_gap(mut self, gap: usize) -> Self {
self.max_gap = gap;
self.allow_gaps = gap > 0;
self
}
pub fn strict() -> Self {
Self {
allow_gaps: false,
max_gap: 0,
..Default::default()
}
}
pub fn lenient() -> Self {
Self {
max_gap: 10,
allow_gaps: true,
min_support: 0.05,
..Default::default()
}
}
}
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde-extras", derive(Serialize, Deserialize))]
pub struct ApiPattern {
pub sequence: Vec<Arc<str>>,
pub support: usize,
pub support_ratio: f64,
pub is_closed: bool,
pub avg_position: f64,
pub confidence: Option<f64>,
}
impl ApiPattern {
pub fn new(sequence: Vec<Arc<str>>, support: usize, total_sequences: usize) -> Self {
Self {
sequence,
support,
support_ratio: support as f64 / total_sequences.max(1) as f64,
is_closed: false,
avg_position: 0.5,
confidence: None,
}
}
pub fn len(&self) -> usize {
self.sequence.len()
}
pub fn is_empty(&self) -> bool {
self.sequence.is_empty()
}
pub fn to_string_pattern(&self) -> String {
self.sequence
.iter()
.map(|s| s.as_ref())
.collect::<Vec<_>>()
.join(" -> ")
}
}
#[derive(Clone, Debug)]
struct SequencePosition {
sequence_idx: usize,
position: usize,
}
#[derive(Clone)]
struct ProjectedDatabase {
positions: Vec<SequencePosition>,
prefix: Vec<Arc<str>>,
}
#[derive(Clone)]
pub struct ApiPatternMiner {
config: ApiPatternConfig,
string_cache: HashMap<String, Arc<str>>,
}
impl ApiPatternMiner {
pub fn new(config: ApiPatternConfig) -> Self {
Self {
config,
string_cache: HashMap::new(),
}
}
pub fn with_defaults() -> Self {
Self::new(ApiPatternConfig::default())
}
fn intern(&mut self, s: &str) -> Arc<str> {
if let Some(cached) = self.string_cache.get(s) {
Arc::clone(cached)
} else {
let arc: Arc<str> = Arc::from(s);
self.string_cache.insert(s.to_string(), Arc::clone(&arc));
arc
}
}
pub fn mine<S, T>(&mut self, sequences: &[S]) -> Vec<ApiPattern>
where
S: AsRef<[T]>,
T: AsRef<str>,
{
if sequences.is_empty() {
return Vec::new();
}
let total_sequences = sequences.len();
let min_support = ((self.config.min_support * total_sequences as f64).ceil() as usize)
.max(self.config.min_support_count);
let interned_sequences: Vec<Vec<Arc<str>>> = sequences
.iter()
.map(|seq| {
seq.as_ref()
.iter()
.map(|s| self.intern(s.as_ref()))
.collect()
})
.collect();
let mut item_counts: HashMap<Arc<str>, usize> = HashMap::new();
for seq in &interned_sequences {
let mut seen = std::collections::HashSet::new();
for item in seq {
if seen.insert(Arc::clone(item)) {
*item_counts.entry(Arc::clone(item)).or_insert(0) += 1;
}
}
}
let frequent_items: Vec<Arc<str>> = item_counts
.into_iter()
.filter(|(_, count)| *count >= min_support)
.map(|(item, _)| item)
.collect();
if frequent_items.is_empty() {
return Vec::new();
}
let mut patterns = Vec::new();
for item in &frequent_items {
let mut positions = Vec::new();
for (seq_idx, seq) in interned_sequences.iter().enumerate() {
for (pos, s) in seq.iter().enumerate() {
if s == item {
positions.push(SequencePosition {
sequence_idx: seq_idx,
position: pos + 1, });
break; }
}
}
if positions.len() >= min_support {
let prefix = vec![Arc::clone(item)];
let projected = ProjectedDatabase { positions, prefix };
self.prefix_span_recursive(
&interned_sequences,
projected,
min_support,
total_sequences,
&mut patterns,
);
}
}
patterns.sort_by(|a, b| {
b.support
.cmp(&a.support)
.then_with(|| b.len().cmp(&a.len()))
});
patterns.truncate(self.config.max_patterns);
if self.config.closed_only {
patterns = self.filter_closed_patterns(patterns);
}
patterns
}
fn prefix_span_recursive(
&self,
sequences: &[Vec<Arc<str>>],
projected: ProjectedDatabase,
min_support: usize,
total_sequences: usize,
patterns: &mut Vec<ApiPattern>,
) {
if projected.prefix.len() >= self.config.min_pattern_length {
let support = self.count_unique_sequences(&projected.positions);
let pattern = ApiPattern::new(projected.prefix.clone(), support, total_sequences);
patterns.push(pattern);
}
if projected.prefix.len() >= self.config.max_pattern_length {
return;
}
let mut item_counts: HashMap<Arc<str>, Vec<SequencePosition>> = HashMap::new();
for pos in &projected.positions {
let seq = &sequences[pos.sequence_idx];
let max_pos = if self.config.allow_gaps {
(pos.position + self.config.max_gap + 1).min(seq.len())
} else {
(pos.position + 1).min(seq.len())
};
let mut seen_in_seq = std::collections::HashSet::new();
for i in pos.position..max_pos {
let item = &seq[i];
if seen_in_seq.insert(Arc::clone(item)) {
item_counts
.entry(Arc::clone(item))
.or_default()
.push(SequencePosition {
sequence_idx: pos.sequence_idx,
position: i + 1,
});
}
}
}
for (item, positions) in item_counts {
let support = self.count_unique_sequences(&positions);
if support >= min_support {
let mut new_prefix = projected.prefix.clone();
new_prefix.push(item);
let new_projected = ProjectedDatabase {
positions,
prefix: new_prefix,
};
self.prefix_span_recursive(
sequences,
new_projected,
min_support,
total_sequences,
patterns,
);
}
}
}
fn count_unique_sequences(&self, positions: &[SequencePosition]) -> usize {
let mut seen = std::collections::HashSet::new();
for pos in positions {
seen.insert(pos.sequence_idx);
}
seen.len()
}
fn filter_closed_patterns(&self, patterns: Vec<ApiPattern>) -> Vec<ApiPattern> {
let mut closed: Vec<ApiPattern> = Vec::new();
'outer: for pattern in patterns {
for other in &closed {
if other.support == pattern.support
&& other.len() > pattern.len()
&& self.is_subsequence(&pattern.sequence, &other.sequence)
{
continue 'outer;
}
}
let support = pattern.support;
let len = pattern.len();
let seq = &pattern.sequence;
closed.retain(|p: &ApiPattern| {
!(p.support == support && p.len() < len && self.is_subsequence(&p.sequence, seq))
});
closed.push(pattern);
}
closed
}
fn is_subsequence(&self, sub: &[Arc<str>], super_seq: &[Arc<str>]) -> bool {
if sub.len() > super_seq.len() {
return false;
}
let mut sub_idx = 0;
for item in super_seq {
if sub_idx < sub.len() && item == &sub[sub_idx] {
sub_idx += 1;
}
}
sub_idx == sub.len()
}
}
impl std::fmt::Debug for ApiPatternMiner {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("ApiPatternMiner")
.field("config", &self.config)
.field("cache_size", &self.string_cache.len())
.finish()
}
}
#[derive(Clone, Debug, Default)]
#[cfg_attr(feature = "serde-extras", derive(Serialize, Deserialize))]
pub struct MiningStats {
pub sequences_processed: usize,
pub items_processed: usize,
pub patterns_found: usize,
pub time_us: u64,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_config_default() {
let config = ApiPatternConfig::default();
assert!(config.min_support > 0.0);
assert!(config.max_pattern_length > 0);
}
#[test]
fn test_config_builder() {
let config = ApiPatternConfig::new()
.with_min_support(0.2)
.with_max_pattern_length(5)
.with_max_gap(2);
assert_eq!(config.min_support, 0.2);
assert_eq!(config.max_pattern_length, 5);
assert_eq!(config.max_gap, 2);
}
#[test]
fn test_empty_sequences() {
let mut miner = ApiPatternMiner::with_defaults();
let sequences: Vec<Vec<&str>> = vec![];
let patterns = miner.mine(&sequences);
assert!(patterns.is_empty());
}
#[test]
fn test_simple_pattern() {
let mut miner = ApiPatternMiner::new(ApiPatternConfig {
min_support: 0.5,
min_support_count: 2,
min_pattern_length: 2,
..Default::default()
});
let sequences = vec![
vec!["open", "read", "close"],
vec!["open", "write", "close"],
vec!["open", "read", "seek", "close"],
];
let patterns = miner.mine(&sequences);
assert!(!patterns.is_empty());
let open_close = patterns.iter().find(|p| {
p.sequence.len() == 2
&& p.sequence[0].as_ref() == "open"
&& p.sequence[1].as_ref() == "close"
});
assert!(open_close.is_some());
assert_eq!(open_close.unwrap().support, 3);
}
#[test]
fn test_longer_patterns() {
let mut miner = ApiPatternMiner::new(ApiPatternConfig {
min_support: 0.5,
min_support_count: 2,
min_pattern_length: 3,
max_pattern_length: 5,
..Default::default()
});
let sequences = vec![
vec!["connect", "query", "fetch", "close"],
vec!["connect", "query", "fetch", "close"],
vec!["connect", "query", "fetch", "process", "close"],
];
let patterns = miner.mine(&sequences);
let long_pattern = patterns.iter().find(|p| p.sequence.len() >= 4);
assert!(long_pattern.is_some());
}
#[test]
fn test_closed_patterns() {
let mut miner = ApiPatternMiner::new(ApiPatternConfig {
min_support: 0.5,
min_support_count: 2,
closed_only: true,
..Default::default()
});
let sequences = vec![
vec!["a", "b", "c"],
vec!["a", "b", "c"],
vec!["a", "b", "c"],
];
let patterns = miner.mine(&sequences);
let has_abc = patterns
.iter()
.any(|p| p.sequence.len() == 3 && p.sequence[0].as_ref() == "a");
let has_ab = patterns.iter().any(|p| {
p.sequence.len() == 2 && p.sequence[0].as_ref() == "a" && p.sequence[1].as_ref() == "b"
});
assert!(has_abc);
assert!(!has_ab);
}
#[test]
fn test_strict_mining() {
let mut miner = ApiPatternMiner::new(
ApiPatternConfig::strict()
.with_min_support(0.5)
.with_min_support_count(2),
);
let sequences = vec![
vec!["a", "x", "b"], vec!["a", "b"], vec!["a", "b"], ];
let patterns = miner.mine(&sequences);
let ab_pattern = patterns.iter().find(|p| {
p.sequence.len() == 2 && p.sequence[0].as_ref() == "a" && p.sequence[1].as_ref() == "b"
});
assert!(ab_pattern.is_some());
assert_eq!(ab_pattern.unwrap().support, 2);
}
#[test]
fn test_gap_mining() {
let mut miner = ApiPatternMiner::new(ApiPatternConfig {
min_support: 0.5,
min_support_count: 2,
allow_gaps: true,
max_gap: 2,
..Default::default()
});
let sequences = vec![
vec!["a", "x", "b"], vec!["a", "x", "y", "b"], vec!["a", "b"], ];
let patterns = miner.mine(&sequences);
let ab_pattern = patterns.iter().find(|p| {
p.sequence.len() == 2 && p.sequence[0].as_ref() == "a" && p.sequence[1].as_ref() == "b"
});
assert!(ab_pattern.is_some());
assert_eq!(ab_pattern.unwrap().support, 3);
}
#[test]
fn test_pattern_to_string() {
let pattern = ApiPattern::new(
vec![Arc::from("open"), Arc::from("read"), Arc::from("close")],
10,
100,
);
assert_eq!(pattern.to_string_pattern(), "open -> read -> close");
}
#[test]
fn test_is_subsequence() {
let miner = ApiPatternMiner::with_defaults();
let sub: Vec<Arc<str>> = vec![Arc::from("a"), Arc::from("c")];
let super_seq: Vec<Arc<str>> = vec![Arc::from("a"), Arc::from("b"), Arc::from("c")];
assert!(miner.is_subsequence(&sub, &super_seq));
assert!(!miner.is_subsequence(&super_seq, &sub));
}
#[test]
fn test_real_api_sequences() {
let mut miner = ApiPatternMiner::new(ApiPatternConfig {
min_support: 0.3,
min_support_count: 2,
min_pattern_length: 2,
..Default::default()
});
let sequences = vec![
vec!["fopen", "fread", "fclose"],
vec!["fopen", "fwrite", "fflush", "fclose"],
vec!["fopen", "fread", "fseek", "fread", "fclose"],
vec!["fopen", "fgets", "fclose"],
vec!["fopen", "fprintf", "fclose"],
];
let patterns = miner.mine(&sequences);
let open_close = patterns.iter().find(|p| {
p.sequence.len() == 2
&& p.sequence[0].as_ref() == "fopen"
&& p.sequence[1].as_ref() == "fclose"
});
assert!(open_close.is_some());
assert_eq!(open_close.unwrap().support, 5);
}
}