#![feature(iter_intersperse)]
#![feature(hash_raw_entry)]
use std::fmt;
use json_in_type::list::ToJSONList;
use json_in_type::*;
use log::{debug, error};
use rustc_hash::FxHashMap;
use std::iter::zip;
use thiserror::Error;
#[derive(Default)]
pub struct ParseTree {
root: TreeRoot,
next_cluster_id: usize,
}
fn zip_tokens_and_template<'c>(
templatetokens: &[LogTemplateItem],
logtokens: &[TokenParse<'c>],
results: &mut Vec<&'c str>,
) {
results.clear();
for (template_token, log_token) in zip(templatetokens, logtokens) {
match template_token {
LogTemplateItem::StaticToken(_) => {}
LogTemplateItem::Value => match log_token {
TokenParse::Token(v) => results.push(*v),
TokenParse::MaskedValue(v) => results.push(*v),
},
}
}
}
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
pub enum LogTemplateItem {
StaticToken(String), Value, }
impl fmt::Display for LogTemplateItem {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"{}",
match self {
Self::StaticToken(s) => s,
Self::Value => "<*>",
}
)
}
}
#[derive(Debug)]
enum TokenParse<'a> {
Token(&'a str),
MaskedValue(&'a str),
}
#[derive(Debug)]
enum Preprocessed<'a> {
Segment(&'a str),
Value(&'a str),
}
#[derive(Error, Debug)]
pub enum ParseError {
#[error("couldn't parse line with user defined template, multiline log msg?")]
NoTokensInRecord,
}
#[derive(Debug)]
pub struct RecordParsed<'a> {
pub template_id: usize,
pub values: Vec<&'a str>,
}
#[derive(Debug)]
pub struct NewTemplate<'a> {
pub template: LogTemplate,
pub first_parse: RecordParsed<'a>,
}
#[derive(Debug)]
pub enum RecordsParsedResult<'a> {
NewTemplate(NewTemplate<'a>),
RecordParsed(RecordParsed<'a>),
ParseError(ParseError),
}
pub struct RecordsParsedIter<'a, 'b: 'a> {
pub input: &'a str,
pub state: &'b mut ParseTree,
tokens: Vec<TokenParse<'a>>,
parsed: &'a mut Vec<&'a str>,
}
impl<'a, 'b> RecordsParsedIter<'a, 'b> {
pub fn from(
input: &'a str,
state: &'b mut ParseTree,
parsed_buffer: &'a mut Vec<&'a str>,
) -> RecordsParsedIter<'a, 'b> {
RecordsParsedIter {
input,
state,
tokens: Vec::new(),
parsed: parsed_buffer,
}
}
}
impl<'a, 'b> Iterator for RecordsParsedIter<'a, 'b> {
type Item = RecordsParsedResult<'a>;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
let split_result = self.input.split_once('\n');
let (line, next_input) = match split_result {
Some((line, rest)) => (line.strip_suffix('\r').unwrap_or(line), rest),
None => (self.input, &self.input[0..0]),
};
self.input = next_input;
if line.is_empty() {
return None;
}
let line_chunks = split_line_provided(line); if line_chunks.is_none() {
return Some(RecordsParsedResult::ParseError(
ParseError::NoTokensInRecord,
));
}
let line_chunks = line_chunks.unwrap();
let log_content = *line_chunks.iter().rev().next().unwrap();
let mut preprocessed = Vec::new();
if false {
let re = Regex::new(r"\d+").unwrap();
let mut last_index = 0;
for mmatch in re.find_iter(log_content) {
if mmatch.start() > last_index {
preprocessed.push(Preprocessed::Segment(
&log_content[last_index..mmatch.start()],
));
}
preprocessed.push(Preprocessed::Value(mmatch.as_str()));
last_index = mmatch.end();
}
if last_index != log_content.len() {
preprocessed.push(Preprocessed::Segment(&log_content[last_index..]));
}
} else {
preprocessed.push(Preprocessed::Segment(log_content));
}
let tokens = &mut self.tokens;
tokens.clear();
debug!("preprocessed={:?}", preprocessed);
for elem in preprocessed {
match elem {
Preprocessed::Segment(s) => tokens.extend(
s.split([' ', '\t'])
.filter(|s| !s.is_empty())
.map(TokenParse::Token),
),
Preprocessed::Value(v) => tokens.push(TokenParse::MaskedValue(v)),
}
}
if tokens.is_empty() {
return Some(RecordsParsedResult::ParseError(
ParseError::NoTokensInRecord,
));
}
let match_cluster = tree_search(&self.state.root, tokens);
if match_cluster.is_none() {
let match_cluster = Some(add_seq_to_prefix_tree(
&mut self.state.root,
&tokens,
&mut self.state.next_cluster_id,
))
.unwrap();
self.parsed.clear();
zip_tokens_and_template(&match_cluster.template, &tokens, self.parsed);
return Some(Self::Item::NewTemplate(NewTemplate {
template: match_cluster.template.to_vec(),
first_parse: RecordParsed {
values: self.parsed.to_vec(),
template_id: match_cluster.cluster_id,
},
}));
}
let match_cluster = match_cluster.unwrap();
debug!("Line {} matched cluster: {:?}", line, match_cluster);
self.parsed.clear();
zip_tokens_and_template(&match_cluster.template, &tokens, self.parsed);
return Some(Self::Item::RecordParsed(RecordParsed {
values: self.parsed.to_vec(),
template_id: match_cluster.cluster_id,
}));
}
}
fn has_numbers(s: &str) -> bool {
s.chars().any(char::is_numeric)
}
fn split_line_provided(_line: &str) -> Option<Vec<&str>> {
Some(vec![_line])
}
#[derive(Debug)]
struct LogCluster {
template: LogTemplate,
cluster_id: usize,
}
fn sequence_distance(seq1: &[LogTemplateItem], seq2: &[TokenParse]) -> (f64, i64) {
assert!(seq1.len() == seq2.len());
if seq1.is_empty() {
return (1.0, 0);
}
let mut sim_tokens: i64 = 0;
let mut num_of_par = 0;
for (token1, token2) in seq1.iter().zip(seq2.iter()) {
match token1 {
LogTemplateItem::Value => num_of_par += 1,
LogTemplateItem::StaticToken(token1) => match token2 {
TokenParse::Token(token2) => {
if token1 == token2 {
sim_tokens += 1
}
}
TokenParse::MaskedValue(_) => num_of_par += 1,
},
}
}
let ret_val = sim_tokens as f64 / seq1.len() as f64;
(ret_val, num_of_par)
}
const SIMILARITY_THRESHOLD: f64 = 0.4;
fn fast_match<'a>(logclusts: &'a Vec<LogCluster>, tokens: &[TokenParse]) -> Option<&'a LogCluster> {
let mut max_similarity = -1.0;
let mut max_param_count = -1;
let mut max_cluster = None;
for log_clust in logclusts {
let (cur_similarity, cur_num_params) = sequence_distance(&log_clust.template, tokens);
if cur_similarity > max_similarity
|| (cur_similarity == max_similarity && cur_num_params > max_param_count)
{
max_similarity = cur_similarity;
max_param_count = cur_num_params;
max_cluster = Some(log_clust);
}
}
if max_similarity >= SIMILARITY_THRESHOLD {
max_cluster
} else {
None
}
}
const MAX_DEPTH: usize = 4;
const MAX_CHILDREN: usize = 100;
fn add_seq_to_prefix_tree<'a>(
root: &'a mut TreeRoot,
tokens: &Vec<TokenParse>,
num_clusters: &mut usize,
) -> &'a LogCluster {
let clust_id = *num_clusters;
*num_clusters += 1;
debug!("Adding seq {} to tree: {:?}", clust_id, tokens);
let token_count = tokens.len();
assert!(token_count >= 2);
let mut cur_node = root.entry(token_count).or_insert_with(|| {
GraphNodeContents::MiddleNode(MiddleNode {
child_d: FxHashMap::default(),
})
});
let mut current_depth = 1;
for token in tokens {
let inserter = || {
if current_depth == MAX_DEPTH - 1 || current_depth == token_count - 1 {
GraphNodeContents::LeafNode(Vec::new())
} else {
GraphNodeContents::MiddleNode(MiddleNode {
child_d: FxHashMap::default(),
})
}
};
cur_node = match cur_node {
GraphNodeContents::MiddleNode(middle) => {
assert!(!(current_depth >= MAX_DEPTH || current_depth >= token_count));
let num_children = middle.child_d.len();
match token {
TokenParse::MaskedValue(_v) => middle
.child_d
.entry(LogTemplateItem::Value)
.or_insert_with(inserter),
TokenParse::Token(token) => {
let perfect_match_key = LogTemplateItem::StaticToken(token.to_string());
let found_node = middle.child_d.contains_key(&perfect_match_key);
if found_node {
middle.child_d.get_mut(&perfect_match_key).unwrap()
} else {
if has_numbers(token) || num_children >= MAX_CHILDREN {
middle
.child_d
.entry(LogTemplateItem::Value)
.or_insert_with(inserter)
} else {
middle
.child_d
.entry(perfect_match_key)
.or_insert_with(inserter)
}
}
}
}
}
GraphNodeContents::LeafNode(leaf) => {
assert!(current_depth >= MAX_DEPTH || current_depth >= token_count);
leaf.push(LogCluster {
template: tokens
.iter()
.map(|tp| match tp {
TokenParse::Token(t) => match has_numbers(t) {
true => LogTemplateItem::Value,
false => LogTemplateItem::StaticToken(t.to_string()),
},
TokenParse::MaskedValue(_v) => LogTemplateItem::Value,
})
.collect(),
cluster_id: clust_id,
});
debug!("tree: {:?}", leaf);
return &leaf[leaf.len() - 1];
}
};
current_depth += 1
}
unreachable!();
}
fn tree_search<'a>(root: &'a TreeRoot, tokens: &[TokenParse]) -> Option<&'a LogCluster> {
let token_count = tokens.len();
assert!(token_count != 0);
let e = root.get(&token_count);
e?;
let mut cur_node = e.unwrap();
let mut current_depth = 1;
for token in tokens {
if current_depth >= MAX_DEPTH {
break;
}
let middle = match cur_node {
GraphNodeContents::MiddleNode(x) => x,
GraphNodeContents::LeafNode(_) => {
assert!(current_depth == token_count);
break;
}
};
match token {
TokenParse::MaskedValue(_v) => {
let maybe_next = middle.child_d.get(&LogTemplateItem::Value);
if let Some(next) = maybe_next {
cur_node = next;
} else {
return None;
}
}
TokenParse::Token(token) => {
let maybe_next = middle
.child_d
.get(&LogTemplateItem::StaticToken(token.to_string()));
if let Some(next) = maybe_next {
cur_node = next;
} else if let Some(wildcard) = middle.child_d.get(&LogTemplateItem::Value) {
cur_node = wildcard;
} else {
return None; }
}
}
current_depth += 1;
}
let log_clust = match cur_node {
GraphNodeContents::MiddleNode(_) => unreachable!("Mistake."),
GraphNodeContents::LeafNode(x) => x,
};
let ret_log_clust = fast_match(log_clust, tokens);
ret_log_clust
}
#[derive(Debug)]
struct MiddleNode {
child_d: FxHashMap<LogTemplateItem, GraphNodeContents>,
}
#[derive(Debug)]
enum GraphNodeContents {
MiddleNode(MiddleNode),
LeafNode(Vec<LogCluster>),
}
type TreeRoot = FxHashMap<usize, GraphNodeContents>;
pub type LogTemplate = Vec<LogTemplateItem>;
use regex::Regex;
use std::fs::read_to_string;
pub fn print_log(filename: &str, actually_print: bool) {
let s: _ = read_to_string(filename).unwrap();
let mut tree = ParseTree::default();
let mut template_names = Vec::new();
let handle_parse = |template_names: &[String], rp: &RecordParsed| {
let typ = &template_names[rp.template_id];
let obj = json_object! {
template: typ,
values: ToJSONList(rp.values.to_vec())};
if actually_print {
println!("{}", obj.to_json_string());
}
};
for record in RecordsParsedIter::from(&s, &mut tree, &mut Vec::new()) {
match record {
RecordsParsedResult::NewTemplate(template) => {
template_names.push(
template
.template
.iter()
.map(|t| t.to_string())
.intersperse(" ".to_string())
.collect::<String>(),
);
handle_parse(&template_names, &template.first_parse);
}
crate::RecordsParsedResult::RecordParsed(rp) => handle_parse(&template_names, &rp),
crate::RecordsParsedResult::ParseError(e) => error!("err: {}", e),
}
}
}