dachshund 0.1.9

Dachshund is a graph mining library written in Rust. It provides high performance data structures for multiple kinds of graphs, from simple undirected graphs to typed hypergraphs. Dachshund also provides algorithms for common tasks for graph mining and analysis, ranging from shortest paths to graph spectral analysis.
Documentation
/*
 * Copyright (c) Facebook, Inc. and its affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this core tree.
 */
extern crate clap;
extern crate serde_json;

use clap::ArgMatches;

use crate::dachshund::beam::{Beam, BeamSearchResult};
use crate::dachshund::error::{CLQError, CLQResult};
use crate::dachshund::graph_base::GraphBase;
use crate::dachshund::graph_builder_base::GraphBuilderBase;
use crate::dachshund::id_types::{GraphId, NodeTypeId};
use crate::dachshund::line_processor::LineProcessorBase;
use crate::dachshund::non_core_type_ids::NonCoreTypeIds;
use crate::dachshund::row::{CliqueRow, EdgeRow, Row};
use crate::dachshund::search_problem::SearchProblem;
use crate::dachshund::transformer_base::TransformerBase;
use crate::dachshund::typed_graph::{LabeledGraph, TypedGraph};
use crate::dachshund::typed_graph_builder::TypedGraphBuilder;
use crate::dachshund::typed_graph_line_processor::TypedGraphLineProcessor;
use std::rc::Rc;
use std::sync::mpsc::Sender;
use std::sync::Arc;

/// Used to set up the typed graph clique mining algorithm.
pub struct Transformer {
    pub core_type: String,
    pub non_core_type_ids: Rc<NonCoreTypeIds>,
    pub non_core_types: Rc<Vec<String>>,
    pub edge_types: Rc<Vec<String>>,
    pub num_non_core_types: usize,
    pub line_processor: Arc<TypedGraphLineProcessor>,
    pub search_problem: Rc<SearchProblem>,
    pub debug: bool,
    pub long_format: bool,

    edge_rows: Vec<EdgeRow>,
    clique_rows: Vec<CliqueRow>,
}
impl TransformerBase for Transformer {
    fn get_line_processor(&self) -> Arc<dyn LineProcessorBase> {
        self.line_processor.clone()
    }
    fn process_row(&mut self, row: Box<dyn Row>) -> CLQResult<()> {
        if let Some(edge_row) = row.as_edge_row() {
            self.edge_rows.push(edge_row);
        }
        if let Some(clique_row) = row.as_clique_row() {
            self.clique_rows.push(clique_row);
        }
        Ok(())
    }
    fn reset(&mut self) -> CLQResult<()> {
        self.edge_rows.clear();
        self.clique_rows.clear();
        Ok(())
    }
    fn process_batch(
        &mut self,
        graph_id: GraphId,
        output: &Sender<(Option<String>, bool)>,
    ) -> CLQResult<()> {
        let drained_rows = self.edge_rows.drain(..).collect::<Vec<_>>();
        let graph: TypedGraph = self.build_pruned_graph(graph_id, drained_rows)?;
        self.process_clique_rows(
            &graph,
            &self.clique_rows,
            graph_id,
            // verbose
            self.debug,
            output,
        )?;
        Ok(())
    }
}
impl Transformer {
    /// processes a "typespec", a command-line argument, of the form:
    /// [["author", "published_in", "journal"], ["author", "co-authored", "article"]].
    /// This sets up the semantics related to the set of relations contained in the
    /// typed graph. A requirement is that all relations share a "core" type, in this
    /// case, "author". Non-core types must be listed in a vector, which is used to
    /// index the non core-types. The function creates a vector of NonCoreTypeIds, which
    /// will then be used to process input rows.
    pub fn process_typespec(
        typespec: Vec<Vec<String>>,
        core_type: &str,
        non_core_types: Vec<String>,
    ) -> CLQResult<NonCoreTypeIds> {
        let mut non_core_type_ids = NonCoreTypeIds::new();
        non_core_type_ids.insert(core_type, NodeTypeId::from(0_usize));

        let should_be_only_this_core_type = &typespec[0][0].clone();
        for (non_core_type_ix, non_core_type) in non_core_types.iter().enumerate() {
            non_core_type_ids.insert(non_core_type, NodeTypeId::from(non_core_type_ix + 1));
        }
        for item in typespec {
            let core_type = &item[0];
            let non_core_type = &item[2];
            assert_eq!(core_type, should_be_only_this_core_type);
            let non_core_type_id: &mut NodeTypeId = non_core_type_ids.require_mut(non_core_type)?;
            non_core_type_id.increment_possible_edge_count();
        }
        Ok(non_core_type_ids)
    }
    /// Called by main.rs module to set up the beam search. Parameters are as follows:
    ///     - `typespec`: a command-line argument, of the form:
    ///     [["author", "published_in", "journal"], ["author", "co-authored", "article"]].
    ///     This sets up the semantics related to the set of relations contained in the
    ///     typed graph. A requirement is that all relations share a "core" type, in this
    ///     case, "author".
    ///     - `beam_size`: Beam construction parameter. The number of top candidates to
    ///     maintain as potential future cores for expansion in the "beam" (i.e., the list of top candidates).
    ///     - `alpha`: `Scorer` constructor parameter. Controls the contribution of density
    ///     - `global_thresh`: `Scorer` constructor parameter. If provided, candidates must be at
    ///     least this dense to be considered valid (quasi-)cliques.
    ///     - `local_thresh`: `Scorer` constructor parameter. if provided, each node in the candidate
    ///     must have at least `local_thresh` proportion of ties to other nodes in the candidate,
    ///     for the candidate to be considered valid.
    ///     - `num_to_search`: number of expansion candidates to consider for each candidate in the
    ///     beam.
    ///     - `num_epochs`: maximum number of epochs to run search for.
    ///     - `max_repeated_prior_scores`: maximum number of times for which the top score can be
    ///     repeated in consecutive epochs, before the search gets shut down early.
    ///     - `debug`: whether to produce verbose output in the search process.
    ///     - `min_degree`: minimum degree required for each node in a (quasi-)clique in order for
    ///     the subgraph to be considered interesting.
    ///     - `core_type`: the core type, as found in the typespec.
    ///     - `long_format`: whether to output results in long format, of the form:
    ///     `graph_id\tnode_id\tnode_type`, instead of the more user-friendly (but
    ///     machine-unfriendly) wide format.
    #[allow(clippy::too_many_arguments)]
    pub fn new(
        typespec: Vec<Vec<String>>,
        beam_size: usize,
        alpha: f32,
        global_thresh: Option<f32>,
        local_thresh: Option<f32>,
        num_to_search: usize,
        num_epochs: usize,
        max_repeated_prior_scores: usize,
        debug: bool,
        min_degree: usize,
        core_type: String,
        long_format: bool,
    ) -> CLQResult<Self> {
        let search_problem = Rc::new(SearchProblem::new(
            beam_size,
            alpha,
            global_thresh,
            local_thresh,
            num_to_search,
            num_epochs,
            max_repeated_prior_scores,
            min_degree,
        ));
        let mut edge_types_v: Vec<String> = typespec.iter().map(|x| x[1].clone()).collect();
        edge_types_v.sort();
        let edge_types = Rc::new(edge_types_v);

        let mut non_core_types_v: Vec<String> = typespec.iter().map(|x| x[2].clone()).collect();
        non_core_types_v.sort();
        let non_core_types = Rc::new(non_core_types_v);

        let num_non_core_types: usize = non_core_types.len();
        let non_core_type_ids: Rc<NonCoreTypeIds> = Rc::new(Transformer::process_typespec(
            typespec,
            &core_type,
            non_core_types.to_vec(),
        )?);
        let line_processor = Arc::new(TypedGraphLineProcessor::new(
            core_type.clone(),
            non_core_type_ids.clone(),
            non_core_types.clone(),
            edge_types.clone(),
        ));
        let transformer = Self {
            core_type,
            non_core_type_ids,
            non_core_types,
            edge_types,
            num_non_core_types,
            line_processor,
            search_problem,
            debug,
            long_format,
            edge_rows: Vec::new(),
            clique_rows: Vec::new(),
        };
        Ok(transformer)
    }

    /// constructs a transformer from an ArgMatches object (to help with command line arguments).
    pub fn from_argmatches(matches: ArgMatches) -> CLQResult<Self> {
        let arg_value = |name: &str| -> CLQResult<&str> {
            matches
                .value_of(name)
                .ok_or_else(|| CLQError::from(format!("Missing required argument: {name}")))
        };
        let typespec_str: &str = arg_value("typespec")?;
        let typespec: Vec<Vec<String>> = serde_json::from_str(typespec_str)?;
        let beam_size: usize = arg_value("beam_size")?.parse::<usize>()?;
        let alpha: f32 = arg_value("alpha")?.parse::<f32>()?;
        let global_thresh: Option<f32> = Some(arg_value("global_thresh")?.parse::<f32>()?);
        let local_thresh: Option<f32> = Some(arg_value("local_thresh")?.parse::<f32>()?);
        let num_to_search: usize = arg_value("num_to_search")?.parse::<usize>()?;
        let num_epochs: usize = arg_value("epochs")?.parse::<usize>()?;
        let max_repeated_prior_scores: usize =
            arg_value("max_repeated_prior_scores")?.parse::<usize>()?;
        let debug: bool = arg_value("debug_mode")?.parse::<bool>()?;
        let min_degree: usize = arg_value("min_degree")?.parse::<usize>()?;
        let core_type: String = arg_value("core_type")?.parse::<String>()?;
        let long_format: bool = arg_value("long_format")?.parse::<bool>()?;

        let transformer = Transformer::new(
            typespec,
            beam_size,
            alpha,
            global_thresh,
            local_thresh,
            num_to_search,
            num_epochs,
            max_repeated_prior_scores,
            debug,
            min_degree,
            core_type,
            long_format,
        )?;
        Ok(transformer)
    }

    /// builds graph, pruned to ensure all nodes have at least self.min_degree degree
    /// with other nodes in the graph. This is done via a greedy algorithm which removes
    /// low-degree nodes iteratively.
    #[allow(clippy::ptr_arg)]
    pub fn build_pruned_graph(
        &self,
        graph_id: GraphId,
        rows: Vec<EdgeRow>,
    ) -> CLQResult<TypedGraph> {
        TypedGraphBuilder {
            graph_id,
            min_degree: Some(self.search_problem.min_degree),
        }
        .from_vector(rows)
    }

    /// Given a properly-built graph, runs the quasi-clique detection beam search on it.
    pub fn process_graph<'a>(
        &'a self,
        graph: &'a TypedGraph,
        clique_rows: &'a Vec<CliqueRow>,
        graph_id: GraphId,
        verbose: bool,
    ) -> CLQResult<BeamSearchResult<'a, TypedGraph>> {
        let mut beam: Beam<TypedGraph> = Beam::new(
            graph,
            clique_rows,
            verbose,
            &self.non_core_types,
            self.search_problem.clone(),
            graph_id,
        )?;
        beam.run_search()
    }
    /// Used to "seed" the beam search with an existing best (quasi-)clique (if any provided),
    /// and then run the search under the parameters specified in the constructor.
    pub fn process_clique_rows<'a>(
        &'a self,
        graph: &'a TypedGraph,
        clique_rows: &'a Vec<CliqueRow>,
        graph_id: GraphId,
        verbose: bool,
        output: &Sender<(Option<String>, bool)>,
    ) -> CLQResult<Option<BeamSearchResult<'a, TypedGraph>>> {
        if graph.get_core_ids().is_empty() || graph.get_non_core_ids().unwrap().is_empty() {
            // still have to send an acknowledgement to the output channel
            // that we have actually processed this graph, otherwise
            // we lose track of how many graphs have been processed so
            // far!
            output.send((None, false)).unwrap();
            return Ok(None);
        }
        let result: BeamSearchResult<TypedGraph> =
            self.process_graph(graph, clique_rows, graph_id, verbose)?;
        // only print if this is a conforming clique
        if result.top_candidate.get_score()? > 0.0 {
            if !self.long_format {
                let line: String = format!(
                    "{}\t{}",
                    graph_id.value(),
                    result
                        .top_candidate
                        .to_printable_row(&self.non_core_types, graph.get_reverse_labels_map())?,
                );
                output.send((Some(line), false)).unwrap();
            } else {
                result.top_candidate.print(
                    graph_id,
                    &self.non_core_types,
                    &self.core_type,
                    output,
                )?;
            }
        }
        Ok(Some(result))
    }
}