palate 0.3.7

File type detection combining tft and hyperpolyglot
Documentation
/
 * Helper function for k means clustering
\
hlpr:{[t;k;means]
 f:{[t;x] sqrt (+/) each xexp[;2] each (x -) each (value each t)};
 r:f[t;] each means;
 zipped:{[k;x] (til k) ,' x}[k;] each flip r;
 cluster:first flip ({$[last x < last y;x;y]} over) each zipped;
 / 1st column keeps count
 m2::(k;(1+count t[0]))#0;
 {m2[first x]+:1,1 _ x} each cluster ,' value each t;
 m2::flip m2;
 (flip 1 _ m2) % first m2}

/
 * k means clustering
 *
 * iris test:
 *   q)iris:flip `sl`sw`pl`pw`class!("FFFFS";",") 0: `:iris.csv
 *   q)\ts kmeans[delete class from iris;3]
 *   25 77184
\
kmeans:{[t;k]
 means:t[k?count t];
 diff:(k;count t[0])#1;
 while[any any diff;
  omeans:means;
  means:hlpr[t;k;means];
  diff:0.01<abs omeans-means];
 flip (cols t)!flip means}

/
 * Entropy
\
entropy:{
 cnt:sum x;
 p:x%cnt;
 -1*sum p*xlog[2;p]}

/
 * Information Gain Helper function
 *
 * t is table where 1st col is attribute and remaining cols are class counts
 * for each value in attribute, e.g.
 *
 * attr1 class1 class2
 * -------------------
 * 0     25     75
 * 1     33     67
 * ...
 *
 * cls is list of classes e.g. `class1`class2
 * clscnt is list of occurances of each class
 * setcnt is count of instances
\
ighlpr:{[t;cls;clscnt;setcnt]
 e:entropy each flip t[cls];
 p:(sum each flip t[cls])%setcnt;
 entropy[clscnt] - sum e*p}

/
 * Information Gain
 *
 * test:
 *   / random integers with class labels:
 *   q)t:flip (`a`b`c`class!) flip {(3?100),1?`A`B`C} each til 10000
 *   q)infogain[t] each `a`b`c
 *   0.01451189 0.01573281 0.01328462
\
infogain:{[tbl;attrib]
 cls:exec distinct class from tbl;
 clscnt:(exec count i by class from tbl)[cls];
 setcnt:count tbl;
 d:distinct tbl[attrib];

 / change attrib colname for ease of use with select syntax
 t2:(`a xcol attrib xcols select from tbl);
 t2:select count i by a, class from t2;
 / create empty table to hold pivot
 t3:flip (`a,cls)!(enlist[d],{[d;x]count[d]#0}[d;] each til count cls);
 / pivot t2, change class column to a column per class value
 t3:{x lj `a xkey flip (`a,y[`class])!(enlist y[`a];enlist y[`x])} over (enlist t3),0!t2;
 ighlpr[t3;cls;clscnt;setcnt]}

/
 *
 * Decision Tree ID3
 *
\

/
 * Generate paths through a decision tree.
 *   data is a table containing entire dataset
 *   d is a dict that contains attribute path (n_) and values (v_) to consider
 *
 * The result contains a table with two columns n_ and v_. The n_ column will
 * match the input d[`n_] except new possible attributes are appended. The
 * prefix of the v_ column should match the input values d[`v_].
 *
 * e.g.
 *
 * q)t:([] a:1 1 2 2;b:3 4 3 4;class:`A`B`A`B)
 * q)nextpaths[t;`n_`v_!(enlist[`a];enlist[])]
 * n_  v_
 * ------
 * a b 1
 * a b 2
\
nextpaths:{[data;d]
 n_:d[`n_];
 attrs:key[first[data]] except `class;
 if[null first n_;:flip `n_`v_!(enlist each attrs;count[attrs]#enlist[])];
 / construct a func. query similar to "select v_:(x,'y) from data"
 clause:{if[1=count x;:(each;enlist;first x)]; {((';,);x;y)} over x}[n_];
 tmp:?[data;();1b;enlist[`v_]!enlist[clause]];
 if[not null first d[`v_];
    tmp:select from tmp where all each =[d[`v_];] each (-1_') v_];
 / tack on the passed-in n_ to each row
 tmp[`n_]:(count[tmp];count[n_])#n_;
 tmp:`n_`v_ xcols tmp;

 / find leaves: select count distinct class by <n_> from data
 leaves:?[data;();n_!n_;enlist[`class]!enlist[(count;(distinct;`class))]];
 / find leaves: select n_1,n_2,... from leaves where class=1
 leaves:(value each) ?[leaves;enlist[(=;`class;1)];0b;n_!n_];

 / internal nodes
 tmpi:select from tmp where not v_ in leaves;
 / leaf nodes
 tmpl:select from tmp where v_ in leaves;

 / dupe rows for each new attr to be appended to n_
 newp:enlist[n_] cross attrs except n_;
 tmpl , (,/){[newp;x] {`n_`v_!(y;x[`v_])}[x;] each newp }[newp;] each tmpi}

/
 * Query a table with a where clause generated by the d argument.
 * e.g. igwrap[t;(`a`b;10 1)] will effectively run the query:
 *   select from t where a=10,b=1
 * TODO: symbols seem to need special treatment in functional queries?
\
igwrap:{[tbl;d] ?[tbl;{(=;x;$[-11=type y;enlist[y]; y])}'[first d;last d];0b;()]};

/
 * Perform one step of the ID3 algorithm.
 *   data is a table containing entire dataset
 *   tree is a n_, v_ table returned from nextpaths
 *   l is level of algorithm to perform (i.e. how many elements in the paths)
 *
 * Returns a sorted table with infogain calculated for each path (e_ column)
\
id3step:{[data;tree;l]
 / c_ contains a chain of attributes already split on
 tree_:update c_:(-1_') n_ from select from tree where l = count each n_;
 / get subsets of data matching c_ and v_
 sub:igwrap[data] each flip tree_`c_`v_;
 / a_ is candidate attribute for next split
 a_:(-1#') tree_`n_;
 / find infogain for each subset and candidate attribute
 tree_[`e_]:{infogain . x} each (enlist each sub),'a_;
 / sort infogain to find attr to split on next
 / sort twice because groupby seems to jumble initial sort
 `e_ xdesc value select first n_, first v_, first e_ by c_, v_ from `e_ xdesc tree_}

/ helper function for recursive calls
id3hlpr:{[data;tree;l]
 if[0=count tree;:tree];
 r:id3step[data;tree;l];
 attrs:key[first[data]] except `class;
 if[(0=count[r]) or l=count attrs;:r];
 / recurse
 np_:(,/) nextpaths[data] each r;
 r:id3hlpr[data;np_;l+1];
 / select paths that that have length l
 tmpl:select from np_ where l = count each n_;
 tmpl uj r}

/
 * ID3
 *
 * test:
 *   q)t:flip (`a`b`c`d`class!) flip {(4?10),1?`A`B`C} each til 100
 *   q)id3[t]
 *
\
id3:{[data]
 r:id3hlpr[data;nextpaths[data;`n_`v_!(enlist[];enlist[])];1];
 / for each id3 path find the most common class, i.e. run query like:
 /   select count i by class from data where (attr1=val1)&(attr2=val2)...
 clauses:{{(&;x;y)} over {(=;first x;enlist last x)} each flip x`n_`v_} each r;
 classes:{[data;clause]
  r:?[data;enlist[clause];enlist[`class]!enlist[`class];enlist[`x]!enlist[(#:;`i)]];
  first exec class from `x xdesc r}[data;] each clauses;
 r[`class]:classes;
 delete e_ from r}

/
 * Classic weather dataset
 *
\
weatherdata:{
 outlook:`sunny`sunny`overcast`rain`rain`rain`overcast`sunny`sunny`rain`sunny`overcast`overcast`rain;
 temp:`hot`hot`hot`mild`cool`cool`cool`mild`cool`mild`mild`mild`hot`mild;
 humidity:`high`high`high`high`normal`normal`normal`high`normal`normal`normal`high`normal`high;
 wind:`weak`strong`weak`weak`weak`strong`strong`weak`weak`weak`strong`strong`weak`strong;
 class:`no`no`yes`yes`yes`no`yes`no`yes`yes`yes`yes`yes`no;
 t:([] outlook:outlook;temp:temp;humidity:humidity;wind:wind;class:class)}