/
* Helper function for k means clustering
\
hlpr:{[t;k;means]
f:{[t;x] sqrt (+/) each xexp[;2] each (x -) each (value each t)};
r:f[t;] each means;
zipped:{[k;x] (til k) ,' x}[k;] each flip r;
cluster:first flip ({$[last x < last y;x;y]} over) each zipped;
/ 1st column keeps count
m2::(k;(1+count t[0]))#0;
{m2[first x]+:1,1 _ x} each cluster ,' value each t;
m2::flip m2;
(flip 1 _ m2) % first m2}
/
* k means clustering
*
* iris test:
* q)iris:flip `sl`sw`pl`pw`class!("FFFFS";",") 0: `:iris.csv
* q)\ts kmeans[delete class from iris;3]
* 25 77184
\
kmeans:{[t;k]
means:t[k?count t];
diff:(k;count t[0])#1;
while[any any diff;
omeans:means;
means:hlpr[t;k;means];
diff:0.01<abs omeans-means];
flip (cols t)!flip means}
/
* Entropy
\
entropy:{
cnt:sum x;
p:x%cnt;
-1*sum p*xlog[2;p]}
/
* Information Gain Helper function
*
* t is table where 1st col is attribute and remaining cols are class counts
* for each value in attribute, e.g.
*
* attr1 class1 class2
* -------------------
* 0 25 75
* 1 33 67
* ...
*
* cls is list of classes e.g. `class1`class2
* clscnt is list of occurances of each class
* setcnt is count of instances
\
ighlpr:{[t;cls;clscnt;setcnt]
e:entropy each flip t[cls];
p:(sum each flip t[cls])%setcnt;
entropy[clscnt] - sum e*p}
/
* Information Gain
*
* test:
* / random integers with class labels:
* q)t:flip (`a`b`c`class!) flip {(3?100),1?`A`B`C} each til 10000
* q)infogain[t] each `a`b`c
* 0.01451189 0.01573281 0.01328462
\
infogain:{[tbl;attrib]
cls:exec distinct class from tbl;
clscnt:(exec count i by class from tbl)[cls];
setcnt:count tbl;
d:distinct tbl[attrib];
/ change attrib colname for ease of use with select syntax
t2:(`a xcol attrib xcols select from tbl);
t2:select count i by a, class from t2;
/ create empty table to hold pivot
t3:flip (`a,cls)!(enlist[d],{[d;x]count[d]#0}[d;] each til count cls);
/ pivot t2, change class column to a column per class value
t3:{x lj `a xkey flip (`a,y[`class])!(enlist y[`a];enlist y[`x])} over (enlist t3),0!t2;
ighlpr[t3;cls;clscnt;setcnt]}
/
*
* Decision Tree ID3
*
\
/
* Generate paths through a decision tree.
* data is a table containing entire dataset
* d is a dict that contains attribute path (n_) and values (v_) to consider
*
* The result contains a table with two columns n_ and v_. The n_ column will
* match the input d[`n_] except new possible attributes are appended. The
* prefix of the v_ column should match the input values d[`v_].
*
* e.g.
*
* q)t:([] a:1 1 2 2;b:3 4 3 4;class:`A`B`A`B)
* q)nextpaths[t;`n_`v_!(enlist[`a];enlist[])]
* n_ v_
* ------
* a b 1
* a b 2
\
nextpaths:{[data;d]
n_:d[`n_];
attrs:key[first[data]] except `class;
if[null first n_;:flip `n_`v_!(enlist each attrs;count[attrs]#enlist[])];
/ construct a func. query similar to "select v_:(x,'y) from data"
clause:{if[1=count x;:(each;enlist;first x)]; {((';,);x;y)} over x}[n_];
tmp:?[data;();1b;enlist[`v_]!enlist[clause]];
if[not null first d[`v_];
tmp:select from tmp where all each =[d[`v_];] each (-1_') v_];
/ tack on the passed-in n_ to each row
tmp[`n_]:(count[tmp];count[n_])#n_;
tmp:`n_`v_ xcols tmp;
/ find leaves: select count distinct class by <n_> from data
leaves:?[data;();n_!n_;enlist[`class]!enlist[(count;(distinct;`class))]];
/ find leaves: select n_1,n_2,... from leaves where class=1
leaves:(value each) ?[leaves;enlist[(=;`class;1)];0b;n_!n_];
/ internal nodes
tmpi:select from tmp where not v_ in leaves;
/ leaf nodes
tmpl:select from tmp where v_ in leaves;
/ dupe rows for each new attr to be appended to n_
newp:enlist[n_] cross attrs except n_;
tmpl , (,/){[newp;x] {`n_`v_!(y;x[`v_])}[x;] each newp }[newp;] each tmpi}
/
* Query a table with a where clause generated by the d argument.
* e.g. igwrap[t;(`a`b;10 1)] will effectively run the query:
* select from t where a=10,b=1
* TODO: symbols seem to need special treatment in functional queries?
\
igwrap:{[tbl;d] ?[tbl;{(=;x;$[-11=type y;enlist[y]; y])}'[first d;last d];0b;()]};
/
* Perform one step of the ID3 algorithm.
* data is a table containing entire dataset
* tree is a n_, v_ table returned from nextpaths
* l is level of algorithm to perform (i.e. how many elements in the paths)
*
* Returns a sorted table with infogain calculated for each path (e_ column)
\
id3step:{[data;tree;l]
/ c_ contains a chain of attributes already split on
tree_:update c_:(-1_') n_ from select from tree where l = count each n_;
/ get subsets of data matching c_ and v_
sub:igwrap[data] each flip tree_`c_`v_;
/ a_ is candidate attribute for next split
a_:(-1#') tree_`n_;
/ find infogain for each subset and candidate attribute
tree_[`e_]:{infogain . x} each (enlist each sub),'a_;
/ sort infogain to find attr to split on next
/ sort twice because groupby seems to jumble initial sort
`e_ xdesc value select first n_, first v_, first e_ by c_, v_ from `e_ xdesc tree_}
/ helper function for recursive calls
id3hlpr:{[data;tree;l]
if[0=count tree;:tree];
r:id3step[data;tree;l];
attrs:key[first[data]] except `class;
if[(0=count[r]) or l=count attrs;:r];
/ recurse
np_:(,/) nextpaths[data] each r;
r:id3hlpr[data;np_;l+1];
/ select paths that that have length l
tmpl:select from np_ where l = count each n_;
tmpl uj r}
/
* ID3
*
* test:
* q)t:flip (`a`b`c`d`class!) flip {(4?10),1?`A`B`C} each til 100
* q)id3[t]
*
\
id3:{[data]
r:id3hlpr[data;nextpaths[data;`n_`v_!(enlist[];enlist[])];1];
/ for each id3 path find the most common class, i.e. run query like:
/ select count i by class from data where (attr1=val1)&(attr2=val2)...
clauses:{{(&;x;y)} over {(=;first x;enlist last x)} each flip x`n_`v_} each r;
classes:{[data;clause]
r:?[data;enlist[clause];enlist[`class]!enlist[`class];enlist[`x]!enlist[(#:;`i)]];
first exec class from `x xdesc r}[data;] each clauses;
r[`class]:classes;
delete e_ from r}
/
* Classic weather dataset
*
\
weatherdata:{
outlook:`sunny`sunny`overcast`rain`rain`rain`overcast`sunny`sunny`rain`sunny`overcast`overcast`rain;
temp:`hot`hot`hot`mild`cool`cool`cool`mild`cool`mild`mild`mild`hot`mild;
humidity:`high`high`high`high`normal`normal`normal`high`normal`normal`normal`high`normal`high;
wind:`weak`strong`weak`weak`weak`strong`strong`weak`weak`weak`strong`strong`weak`strong;
class:`no`no`yes`yes`yes`no`yes`no`yes`yes`yes`yes`yes`no;
t:([] outlook:outlook;temp:temp;humidity:humidity;wind:wind;class:class)}