% vptree.pl - Implement a vantage point tree
% Version 0.1
% Copyright (C) 2006 Matthew Skala

% This program is free software; you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation; either version 2 of the License, or
% (at your option) any later version.

% This program is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
% GNU General Public License for more details.

% You should have received a copy of the GNU General Public License
% along with this program; if not, write to the Free Software
% Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% This module implements a modified vantage point tree data structure, based
% on the "VP-tree" described in the paper referenced below, extended to make
% it partially dynamic.  This is experimental, unfinished code.

% @InProceedings{Yianilos:VPTree,
%    author =       "Peter N. Yianilos",
%    title =        "Data Structures and Algorithms for Nearest Neighbor
%                    Search in General Metric Spaces",
%    pages =        "311--321",
%    ISBN =         "0-89871-313-7",
%    booktitle =    "Proceedings of the 4th Annual {ACM}-{SIAM} Symposium
%                    on Discrete Algorithms ({SODA} ",
%    address =      "Austin, TX, USA",
%    month =        jan,
%    year =         "1993",
%    publisher =    "SIAM",
% }

% Exported predicates:
%
%   vp_empty(?Tree)
%   Tree is an empty vp-tree.  Use this to both create new trees and
%   test existing ones.
%
%   vp_insert(+DistFunc,+Point,+OldTree,-NewTree).
%   Add Point to OldTree using DistFunc to test distances.  NewTree is
%   result.  call(DistFunc,+X,+Y,-D) with points X and Y should return
%   the distance D, and DistFunc should be a metric, i.e. nonnegative,
%   equal to zero iff X=Y, and obeying the Triangle Inequality.  You should
%   always use the same distance function with any given tree.
%
%   vp_delete_one(-Elt,+OldTree,-NewTree).
%   Remove one element, Elt, from OldTree; NewTree is what remains.  Note
%   that this does not necessarily remove the elements in any particular
%   order and should not be depended on to remove them randomly either.  Its
%   main use is internally, during rebalancing, when we collapse an entire
%   subtree one element at a time.
%
%   vp_range_query(+DistFunc,+QElt,+Range,-FoundElt,+Tree).
%   Search Tree, using DistFunc, for an element FoundElt within distance
%   Range of QElt.  Succeeds multiple times, once for each matching element.
%
%   vp_nearest_query(+DistFunc,+QElt,-FoundElt,+Tree).
%   Search Tree, using DistFunc, for the unique element closest to QElt, and
%   return it in FoundElt.  THIS PREDICATE IS NOT DEFINED YET.
%
%   find_median(+L,-M).
%   Set M to the median element of L, which should be a list of numbers.
%
%   euclidean(+X,+Y,-D).
%   Compute the Eucliden distance D between two vectors X and Y, which
%   should be same-length lists of numbers.  This function is suitable for
%   use as a DistFunc in the vp-tree predicates.
%
%   hamming(+X,+Y,-D).
%   Compute the Hamming distance D between two same-length lists X and Y;
%   that is, a count of the number of pairs of same-index (corresponding)
%   elements in the two lists that do NOT unify.  This function is suitable
%   for use as a DistFunc in the vp-tree predicates.

:-module(vptree,[
   vp_empty/1,
   vp_insert/4,
   vp_delete_one/3,
   vp_range_query/5,
   vp_nearest_query/4,
   find_median/2,
   euclidean/3,
   hamming/3
]).

:-discontiguous vp_test/1.

vp_test(all):-
   vp_test(vp_tree),
   vp_test(vp_empty),
   vp_test(euclidean),
   vp_test(vp_insert).

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% use this to recognize tree
vp_tree(vp_leaf(_,_)).
vp_tree(vp_node(_,_,_,_,_,_)).

vp_test(vp_tree):-
   vp_tree(vp_leaf([],0)),
   vp_tree(vp_leaf([x],0)),
   vp_tree(vp_node(x,1,vp_leaf([],0),vp_leaf([],0),0,0)),
   \+(vp_tree(garbage)).

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% use this both to test trees, and create new ones!
vp_empty(vp_leaf([],0)).

vp_test(vp_empty):-
   vp_empty(vp_leaf([],0)),
   \+(vp_empty(vp_leaf([x],1))).

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

euclidean(X,Y,D):-
   sum_squared_diff(X,Y,SS),D is sqrt(SS).

sum_squared_diff([],[],0).
sum_squared_diff([XH|XT],[YH|YT],S):-
   sum_squared_diff(XT,YT,SS),
   S is SS+(XH-YH)*(XH-YH).

vp_test(euclidean):-
   euclidean([1],[2],1.0),
   euclidean([3,3,3],[4,4,4],R3),R3=:=sqrt(3).

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

hamming([],[],0).
hamming([H|TA],[H|TB],NN):-!,hamming(TA,TB,N),succ(N,NN).
hamming([_|TA],[_|TB],N):-hamming(TA,TB,N).

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% we mostly test insert inside of the query tests, this is just a quick
% sanity check

vp_test(vp_insert):-
   vp_empty(E),
   vpt_i(euclidean,E,O,0,1000),
   vp_tree(O).

vpt_i(_,O,O,N,N).
vpt_i(DF,OldT,NewT,I,N):-
   I<N,II is I+1,
   vp_insert(DF,[I,I,II,I],OldT,MidT),!,
   vpt_i(DF,MidT,NewT,II,N).

vp_insert(DistFunc,Point,OldTree,NewTree):-
   vp_insert_noreb(DistFunc,Point,OldTree,UB),
   vp_rebalance(DistFunc,UB,NewTree),!.

vp_insert_noreb(_,Point,vp_leaf(L,C),vp_leaf([Point|L],CC)):-
   succ(C,CC).
vp_insert_noreb(DistFunc,Point,
 vp_node(ThisPoint,Threshold,Left,Right,LeftChildren,RightChildren),
 vp_node(ThisPoint,Threshold,NewLeft,Right,NewLeftChildren,RightChildren)):-
   call(DistFunc,Point,ThisPoint,D),
   (D<Threshold;D=Threshold,LeftChildren<RightChildren),!,
   vp_insert(DistFunc,Point,Left,NewLeft),
   succ(LeftChildren,NewLeftChildren).
vp_insert_noreb(DistFunc,Point,
 vp_node(ThisPoint,Threshold,Left,Right,LeftChildren,RightChildren),
 vp_node(ThisPoint,Threshold,Left,NewRight,LeftChildren,NewRightChildren)):-
   vp_insert(DistFunc,Point,Right,NewRight),
   succ(RightChildren,NewRightChildren).

% note that rebalance is called on insert, not on delete, so it only has
% to deal with rebalancing, not contracting vp_nodes into vp_lists

% tree is already balanced enough
vp_rebalance(_,
 vp_node(ThisPoint,Threshold,Left,Right,LeftChildren,RightChildren),
 vp_node(ThisPoint,Threshold,Left,Right,LeftChildren,RightChildren)):-
   LeftChildren=<3*RightChildren,RightChildren=<3*LeftChildren,!.

% leaf is small enough to leave alone
vp_rebalance(_,vp_leaf(L,N),vp_leaf(L,N)):-N=<5,!.

% change a leaf into a tree
vp_rebalance(DistFunc,vp_leaf([H|L],_),
 vp_node(H,Threshold,Left,Right,LeftChildren,RightChildren)):-
   maplist(call(DistFunc,H),L,DistList),
   find_median(DistList,Threshold),!,
   vp_rebalance_i(DistFunc,L,
                  vp_node(H,Threshold,vp_leaf([],0),vp_leaf([],0),0,0),
                  vp_node(H,Threshold,Left,Right,LeftChildren,RightChildren)).

% Left too big, reset threshold
vp_rebalance(DistFunc,
 vp_node(ThisPoint,_,Left,Right,LeftChildren,RightChildren),
 NewTree):-
    LeftChildren>3*RightChildren,
    vp_delete_one(H,Left,_),
    call(DistFunc,ThisPoint,H,Threshold),!,
    vp_rebalance_i(DistFunc,Left,
                   vp_node(ThisPoint,Threshold,
                           vp_leaf([],0),Right,0,RightChildren),
                   NewTree).

% Right too big, reset threshold
vp_rebalance(DistFunc,
 vp_node(ThisPoint,_,Left,Right,LeftChildren,RightChildren),
 NewTree):-
    RightChildren>3*LeftChildren,
    vp_delete_one(H,Right,_),
    call(DistFunc,ThisPoint,H,Threshold),!,
    vp_rebalance_i(DistFunc,Right,
                   vp_node(ThisPoint,Threshold,
                           Left,vp_leaf([],0),LeftChildren,0),
                   NewTree).

% should never happen - catch other rebalancing cases
vp_rebalance(_,T,T).

vp_rebalance_i(_,[],Tree,Tree).
vp_rebalance_i(DistFunc,[H|T],OldTree,NewTree):-
   vp_insert_noreb(DistFunc,H,OldTree,TmpTree),
   !,vp_rebalance_i(DistFunc,T,TmpTree,NewTree).
vp_rebalance_i(_,In,Tree,Tree):-vp_empty(In),!.
vp_rebalance_i(DistFunc,In,OldTree,NewTree):-
   vp_delete_one(H,In,T),
   vp_insert_noreb(DistFunc,H,OldTree,TmpTree),
   !,vp_rebalance_i(DistFunc,T,TmpTree,NewTree).

% caution - this algorithm misbehaves on pathological inputs
find_median([H|T],M):-
   partition_count(H,T,L,R,Lc,Rc),
   N is (Lc+Rc)//2,
   find_orderstat_i(H,L,R,Lc,N,M).

% find_orderstat(L,N,E) - E is N'th element, zero-indexed, of sorted L, but
% less work than actually sorting L
find_orderstat([X],0,X).
find_orderstat([H|T],N,E):-
   partition_count(H,T,L,R,Lc,_),
   find_orderstat_i(H,L,R,Lc,N,E).

find_orderstat_i(H,_L,_R,N,N,H):-!.
find_orderstat_i(_H,L,_R,Lc,N,E):-
   N<Lc,!,find_orderstat(L,N,E).
find_orderstat_i(_H,_L,R,Lc,N,E):-
   N>Lc,!,NN is N-Lc-1,
   find_orderstat(R,NN,E).

partition_count(_,[],[],[],0,0):-!.
partition_count(P,[H|T],[H|L],R,Lc,Rc):-
   H @< P,partition_count(P,T,L,R,Lcc,Rc),succ(Lcc,Lc).
partition_count(P,[H|T],L,[H|R],Lc,Rc):-
   partition_count(P,T,L,R,Lc,Rcc),succ(Rcc,Rc).

vp_delete_one(H,vp_leaf([H|T],CC),vp_leaf(T,C)):-succ(C,CC).
vp_delete_one(H,vp_node(H,_,_,_,0,0),vp_leaf([],0)).
vp_delete_one(H,
 vp_node(ThisPoint,Threshold,LL,Right,LCp,RC),
 vp_node(ThisPoint,Threshold,L,Right,LC,RC)):-
   LCp>RC,succ(LC,LCp),!,vp_delete_one(H,LL,L).
vp_delete_one(H,
 vp_node(ThisPoint,Threshold,Left,RR,LC,RCp),
 vp_node(ThisPoint,Threshold,Left,R,LC,RC)):-
   succ(RC,RCp),!,vp_delete_one(H,RR,R).

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% vp_range_query(DistFunc,QElt,Range,FoundElt,Tree).
% vp_nearest_query(DistFunc,QElt,FoundElt,Tree).

vp_range_query(DistFunc,Q,QR,F,vp_leaf([H|T],_)):-
   call(DistFunc,Q,H,R),!,
   (R=<QR,F=H;
    vp_range_query(DistFunc,Q,QR,F,vp_leaf(T,_))).
vp_range_query(DistFunc,Q,QR,F,vp_node(V,R,Left,Right,_,_)):-
   call(DistFunc,Q,V,DQV),!,
   (DQV=<QR,F=V;
    DQV-R=<QR,vp_range_query(DistFunc,Q,QR,F,Left);
    R-DQV=<QR,vp_range_query(DistFunc,Q,QR,F,Right)).

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

vp_perftest(NDims,NPoints,NQueries,Time):-
   TStart is cputime,
      vp_empty(E),
      gen_random_points(NDims,NPoints,E,Tree),
      make_queries(NDims,NQueries,Tree),
   Time is cputime-TStart.

gen_random_points(_,0,Tree,Tree).
gen_random_points(NDims,N,InTree,OutTree):-
   succ(NN,N),
   gen_random_point(NDims,X),
   vp_insert(euclidean,X,InTree,MidTree),
   gen_random_points(NDims,NN,MidTree,OutTree).

gen_random_point(0,[]).
gen_random_point(NDims,[H|T]):-
   succ(NNDims,NDims),
   H is random(1000000)/100000,
   gen_random_point(NNDims,T).

make_queries(_,0,_).
make_queries(NDims,NQueries,Tree):-
   succ(NNQueries,NQueries),
   gen_random_point(NDims,Q),
   R is random(100000)/100000,
   (vp_range_query(euclidean,Q,R,_,Tree),fail;true),
   !,make_queries(NDims,NNQueries,Tree).
