% vptree.pl - Implement a vantage point tree % Version 0.1 % Copyright (C) 2006 Matthew Skala % This program is free software; you can redistribute it and/or modify % it under the terms of the GNU General Public License as published by % the Free Software Foundation; either version 2 of the License, or % (at your option) any later version. % This program is distributed in the hope that it will be useful, % but WITHOUT ANY WARRANTY; without even the implied warranty of % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the % GNU General Public License for more details. % You should have received a copy of the GNU General Public License % along with this program; if not, write to the Free Software % Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % This module implements a modified vantage point tree data structure, based % on the "VP-tree" described in the paper referenced below, extended to make % it partially dynamic. This is experimental, unfinished code. % @InProceedings{Yianilos:VPTree, % author = "Peter N. Yianilos", % title = "Data Structures and Algorithms for Nearest Neighbor % Search in General Metric Spaces", % pages = "311--321", % ISBN = "0-89871-313-7", % booktitle = "Proceedings of the 4th Annual {ACM}-{SIAM} Symposium % on Discrete Algorithms ({SODA} ", % address = "Austin, TX, USA", % month = jan, % year = "1993", % publisher = "SIAM", % } % Exported predicates: % % vp_empty(?Tree) % Tree is an empty vp-tree. Use this to both create new trees and % test existing ones. % % vp_insert(+DistFunc,+Point,+OldTree,-NewTree). % Add Point to OldTree using DistFunc to test distances. NewTree is % result. call(DistFunc,+X,+Y,-D) with points X and Y should return % the distance D, and DistFunc should be a metric, i.e. nonnegative, % equal to zero iff X=Y, and obeying the Triangle Inequality. You should % always use the same distance function with any given tree. % % vp_delete_one(-Elt,+OldTree,-NewTree). % Remove one element, Elt, from OldTree; NewTree is what remains. Note % that this does not necessarily remove the elements in any particular % order and should not be depended on to remove them randomly either. Its % main use is internally, during rebalancing, when we collapse an entire % subtree one element at a time. % % vp_range_query(+DistFunc,+QElt,+Range,-FoundElt,+Tree). % Search Tree, using DistFunc, for an element FoundElt within distance % Range of QElt. Succeeds multiple times, once for each matching element. % % vp_nearest_query(+DistFunc,+QElt,-FoundElt,+Tree). % Search Tree, using DistFunc, for the unique element closest to QElt, and % return it in FoundElt. THIS PREDICATE IS NOT DEFINED YET. % % find_median(+L,-M). % Set M to the median element of L, which should be a list of numbers. % % euclidean(+X,+Y,-D). % Compute the Eucliden distance D between two vectors X and Y, which % should be same-length lists of numbers. This function is suitable for % use as a DistFunc in the vp-tree predicates. % % hamming(+X,+Y,-D). % Compute the Hamming distance D between two same-length lists X and Y; % that is, a count of the number of pairs of same-index (corresponding) % elements in the two lists that do NOT unify. This function is suitable % for use as a DistFunc in the vp-tree predicates. :-module(vptree,[ vp_empty/1, vp_insert/4, vp_delete_one/3, vp_range_query/5, vp_nearest_query/4, find_median/2, euclidean/3, hamming/3 ]). :-discontiguous vp_test/1. vp_test(all):- vp_test(vp_tree), vp_test(vp_empty), vp_test(euclidean), vp_test(vp_insert). %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % use this to recognize tree vp_tree(vp_leaf(_,_)). vp_tree(vp_node(_,_,_,_,_,_)). vp_test(vp_tree):- vp_tree(vp_leaf([],0)), vp_tree(vp_leaf([x],0)), vp_tree(vp_node(x,1,vp_leaf([],0),vp_leaf([],0),0,0)), \+(vp_tree(garbage)). %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % use this both to test trees, and create new ones! vp_empty(vp_leaf([],0)). vp_test(vp_empty):- vp_empty(vp_leaf([],0)), \+(vp_empty(vp_leaf([x],1))). %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% euclidean(X,Y,D):- sum_squared_diff(X,Y,SS),D is sqrt(SS). sum_squared_diff([],[],0). sum_squared_diff([XH|XT],[YH|YT],S):- sum_squared_diff(XT,YT,SS), S is SS+(XH-YH)*(XH-YH). vp_test(euclidean):- euclidean([1],[2],1.0), euclidean([3,3,3],[4,4,4],R3),R3=:=sqrt(3). %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% hamming([],[],0). hamming([H|TA],[H|TB],NN):-!,hamming(TA,TB,N),succ(N,NN). hamming([_|TA],[_|TB],N):-hamming(TA,TB,N). %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % we mostly test insert inside of the query tests, this is just a quick % sanity check vp_test(vp_insert):- vp_empty(E), vpt_i(euclidean,E,O,0,1000), vp_tree(O). vpt_i(_,O,O,N,N). vpt_i(DF,OldT,NewT,I,N):- I3*RightChildren, vp_delete_one(H,Left,_), call(DistFunc,ThisPoint,H,Threshold),!, vp_rebalance_i(DistFunc,Left, vp_node(ThisPoint,Threshold, vp_leaf([],0),Right,0,RightChildren), NewTree). % Right too big, reset threshold vp_rebalance(DistFunc, vp_node(ThisPoint,_,Left,Right,LeftChildren,RightChildren), NewTree):- RightChildren>3*LeftChildren, vp_delete_one(H,Right,_), call(DistFunc,ThisPoint,H,Threshold),!, vp_rebalance_i(DistFunc,Right, vp_node(ThisPoint,Threshold, Left,vp_leaf([],0),LeftChildren,0), NewTree). % should never happen - catch other rebalancing cases vp_rebalance(_,T,T). vp_rebalance_i(_,[],Tree,Tree). vp_rebalance_i(DistFunc,[H|T],OldTree,NewTree):- vp_insert_noreb(DistFunc,H,OldTree,TmpTree), !,vp_rebalance_i(DistFunc,T,TmpTree,NewTree). vp_rebalance_i(_,In,Tree,Tree):-vp_empty(In),!. vp_rebalance_i(DistFunc,In,OldTree,NewTree):- vp_delete_one(H,In,T), vp_insert_noreb(DistFunc,H,OldTree,TmpTree), !,vp_rebalance_i(DistFunc,T,TmpTree,NewTree). % caution - this algorithm misbehaves on pathological inputs find_median([H|T],M):- partition_count(H,T,L,R,Lc,Rc), N is (Lc+Rc)//2, find_orderstat_i(H,L,R,Lc,N,M). % find_orderstat(L,N,E) - E is N'th element, zero-indexed, of sorted L, but % less work than actually sorting L find_orderstat([X],0,X). find_orderstat([H|T],N,E):- partition_count(H,T,L,R,Lc,_), find_orderstat_i(H,L,R,Lc,N,E). find_orderstat_i(H,_L,_R,N,N,H):-!. find_orderstat_i(_H,L,_R,Lc,N,E):- NLc,!,NN is N-Lc-1, find_orderstat(R,NN,E). partition_count(_,[],[],[],0,0):-!. partition_count(P,[H|T],[H|L],R,Lc,Rc):- H @< P,partition_count(P,T,L,R,Lcc,Rc),succ(Lcc,Lc). partition_count(P,[H|T],L,[H|R],Lc,Rc):- partition_count(P,T,L,R,Lc,Rcc),succ(Rcc,Rc). vp_delete_one(H,vp_leaf([H|T],CC),vp_leaf(T,C)):-succ(C,CC). vp_delete_one(H,vp_node(H,_,_,_,0,0),vp_leaf([],0)). vp_delete_one(H, vp_node(ThisPoint,Threshold,LL,Right,LCp,RC), vp_node(ThisPoint,Threshold,L,Right,LC,RC)):- LCp>RC,succ(LC,LCp),!,vp_delete_one(H,LL,L). vp_delete_one(H, vp_node(ThisPoint,Threshold,Left,RR,LC,RCp), vp_node(ThisPoint,Threshold,Left,R,LC,RC)):- succ(RC,RCp),!,vp_delete_one(H,RR,R). %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % vp_range_query(DistFunc,QElt,Range,FoundElt,Tree). % vp_nearest_query(DistFunc,QElt,FoundElt,Tree). vp_range_query(DistFunc,Q,QR,F,vp_leaf([H|T],_)):- call(DistFunc,Q,H,R),!, (R=