top_k.pl

    1/*  Part of Extended Libraries for SWI-Prolog
    2
    3    Author:        Edison Mera
    4    E-mail:        efmera@gmail.com
    5    WWW:           https://github.com/edisonm/xlibrary
    6    Copyright (C): 2026, Process Design Center, Breda, The Netherlands.
    7    All rights reserved.
    8
    9    Redistribution and use in source and binary forms, with or without
   10    modification, are permitted provided that the following conditions
   11    are met:
   12
   13    1. Redistributions of source code must retain the above copyright
   14       notice, this list of conditions and the following disclaimer.
   15
   16    2. Redistributions in binary form must reproduce the above copyright
   17       notice, this list of conditions and the following disclaimer in
   18       the documentation and/or other materials provided with the
   19       distribution.
   20
   21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   32    POSSIBILITY OF SUCH DAMAGE.
   33*/
   34
   35:- module(top_k,
   36          [ top_k/3, % +Options, :Goal, -Result
   37            group_by_sorted/4
   38          ]).   39
   40:- use_module(library(heaps)).   41:- use_module(library(assoc)).   42:- use_module(library(option)).   43:- use_module(library(solution_sequences)).   44:- use_module(library(local_dynamic)).   45
   46/* <Top-k selection problem>
   47
   48This module provides a predicate top_k/3, which is an efficient replacement of
   49the library(solution_sequences) when several sequencing options have to be
   50combined.
   51
   52The very well-known top-k selection problem says:
   53
   54- “Given a set of items, find the k items with the highest scores”
   55
   56Options:
   57  - limit(N)                        Optional, maximum number of solutions to process.
   58                                    Default is infinite.
   59  - order_by(asc(Key) | desc(Key))  Optional, Key is a term evaluated in Goal's context
   60  - distinct(W)                     term; skip solutions that bounds W to the same value.
   61                                    If not provided the original order is preserved
   62  - group_by(G)                     optional term; if given, do top-K per group, if not
   63                                    provided, it equates to put all in one single group
   64  - return(list(Term)|backtrack)    Returns the solutions as list of Term or backtracking
   65
   66semantically equivalent to:
   67    group_by(G, Goal, limit(N, distinct(W, order_by(OrderSpec, Goal))), GKGoalL),
   68    member(GK-GoalL, GKGoalL),
   69    member(Goal, GoalL).
   70
   71- Note that the chosen order of operations is not casual.
   72
   73- Performs better in terms of memory consumption.
   74
   75*/
   76
   77% Test:
   78
   79% ?- top_k([limit(3),distinct(Y),order_by(asc(X)), return(list(X-Y))], member(X-Y, [6-z,3-a,2-a,1-a,3-b,4-c,5-d]), L).
   80% L = [1-a, 3-b, 4-c].
   81
   82% To compare performance wrt library(solution_sequences):
   83
   84% whoo:
   85% ?- findall(E, between(1,1000,E),L),findall(E1-E2, limit(10, distinct(E1-E2, order_by([asc(E2)], ( append(_, [E1|T], L), member(E2, T) )))),L2),length(L2,S2).
   86% ERROR: Stack limit (1.0Gb) exceeded
   87
   88% whee:
   89% ?- time((findall(E, between(1,1000,E),L),top_k([limit(10),distinct(E1-E2),order(asc(E2)), return(list(E1-E2))],( append(_, [E1|T], L), member(E2, T) ), L2), length(L2, S2))).
   90% % 21,981,983 inferences, 0.825 CPU in 0.825 seconds (100% CPU, 26650647 Lips)
   91% ...
   92
   93:- meta_predicate top_k(+, 0, -).   94
   95top_k(Options1, Goal, Result) :-
   96    select_option(return(Return), Options1, Options, backtrack),
   97    top_k(Return, Options, Goal, Result).
   98
   99top_k(Return, Options, Goal, Result) :-
  100    (   Options = [_, _|_]
  101    ->  run_optimized(Return, Options, Goal, Result)
  102    ;   dispatch_singles(Return, Options, Goal, Result)
  103    ).
  104
  105dispatch_singles(backtrack,  Opts, Goal, Goal) :- dispatch_singles(Opts, Goal, _).
  106dispatch_singles(list(Term), Opts, Goal, List) :-
  107    option(group_by(Group), Opts, ungrouped),
  108    (   group_by(GK, Term, dispatch_singles(Opts, Goal, GK), List)
  109    *-> Group = GK % instantiate Group
  110    ;   ground(Group),
  111        List = [] % prevent failure if no solutions where found
  112    ).
  113
  114dispatch_singles([],    Goal, ungrouped) :- call(Goal).
  115dispatch_singles([Opt], Goal, GK) :- dispatch_single(Opt, Goal, GK).
  116
  117ordered_term_variables(Term, Vars) :-
  118    term_variables(Term, UVars),
  119    sort(UVars, Vars).
  120
  121dispatch_single(order_by(Spec),  Goal, ungrouped) :- order_by([Spec], Goal).
  122dispatch_single(limit(K),        Goal, ungrouped) :- limit(K, Goal).
  123dispatch_single(distinct(W),     Goal, ungrouped) :- distinct(W, Goal).
  124dispatch_single(group_by(Group), Goal, Group) :- dispatch_group_by(Goal, Group).
  125
  126dispatch_group_by(Goal, Group) :-
  127    ordered_term_variables(Goal, GVars),
  128    ordered_term_variables(Group, KVars),
  129    ord_subtract(GVars, KVars, TVars),
  130    Term =.. [v|TVars],
  131    bagof(Term, Goal, List),
  132    member(Term, List).
  133
  134run_optimized(Return, Opts, Goal, Result) :-
  135    option(limit(Count), Opts, inf),
  136    option(order_by(OrderSpec), Opts, asc(unordered)),
  137    option(group_by(Group), Opts, ungrouped),
  138    (   option(distinct(Witness), Opts)
  139    ->  Distinct = true
  140    ;   Distinct = false
  141    ),
  142    priority_for(OrderSpec, Pri, Key),
  143    run_optimized(Goal, Count, Pri, Key, Distinct, Witness, Group, Return, Result).
  144
  145/* ---------- ordering ---------- */
  146
  147% We store entries with a "priority" such that the heap root is the WORST
  148% among kept ones. Then replacement is cheap.
  149%
  150% For asc(Key): "best" = smallest Key. "worst" = largest Key.
  151% So make heap a max-heap on Key by using priority = Key and using a max-heap?
  152% library(heaps) is a min-heap, so emulate max-heap by negating an order key
  153% or by wrapping with a reversed term order.
  154%
  155% Easiest: map Key to Priority so that worse == smaller Priority (min-heap root).
  156% For asc: worse = larger Key, so Priority = key_rank(Key) where larger Key => smaller Priority
  157% We can use Priority = rev(Key) where rev/1 compares reverse via standard order.
  158
  159priority_for(asc(Key),  @=<, Key).
  160priority_for(desc(Key), @>=, Key).
  161
  162% Because root is WORST, a candidate is "better" if its priority is GREATER
  163% than worst priority (min-heap root means smallest = worst).
  164better_than(@>=, P1, P2) :- P1 @> P2.
  165better_than(@=<, P1, P2) :- P1 @< P2.
  166
  167setup_state(false, none).
  168setup_state(true,  state(DictHolder)) :-
  169    empty_assoc(D0),
  170    DictHolder = holder(D0).
  171
  172seen_hash(state(holder(D)), Hash, Key) :-
  173    get_assoc(Hash, D, Key).
  174
  175mark_hash(state(DictHolder), Hash, Key) :-
  176    DictHolder = holder(D0),
  177    put_assoc(Hash, D0, Key, D1),
  178    nb_setarg(1, DictHolder, D1),
  179    true.
  180
  181update_topk(Count, Pri, Key, Entry, HHolder) :-
  182    HHolder = holder(N0, H0),
  183    (   N0 < Count
  184    ->  add_to_heap(H0, Key, Entry, H1),
  185        N1 is N0 + 1,
  186        nb_setarg(1, HHolder, N1),
  187        nb_setarg(2, HHolder, H1)
  188    ;   % Heap full: compare with worst (root because root is "worst")
  189        replace_topk(_WorstKey, Pri, Key, Entry, HHolder)
  190    ).
  191
  192revdel_from_heap(Q0,Px,X,Q) :-
  193    get_from_heap(Q0,Py,Y,Q1),
  194    revdel_from_heap(Q1,Px,X,Q2),
  195    add_to_heap(Q2,Py,Y,Q),
  196    !.
  197revdel_from_heap(Q0,P,X,Q) :-
  198    get_from_heap(Q0,P,X,Q).
  199
  200pri_del_from_heap(Pri, H0, Key, Entry, HRest) :-
  201    (   var(Key),
  202        Pri == (@=<)
  203    ->  revdel_from_heap(H0, Key, Entry, HRest)
  204    ;   delete_from_heap(H0, Key, Entry, HRest)
  205    ).
  206
  207replace_topk(Key1, Pri, Key, Entry, HHolder) :-
  208    HHolder = holder(_, H0),
  209    pri_del_from_heap(Pri, H0, Key1, _, HRest),
  210    (   better_than(Pri, Key, Key1)
  211    ->  add_to_heap(HRest, Key, Entry, H1),
  212        nb_setarg(2, HHolder, H1)
  213    ;   fail
  214    ).
  215
  216heap_to_list(holder(_N, H), Pri, SortedKeyVars) :-
  217    heap_to_list(H, KV0),
  218    ( Pri == (@=<)
  219    ->% To avoid the reverse we need a max-heap, SWI-Prolog only provides a
  220      % min-heap implementation
  221      reverse(KV0, KV1)
  222    ; KV1 = KV0
  223    ),
  224    sort(1, Pri, KV1, SortedKeyVars).
  225
  226/* ---------- optimized execution (top-K per group) ---------- */
  227
  228run_optimized(Goal, Count, Pri, Key, Distinct, Witness, Group, Return, Result) :-
  229    term_variables(Goal, Vars),
  230    setup_state(Distinct, State),
  231    empty_assoc(G0),
  232    GHolder = holder(G0),  % maps GroupKey -> holder(N,Heap)
  233    term_variables(Witness, WVars),
  234    WTerm =.. [w|WVars],
  235    ( ground(Group)
  236    ->create_bucket(GHolder, Group, _)
  237    ; true
  238    ),
  239    forall(Goal,
  240           ignore(consider_solution(Count, Pri, Key, Distinct, WTerm, Group, Vars, State, GHolder))),
  241    finalize(Group, Return, Pri, GHolder, Vars, Goal, Result).
  242
  243consider_solution(Count, Pri, Key, Distinct, WTerm, Group, Vars, State, GHolder) :-
  244    Entry = Vars,
  245    ( Distinct == true
  246    ->variant_sha1(WTerm, Hash),
  247      ( seen_hash(State, Hash, Key1)
  248      ->get_or_create_bucket(GHolder, Group, Bucket),
  249        replace_topk(Key1, Pri, Key, Entry, Bucket),
  250        GHolder = holder(G0),
  251        put_assoc(Group, G0, Bucket, G1),
  252        nb_setarg(1, GHolder, G1),
  253        mark_hash(State, Hash, Key),
  254        fail
  255      ; true
  256      )
  257    ; true
  258    ),
  259    get_or_create_bucket(GHolder, Group, Bucket),
  260    update_topk(Count, Pri, Key, Entry, Bucket),
  261    GHolder = holder(G0),
  262    put_assoc(Group, G0, Bucket, G1),
  263    nb_setarg(1, GHolder, G1),
  264    ( Distinct == true
  265    ->mark_hash(State, Hash, Key)
  266    ; true
  267    ).
  268
  269get_or_create_bucket(GHolder, Group, Bucket) :-
  270    GHolder = holder(G0),
  271    (   get_assoc(Group, G0, Bucket)
  272    ->  true
  273    ;   create_bucket(GHolder, Group, Bucket)
  274    ).
  275
  276create_bucket(GHolder, Group, Bucket) :-
  277    GHolder = holder(G0),
  278    empty_heap(H0),
  279    Bucket = holder(0, H0),
  280    put_assoc(Group, G0, Bucket, G1),
  281    nb_setarg(1, GHolder, G1).
  282
  283finalize(Group, Return, Pri, holder(G), Vars, Goal, Result) :-
  284    gen_assoc(Group, G, Bucket),
  285    (Group == ungrouped -> ! ; true), % Minor optimization
  286    heap_to_list(Bucket, Pri, List),
  287    emit_result(Return, Vars, List, Goal, Result).
  288
  289emit_result(list(Term), Vars, List, _, Result) :- findall(Term, member(_Key-Vars, List), Result).
  290emit_result(backtrack, Vars, List, Goal, Goal) :- member(_Key-Vars, List).
  291
  292:- meta_predicate group_by_sorted(+, +, 0, -).

group_by_sorted(+Key, :Value, -Goal, -Bag): Like group_by/4, but assumes that Goal produces solutions ordered by Key. This allows a linear, streaming implementation.

  299group_by_sorted(Key, Value, Goal, Bag) :-
  300    with_local_dynamic([key_value/2], H, group_by_sorted(H, Key, Value, Goal, Bag)).
  301
  302group_by_sorted(H, Key, Value, Goal, Bag) :-
  303    ( copy_term(Goal-Key-Value, Goal1-Key1-Value1),
  304      Goal1,
  305      ld_assertz(H, key_value(Key1, Value1)),
  306      once(ld_call(H, key_value(Key, _))),
  307      Key \= Key1
  308    ; true
  309    ),
  310    group_by(Key, Value, ld_retract(H, key_value(Key, Value)), Bag)