1:- module(lsp_formatter_parser, [ reified_format_for_file/2,
    2                                  emit_reified/2 ]).

LSP Parser For Formatter

Module for parsing Prolog source code, for subsequent formatting

author
- James Cash
To be done
- Files using quasi-quotations currently aren't supported; need to teach prolog_read_source_term/4 to load correctly

*/

   14:- use_module(library(apply)).   15:- use_module(library(apply_macros)).   16:- use_module(library(clpfd)).   17:- use_module(library(prolog_source)).   18:- use_module(library(readutil), [ read_line_to_codes/2,
   19                                   read_file_to_string/3 ]).   20
   21%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   22% Reading in terms
   23%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 file_lines_start_end(+Path:text, -LineCharRange:list) is det
Construct a mapping of file offsets to line numbers in the file at Path. LineCharRange will be a list containing terms like =line_start_end(LineNumber, LineOffsetStart, LineOffsetEnd)=
   30file_lines_start_end(Path, LineCharRange) :-
   31    Acc = line_data([], line(1, 0)),
   32    setup_call_cleanup(
   33        open(Path, read, Stream),
   34        ( repeat,
   35          read_line_to_codes(Stream, Line),
   36          stream_property(Stream, position(Position)),
   37          stream_position_data(char_count, Position, NewLineStart),
   38          arg(2, Acc, line(LastLine, LastLineStart)),
   39          arg(1, Acc, Data),
   40          LastLineEnd is NewLineStart - 1,
   41          nb_setarg(1, Acc, [line_start_end(LastLine, LastLineStart, LastLineEnd)|Data]),
   42          NextLine is LastLine + 1,
   43          nb_setarg(2, Acc, line(NextLine, NewLineStart)),
   44          Line == end_of_file, !
   45        ),
   46        close(Stream)),
   47    arg(1, Acc, RangesReversed),
   48    reverse(RangesReversed, LineCharRange).
 read_term_positions(+Path:text, -TermsWithPositions:list) is det
Read in all the terms in the file at Path, using prolog_read_source_term/4, to a list of dictionaries. Each dictionary has the following keys:
term
The term read in, with variables replace with the term var(VariableName).
pos
The position of the term (see [[prolog_read_source_term/4]]).
subterm
The position of the subterms in term (see [[prolog_read_source_term/4]]).
variable_names
List of Name=Var terms for the variables in Term. Note that the variables in term have already been replace with var(Name)
comments
Comments in the term, with the same format as prolog_read_source_term/4
   66read_term_positions(Path, TermsWithPositions) :-
   67    Acc = data([]),
   68    prolog_canonical_source(Path, SourceId),
   69    setup_call_cleanup(
   70        prolog_open_source(SourceId, Stream),
   71        ( repeat,
   72          prolog_read_source_term(Stream, Term, _Ex, [term_position(TermPos),
   73                                                      subterm_positions(SubTermPos),
   74                                                      variable_names(VarNames),
   75                                                      comments(Comments),
   76                                                      syntax_errors(error)]),
   77          maplist([Name=Var]>>( Var = var(Name) ), VarNames),
   78          arg(1, Acc, Lst),
   79          nb_setarg(1, Acc, [_{term: Term, pos: TermPos, subterm: SubTermPos,
   80                               varible_names: VarNames, comments: Comments}|Lst]),
   81          Term = end_of_file, !
   82        ),
   83        prolog_close_source(Stream)),
   84    arg(1, Acc, TermsWithPositionsRev),
   85    reverse(TermsWithPositionsRev, TermsWithPositions).
   86
   87:- thread_local current_source_string/1.
 reified_format_for_file(+Path:string, -Reified:list) is det
Read the prolog source file at Path into a flattened list of terms indicating content, comments, and whitespace.
   93reified_format_for_file(Path, Reified) :-
   94    retractall(current_source_string(_)),
   95    read_file_to_string(Path, FileString, []),
   96    read_term_positions(Path, TermsWithPos),
   97    setup_call_cleanup(
   98        assertz(current_source_string(FileString)),
   99        expand_term_positions(TermsWithPos, Reified0),
  100        retractall(current_source_string(_))
  101    ),
  102    sort(1, @=<, Reified0, Reified1),
  103    file_lines_start_end(Path, LinesStartEnd),
  104    InitState = _{last_line: 1, last_char: 0, line_bounds: LinesStartEnd},
  105    add_whitespace_terms(InitState, Reified1, Reified2),
  106    simplify_reified_terms(Reified2, Reified).
  107
  108% Remove no-longer needed positioning information to make things less
  109% annoying for later steps.
  110simplify_reified_terms(In, Out) :-
  111    maplist(simplify_reified_term, In, Out).
  112
  113simplify_reified_term(newline, newline) :- !.
  114simplify_reified_term(white(N), white(N)) :- !.
  115simplify_reified_term(Term, SimpleTerm) :-
  116    % all other terms have two extra args, From & To
  117    compound_name_arguments(Term, Name, [_, _|Args]),
  118    ( Args = []
  119    -> SimpleTerm = Name
  120    ;  compound_name_arguments(SimpleTerm, Name, Args) ).
 emit_reified(+To, +Reified) is det
Output source file as read with reified_format_for_file/2 to To, as format/3.
  126emit_reified(_, []) :- !.
  127emit_reified(To, [Term|Rest]) :-
  128    emit_reified_(To, Term),
  129    emit_reified(To, Rest).
  130
  131emit_reified_(To, newline) => format(To, "~n", []).
  132emit_reified_(To, white(N)) =>
  133    length(Whites, N),
  134    maplist(=(0' ), Whites),
  135    format(To, "~s", [Whites]).
  136emit_reified_(To, comma) => format(To, ",", []).
  137emit_reified_(To, simple(T)) =>
  138    format(To, "~s", [T]).
  139emit_reified_(To, simple_quoted(T)) =>
  140    format(To, "'~q'", [T]).
  141emit_reified_(To, string(T)), string(T) =>
  142    format(To, "~q", [T]).
  143emit_reified_(To, string(T)) =>
  144    % string term, but not a string, must be codes
  145    format(To, "`~s`", [T]).
  146emit_reified_(To, term_begin(Func, _, Parens)) =>
  147    ( Parens = true
  148    -> Format = "~q("
  149    ;  Format = "~w" ),
  150    format(To, Format, [Func]).
  151emit_reified_(To, term_end(Parens, TermState)) =>
  152    ( Parens = true
  153    -> MaybeClose = ")"
  154    ; MaybeClose = "" ),
  155    ( TermState = toplevel
  156    -> MaybeStop = "."
  157    ; MaybeStop = "" ),
  158    format(To, "~w~w", [MaybeClose, MaybeStop]).
  159emit_reified_(To, list_begin) =>
  160    format(To, "[", []).
  161emit_reified_(To, list_tail) =>
  162    format(To, "|", []).
  163emit_reified_(To, list_end) =>
  164    format(To, "]", []).
  165emit_reified_(To, comment(Text)) =>
  166    format(To, "~s", [Text]).
  167emit_reified_(To, braces_begin) =>
  168    format(To, "{", []).
  169emit_reified_(To, braces_end) =>
  170    format(To, "}", []).
  171emit_reified_(To, parens_begin) =>
  172    format(To, "(", []).
  173emit_reified_(To, parens_end) =>
  174    format(To, ")", []).
  175emit_reified_(To, dict_tag(var(Tag))) =>
  176    format(To, "~w", [Tag]).
  177emit_reified_(To, dict_tag(Tag)), var(Tag) =>
  178    % if Tag is still a var, it must be anonymous
  179    format(To, "_", []).
  180emit_reified_(To, dict_tag(Tag)) =>
  181    % if Tag is still a var, it must be anonymous
  182    format(To, "~w", [Tag]).
  183emit_reified_(To, dict_begin) =>
  184    format(To, "{", []).
  185emit_reified_(To, dict_sep) =>
  186    format(To, ":", []).
  187emit_reified_(To, dict_end) =>
  188    format(To, "}", []).
 add_whitespace_terms(+State:dict, +Reified:list, -Formatted:list) is det
Add terms indicating whitespace and newlines in between positioned terms, as created by reified_format_for_file/2.
  194add_whitespace_terms(_State, [], [newline]) :- !.
  195add_whitespace_terms(State, [Term|Terms], Out) :-
  196    arg(1, Term, TermStart),
  197    stream_position_at_offset(State.line_bounds, TermStart, Pos),
  198    sync_position_whitespace(State, Pos, Out, Out1),
  199    Out1 = [Term|Out2],
  200    arg(2, Term, TermEnd),
  201    stream_position_at_offset(State.line_bounds, TermEnd, EndPos),
  202    update_state_position(State, EndPos, State1),
  203    add_whitespace_terms(State1, Terms, Out2).
  204
  205expand_term_positions([], []).
  206expand_term_positions([InfoDict|Rest], Expanded0) :-
  207    ( InfoDict.comments \= []
  208    -> expand_comments_positions(InfoDict.comments, Expanded0, Expanded1)
  209    ;  Expanded1 = Expanded0 ),
  210
  211    Term = InfoDict.term,
  212    ( Term \= end_of_file % just for comments at the end
  213    -> expand_subterm_positions(Term, toplevel, InfoDict.subterm,
  214                                Expanded1, Expanded2)
  215    ;  Expanded2 = Expanded1 ),
  216
  217    expand_term_positions(Rest, Expanded2).
  218
  219expand_comments_positions([], Tail, Tail) :- !.
  220expand_comments_positions([Comment|Rest], Expanded, Tail) :-
  221    expand_comment_positions(Comment, Expanded, Tail0),
  222    expand_comments_positions(Rest, Tail0, Tail).
  223
  224expand_comment_positions(CommentPos-Comment, Expanded, ExpandedTail) :-
  225    term_end_position(Comment, CommentEndPosRel),
  226    increment_stream_position(CommentPos, CommentEndPosRel, CommentEndPos),
  227    stream_position_data(char_count, CommentPos, From),
  228    stream_position_data(char_count, CommentEndPos, To),
  229    Expanded = [comment(From, To, Comment)|ExpandedTail].
  230
  231expand_subterm_positions(Term, _TermState, term_position(_From, _To, FFrom, FTo, SubPoses),
  232                         Expanded, ExTail), functor(Term, ',', _, _) =>
  233    % special-case comma terms to be reified as commas
  234    Expanded = [comma(FFrom, FTo)|ExpandedTail0],
  235    functor(Term, _, Arity, _),
  236    expand_term_subterms_positions(false, Term, Arity, 1, SubPoses, ExpandedTail0, ExTail).
  237expand_subterm_positions(Term, TermState, term_position(From, To, FFrom, FTo, SubPoses),
  238                         Expanded, ExTail) =>
  239    % using functor/4 to allow round-tripping zero-arity functors
  240    functor(Term, Func, Arity, TermType),
  241    % better way to tell if term is parenthesized?
  242    % read functor from current_source_string/1 (as with simple below)
  243    % and see if parens are there?
  244    (  From = FFrom, max_subterm_to(SubPoses, SubTermMax), To > SubTermMax
  245    -> ( Parens = true, FTo1 is FTo + 1 ) % add space for the parenthesis
  246    ;  ( Parens = false, FTo1 = FTo )  ),
  247    Expanded = [term_begin(FFrom, FTo1, Func, TermType, Parens)|ExpandedTail0],
  248    expand_term_subterms_positions(Parens, Term, Arity, 1, SubPoses,
  249                                   ExpandedTail0, ExpandedTail1),
  250    succ(To0, To),
  251    ExpandedTail1 = [term_end(To0, To, Parens, TermState)|ExpandedTail2],
  252    maybe_add_comma(TermState, To, ExpandedTail2, ExTail).
  253expand_subterm_positions(Term, TermState, string_position(From, To), Expanded, Tail) =>
  254    Expanded = [string(From, To, Term)|Tail0],
  255    maybe_add_comma(TermState, To, Tail0, Tail).
  256expand_subterm_positions(_Term, TermState, From-To, Expanded, Tail) =>
  257    current_source_string(FileString),
  258    Length is To - From,
  259    sub_string(FileString, From, Length, _, SimpleString),
  260    Expanded = [simple(From, To, SimpleString)|Tail0],
  261    maybe_add_comma(TermState, To, Tail0, Tail).
  262expand_subterm_positions(Term, TermState, list_position(From, To, Elms, HasTail), Expanded, Tail) =>
  263    assertion(is_listish(Term)),
  264    ListBeginTo is From + 1,
  265    Expanded = [list_begin(From, ListBeginTo)|Expanded1],
  266    expand_list_subterms_positions(Term, Elms, Expanded1, Expanded2),
  267    succ(To0, To),
  268    (  HasTail = none
  269    -> Expanded2 = [list_end(To0, To)|Tail0]
  270    ;  ( arg(1, HasTail, TailFrom),
  271         succ(TailBarFrom, TailFrom),
  272         Expanded2 = [list_tail(TailBarFrom, TailFrom)|Expanded3],
  273         list_tail(Term, Elms, ListTail),
  274         expand_subterm_positions(ListTail, false, HasTail, Expanded3, Expanded4),
  275         Expanded4 = [list_end(To0, To)|Tail0] )  ),
  276    maybe_add_comma(TermState, To, Tail0, Tail).
  277expand_subterm_positions(Term, TermState, brace_term_position(From, To, BracesPos), Expanded, Tail) =>
  278    BraceTo is From + 1,
  279    Expanded = [braces_begin(From, BraceTo)|Tail0],
  280    Term = {Term0},
  281    expand_subterm_positions(Term0, false, BracesPos, Tail0, Tail1),
  282    succ(To1, To),
  283    Tail1 = [braces_end(To1, To)|Tail2],
  284    maybe_add_comma(TermState, To1, Tail2, Tail).
  285expand_subterm_positions(Term, TermState, parentheses_term_position(From, To, ContentPos),
  286                         Expanded, Tail) =>
  287    ParenTo is From + 1,
  288    Expanded = [parens_begin(From, ParenTo)|Tail0],
  289    expand_subterm_positions(Term, false, ContentPos, Tail0, Tail1),
  290    succ(To1, To),
  291    Tail1 = [parens_end(To1, To)|Tail2],
  292    maybe_add_comma(TermState, To, Tail2, Tail).
  293expand_subterm_positions(Term, TermState, dict_position(_From, To, TagFrom, TagTo, KeyValPos),
  294                         Expanded, Tail) =>
  295    is_dict(Term, Tag),
  296    DictBraceTo is TagTo + 1,
  297    Expanded = [dict_tag(TagFrom, TagTo, Tag), dict_begin(TagTo, DictBraceTo)|Tail0],
  298    expand_dict_kvs_positions(Term, KeyValPos, Tail0, Tail1),
  299    succ(To1, To),
  300    Tail1 = [dict_end(To1, To)|Tail2],
  301    maybe_add_comma(TermState, To, Tail2, Tail).
  302
  303maybe_add_comma(subterm_item, CommaFrom, Tail0, Tail) :- !,
  304    CommaTo is CommaFrom + 1,
  305    Tail0 = [comma(CommaFrom, CommaTo)|Tail].
  306maybe_add_comma(_, _, Tail, Tail).
  307
  308is_listish(L) :- \+ var(L), !.
  309is_listish([]).
  310is_listish([_|_]).
  311
  312list_tail(Tail, [], Tail) :- !.
  313list_tail([_|Rest], [_|PosRest], Tail) :-
  314    list_tail(Rest, PosRest, Tail).
  315
  316max_subterm_to(SubPoses, SubTermMaxTo) :-
  317    aggregate_all(max(To),
  318                  ( member(Pos, SubPoses),
  319                    arg(2, Pos, To) ),
  320                  SubTermMaxTo).
  321
  322expand_dict_kvs_positions(_, [], Tail, Tail) :- !.
  323expand_dict_kvs_positions(Dict, [Pos|Poses], Expanded0, Tail) :-
  324    Pos = key_value_position(_From, To, SepFrom, SepTo, Key, KeyPos, ValuePos),
  325    get_dict(Key, Dict, Value),
  326    expand_subterm_positions(Key, false, KeyPos, Expanded0, Expanded1),
  327    Expanded1 = [dict_sep(SepFrom, SepTo)|Expanded2],
  328    expand_subterm_positions(Value, false, ValuePos, Expanded2, Expanded3),
  329    CommaTo is To + 1,
  330    ( Poses = [_|_]
  331    -> Expanded3 = [comma(To, CommaTo)|Expanded4]
  332    ;  Expanded3 = Expanded4 ),
  333    expand_dict_kvs_positions(Dict, Poses, Expanded4, Tail).
  334
  335% possible for the list to still have a tail when out of positions
  336expand_list_subterms_positions(_, [], Tail, Tail) :- !.
  337expand_list_subterms_positions([Term|Terms], [Pos|Poses], Expanded, Tail) :-
  338    ( Poses = [_|_]
  339    -> TermState = subterm_item
  340    ;  TermState = false ),
  341    expand_subterm_positions(Term, TermState, Pos, Expanded, Expanded1),
  342    expand_list_subterms_positions(Terms, Poses, Expanded1, Tail).
  343
  344expand_term_subterms_positions(_Parens, _Term, _Arity, _Arg, [], Tail, Tail) :- !.
  345expand_term_subterms_positions(Parens, Term, Arity, Arg, [SubPos|Poses], Expanded, ExpandedTail) :-
  346    assertion(between(1, Arity, Arg)),
  347    arg(Arg, Term, SubTerm),
  348    ( Parens = true, Arg < Arity
  349    -> State = subterm_item
  350    ;  State = false ),
  351    expand_subterm_positions(SubTerm, State, SubPos, Expanded, Expanded0),
  352    succ(Arg, Arg1),
  353    expand_term_subterms_positions(Parens, Term, Arity, Arg1, Poses, Expanded0, ExpandedTail).
  354
  355increment_stream_position(StartPos, RelPos, EndPos) :-
  356    stream_position_data(char_count, StartPos, StartCharCount),
  357    stream_position_data(char_count, RelPos, RelCharCount),
  358    CharCount is StartCharCount + RelCharCount,
  359    stream_position_data(byte_count, StartPos, StartByteCount),
  360    stream_position_data(byte_count, RelPos, RelByteCount),
  361    ByteCount is StartByteCount + RelByteCount,
  362    stream_position_data(line_count, StartPos, StartLineCount),
  363    stream_position_data(line_count, RelPos, RelLineCount),
  364    stream_position_data(line_position, StartPos, StartLinePosition),
  365    stream_position_data(line_position, RelPos, RelLinePosition),
  366    ( RelLineCount == 1
  367    -> LineCount = StartLineCount,
  368       LinePosition is StartLinePosition + RelLinePosition
  369    ; ( LineCount is StartLineCount + RelLineCount - 1,
  370        LinePosition = RelLinePosition ) ),
  371    EndPos = '$stream_position_data'(CharCount, LineCount, LinePosition, ByteCount).
  372
  373update_state_position(State0, EndPos, State2) :-
  374    stream_position_data(line_count, EndPos, EndLineCount),
  375    stream_position_data(line_position, EndPos, EndLinePos),
  376    put_dict(last_line, State0, EndLineCount, State1),
  377    put_dict(last_char, State1, EndLinePos, State2).
  378
  379sync_position_whitespace(State, TermPos, Expanded, ExpandedTail) :-
  380    PrevLineCount = State.last_line,
  381    stream_position_data(line_count, TermPos, NewLineCount),
  382    NewLines is NewLineCount - PrevLineCount,
  383    ( NewLines > 0
  384    -> n_copies_of(NewLines, newline, Expanded, Expanded0),
  385       PrevLinePosition = 0
  386    ;  ( Expanded = Expanded0,
  387         PrevLinePosition = State.last_char )
  388    ),
  389
  390    stream_position_data(line_position, TermPos, NewLinePosition),
  391    Whitespace is NewLinePosition - PrevLinePosition,
  392    ( Whitespace > 0
  393    -> Expanded0 = [white(Whitespace)|ExpandedTail]
  394    ;  Expanded0 = ExpandedTail ).
  395
  396file_offset_line_position(LineCharMap, CharCount, Line, LinePosition) :-
  397    member(line_start_end(Line, Start, End), LineCharMap),
  398    between(Start, End, CharCount),
  399    LinePosition #= CharCount - Start, !.
  400
  401stream_position_at_offset(LineCharMap, To, EndPos) :-
  402    CharCount = To,
  403    ByteCount = To, % need to check for multibyte...
  404    file_offset_line_position(LineCharMap, To, LineCount, LinePosition),
  405    % breaking the rules, building an opaque term
  406    EndPos = '$stream_position_data'(CharCount, LineCount, LinePosition, ByteCount).
  407
  408% Helpers
  409
  410term_end_position(Term, Position) :-
  411    setup_call_cleanup(
  412        open_null_stream(Out),
  413        ( write(Out, Term),
  414          stream_property(Out, position(Position))
  415        ),
  416        close(Out)).
  417
  418n_copies_of(0, _, Tail, Tail) :- !.
  419n_copies_of(N, ToCopy, [ToCopy|Rest], Tail) :-
  420    N1 is N - 1,
  421    n_copies_of(N1, ToCopy, Rest, Tail)