3
7
8:- prolog_load_context(file,File),
9 absolute_file_name('..',X,[relative_to(File),file_type(directory)]),
10 asserta(user:file_search_path(candc,X)). 11
12user:file_search_path(semlib, candc(lib)).
13user:file_search_path(boxer, candc(boxer)).
14
15:- set_prolog_flag(double_quotes,codes). 16
20
21:- dynamic split/7, title/1. 22
23
27
28:- use_module(library(lists),[member/2,append/3,reverse/2]). 29:- use_module(library(readutil),[read_stream_to_codes/2]). 30:- use_module(semlib(abbreviations),[iAbb/2,tAbb/2]). 31:- use_module(semlib(errors),[error/2,warning/2]). 32:- use_module(semlib(options),[option/2,parseOptions/2,setOption/3,
33 showOptions/1,setDefaultOptions/1]). 34
35
39
40tokkie:-
41 option(Option,do),
42 member(Option,['--help']), !,
43 help.
44
45tokkie:-
46 openInput(InStream),
47 openOutput(OutStream), !,
48 read_stream_to_codes(InStream,Codes),
49 close(InStream),
50 initTokkie,
51 readLines(Codes,0,1,OutStream,Tokens),
52 outputIOB(Codes,Tokens,OutStream),
53 close(OutStream).
54
55tokkie:-
56 setOption(tokkie,'--help',do), !,
57 help.
58
59
63
64readLines(Codes1,I1,S1,Stream,[Tokens|L]):-
65 begSent(Codes1,I1,Codes2,I2), !, 66 endSent(Codes2,I2,Codes3,I3,Rest,[]), 69 tokenise(Codes3,I2,I2,T-T,Tokens), 70 outputTokens(Tokens,S1,Stream),
71 S2 is S1 + 1, 72 readLines(Rest,I3,S2,Stream,L). 73
74readLines(_,_,_,_,[]).
75
76
80
81begSent([Sep|C1],I1,C2,I3):-
82 sep(Sep), !, 83 I2 is I1 + 1,
84 begSent(C1,I2,C2,I3).
85
86begSent([C|L],I,[C|L],I).
87
88
100
101endSent([],I,[],I,[],_):- !.
102
105endSent([46|Rest],I1,[46],I2,Rest,[]):- !,
106 I2 is I1 + 1.
107
110endSent([46,Q1,Q2,X|Rest],I1,[46,Q1,Q2],I2,[X|Rest],_):-
111 \+ alphanum(X), quote(Q1), quote(Q2), !, I2 is I1 + 3.
112
113endSent([46,Q,X|Rest],I1,[46,Q],I2,[X|Rest],_):-
114 \+ alphanum(X), quote(Q), !, I2 is I1 + 2.
115
118endSent([C|C1],I1,[C|C2],I3,Rest,Last):-
119 noSentenceBoundary([C],C1,Last), !,
120 I2 is I1 + 1,
121 endSent(C1,I2,C2,I3,Rest,[C|Last]).
122
125endSent([End|Rest],I1,[End],I2,Rest,_):-
126 member(End,[46,63,33]), !,
127 I2 is I1 + 1.
128
129endSent([46|Rest],I1,[46],I2,Rest,_):- !,
130 I2 is I1 + 1.
131
132endSent([C|C1],I1,[C|C2],I3,Rest,Last):-
133 alphanum(C), !,
134 I2 is I1 + 1,
135 endSent(C1,I2,C2,I3,Rest,[C|Last]).
136
137endSent([C|C1],I1,[C|C2],I3,Rest,_):-
138 I2 is I1 + 1,
139 endSent(C1,I2,C2,I3,Rest,[]).
140
141
151noSentenceBoundary(".",_,Last):- Last = [Upper], upper(Upper).
153noSentenceBoundary(".",_,Last):- title(Last).
155noSentenceBoundary(".",_,Last):- member(46,Last).
157noSentenceBoundary(".",[N|_],_):- num(N).
158
159
163
166tokenise([],_,_,Sofar-[],[]):- Sofar=[], !.
167
170tokenise([],CurrentPos,StartPos,Sofar-[],[tok(StartPos,CurrentPos,Sofar)]):- !.
171
174tokenise([Sep|Codes],CurrentPos,_,T1-T2,Tokens):-
175 sep(Sep), T2=[], T1=[], !,
176 Pos is CurrentPos + 1,
177 tokenise(Codes,Pos,Pos,T-T,Tokens).
178
181tokenise([Sep|Codes],CurrentPos,StartPos,Sofar-Tail,[Token|Tokens]):-
182 sep(Sep), !, Tail = [],
183 Token = tok(StartPos,CurrentPos,Sofar),
184 Pos is CurrentPos + 1,
185 tokenise(Codes,Pos,Pos,T-T,Tokens).
186
189tokenise(Input,CurrentPos,_,Sofar-[],[Token|Tokens]):-
190 final(Input,Head,Rest,Len), Sofar = [], !,
191 FinalPos is CurrentPos + Len,
192 Token = tok(CurrentPos,FinalPos,Head),
193 tokenise(Rest,FinalPos,FinalPos,T-T,Tokens).
194
197tokenise(Input,CurrentPos,StartPos,Sofar-[],[Token1,Token2|Tokens]):-
198 final(Input,Head,Rest,Len), !,
199 FinalPos is CurrentPos + Len,
200 Token1 = tok(StartPos,CurrentPos,Sofar),
201 Token2 = tok(CurrentPos,FinalPos,Head),
202 tokenise(Rest,FinalPos,FinalPos,T-T,Tokens).
203
206tokenise(Input,CurrentPos,StartPos,OldSofar,Tokens):-
207 dontsplit(Input,Rest,Diff,OldSofar,NewSofar), !,
208 Pos is CurrentPos + Diff,
209 tokenise(Rest,Pos,StartPos,NewSofar,Tokens).
210
211
214tokenise(Input,CurrentPos,StartPos,Sofar-Tail,[Token|Tokens]):-
215 trysplit(Input,Left,Right,Rest,LenLeft,LenRight), !,
220 Pos is CurrentPos + LenLeft,
221 NewPos is Pos + LenRight,
222 Tail = Left,
223 Token = tok(StartPos,Pos,Sofar),
224 append(Right,NewTail,New),
225 tokenise(Rest,NewPos,Pos,New-NewTail,Tokens).
226
229tokenise([X|Codes],CurrentPos,StartPos,Sofar-Tail,Tokens):-
230 Pos is CurrentPos + 1,
231 Tail = [X|NewTail],
232 tokenise(Codes,Pos,StartPos,Sofar-NewTail,Tokens).
233
234
238
239outputTokens(Tokens,S,Stream):-
240 option('--mode',poor), !,
241 printTokens(Tokens,S,1,Stream).
242
243outputTokens(Tokens,S,Stream):-
244 option('--mode',rich), !,
245 printTokens(Tokens,S,1,Stream).
246
247outputTokens(_,_,_).
248
249
253
254outputIOB(Codes,Tokens,Stream):-
255 option('--mode',iob), !,
256 printIOB(Codes,0,Tokens,Stream).
257
258outputIOB(_,_,_).
259
260
264
265printIOB([],_,_,_).
266
267printIOB([X|L],N1,TokenSet,Stream):-
268 member([tok(N1,_,Tok)|_],TokenSet), !, Tag = 'S',
269 tupleIOB(N1,X,Tag,Tok,Stream),
270 N2 is N1 + 1,
271 printIOB(L,N2,TokenSet,Stream).
272
273printIOB([X|L],N1,TokenSet,Stream):-
274 member(Tokens,TokenSet),
275 member(tok(N1,_,Tok),Tokens), !, Tag = 'T',
276 tupleIOB(N1,X,Tag,Tok,Stream),
277 N2 is N1 + 1,
278 printIOB(L,N2,TokenSet,Stream).
279
280printIOB([X|L],N1,TokenSet,Stream):-
281 member(Tokens,TokenSet),
282 member(tok(Start,End,_),Tokens), N1 > Start, N1 < End, !, Tag = 'I',
283 tupleIOB(N1,X,Tag,[],Stream),
284 N2 is N1 + 1,
285 printIOB(L,N2,TokenSet,Stream).
286
287printIOB([X|L],N1,TokenSet,Stream):-
288 Tag = 'O',
289 tupleIOB(N1,X,Tag,[],Stream),
290 N2 is N1 + 1,
291 printIOB(L,N2,TokenSet,Stream).
292
293
297
298tupleIOB(_,X,Tag,_,Stream):-
299 option('--format',txt), !,
300 format(Stream,'~p ~p~n',[X,Tag]).
301
302tupleIOB(N,X,Tag,Tok,Stream):-
303 option('--format',prolog), !,
304 format(Stream,'tok(~p,\'~p\'). % ~p ~s~n',[X,Tag,N,Tok]).
305
306
310
311printTokens([],_,_,_).
312
313printTokens([tok(_,_,Tok)],_,_,Stream):-
314 option('--mode',poor), !,
315 format(Stream,'~s~n',[Tok]).
316
317printTokens([tok(I,J,Tok)|L],S,T1,Stream):-
318 option('--format',prolog),
319 option('--mode',rich), !,
320 Index is S*1000+T1,
321 format(Stream,'tok(~p, ~p, ~p, ~s).~n',[I,J,Index,Tok]),
322 T2 is T1+1,
323 printTokens(L,S,T2,Stream).
324
325printTokens([tok(I,J,Tok)|L],S,T1,Stream):-
326 option('--format',txt),
327 option('--mode',rich), !,
328 Index is S*1000+T1,
329 format(Stream,'~p ~p ~p ~s~n',[I,J,Index,Tok]),
330 T2 is T1+1,
331 printTokens(L,S,T2,Stream).
332
333printTokens([tok(_,_,Tok)|L],S,T,Stream):-
334 option('--mode',poor), !,
335 format(Stream,'~s ',[Tok]),
336 printTokens(L,S,T,Stream).
337
338
342
343sep(10). 344sep(13). 345sep(32). 346sep(9). 347sep(160). 348sep(8194). 349sep(8195). 350
351alphanum(X):- alpha(X), !.
352alphanum(X):- num(X), !.
353
354alpha(62):- !. 355alpha(X):- upper(X), !.
356alpha(X):- lower(X), !.
357
358upper(X):- number(X), X > 64, X < 91, !.
359upper(X):- var(X), member(X,"ABCDEFGHIJKLMNOPQRSTUVWXYZ").
360
361lower(X):- number(X), X > 96, X < 123, !.
362lower(X):- var(X), member(X,"abcdefghijklmnopqrstuvwxyz").
363
364num(X):- number(X), X > 47, X < 58, !.
365num(X):- var(X), member(X,"0123456789").
366
367
372
373split(`can`,[], "not",[], []).
374split([_],[], "n't",[], []).
375split([_],[], "'ll",[], []).
376split([_],[], "'ve",[], []).
377split([_],[], "'re",[], []).
378
379split([_],[], "'m",[], []).
380split([_],[], "'d",[], []).
381split([_],[], "'s",[], []).
382
383split([N],[num(N)], [], [], "%").
384split("%",[], ",",[],[]).
385split(")",[], ",",[],[]).
386
387split([N],[num(N)], ",",[], [32]).
388split([N],[num(N)], ",",[], [10]).
389split([A],[alpha(A)], [], [], ",").
390split([_],[], ";",[], []).
391split([_],[], ":",[], []).
392split([_],[], [],[], ")").
394split([_],[], "]",[], []).
395
396split("$",[], [N],[num(N)], []). 397split([163],[], [N],[num(N)], []). 398split([165],[], [N],[num(N)], []). 399split("(",[], [X],[alphanum(X)], []).
400split("[",[], [X],[alphanum(X)], []).
401
402split([_],[], [Q],[quote(Q)], []).
403split([Q],[quote(Q)], [X],[alphanum(X)], []).
404
405
409
410dontsplit(Input,Rest,N,Old-OldTail,Old-NewTail):-
411 nosplit(Left,N),
412 append(Left,Rest,Input), !,
413 append(Left,NewTail,OldTail).
414
415nosplit("hi'it",5).
416nosplit("e.g.",4).
417nosplit([79,Q,U],3):- rsq(Q), upper(U). 418
419
423
424initTokkie:-
425 initTitles,
426 initSplitRules.
427
428initTitles:-
429 option('--language',Language), !,
430 findall(Title,
431 ( tAbb(Language,Title),
432 reverse(Title,Reversed),
433 assertz(title(Reversed)) ),
434 _).
435
436initSplitRules:-
437 findall(Ri,
438 ( split(Le,CondLe,Ri,CondRi,Context),
439 length(Le,LenLe),
440 length(Ri,LenRi),
441 assertz(split(Le,LenLe,CondLe,Ri,LenRi,CondRi,Context)) ),
442 _).
443
444
448
449final("?", "?", [], 1).
450final(".", ".", [], 1).
451
452final([46,Q],[46], [Q],1):- quote(Q).
453
454
458
459trysplit(Input,Left,Right,Rest,LenLeft,LenRight):-
460 split(Left,LenLeft,CondsLeft,Right,LenRight,CondsRight,RightContext),
461 append(Left,Middle,Input),
462 checkConds(CondsLeft),
463 append(Right,Rest,Middle),
464 checkConds(CondsRight),
465 append(RightContext,_,Rest), !.
466
467
471
472checkConds([]).
473checkConds([C|L]):- call(C), !, checkConds(L).
474
475
479
480rsq(39).
481rsq(8217).
482
483
487
488quote(34). 489quote(39). 490quote(96). 491quote(8216). 492quote(8217). 493quote(8218). 494quote(8220). 495quote(8221). 496quote(8222). 497
498
502
503quotes(96). 504quotes(39). 505quotes(8216).
506quotes(8217).
507quotes(8218).
508
509
510
514
515openInput(Stream):-
516 option('--stdin',dont),
517 option('--input',File),
518 exists_file(File), !,
519 open(File,read,Stream,[encoding(utf8)]).
520
521openInput(Stream):-
522 option('--stdin',do),
523 set_prolog_flag(encoding,utf8),
524 warning('reading from standard input',[]),
525 prompt(_,''),
526 Stream = user_input.
527
528
532
533openOutput(Stream):-
534 option('--output',Output),
535 atomic(Output),
536 \+ Output=user_output,
537 ( access_file(Output,write), !,
538 open(Output,write,Stream,[encoding(utf8)])
539 ; error('cannot write to specified file ~p',[Output]),
540 Stream=user_output ), !.
541
542openOutput(user_output).
543
544
548
549help:-
550 option('--help',do), !,
551 format(user_error,'usage: tokkie [options]~n~n',[]),
552 showOptions(tokkie).
553
554help:-
555 option('--help',dont), !.
556
557
561
562start:-
563 current_prolog_flag(argv,[_Comm|Args]),
564 setDefaultOptions(tokkie),
565 parseOptions(tokkie,Args),
566 tokkie, !,
567 halt.
568
569start:-
570 error('tokkie failed',[]),
571 halt