:- ensure_loaded('$REGULUS/PrologLib/compatibility'). %--------------------------------------------------------------- :- module(postediting_experiments, [read_postediting_csv/2, read_postediting_csv/4, format_postediting_csv/2, analyse_postediting_csv/3, read_xliff/2, test_postediting/1 ] ). %--------------------------------------------------------------- :- use_module('$REGULUS/PrologLib/CorpusTools/tokenize_sents'). :- use_module('$REGULUS/PrologLib/CorpusTools/utils'). :- use_module('$REGULUS/PrologLib/utilities'). :- use_module(library(random)). :- use_module(library(lists)). :- use_module(library(xml)). %--------------------------------------------------------------- test_postediting(smart_diff1) :- TargA = 'I have not all versions in mind', Post1A = 'I do not have all versions in mind', tokenize_sent_atom(TargA, Targ), tokenize_sent_atom(Post1A, Post1), smart_diff(Post1, Targ, Diff1), format('~NTarget1: "~w"~n', [TargA]), format('~NTarget2: "~w"~n', [Post1A]), format('~N Diff: "~w"~n', [Diff1]), !. test_postediting(load_main_corpus_ngram_frequencies) :- safe_compile(user, '$ACCEPT/MT/GTFeb2012/CleanFrenchVersions/forum_ngrams.pl'). test_postediting(summit_read) :- read_postediting_csv('$ACCEPT/MT/PostEdition/Data/summitPostEdData.csv', '$ACCEPT/MT/PostEdition/Data/summitPostEdData.pl'). test_postediting(summit_analyse) :- analyse_postediting_csv('$ACCEPT/MT/PostEdition/Data/summitPostEdData.pl', '$ACCEPT/MT/PostEdition/Results/summitPostEd.txt', 4). test_postediting(fti_students_read) :- read_postediting_csv('$ACCEPT/MT/PostEdition/FTIStudents2014/FRdataToPostEdit_200ph_MarieRachel.csv', '$ACCEPT/MT/PostEdition/FTIStudents2014/FRdataToPostEdit_200ph_MarieRachel.pl', [_Id, SourceA, TargA, Post1A, Post2A], [pe_record([type=preedited, source=SourceA, target=TargA, pe(1)=Post1A, pe(2)=Post2A])]). test_postediting(fti_students_analyse) :- analyse_postediting_csv('$ACCEPT/MT/PostEdition/FTIStudents2014/FRdataToPostEdit_200ph_MarieRachel.pl', '$ACCEPT/MT/PostEdition/FTIStudents2014/FRdataToPostEdit_200ph_MarieRachel.html', 1). test_postediting(fti_students_format) :- format_postediting_csv('$ACCEPT/MT/PostEdition/FTIStudents2014/FRdataToPostEdit_200ph_MarieRachel.pl', '$ACCEPT/MT/PostEdition/FTIStudents2014/FRdataToPostEdit_200ph_MarieRachel_simple.html'). test_postediting(fti_students_read_525) :- read_postediting_csv('$ACCEPT/MT/PostEdition/FTIStudents2014/FRdataToPostEdit_525ph_MarieRachel.csv', '$ACCEPT/MT/PostEdition/FTIStudents2014/FRdataToPostEdit_525ph_MarieRachel.pl', [_Id, SourceA, TargA, Post1A, Post2A], [pe_record([type=preedited, source=SourceA, target=TargA, pe(1)=Post1A, pe(2)=Post2A])]). test_postediting(fti_students_analyse_525) :- analyse_postediting_csv('$ACCEPT/MT/PostEdition/FTIStudents2014/FRdataToPostEdit_525ph_MarieRachel.pl', '$ACCEPT/MT/PostEdition/FTIStudents2014/FRdataToPostEdit_525ph_MarieRachel.html', 1). test_postediting(fti_students_format_525) :- format_postediting_csv('$ACCEPT/MT/PostEdition/FTIStudents2014/FRdataToPostEdit_525ph_MarieRachel.pl', '$ACCEPT/MT/PostEdition/FTIStudents2014/FRdataToPostEdit_525ph_MarieRachel_simple.html'). test_postediting(fti_students_read_1k) :- read_postediting_csv('$ACCEPT/MT/PostEdition/FTIStudents2014/FR_postEditedData_1k.csv', '$ACCEPT/MT/PostEdition/FTIStudents2014/FR_postEditedData_1k.pl', [_Id, SourceA, TargA, Post1A, Post2A], [pe_record([type=preedited, source=SourceA, target=TargA, pe(1)=Post1A, pe(2)=Post2A])]). test_postediting(fti_students_analyse_1k) :- analyse_postediting_csv('$ACCEPT/MT/PostEdition/FTIStudents2014/FR_postEditedData_1k.pl', '$ACCEPT/MT/PostEdition/FTIStudents2014/FR_postEditedData_1k.html', 1). test_postediting(fti_students_format_1k) :- format_postediting_csv('$ACCEPT/MT/PostEdition/FTIStudents2014/FR_postEditedData_1k.pl', '$ACCEPT/MT/PostEdition/FTIStudents2014/FR_postEditedData_1k_simple.html'). test_postediting(lexcelera_read) :- read_xliff('$ACCEPT/MT/PostEdition/Data/Chapter1BilingualProject/Chapter1_BilingualProject_all.xliff', '$ACCEPT/MT/PostEdition/Data/Chapter1BilingualProject/Chapter1_BilingualProject_all.pl', raw). test_postediting(lexcelera_analyse) :- analyse_postediting_csv('$ACCEPT/MT/PostEdition/Data/Chapter1BilingualProject/Chapter1_BilingualProject_all.pl', '$ACCEPT/MT/PostEdition/Data/Chapter1BilingualProject/Chapter1_BilingualProject_all.txt', 3). test_postediting(amt1234) :- test_postediting(amt1), test_postediting(amt2), test_postediting(amt3), test_postediting(amt4), test_postediting(combine_amt1234). test_postediting(amt1) :- read_xliff('$ACCEPT/MT/PostEdition/Data/AMTXLIFF/AMT1.xml', '$ACCEPT/MT/PostEdition/Data/AMTXLIFF/AMT1.pl', raw). test_postediting(amt2) :- read_xliff('$ACCEPT/MT/PostEdition/Data/AMTXLIFF/AMT2.xml', '$ACCEPT/MT/PostEdition/Data/AMTXLIFF/AMT2.pl', raw). test_postediting(amt3) :- read_xliff('$ACCEPT/MT/PostEdition/Data/AMTXLIFF/AMT3.xml', '$ACCEPT/MT/PostEdition/Data/AMTXLIFF/AMT3.pl', raw). test_postediting(amt4) :- read_xliff('$ACCEPT/MT/PostEdition/Data/AMTXLIFF/AMT4.xml', '$ACCEPT/MT/PostEdition/Data/AMTXLIFF/AMT4.pl', raw). test_postediting(amt1_analyse) :- analyse_postediting_csv('$ACCEPT/MT/PostEdition/Data/AMTXLIFF/AMT1.pl', '$ACCEPT/MT/PostEdition/Data/AMTXLIFF/AMT1.txt', 2). test_postediting(combine_amt1234) :- prolog_file_or_files_to_list(['$ACCEPT/MT/PostEdition/Data/AMTXLIFF/AMT1.pl', '$ACCEPT/MT/PostEdition/Data/AMTXLIFF/AMT2.pl', '$ACCEPT/MT/PostEdition/Data/AMTXLIFF/AMT3.pl', '$ACCEPT/MT/PostEdition/Data/AMTXLIFF/AMT4.pl' ], List, 'UTF-8' ), length(List, N), safe_absolute_file_name('$ACCEPT/MT/PostEdition/Data/AMTXLIFF/AMT1234.pl', OutFile), list_to_prolog_file_prettyprint_unicode(List, OutFile), format('~N--- Written combined file (~d records), ~w~n', [N, OutFile]). test_postediting(amt1234_analyse) :- analyse_postediting_csv('$ACCEPT/MT/PostEdition/Data/AMTXLIFF/AMT1234.pl', '$ACCEPT/MT/PostEdition/Data/AMTXLIFF/AMT1234.html', 1). %--------------------------------------------------------------- read_postediting_csv(InFile, OutFile) :- read_postediting_csv(InFile, OutFile, [SourceA, TargA, Post1A, Post2A, Post3A, PreEdA, TargPrA, PostPr1A, PostPr2A, PostPr3A], [pe_record([type=raw, source=SourceA, target=TargA, pe(1)=Post1A, pe(2)=Post2A, pe(3)=Post3A]), pe_record([type=preedited, source=PreEdA, target=TargPrA, pe(1)=PostPr1A, pe(2)=PostPr2A, pe(3)=PostPr3A])]). read_postediting_csv(InFile, OutFile, Pattern, Records) :- safe_absolute_file_name(InFile, AbsInFile), safe_absolute_file_name(OutFile, AbsOutFile), csv_file_to_list_of_lists(AbsInFile, InList), InList = [_Header1, _Header2 | InList1], length(InList1, NIn), format('~N--- Read postediting file (~d records) ~w~n', [NIn, AbsInFile]), process_postediting_csv_list(InList1, NextList1, Pattern, Records), format('~N--- Processing grouped lines~n', []), process_grouped_lines(NextList1, OutList, 0), format('~N--- Finished processing grouped lines~n', []), length(OutList, NOut), list_to_prolog_file_prettyprint(OutList, AbsOutFile), format('~N--- Written postediting file (~d records) ~w~n', [NOut, AbsOutFile]), !. %--------------------------------------------------------------- read_xliff(InFile, OutFile, Type) :- safe_absolute_file_name(InFile, AbsInFile), safe_absolute_file_name(OutFile, AbsOutFile), read_unicode_file_to_string(AbsInFile, Str), xml_parse(Str, XMLTerm), ( XMLTerm = malformed(_, _) -> format('~N*** Error: malformed XML~n', []), fail ; otherwise -> xliff_prolog_to_list(XMLTerm, List, Type), length(List, NOut), list_to_prolog_file_prettyprint_unicode(List, AbsOutFile), format('~N--- Written postediting file (~d records) ~w~n', [NOut, AbsOutFile]) ), !. internalise_script_file1(File, _Script, _LessonId) :- format('~N*** Error: unable to internalise script file "~w"~n', [File]), fail. %--------------------------------------------------------------- format_postediting_csv(InFile, OutFile) :- safe_absolute_file_name(InFile, AbsInFile), safe_absolute_file_name(OutFile, AbsOutFile), load_postediting_csv(AbsInFile), format_postediting_csv1(AbsOutFile). %--------------------------------------------------------------- analyse_postediting_csv(InFile, OutFile, MinFreq) :- safe_absolute_file_name(InFile, AbsInFile), safe_absolute_file_name(OutFile, AbsOutFile), load_postediting_csv(AbsInFile), collect_and_show_basic_edit_operations(AbsOutFile, MinFreq). load_postediting_csv(File) :- compile(File). %--------------------------------------------------------------- process_postediting_csv_list([], [], _Pattern, _Records). process_postediting_csv_list([F | R], Out, Pattern, Records) :- copy_term([Pattern, Records], [Pattern1, Records1]), process_postediting_csv_line(F, Pattern1, Records1), append(Records1, R1, Out), !, process_postediting_csv_list(R, R1, Pattern, Records). process_postediting_csv_line(Line, Pattern, _Records) :- Line = Pattern, !. process_postediting_csv_line(Line, Pattern, _Records) :- length(Line, LLen), length(Pattern, PLen), format('~NLine ~w (length ~d) fails to match~n', [Line, LLen]), format('~Npattern ~w (length ~d)~n', [Pattern, PLen]), fail. %--------------------------------------------------------------- process_grouped_lines([], [], _I). process_grouped_lines([F | R], [F1 | R1], I) :- process_grouped_line(F, F1), format(user, '.', []), flush_output(user), I1 is I + 1, ( 0 is I1 mod 100 -> format(' (~d) ~n', [I1]), flush_output(user) ; otherwise -> true ), !, process_grouped_lines(R, R1, I1). %process_grouped_line(F, F1) :- % F = pe_record([type=Type, source=SourceA, target=TargA, pe(1)=Post1A, pe(2)=Post2A, pe(3)=Post3A]), % %tokenize_sent_atom(SourceA, _Source), % tokenize_sent_atom(TargA, Targ), % tokenize_sent_atom(Post1A, Post1), % tokenize_sent_atom(Post2A, Post2), % tokenize_sent_atom(Post3A, Post3), % smart_diff(Post1, Targ, Diff1), % smart_diff(Post2, Targ, Diff2), % smart_diff(Post3, Targ, Diff3), % F1 = pe_record([type=Type, source=SourceA, target=TargA, % pe(1)=Post1A, pe(2)=Post2A, pe(3)=Post3A, % diff(1)=Diff1, diff(2)=Diff2, diff(3)=Diff3]). process_grouped_line(F, F1) :- F = pe_record(InList), member(type=Type, InList), member(source=SourceA, InList), member(target=TargA, InList), tokenize_sent_atom(TargA, Targ), findall([I, PostA, Diff], ( member(pe(I)=PostA, InList), tokenize_sent_atom(PostA, Post), smart_diff(Post, Targ, Diff) ), Triples), pe_elements_in_grouped_line_triples(Triples, PEElements), diff_elements_in_grouped_line_triples(Triples, DiffElements), append_list([[type=Type, source=SourceA, target=TargA], PEElements, DiffElements], OutList), F1 = pe_record(OutList), !. process_grouped_line(F, F1) :- format('~N*** Error: bad call: ~w~n', [process_grouped_line(F, F1)]), fail. pe_elements_in_grouped_line_triples([], []). pe_elements_in_grouped_line_triples([[I, PostA, _Diff] | R], [pe(I)=PostA | R1]) :- !, pe_elements_in_grouped_line_triples(R, R1). diff_elements_in_grouped_line_triples([], []). diff_elements_in_grouped_line_triples([[I, _PostA, Diff] | R], [diff(I)=Diff | R1]) :- !, diff_elements_in_grouped_line_triples(R, R1). %--------------------------------------------------------------- tokenize_sent_atom(Atom, Tokens) :- atom_codes(Atom, Str), tokenize_sent(Str, Tokens). smart_diff(Targ, Post, Diff2) :- remove_spaces(Targ, Targ1), remove_spaces(Post, Post1), insertions_deletions_substitutions_and_matches(Targ1, Post1, _, _, _, _, Diff), simplify_diff(Diff, Diff1, null, null), find_complex_editing_operations(Diff1, Diff2), !. remove_spaces([], []). remove_spaces([' ' | R], R1) :- !, remove_spaces(R, R1). remove_spaces([F | R], [F | R1]) :- !, remove_spaces(R, R1). %--------------------------------------------------------------- simplify_diff([], Result, Current, Preceding) :- ( Current = null -> Result = [] ; Current = same(_) -> Result = [] ; otherwise -> Result = [Preceding/Current/null] ), !. simplify_diff([same(X) | R], R1, null, Preceding) :- !, update_preceding(Preceding, same(X), NewPreceding), simplify_diff(R, R1, null, NewPreceding). simplify_diff([same(X) | R], R1, same(Y), _Preceding) :- !, update_preceding([Y], same(X), NewPreceding), simplify_diff(R, R1, null, NewPreceding). simplify_diff([same(X) | R], [Preceding/Current/Following | R1], Current, Preceding) :- !, initial_context([same(X) | R], Following), update_preceding(Preceding, Current, NewPreceding0), update_preceding(NewPreceding0, same(X), NewPreceding), simplify_diff(R, R1, null, NewPreceding). simplify_diff([F | R], Result, Current, Preceding) :- combine_diff(Current, F, NewCurrent), !, simplify_diff(R, Result, NewCurrent, Preceding). simplify_diff([F | R], R1, null, Preceding) :- !, simplify_diff(R, R1, F, Preceding). simplify_diff([F | R], [Preceding/Current/Following | R1], Current, Preceding) :- update_preceding(Preceding, Current, Preceding1), !, initial_context(R, Following), simplify_diff(R, R1, F, Preceding1). update_preceding(Preceding, Operation, NewPreceding) :- coerce_to_list(Preceding, PrecedingList), coerce_to_list(Operation, OperationList), append(PrecedingList, OperationList, NewPreceding0), at_most_last_two(NewPreceding0, NewPreceding), !. initial_context(Following, Context) :- coerce_to_list(Following, Context0), at_most_first_two(Context0, Context), !. initial_context(_Other, null). %--------------------------------------------------------------- simplify_diff_no_context([], Result, Current) :- ( Current = null -> Result = [] ; otherwise -> Result = [Current] ), !. simplify_diff_no_context([same(X) | R], [same(X) | R1], null) :- !, simplify_diff_no_context(R, R1, null). simplify_diff_no_context([same(X) | R], [Current, same(X) | R1], Current) :- !, simplify_diff_no_context(R, R1, null). simplify_diff_no_context([F | R], Result, Current) :- combine_diff(Current, F, NewCurrent), !, simplify_diff_no_context(R, Result, NewCurrent). simplify_diff_no_context([F | R], R1, null) :- !, simplify_diff_no_context(R, R1, F). simplify_diff_no_context([F | R], [Current| R1], Current) :- !, simplify_diff_no_context(R, R1, F). %--------------------------------------------------------------- combine_diff(null, Current, Current) :- !. combine_diff(del(A1), del(B1), del(AB1)) :- append_atoms_or_lists(A1, B1, AB1), !. combine_diff(ins(A1), ins(B1), ins(AB1)) :- append_atoms_or_lists(A1, B1, AB1), !. combine_diff(sub(A1, A2), sub(B1, B2), sub(AB1, AB2)) :- append_atoms_or_lists(A1, B1, AB1), append_atoms_or_lists(A2, B2, AB2), !. combine_diff(sub(A1, A2), del(B2), sub(A1, AB2)) :- append_atoms_or_lists(A2, B2, AB2), !. combine_diff(del(B2), sub(A1, A2), sub(A1, BA2)) :- append_atoms_or_lists(B2, A2, BA2), !. combine_diff(sub(A1, A2), ins(B1), sub(AB1, A2)) :- append_atoms_or_lists(A1, B1, AB1), !. combine_diff(ins(B1), sub(A1, A2), sub(BA1, A2)) :- append_atoms_or_lists(B1, A1, BA1), !. %--------------------------------------------------------------- find_complex_editing_operations([], []). % [[and,the]/del(address)/['HP',is],[address,'HP']/ins(address)/[is]] -> [and,the]/exch(address, 'HP')/[is] find_complex_editing_operations([Before/del(X)/[Y|_], [X,Y]/ins(X)/After | R], [Before/exch(X, Y)/After | R1]) :- !, find_complex_editing_operations(R, R1). % "[[*start*,I]/sub(do,have)/[not,all],[have,not]/ins(have)/[all,versions]]" find_complex_editing_operations([Before/sub(Do,Have)/[Not|_], [Have,Not]/ins(Have)/After | R], [Before/sub([Do,Not,Have], [Have,Not])/After | R1]) :- !, find_complex_editing_operations(R, R1). find_complex_editing_operations([F | R], [F | R1]) :- !, find_complex_editing_operations(R, R1). %--------------------------------------------------------------- remove_same([], []). remove_same([same(_) | R], R1) :- !, remove_same(R, R1). remove_same([F | R], [F | R1]) :- !, remove_same(R, R1). %=============================================================== format_postediting_csv1(File) :- findall([Source, Target], %find_edit_operation(EditOperation), find_source_target_pair(Source, Target), Pairs), show_source_target_pairs(File, Pairs). find_source_target_pair(Source, Target) :- pe_record(List), member(source=Source, List), member(target=Target, List). show_source_target_pairs(File, Pairs) :- length(Pairs, NPairs), format('~NPrinting data for ~d source-target pairs~n', [NPairs]), open(File, write, S, [encoding('UTF-8')]), print_html_opening(S), show_source_target_pairs1(Pairs, S), print_html_closing(S), close(S), format('~N--- Results written to: ~w~n', [File]). show_source_target_pairs1([], _S). show_source_target_pairs1([F | R], S) :- show_source_target_pair(F, S), !, show_source_target_pairs1(R, S). show_source_target_pair([Source, Target], S) :- pe_record(List), member(source=Source, List), member(target=Target, List), findall([I, Target1, Edited1], target_edited_pair_in_list(List, Target, I, Target1, Edited1), Triples), show_source_target_pair1(S, Source, Triples), !. target_edited_pair_in_list(List, Target, I, Target1, Edited1) :- member(pe(I)=Edited, List), mark_diffs_in_target_and_edited(Target, Edited, Target1, Edited1). show_source_target_pair1(S, Source, Triples) :- format(S, '~N~n

Source: ~w
~n', [Source]), show_target_edited_triples(S, Triples), format(S, '~N

~n', []), !. show_target_edited_triples(_S, []). show_target_edited_triples(S, [F | R]) :- show_target_edited_triple(S, F), !, show_target_edited_triples(S, R). show_target_edited_triple(S, [I, Target1, Edited1]) :- format(S, '~NSubject ID: ~d
~n', [I]), format(S, '~NTarget: ~w
~n', [Target1]), format(S, '~NEdited: ~w
~n', [Edited1]), !. show_target_edited_triple(S, Triple) :- format('~N*** Error: bad call: ~w~n', [show_target_edited_triple(S, Triple)]), fail. %=============================================================== collect_and_show_basic_edit_operations(File, MinFreq) :- findall(EditOperation, %find_edit_operation(EditOperation), find_context_free_edit_operation(EditOperation), EditOperations), show_edit_operations(File, EditOperations, MinFreq). %show_edit_operations(File, EditOperations, MinFreq) :- % list_to_ordered_multiset(EditOperations, MultiSet), % length(MultiSet, NEditOperations), % format('~NPrinting data for ~d possible edit operations~n', [NEditOperations]), % open(File, write, S, [encoding('UTF-8')]), % print_html_opening(S), % format(S, '~N

Total number of edit operations: ~d
~n', [NEditOperations]), % format(S, '~NFrequency-ordered list:

~n', []), % show_edit_operations1(MultiSet, MinFreq, S, 1), % print_html_closing(S), % close(S), % format('~N--- Results written to: ~w~n', [File]). show_edit_operations(File, EditOperations, MinFreq) :- store_edit_operation_source_correspondences(EditOperations, SourcesWithCounts), length(SourcesWithCounts, NSources), format('~NPrinting data for ~d possible edit operation LHSs~n', [NSources]), open(File, write, S, [encoding('UTF-8')]), print_html_opening(S), format(S, '~N

Total number of edit operation LHSs: ~d
~n', [NSources]), format(S, '~NFrequency-ordered list:

~n', []), show_edit_operations_for_source(SourcesWithCounts, MinFreq, S, 1), print_html_closing(S), close(S), format('~N--- Results written to: ~w~n', [File]). %-------------------------------------------------- :- dynamic edit_operation_source_count/3. store_edit_operation_source_correspondences(EditOperations, SourcesWithCounts) :- retractall(edit_operation_source_count(_, _, _)), list_to_ordered_multiset(EditOperations, MultiSet), store_edit_operation_source_correspondences1(MultiSet), all_edit_operation_sources(Sources), length(Sources, NSources), format('~N--- Getting source counts (~d possible sources)~n', [NSources]), edit_operation_source_counts(Sources, SourcesWithCounts). all_edit_operation_sources(Sources) :- findall(Source, edit_operation_source_count(Source, _, _), Sources0), sort(Sources0, Sources). store_edit_operation_source_correspondences1([]). store_edit_operation_source_correspondences1([Freq-EditOperation | R]) :- matching_ngram_for_edit_operation(EditOperation, Source), assertz(edit_operation_source_count(Source, EditOperation, Freq)), !, store_edit_operation_source_correspondences1(R). edit_operation_source_counts(Sources, SourcesWithCounts) :- edit_operation_source_counts1(Sources, SourcesWithCounts0, 0), keysort(SourcesWithCounts0, SourcesWithCounts1), reverse(SourcesWithCounts1, SourcesWithCounts). edit_operation_source_counts1([], [], _I). edit_operation_source_counts1([F | R], [Count-F | R1], I) :- edit_operation_source_count(F, Count), I1 is I + 1, ( 0 is I1 mod 1000 -> format('~d ', [I1]), flush_output(user) ; otherwise -> true ), !, edit_operation_source_counts1(R, R1, I1). edit_operation_source_count(Source, TotalFreq) :- findall(Freq, edit_operation_source_count(Source, _EditOperation, Freq), Freqs), safe_sum_list(Freqs, TotalFreq). %-------------------------------------------------- print_html_opening(S) :- format(S, '~N~n', []), format(S, '~N~n', []), !. print_html_closing(S) :- format(S, '~N~n', []), format(S, '~N~n', []), !. show_edit_operations1([], _MinFreq, _S, _I). show_edit_operations1([Freq-EditOperation | R], MinFreq, S, I) :- ( Freq < MinFreq -> true ; otherwise -> show_edit_operation(Freq-EditOperation, S) ), I1 is I + 1, ( 0 is I1 mod 1000 -> format('~d ', [I1]), flush_output(user) ; otherwise -> true ), !, show_edit_operations1(R, MinFreq, S, I1). show_edit_operations_for_source([], _MinFreq, _S, _I). show_edit_operations_for_source([Freq-EditOperationSource | R], MinFreq, S, I) :- ( Freq < MinFreq -> true ; trivial_edit_operation_source(EditOperationSource) -> true ; otherwise -> show_edit_operations_for_source1(Freq-EditOperationSource, S) ), I1 is I + 1, ( 0 is I1 mod 1000 -> format('~d ', [I1]), flush_output(user) ; otherwise -> true ), !, show_edit_operations_for_source(R, MinFreq, S, I1). trivial_edit_operation_source([]). show_edit_operation(Freq-EditOperation, S) :- %get_main_corpus_frequency_for_edit_operation(EditOperation, MatchingNGram, MainCorpusFreq), matching_ngram_for_edit_operation(EditOperation, MatchingNGram), target_ngram_for_edit_operation(EditOperation, TargetNGram), MatchingNGram \== [], %\+ trivial_edit_operation(MatchingNGram, MainCorpusFreq), ReadableEditOperation = (MatchingNGram --> TargetNGram), format(S, '~N~n

-----------------------------

', []), %format(S, '~NOperation: ~w, freq: ~d~n', [EditOperation, Freq]), format(S, '~NOperation: ~w, freq: ~d
~n', [ReadableEditOperation, Freq]), %format(S, '~NFreq in training corpus of "~w": ~w

~n', [MatchingNGram, MainCorpusFreq]), find_examples_of_edit_operation(EditOperation, Examples), show_examples_of_edit_operation(Examples, S), !. show_edit_operation(_, _S). show_edit_operations_for_source1(Freq-EditOperationSource, S) :- format(S, '~N~n

-----------------------------

', []), format(S, '~NEdited string: ~w, freq: ~d
~n', [EditOperationSource, Freq]), findall(Freq1-EditOperation, edit_operation_source_count(EditOperationSource, EditOperation, Freq1), EditOperations0), keysort(EditOperations0, EditOperations1), reverse(EditOperations1, EditOperations), show_edit_operations_list(EditOperations, S). show_edit_operations_list([], _S). show_edit_operations_list([F | R], S) :- show_edit_operation_short(F, S), !, show_edit_operations_list(R, S). show_edit_operation_short(Freq-EditOperation, S) :- matching_ngram_for_edit_operation(EditOperation, MatchingNGram), target_ngram_for_edit_operation(EditOperation, TargetNGram), ReadableEditOperation = (MatchingNGram --> TargetNGram), format(S, '~NOperation: ~w, freq: ~d
~n', [ReadableEditOperation, Freq]), find_examples_of_edit_operation(EditOperation, Examples), show_examples_of_edit_operation(Examples, S), !. show_edit_operation_short(_, _S). trivial_edit_operation(_MatchingNGram, MainCorpusFreq) :- member(MainCorpusFreq, [unknown, undefined]), !. trivial_edit_operation(_MatchingNGram, MainCorpusFreq) :- MainCorpusFreq > 500. show_examples_of_edit_operation([], _S). show_examples_of_edit_operation([F | R], S) :- show_example_of_edit_operation(F, S), !, show_examples_of_edit_operation(R, S). show_example_of_edit_operation(Example, S) :- member(type=Type, Example), member(source=Source, Example), member(target=Target, Example), member(edited=Edited, Example), format(S, '~N~n

Source: ~w (~w)
~n', [Source, Type]), format(S, '~NTarget: ~w
~n', [Target]), format(S, '~NEdited: ~w

~n', [Edited]), !. %--------------------------------------------------------------- /* pe_record([(type=raw), (source='Je ne peux donc plus utiliser mon t�l�phone suite � un pb de carte SIM.'), (target='I can no longer use my phone pb following a SIM card.'), (pe(1)='I can no longer use my phone following a SIM card problem.'), (pe(2)='I can no longer use my phone following a problem with the SIM card.'), (pe(3)='I can no longer use my phone following a SIM card problem.'), (diff(1)=[del(pb),ins(problem)]), (diff(2)=[del(pb),ins([problem,with,the])]), (diff(3)=[del(pb),ins(problem)])]). */ find_context_free_edit_operation(EditOperation) :- find_edit_operation(EditOperation), context_free_free_edit_operation(EditOperation). find_edit_operation(EditOperation) :- pe_record(List), member(diff(_I)=Diffs, List), member(EditOperation0, Diffs), abstract_edit_operation(EditOperation0, EditOperation), make_ground(EditOperation). abstract_edit_operation(Before/Edit/After, Before/Edit/After). abstract_edit_operation(_Before/Edit/After, _AnyBefore/Edit/After). abstract_edit_operation(_Before/Edit/[After1, _After2], _AnyBefore/Edit/[After1, _AnyAfter2]). abstract_edit_operation(Before/Edit/_After, Before/Edit/_AnyAfter). abstract_edit_operation([_Before1, Before2]/Edit/_After, [_AnyBefore1, Before2]/Edit/_AnyAfter). abstract_edit_operation(_Before/Edit/_After, _AnyBefore/Edit/_AnyAfter). find_examples_of_edit_operation(EditOperation, Examples) :- findall(Example, example_of_edit_operation(EditOperation, Example), Examples). example_of_edit_operation(GroundedEditOperation, Example) :- unground(GroundedEditOperation, EditOperation), pe_record(List), member(diff(I)=Diffs, List), member(EditOperation1, Diffs), safe_subsumes_chk(EditOperation, EditOperation1), member(type=Type, List), member(source=Source, List), member(target=Target0, List), member(pe(I)=Edited0, List), %( EditOperation = _/sub([as,a],to)/_ -> true ; fail ), %mark_edit_operation_in_target(Target0, Edited, EditOperation, Target), mark_diffs_in_target_and_edited(Target0, Edited0, Target, Edited), Example = [type=Type, source=Source, target=Target, edited=Edited]. context_free_free_edit_operation(GroundedEditOperation) :- unground(GroundedEditOperation, EditOperation), EditOperation = Before/_Operation/After, var(Before), var(After). %--------------------------------------------------------------- mark_diffs_in_target_and_edited(Target0, Edited0, Target, Edited) :- colour_mark_diffs(Target0, Edited0, Target), colour_mark_diffs(Edited0, Target0, Edited), !. mark_diffs_in_target_and_edited(Target0, Edited0, Target0, Edited0). colour_mark_diffs(Atom0, RefAtom0, ColouredDiff) :- insertions_deletions_substitutions_and_matches(Atom0, RefAtom0, _Total, _I, _D, _S, Diff), turn_diff_into_colour_markings(Diff, ColouredDiff). turn_diff_into_colour_markings(Diff, ColouredDiff) :- turn_diff_into_colour_markings1(Diff, ColouredDiffElements), join_with_spaces(ColouredDiffElements, ColouredDiff), !. turn_diff_into_colour_markings(Diff, ColouredDiff) :- format('~N*** Error: bad call: ~w~n', [turn_diff_into_colour_markings(Diff, ColouredDiff)]), fail. turn_diff_into_colour_markings1([], []). turn_diff_into_colour_markings1([F | R], [F1 | R1]) :- turn_diff_element_into_colour_markings(F, F1), !, turn_diff_into_colour_markings1(R, R1). turn_diff_element_into_colour_markings(same(X), X) :- !. turn_diff_element_into_colour_markings(sub(X, _Y), Output) :- mark_with_color(X, red, Output), !. turn_diff_element_into_colour_markings(ins(X), Output) :- mark_with_color(X, red, Output), !. turn_diff_element_into_colour_markings(del(X), Output) :- mark_with_strikethrough(X, X1), %mark_with_italics(X, X1), mark_with_color(X1, blue, Output), !. turn_diff_element_into_colour_markings(Other, Output) :- format('~N*** Error: bad call: ~w~n', [turn_diff_element_into_colour_markings(Other, Output)]), fail. mark_with_color(X, ColorID, Output) :- color_id_to_html_color(ColorID, HTMLColor), format_to_atom('~w', [HTMLColor, X], Output), !. mark_with_strikethrough(X, Output) :- format_to_atom('~w', [X], Output), !. mark_with_italics(X, Output) :- format_to_atom('~w', [X], Output), !. % #ff0033 red % #00ff33 green % #0000cc blue color_id_to_html_color(red, '#ff0033'). %red color_id_to_html_color(green, '#00ff33'). %green color_id_to_html_color(blue, '#0000cc'). %blue %--------------------------------------------------------------- % Target, Edited and AnnotatedTarget are atoms. EditOperation is an ungrounded edit operation. mark_edit_operation_in_target(Target, _Edited, _EditOperation, AnnotatedTarget) :- AnnotatedTarget = Target, !. mark_edit_operation_in_target(Target, Edited, EditOperation, AnnotatedTarget) :- tokenize_sent_atom(Target, TargetWords0), tokenize_sent_atom(Edited, EditedWords0), %smart_diff(EditedWords, TargetWords, Diff), remove_spaces(TargetWords0, TargetWords), remove_spaces(EditedWords0, EditedWords), insertions_deletions_substitutions_and_matches(EditedWords, TargetWords, _, _, _, _, Diff0), simplify_diff_no_context(Diff0, Diff, null), EditOperation = LeftContext/Operation/RightContext, coerce_to_list(LeftContext, LeftContextList), coerce_to_list(RightContext, RightContextList), EditOperation1 = [context(LeftContextList), Operation, context(RightContextList)], mark_edit_operation_in_target1(Diff, EditOperation1, AnnotatedDiff), annotated_diff_to_atom(AnnotatedDiff, AnnotatedTarget), !. mark_edit_operation_in_target(Target, Edited, EditOperation, AnnotatedTarget) :- format('~N*** Warning: bad call "~w"~n', [mark_edit_operation_in_target(Target, Edited, EditOperation, AnnotatedTarget)]), AnnotatedTarget = Target. mark_edit_operation_in_target1(Diff, EditOperation, AnnotatedDiff) :- match_edit_operation_to_diff(EditOperation, Diff-RestDiff, MatchedElements-[]), append(['' | MatchedElements], ['' | RestDiff], AnnotatedDiff), !. mark_edit_operation_in_target1([F | Diff], EditOperation, [F | AnnotatedDiff]) :- !, mark_edit_operation_in_target1(Diff, EditOperation, AnnotatedDiff). match_edit_operation_to_diff([], Diff-Diff, MatchedElements-MatchedElements). match_edit_operation_to_diff([F | R], DiffIn-DiffOut, MatchedElementsIn-MatchedElementsOut) :- match_edit_operation_component_to_diff(F, DiffIn-DiffNext, MatchedElementsIn-MatchedElementsNext), !, match_edit_operation_to_diff(R, DiffNext-DiffOut, MatchedElementsNext-MatchedElementsOut). match_edit_operation_component_to_diff(context(List), DiffIn-DiffOut, MatchedElementsIn-MatchedElementsOut) :- !, match_literal_list_to_diff(List, DiffIn-DiffOut, MatchedElementsIn-MatchedElementsOut). match_edit_operation_component_to_diff(Other, [Other | Diff]-Diff, [Other | MatchedElements]-MatchedElements) :- !. match_literal_list_to_diff([], Diff-Diff, MatchedElements-MatchedElements). match_literal_list_to_diff([F | R], [same(F) | DiffNext]-DiffOut, [same(F) | MatchedElementsNext]-MatchedElementsOut) :- !, match_literal_list_to_diff(R, DiffNext-DiffOut, MatchedElementsNext-MatchedElementsOut). annotated_diff_to_atom(AnnotatedDiff, AnnotatedTarget) :- coerce_to_list(AnnotatedDiff, AnnotatedDiffList), remove_start_and_end_markers(AnnotatedDiffList, AnnotatedDiffList1), join_with_spaces(AnnotatedDiffList1, AnnotatedTarget). %--------------------------------------------------------------- get_main_corpus_frequency_for_edit_operation(EditOperation, NGram, MainCorpusFreq) :- matching_ngram_for_edit_operation(EditOperation, NGram), ( NGram = [] -> MainCorpusFreq = undefined ; \+ current_predicate(user:ngram/2) -> MainCorpusFreq = unknown ; user:ngram(NGram, MainCorpusFreq) -> true ; MainCorpusFreq = unknown ). matching_ngram_for_edit_operation(EditOperation, NGram) :- unground(EditOperation, UngroundedEditOperation), UngroundedEditOperation = Before/Operation/After, coerce_to_list([Before, Operation, After], NGram0), lowercase_atom_list(NGram0, NGram), !. matching_ngram_for_edit_operation(EditOperation, NGram) :- format('~N*** Error: bad call "~w"~n', [matching_ngram_for_edit_operation(EditOperation, NGram)]), fail. target_ngram_for_edit_operation(EditOperation, NGram) :- unground(EditOperation, UngroundedEditOperation), UngroundedEditOperation = Before/Operation/After, coerce_to_target_list([Before, Operation, After], NGram0), lowercase_atom_list(NGram0, NGram), !. target_ngram_for_edit_operation(EditOperation, NGram) :- format('~N*** Error: bad call "~w"~n', [target_ngram_for_edit_operation(EditOperation, NGram)]), fail. %--------------------------------------------------------------- xliff_prolog_to_list(XMLTerm, List, Type) :- format('~NReplacing strings with atoms... ', []), replace_strings_with_atoms_in_xml(XMLTerm, XMLTerm1), format('~Ndone ', []), format('~NRemoving comments... ', []), remove_comments_in_xml(XMLTerm1, XMLTerm2), format('~Ndone ', []), xliff_prolog_to_list1(XMLTerm2, Type, List). /*
....
Diseases can be divided into a few large broad categories based on their main causes (see Table 1.1). Les maladies peuvent �tre divis�es en plusieurs grandes cat�gories en fonction de leurs causes principales (cf. Tableau 1). Les maladies peuvent �tre divis�s en quelques grands grandes cat�gories bas�e sur leurs causes principales (Table 1). ...
*/ xliff_prolog_to_list1(XMLTerm, Type, List) :- XMLTerm = xml(_VersionInfo, [element(xliff, _, Files)]), xliff_files_to_list(Files, Type, List-[]), !. xliff_files_to_list([], _Type, List-List). xliff_files_to_list([F | R], Type, ListIn-ListOut) :- xliff_file_to_list(F, Type, ListIn-ListNext), !, xliff_files_to_list(R, Type, ListNext-ListOut). xliff_file_to_list(element(file, _Attrs, List), Type, ListIn-ListOut) :- xliff_file_contents(List, Type, ListIn-ListOut). xliff_file_contents([], _Type, ListIn-ListIn). xliff_file_contents([F | R], Type, ListIn-ListOut) :- xliff_file_element(F, Type, ListIn-ListNext), !, xliff_file_contents(R, Type, ListNext-ListOut). xliff_file_element(element(body, _Attrs, List), Type, ListIn-ListOut) :- xliff_body(List, Type, ListIn-ListOut), !. xliff_file_element(_Other, _Type, ListIn-ListIn). xliff_body([], _Type, ListIn-ListIn). xliff_body([F | R], Type, ListIn-ListOut) :- xliff_body_element(F, Type, ListIn-ListNext), !, xliff_body(R, Type, ListNext-ListOut). /* Diseases can be divided into a few large broad categories based on their main causes (see Table 1.1). Les maladies peuvent �tre divis�es en plusieurs grandes cat�gories en fonction de leurs causes principales (cf. Tableau 1). Les maladies peuvent �tre divis�s en quelques grands grandes cat�gories bas�e sur leurs causes principales (Table 1). */ xliff_body_element(element('trans-unit', _Attrs, List), Type, ListIn-ListOut) :- xliff_trans_unit_elements(List, List1), trans_unit_with_postediting(List1), List2 = [type=Type | List1], Record = pe_record(List2), add_diffs_to_xliff_record(Record, Record1), ListIn = [Record1 | ListOut], !. xliff_body_element(_Other, _Type, ListIn-ListIn). xliff_trans_unit_elements([], []). xliff_trans_unit_elements([F | R], [F1 | R1]) :- xliff_trans_unit_element(F, F1), !, xliff_trans_unit_elements(R, R1). xliff_trans_unit_element(element(source, _Attrs, [Source]), source=Source) :- !. xliff_trans_unit_element(element(TargetOrAlttrans, ['phase-name'='mt_baseline'], [TargetElement]), target=Target) :- target_or_alt_trans(TargetOrAlttrans), target_element_to_target(TargetElement, Target), !. xliff_trans_unit_element(element(TargetOrAlttrans, ['phase-name'=Version], [TargetElement]), pe(N)=Target) :- target_or_alt_trans(TargetOrAlttrans), target_element_to_target(TargetElement, Target), atomic(Version), atom_codes(Version, Str), pe_tag(_VersionNum, N, Str, []), !. target_or_alt_trans(target). target_or_alt_trans('alt-trans'). target_element_to_target(TargetElement, Target) :- atomic(TargetElement), !, TargetElement = Target. target_element_to_target(element(target, _Atr, [Target]), Target) :- atomic(Target), !. target_element_to_target(TargetElement, Target) :- format('~N*** Error: bad call: ~w~n', [target_element_to_target(TargetElement, Target)]), fail. trans_unit_with_postediting(List) :- member(source=_Source, List), member(target=_Target, List), member(pe(_)=_PE, List), !. pe_tag(Id, N) --> "r", integer(Id), ".", integer(N). integer(N) --> digit_string(Str), {number_codes(N, Str)}. digit_string([F | R]) --> [F], {digit_char(F)}, !, digit_string(R). digit_string([]) --> []. %---------------------------------------------------------------------- add_diffs_to_xliff_record(pe_record(List), pe_record(List1)) :- member(target=TargA, List), tokenize_sent_atom(TargA, Targ), add_diffs_to_xliff_record1(List, Targ, List1), !. add_diffs_to_xliff_record1([], _Targ, []). add_diffs_to_xliff_record1([pe(I)=PostA | R], Targ, [pe(I)=PostA, diff(I)=Diff | R1]) :- tokenize_sent_atom(PostA, Post), smart_diff(Post, Targ, Diff), !, add_diffs_to_xliff_record1(R, Targ, R1). add_diffs_to_xliff_record1([Other | R], Targ, [Other | R1]) :- !, add_diffs_to_xliff_record1(R, Targ, R1). %---------------------------------------------------------------------- replace_strings_with_atoms_in_xml(Atom, Atom) :- atomic(Atom), !. replace_strings_with_atoms_in_xml(pcdata(String), Atom) :- safe_string_to_atom_or_number(String, Atom), !. replace_strings_with_atoms_in_xml(pcdata(String), Atom) :- Atom = 'UNMAPPABLE_STRING', show_unmappable_string(String), !. replace_strings_with_atoms_in_xml(String, Atom) :- is_list_of_non_negative_integers(String), safe_string_to_atom_or_number(String, Atom), !. replace_strings_with_atoms_in_xml(Term, Term1) :- functor(Term, F, N), functor(Term1, F, N), replace_strings_with_atoms_in_xml_args(N, Term, Term1). replace_strings_with_atoms_in_xml_args(I, _Term, _Term1) :- I =< 0, !. replace_strings_with_atoms_in_xml_args(I, Term, Term1) :- I > 0, arg(I, Term, Arg), arg(I, Term1, Arg1), replace_strings_with_atoms_in_xml(Arg, Arg1), I1 is I - 1, !, replace_strings_with_atoms_in_xml_args(I1, Term, Term1). warn_if_overlong_string(String) :- length(String, N), ( N > 1000 -> length(Prefix, 50), append(Prefix, _, String), format('~N*** Warning: long string (~d characters) "~s"~n', [N, String]), fail ; otherwise -> true ). %====================================================================== safe_string_to_atom_or_number(String0, Atom) :- %is_prolog_string(String0), is_list_of_non_negative_integers(String0), remove_bad_chars_and_warn_if_necessary(String0, String), warn_if_overlong_string(String), ( safe_number_codes(Atom, String) ; atom_codes(Atom, String) ), !. remove_bad_chars_and_warn_if_necessary(Str, Str1) :- remove_bad_chars(Str, Str1, BadChars-[]), ( BadChars = [] -> true ; format('~N*** Warning: suspicious chars ~w removed from "~s"~n', [BadChars, Str1]) ), !. remove_bad_chars([], [], Bad-Bad). remove_bad_chars([BadChar | R], Out, [BadChar | BadNext]-BadOut) :- bad_char(BadChar), !, remove_bad_chars(R, Out, BadNext-BadOut). remove_bad_chars([F | R], [F | R1], BadIn-BadOut) :- !, remove_bad_chars(R, R1, BadIn-BadOut). % We seem to get odd chars with values around 65000 from Excel - probable BOM marks bad_char(BadChar) :- BadChar > 65000. show_unmappable_string(String) :- format('~N*** Warning: unmappable string: ~w~n', [String]), !. %====================================================================== remove_comments_in_xml(Var, Var) :- var(Var), !. remove_comments_in_xml(Atom, Atom) :- atomic(Atom), !. remove_comments_in_xml(List, List1) :- is_list(List), remove_comments_in_xml_list(List, List1), !. remove_comments_in_xml(Term, Term1) :- functor(Term, F, N), functor(Term1, F, N), remove_comments_in_xml_args(N, Term, Term1). remove_comments_in_xml_list([], []). remove_comments_in_xml_list([comment(_) | R], R1) :- !, remove_comments_in_xml_list(R, R1). remove_comments_in_xml_list([F | R], [F1 | R1]) :- remove_comments_in_xml(F, F1), !, remove_comments_in_xml_list(R, R1). remove_comments_in_xml_args(I, _Term, _Term1) :- I =< 0, !. remove_comments_in_xml_args(I, Term, Term1) :- I > 0, arg(I, Term, Arg), arg(I, Term1, Arg1), remove_comments_in_xml(Arg, Arg1), I1 is I - 1, !, remove_comments_in_xml_args(I1, Term, Term1). %--------------------------------------------------------------- append_atoms_or_lists(A, B, [A, B]) :- atomic(A), atomic(B), !. append_atoms_or_lists(A, B, [A | B]) :- atomic(A), is_list(B), !. append_atoms_or_lists(A, B, AB) :- is_list(A), atomic(B), append(A, [B], AB), !. append_atoms_or_lists(A, B, AB) :- is_list(A), is_list(B), append(A, B, AB), !. %--------------------------------------------------------------- coerce_to_list(Var, []) :- var(Var), !. coerce_to_list([], []) :- !. coerce_to_list(null, []) :- !. coerce_to_list(X, [X]) :- atomic(X), !. coerce_to_list(same(X), List) :- coerce_to_list(X, List), !. coerce_to_list(del(X), List) :- coerce_to_list(X, List), !. coerce_to_list(ins(_X), []) :- !. coerce_to_list(sub(_X, Y), List) :- coerce_to_list(Y, List), !. coerce_to_list(exch(X, Y), List) :- coerce_to_list([X, Y], List), !. coerce_to_list([F | R], List) :- coerce_to_list(F, FList), coerce_to_list(R, RList), append(FList, RList, List), !. coerce_to_list(_Other, []). %--------------------------------------------------------------- coerce_to_target_list(Var, []) :- var(Var), !. coerce_to_target_list([], []) :- !. coerce_to_target_list(null, []) :- !. coerce_to_target_list(X, [X]) :- atomic(X), !. coerce_to_target_list(same(X), List) :- coerce_to_target_list(X, List), !. coerce_to_target_list(del(_X), []) :- !. coerce_to_target_list(ins(X), List) :- coerce_to_target_list(X, List), !. coerce_to_target_list(sub(X, _Y), List) :- coerce_to_target_list(X, List), !. coerce_to_target_list(exch(X, Y), List) :- coerce_to_target_list([Y, X], List), !. coerce_to_target_list([F | R], List) :- coerce_to_target_list(F, FList), coerce_to_target_list(R, RList), append(FList, RList, List), !. coerce_to_target_list(_Other, []). %--------------------------------------------------------------- at_most_last_two([], []) :- !. at_most_last_two([X], [X]) :- !. at_most_last_two([X, Y], [X, Y]) :- !. at_most_last_two([_F | R], LastTwo) :- !, at_most_last_two(R, LastTwo). %--------------------------------------------------------------- at_most_first_two([], []) :- !. at_most_first_two([X], [X]) :- !. at_most_first_two([X, Y | _], [X, Y]) :- !. %---------------------------------------------------------------