1:- module(pinyin, [
2 word//2,
3 num_dia/2]).
14:- use_module(casing, [ 15 casing/3]). 16 17:- set_prolog_flag(double_quotes, codes). % SWI-7 ready 18 19% TODO support v for ü? 20% TODO support tone number 0/5? 21% TODO What about "Harbin"?
Maximal substrings of Pinyin letters, the numbers 1-4 as well as the
characters '
and -
are converted if they can be parsed as
lower-case, capitalized or all-caps Pinyin words in the input format,
everything else is left alone. Case is preserved.
The assumed input format is diacritics if Num is variable, numbers otherwise.
Example usage:
?- set_prolog_flag(double_quotes, codes). true. ?- num_dia("Wo3 xian4zai4 dui4 jing1ju4 hen3 gan3 xing4qu4.", | Codes), atom_codes(Atom, Codes). Codes = [87, 466, 32, 120, 105, 224, 110, 122, 224|...], Atom = 'Wǒ xiànzài duì jīngjù hěn gǎn xìngqù.'. ?- num_dia(Codes, "Nǐ ne?"), atom_codes(Atom, Codes). Codes = [78, 105, 51, 32, 110, 101, 63], Atom = 'Ni3 ne?'.
51num_dia(Num, Dia) :- 52 var(Num), 53 !, 54 convert(dia, num, Dia, Num). 55num_dia(Num, Dia) :- 56 convert(num, dia, Num, Dia). 57 58% PINYIN WORD GRAMMAR %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Morphs is a list of "morphs" (not in the strictest linguistic sense) that make up the word. They take one of three forms:
Initial-Final-Tone
where Initial
and Final
are atoms.
The final takes the "underlying" form, which may be different
from the written form. Tone
is one of the integers from 0
to 4
.r
(for the erhuayin suffix)-
(for word-internal hyphens)
ND is either num
or dia
, depending on how tones are represented
(numbers or diacritics).
The grammar implements the following tricky aspects of Hanyu Pinyin:
The following aspects are currently not supported:
Example usage:
?- set_prolog_flag(double_quotes, codes). true. ?- phrase(word([n-ü-3, ''-er-2], ND), Codes), atom_codes(Atom, Codes). ND = dia, Codes = [110, 474, 39, 233, 114], Atom = 'nǚ\'ér' ; ND = num, Codes = [110, 252, 51, 39, 101, 114, 50], Atom = 'nü3\'er2' ; false. ?- phrase(word(Morphs, ND), "yìjué"). Morphs = [''-i-4, j-üe-2], ND = dia ; false.
120word([Morph], ND) --> 121 syll1(ND, Morph, _). 122% erhuayin 123word([Morph, r], ND) --> 124 syll1(ND, Morph, _), 125 { Morph \= _-e-_ 126 }, 127 "r". 128word([Morph|Morphs], ND) --> 129 syll1(ND, Morph, _), 130 sylls(ND, Morphs). 131word([''-Final-Tone], ND) --> 132 syllabic_nasal(ND, Final, Tone). 133 134% initial syllable 135syll1(ND, ''-Final-Tone, RequiresApostrophe) --> 136 bare_final(ND, Final, Tone, RequiresApostrophe). 137syll1(ND, Initial-Final-Tone, no) --> 138 nonempty_initial(Initial), 139 { requires_modification(Initial, RequiresModification) 140 }, 141 nonbare_final(ND, RequiresModification, Final, Tone). 142 143% sequence of non-initial syllables 144sylls(ND, [Morph]) --> 145 syll(ND, Morph). 146% erhuayin 147sylls(ND, [Morph, r]) --> 148 syll(ND, Morph), 149 { Morph \= _-e-_ 150 }, 151 "r". 152sylls(ND, [Morph|Morphs]) --> 153 syll(ND, Morph), 154 sylls(ND, Morphs). 155 156% non-initial syllable 157syll(ND, [-|Morph]) --> 158 "-", 159 syll1(ND, Morph). 160syll(ND, Initial-Final-Tone) --> 161 syll1(ND, Initial-Final-Tone, no). 162syll(ND, Initial-Final-Tone) --> 163 "'", 164 syll1(ND, Initial-Final-Tone, yes). 165 166% GRAPHEMIC TRANSFORMATIONS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Introduce y resp. w before [iuü]. Also delete [iu] if another vowel follows.
173render_final_bare([Glide, Vowel|Rest],[Substitute, Vowel|Rest]) :- 174 member(Glide, "iu"), 175 vowel_letter(Vowel), 176 !, 177 glide_substitute([Glide], [Substitute]). 178render_final_bare([Glide|Rest], [Substitute, ModifiedGlide|Rest]) :- 179 [Glide] = "ü", 180 !, 181 [Substitute, ModifiedGlide] = "yu". 182render_final_bare([Glide|Rest], [Substitute, Glide|Rest]) :- 183 member(Glide, "iuü"), 184 !, 185 glide_substitute([Glide], [Substitute]). 186% Otherwise, written form is identical to underlying. 187render_final_bare(Final, Final).
Three finals drop a vowel in spelling after a non-empty initial. If JXQ==yes, ü changes to u.
195render_final_nonbare(_, "uei", "ui") :- 196 !. 197render_final_nonbare(_, "iou", "iu") :- 198 !. 199render_final_nonbare(_, "uen", "un") :- 200 !. 201% After j, x, q, render ü as u: 202render_final_nonbare(yes, [Glide|Rest], [ModifiedGlide|Rest]) :- 203 [Glide] = "ü", 204 !, 205 [ModifiedGlide] = "u". 206% Disallow u after j, x, q: 207render_final_nonbare(yes, [Glide|_], _) :- 208 [Glide] = "u", 209 !, 210 fail. 211render_final_nonbare(_, Final, Final).
215tonalize(Final, Tone, dia, Tonalized) :- 216 add_tone_mark(Final, Tone, Tonalized). 217tonalize(Final, 0, num, Final) :- 218 !. 219tonalize(Final, Tone, num, Tonalized) :- 220 atom_codes(Tone, ToneCodes), 221 append(Final, ToneCodes, Tonalized). 222 223% add tone mark to a final 224add_tone_mark("m", Tone, [Tonal]) :- 225 !, 226 tonal(Tone, "m", [Tonal]). % no vowel - place mark on m 227add_tone_mark("n", Tone, [Tonal]) :- 228 !, 229 tonal(Tone, "n", [Tonal]). % no vowel - place mark on n 230add_tone_mark("ng", Tone, [Tonal|"g"]) :- 231 !, 232 tonal(Tone, "n", [Tonal]). % no vowel - place mark on n 233add_tone_mark([First|Rest], Tone, [Tonal|Rest]) :- 234 member(First, "aeo"), % these three greedily attract the tone mark 235 !, 236 tonal(Tone, [First], [Tonal]). 237add_tone_mark([First, Vowel|Rest], Tone, [First|Tonal]) :- 238 vowel_letter(Vowel), % otherwise, do not yet place the mark if another vowel follows 239 !, 240 add_tone_mark([Vowel|Rest], Tone, Tonal). 241add_tone_mark([First|Rest], Tone, [Tonal|Rest]) :- 242 vowel_letter(First), % last vowel - last chance to place the mark 243 !, 244 tonal(Tone, [First], [Tonal]). 245 246% GRAPH/PHONE PROPERTIES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 247 248% Indicates whether a final must be preceded by an apostrophe if it is bare (no 249% initial preceding) and non-initial. 250requires_apostrophe([First|_], yes) :- 251 member(First, "aeo"), 252 !. 253requires_apostrophe(_, no). 254 255% Indicates whether ü changes to u after the given initial. 256requires_modification(j, yes) :- 257 !. 258requires_modification(x, yes) :- 259 !. 260requires_modification(q, yes) :- 261 !. 262requires_modification(_, no). 263 264vowel_letter(X) :- 265 member(X, "aeiouü"). 266 267glide_substitute("i", "y"). 268glide_substitute("u", "w"). 269glide_substitute("ü", "y"). 270 271% map between letters with and without tone marks 272% TODO m only available with 0 and 2 in Unicode, are others not needed for 273% Mandarin? Otherwise we have to use combining characters and modify 274% add_tone_mark/3. 275tonal(0, "a", "a"). 276tonal(0, "e", "e"). 277tonal(0, "i", "i"). 278tonal(0, "o", "o"). 279tonal(0, "u", "u"). 280tonal(0, "ü", "ü"). 281tonal(0, "m", "m"). 282tonal(0, "n", "n"). 283tonal(1, "a", "ā"). 284tonal(1, "e", "ē"). 285tonal(1, "i", "ī"). 286tonal(1, "o", "ō"). 287tonal(1, "u", "ū"). 288tonal(1, "ü", "ǖ"). 289tonal(1, "n", "ñ"). 290tonal(2, "a", "á"). 291tonal(2, "e", "é"). 292tonal(2, "i", "í"). 293tonal(2, "o", "ó"). 294tonal(2, "u", "ú"). 295tonal(2, "ü", "ǘ"). 296tonal(2, "m", "ḿ"). 297tonal(2, "n", "ń"). 298tonal(3, "a", "ǎ"). 299tonal(3, "e", "ě"). 300tonal(3, "i", "ǐ"). 301tonal(3, "o", "ǒ"). 302tonal(3, "u", "ǔ"). 303tonal(3, "ü", "ǚ"). 304tonal(3, "n", "ň"). 305tonal(4, "a", "à"). 306tonal(4, "e", "è"). 307tonal(4, "i", "ì"). 308tonal(4, "o", "ò"). 309tonal(4, "u", "ù"). 310tonal(4, "ü", "ǜ"). 311tonal(4, "n", "ǹ"). 312 313% Inventory of tones. 314tone(0). 315tone(1). 316tone(2). 317tone(3). 318tone(4). 319 320% TERM EXPANSION RULES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 321 322% Generate terminal DCG rules for initials. 323term_expansion(nonempty_initial(Initial), 324 (nonempty_initial(InitialAtom) --> Initial)) :- 325 atom_codes(InitialAtom, Initial). 326% Generate terminal DCG rules for each final, depending on the final, whether 327% it follows a nonempty initial ("nonbare"), which tone it carries and how 328% tones are written (diacritic or number). 329term_expansion(final(Final), Rules) :- 330 findall( 331 ( bare_final(ND, FinalAtom, Tone, RequiresApostrophe) --> String ), 332 ( render_final_bare(Final, Rendered), 333 requires_apostrophe(Final, RequiresApostrophe), 334 member(ND, [dia, num]), 335 tone(Tone), 336 tonalize(Rendered, Tone, ND, String), 337 atom_codes(FinalAtom, Final) 338 ), 339 BareRules), 340 findall( 341 ( nonbare_final(ND, JXQ, FinalAtom, Tone) --> String ), 342 ( member(JXQ, [yes, no]), 343 render_final_nonbare(JXQ, Final, Rendered), 344 member(ND, [dia, num]), 345 tone(Tone), 346 tonalize(Rendered, Tone, ND, String), 347 atom_codes(FinalAtom, Final) 348 ), 349 NonbareRules), 350 append(BareRules, NonbareRules, Rules0), 351 % This translation should happen automatically, but doesn't: 352 maplist(dcg_translate_rule, Rules0, Rules). 353% Generate terminal DCG rules for syllabic nasals. 354term_expansion(syllabic_nasal(Final), Rules) :- 355 findall( 356 ( syllabic_nasal(ND, FinalAtom, Tone) --> String ), 357 ( member(ND, [dia, num]), 358 tone(Tone), 359 tonalize(Final, Tone, ND, String), 360 atom_codes(FinalAtom, Final) 361 ), 362 Rules0), 363 maplist(dcg_translate_rule, Rules0, Rules). 364 365% PHONE INVENTORIES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 366% These are translated to DCG rules so are not available in this form in the 367% compiled program. 368 369:- discontiguous bare_final//4. 370:- discontiguous nonbare_final//4. 371 372% Inventory of initials. 373nonempty_initial("b"). 374nonempty_initial("p"). 375nonempty_initial("m"). 376nonempty_initial("f"). 377nonempty_initial("d"). 378nonempty_initial("t"). 379nonempty_initial("n"). 380nonempty_initial("l"). 381nonempty_initial("g"). 382nonempty_initial("k"). 383nonempty_initial("h"). 384nonempty_initial("j"). 385nonempty_initial("q"). 386nonempty_initial("x"). 387nonempty_initial("zh"). 388nonempty_initial("ch"). 389nonempty_initial("sh"). 390nonempty_initial("r"). 391nonempty_initial("z"). 392nonempty_initial("c"). 393nonempty_initial("s"). 394 395% Inventory of finals in canonical form. Actual written form depends on 396% initial. 397final("i"). 398final("u"). 399final("ü"). 400final("a"). 401final("ia"). 402final("ua"). 403final("o"). 404final("io"). % occurs in interjections 405final("uo"). 406final("e"). 407final("ie"). 408final("üe"). 409final("er"). 410final("ai"). 411final("uai"). 412final("ei"). 413final("uei"). 414final("ao"). 415final("iao"). 416final("ou"). 417final("iou"). 418final("an"). 419final("ian"). 420final("uan"). 421final("üan"). 422final("en"). 423final("in"). 424final("uen"). 425final("ün"). 426final("ang"). 427final("iang"). 428final("uang"). 429final("eng"). 430final("ing"). 431final("ueng"). 432final("ong"). 433final("iong"). 434 435% Inventory of syllabic nasals. 436syllabic_nasal("m"). 437syllabic_nasal("n"). 438syllabic_nasal("ng"). 439 440% CONVERSION BETWEEN NUMBERS AND DIACRITICS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 441 442convert(InputFormat, OutputFormat, Input, Output) :- 443 phrase(l(InputSegments), Input), 444 maplist(convert_segment(InputFormat, OutputFormat), InputSegments, 445 OutputSegments), 446 append(OutputSegments, Output). 447 448convert_segment(_, _, o(C), [C]) :- 449 !. 450convert_segment(InputFormat, OutputFormat, w(InputWord), OutputWord) :- 451 casing(InputWord, UncasedInputWord, CasePattern), 452 phrase(word(Morphs, InputFormat), UncasedInputWord), 453 phrase(word(Morphs, OutputFormat), UncasedOutputWord), 454 casing(OutputWord, UncasedOutputWord, CasePattern), 455 !. 456% word does not conform to rules, we leave it alone 457convert_segment(_, _, w(Word), Word). 458 459% TEXT PARSING %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 460% A mini-DCG that parses a string into a list of words and intervening non-word 461% characters. 462 463% string starting with word 464l([w(W)|L]) --> 465 w(W), 466 !, 467 l(L). 468% string starting with non-word character 469l([o(C)|L]) --> 470 [C], 471 !, 472 l(L). 473% empty string 474l([]) --> 475 "". 476 477% word with more than one character 478w([C|W]) --> 479 c(C), 480 w(W), 481 !. 482% word with one character 483w([C]) --> 484 c(C). 485 486% word character 487c(C) --> 488 [C], 489 { member(C, "AÃÁǍÀBCDEĒÉĚÈFGHIĪÍǏÌJKLMNOŌÓǑÒPQRSTUŪÚǓÙÜǕǗǙǛWXYZaãáǎàbcdeēéěèfghiīíǐìjklmḿnñńňǹoōóǒòpqrstuūúǔùüǖǘǚǜwxyz1234'-") 490 }
Parsing and generation of Hanyu Pinyin
This module implements a grammar that parses and generates words written in Hanyu Pinyin, the standard romanization system for Mandarin Chinese. It also provides a utility to convert whole texts between diacritics and numbers for writing tones.
*/