. (utf8) 2:- module( 3 rdf_guess, 4 [ 5 rdf_guess_file/3, % +FileSpec, +Size, -MediaType 6 rdf_guess_stream/3, % +In, +Size, -MediaType 7 rdf_guess_string/2 % +String, -MediaType 8 ] 9).
22:- use_module(library(sgml)). 23:- use_module(library(yall)). 24 25:- use_module(library(archive_ext)). 26:- use_module(library(dcg)). 27:- use_module(library(file_ext)). 28:- use_module(library(media_type)). 29:- use_module(library(rdf_prefix)). 30 31:- meta_predicate 32 n3_lexical_form_codes( , , ).
42rdf_guess_file(Spec, Size, MediaType) :-
43 read_from_file(
44 Spec,
45 {Size,MediaType}/[In0]>>rdf_guess_stream(In0, Size, MediaType)
46 ).
87rdf_guess_stream(In, Size, MediaType) :-
88 peek_string(In, Size, String),
89 rdf_guess_string(String, MediaType).
95rdf_guess_string(String, MediaType) :- 96 rdf_guess_string_(String, Ext), 97 media_type_extension(MediaType, Ext). 98 99rdf_guess_string_(String, jsonld) :- 100 string_phrase(jsonld_format, String), !. 101rdf_guess_string_(String, Ext) :- 102 % We use the information as to whether or not the end of the stream 103 % has been reached. 104 string_phrase(n3_format(Ext), String, _), !. 105rdf_guess_string_(String, Ext) :- 106 setup_call_cleanup( 107 open_string(String, In), 108 sgml_format(In, Ext), 109 close(In) 110 ). 111 112 113 114% JSON-LD %
Whatever the case may be, if a document start with the square opening bracket, it can be neither belong to the Turtle- nor to the SGML-family, so it must be JSON-LD.
If a document start with a curly opening bracket, it can still be TriG or JSON-LD. However, if the opening curly bracket is directly followed by a JSON-conformant string and a colon (the key/value separator in JSON).
132jsonld_format --> 133 blanks, 134 optional_arrays, 135 "{", 136 blanks, 137 json_string, 138 blanks, 139 ":", !, 140 remainder(_). 141 142optional_arrays --> 143 "[", !, 144 blanks, 145 optional_arrays. 146optional_arrays --> "". 147 148% JSON strings start with a double quote. 149json_string --> 150 "\"", 151 json_string_rest. 152 153% JSON strings may contain escaped double quotes. 154json_string_rest --> 155 "\\\"", !, 156 json_string_rest. 157% An unescaped double quote ends a JSON string. 158json_string_rest --> 159 "\"", !. 160% Skip other JSON string content. 161json_string_rest --> 162 [_], !, 163 json_string_rest. 164 165 166 167% N3 FAMILY %
174% skip blanks 175n3_format(Ext) --> 176 n3_blank, !, 177 n3_blanks, 178 n3_format(Ext). 179% N-Quads, N-Triples, TriG, Turtle comment 180n3_format(Ext) --> 181 n3_comment, !, 182 n3_format(Ext). 183% Turtle, TriG base or prefix declaration 184n3_format(trig) --> 185 ("@base" ; "base" ; "@prefix" ; "prefix"), !. 186% TriG default graph 187n3_format(trig) --> 188 "{", !. 189% N-Quads, N-Triples TriG, Turtle triple or quadruple 190n3_format(Ext) --> 191 n3_subject(Ext), 192 n3_blanks, 193 n3_predicate(Ext), 194 n3_blanks, 195 n3_object(Ext), 196 n3_blanks, 197 ( % end of a triple 198 "." 199 -> "" 200 ; % TriG, Turtle object list notation 201 ";" 202 -> {Ext = trig} 203 ; % TriG, Turtle predicate-object pairs list notation 204 "," 205 -> {Ext = trig} 206 ; % N-Quads end of a quadruple 207 n3_graph(Ext), 208 n3_blanks, 209 "." 210 -> {Ext = nq} 211 ), !, 212 {(var(Ext) -> Ext = nq ; true)}. 213% TriG, Turtle anonymous blank node 214n3_format(trig) --> 215 "[", !. 216% TriG, Turtle collection 217n3_format(trig) --> 218 "(", !. 219% TriG named graph 220n3_format(trig) --> 221 n3_graph(_), 222 n3_blanks, 223 "{", !. 224 225% N3 only allows horizontal tab and space, but we skip other blank 226% characters as well, since they may appear in non-conforming 227% documents without telling us anything about which N3 subtype we are 228% parsing. 229n3_blank --> 230 blank. 231n3_blank --> 232 n3_comment. 233 234n3_blanks --> 235 n3_blank, !, 236 n3_blanks. 237n3_blanks --> "". 238 239n3_bnode --> 240 "_:", 241 nonblanks. 242 243n3_comment --> 244 "#", 245 string(_), 246 (eol ; eos). 247 248eol --> "\n". 249eol --> "\r\n". 250 251n3_graph(Ext) --> 252 n3_iriref(Ext). 253 254% N-Quads, N-Triples, TriG, Turtle full IRI 255n3_iriref(_) --> 256 "<", !, 257 n3_iriref0. 258% TriG, Turtle prefixed IRI 259n3_iriref(trig) --> 260 n3_iriref_prefix, 261 ":", 262 nonblanks. 263 264n3_iriref0 --> 265 ">", !. 266n3_iriref0 --> 267 "\\", !, 268 ( "u" 269 -> xdigit(_), xdigit(_), xdigit(_), xdigit(_) 270 ; "U" 271 -> xdigit(_), xdigit(_), xdigit(_), xdigit(_), xdigit(_), xdigit(_), xdigit(_), xdigit(_) 272 ), 273 n3_iriref0. 274n3_iriref0 --> 275 [Code], 276 {\+ non_iri_code(Code)}, 277 n3_iriref0. 278 279non_iri_code(Code) :- 280 between(0x0, 0x20, Code). 281non_iri_code(0'<). 282non_iri_code(0'>). 283non_iri_code(0'").%" 284non_iri_code(0'{). 285non_iri_code(0'}). 286non_iri_code(0'|). 287non_iri_code(0'^). 288non_iri_code(0'`). 289non_iri_code(0'\\). 290 291n3_iriref_prefix --> 292 ":", !, 293 {fail}. 294n3_iriref_prefix --> 295 blank, !, 296 {fail}. 297n3_iriref_prefix --> 298 nonblank, !, 299 n3_iriref_prefix. 300n3_iriref_prefix --> "". 301 302% TriG, Turtle lexical form with triple single quotes 303n3_lexical_form(trig) --> 304 "'''", !, 305 n3_lexical_form_codes([0'',0'',0'']). 306% TriG, Turtle lexical form with single single quotes 307n3_lexical_form(trig) --> 308 "'", !, 309 n3_lexical_form_codes([0'']). 310% TriG, Turtle lexical form with triple double quotes 311n3_lexical_form(trig) --> 312 "\"\"\"", !, 313 n3_lexical_form_codes([0'",0'",0'"]). 314% N-Quads, N-Triples, TriG, Turtle lexical form with single double 315% quotes 316n3_lexical_form(_) --> 317 "\"", 318 n3_lexical_form_codes([0'"]). 319 320% Escaped single quote. 321n3_lexical_form_codes(End) --> 322 "\\\'", !, 323 n3_lexical_form_codes(End). 324% Escaped double quote. 325n3_lexical_form_codes(End) --> 326 "\\\"", !, 327 n3_lexical_form_codes(End). 328% End of string. 329n3_lexical_form_codes(End) --> 330 , !. 331% Content. 332n3_lexical_form_codes(End) --> 333 [_], !, 334 n3_lexical_form_codes(End). 335% End of stream. 336n3_lexical_form_codes(_) --> "". 337 338% TriG, Turtle abbreviated form for XSD boolean, decimal, double, and 339% integer literals 340n3_literal(trig) --> 341 ("false" ; "true" ; "+" ; "-" ; "." ; digit(_)), !. 342n3_literal(Ext) --> 343 n3_lexical_form(Ext), 344 n3_blanks, 345 ( % Literal with an explicit datatype IRI (previously: typed 346 % literal). 347 "^^" 348 -> n3_blanks, 349 n3_iriref(Ext) 350 ; % Language-tagged string (datatype IRI `rdf:langString'). 351 "@" 352 -> n3_ltag 353 ; % Abbreviated notation for `xsd:string' (previously: simple 354 % literal). 355 "" 356 ). 357 358n3_ltag --> 359 nonblanks. 360 361n3_object(Ext) --> 362 n3_iriref(Ext), !. 363n3_object(_) --> 364 n3_bnode, !. 365n3_object(Ext) --> 366 n3_literal(Ext). 367 368n3_predicate(Ext) --> 369 n3_iriref(Ext), !. 370% TriG, Turtle abbreviation for `rdf:type' 371n3_predicate(trig) --> 372 "a". 373 374n3_subject(Ext) --> 375 n3_iriref(Ext), !. 376n3_subject(_) --> 377 n3_bnode. 378 379 380 381% SGML FAMILY %
If the toplevel element is detected as HTML we guess that the document contains RDFa.
397sgml_format(In, Ext) :-
398 sgml_doctype(In, Dialect, DocType, Attributes),
399 doc_content_type(Dialect, DocType, Attributes, Ext).
Note that it is not possible to define valid RDF/XML without
namespaces, while it is not possible to define a valid absolute
Turtle IRI (using <...>
-notation) with a valid xmlns declaration.
415sgml_doctype(In, Dialect, DocType, Attributes) :- 416 setup_call_cleanup( 417 make_parser(In, Parser, State), 418 catch( 419 sgml_parse( 420 Parser, 421 [ 422 call(begin, on_begin), 423 call(cdata, on_cdata), 424 call(decl, on_declaration), 425 max_errors(-1), 426 source(In), 427 syntax_errors(quiet) 428 ] 429 ), 430 E, 431 true 432 ), 433 clean_parser(In, Parser, State) 434 ), 435 nonvar(E), 436 E = tag(Dialect, DocType, Attributes). 437 438make_parser(In, Parser, state(Position)):- 439 stream_property(In, position(Position)), 440 new_sgml_parser(Parser, []). 441 442clean_parser(In, Parser, state(Position)):- 443 free_sgml_parser(Parser), 444 set_stream_position(In, Position). 445 446on_begin(Tag, Attributes, Parser) :- 447 get_sgml_parser(Parser, dialect(Dialect)), 448 throw(tag(Dialect, Tag, Attributes)). 449 450on_cdata(_, _) :- 451 throw(error(cdata)). 452 453on_declaration(Text, Parser) :- 454 atomic_list_concat(Components, ' ', Text), 455 Components = ['DOCTYPE',Doctype], 456 get_sgml_parser(Parser, dialect(Dialect)), 457 throw(tag(Dialect, Doctype, [])).
463doc_content_type(_, html, _, html) :- !. 464doc_content_type(html, _, _, html) :- !. 465doc_content_type(xhtml, _, _, xhtml) :- !. 466doc_content_type(html5, _, _, html) :- !. 467doc_content_type(xhtml5, _, _, xhtml) :- !. 468doc_content_type(xml, rss, _, rdf) :- !. 469doc_content_type(Dialect, Top, Attributes, rdf) :- 470 % Extract the namespace from the doctype. 471 dialect_local_name(Dialect, LocalName), 472 atomic_list_concat([NS,LocalName], :, Top), 473 474 % Look up the RDF namespace in the attributes list. 475 atomic_list_concat([xmlns,NS], :, Attribute), 476 memberchk(Attribute=RDFNS, Attributes), 477 478 % Ensure it is indeed the RDF namespace. 479 rdf_prefix(rdf, RDFNS). 480 481dialect_local_name(sgml, rdf). 482dialect_local_name(xml, 'RDF')
RDF guess
This module heuristically guesses the RDF serialization format that is stored in the input stream In.
RDF/XML can be distinguished from Turtle-family serializations, because it is not possible to define valid RDF/XML without declaring XML namespaces.
*/