[redland-dev] Entities in RDFa
Richard Smith
richard at ex-parrot.com
Thu Aug 15 16:25:00 EDT 2013
Last January I submitted a patch to raptor that added a
added a new parser option called 'loadDTD'. The patch
hasn't been applied and there's been no subsequent
discussion that I've seen. Perhaps it's my fault for
going about submitting the patch in the wrong way or the
wrong place, and if so, apologies.
However I still feel that the patch is of definite advantage
to raptor, in particular in its handling of entities in RDFa
where it's very common to see an example like this:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN"
"http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"
version="XHTML+RDFa 1.0" xml:lang="en">
<head>
<title>Test</title>
</head>
<body>
<p>This page was written by
<span xmlns:dc="http://purl.org/dc/elements/1.1/"
property="dc:creator">José</span>.</p>
</body>
</html>
Note the use of the HTML eacute entity in the name José.
(This got mangled in the web archive last time round.)
When I submitted the patch, rapper 2.0.6 couldn't parse
this, and testing today with 2.0.9 it is still the case.
If Redland is to be of use with real-world RDFa, without an
otherwise unnecessary additional pre-procesing stage, this
needs fixing.
The argument that W3 add a 30s delay in serving the DTDs is
largely irrelevant. With the patch, DTD loading only
happens only if you specifically request it, and if you have
a suitable XML catalog, libxml2 won't fetch the DTD from the
W3 but from a local copy on your machine. (Under Debian,
the w3c-sgml-lib package installs such a catalog for you.)
And in any case, if you really need to parse the entity,
there will be cases when the 30s delay is quite acceptable.
Is it worth me reworking the patch so that it applies
cleanly against the current code base?
Richard
-------------- next part --------------
diff -ur raptor2-2.0.6/configure.ac raptor2-2.0.6+patch/configure.ac
--- raptor2-2.0.6/configure.ac 2011-11-24 07:15:15.000000000 +0000
+++ raptor2-2.0.6+patch/configure.ac 2012-01-06 01:51:56.815669830 +0000
@@ -700,6 +700,16 @@
AC_CHECK_FUNCS(xmlSAX2InternalSubset xmlCtxtUseOptions)
+ AC_MSG_CHECKING(if libxml has parser option XML_PARSE_DTDLOAD)
+ AC_TRY_LINK([
+#ifdef HAVE_LIBXML_PARSER_H
+#include <libxml/parser.h>
+#endif
+], [xmlParserOption foo; foo = XML_PARSE_DTDLOAD],
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(RAPTOR_LIBXML_XML_PARSE_DTDLOAD, 1, [does libxml have XML_PARSE_DTDLOA]),
+ AC_MSG_RESULT(no))
+
AC_MSG_CHECKING(if libxml has parser option XML_PARSE_NONET)
AC_TRY_LINK([
#ifdef HAVE_LIBXML_PARSER_H
diff -ur raptor2-2.0.6/librdfa/rdfa.c raptor2-2.0.6+patch/librdfa/rdfa.c
--- raptor2-2.0.6/librdfa/rdfa.c 2011-08-22 07:05:56.000000000 +0100
+++ raptor2-2.0.6+patch/librdfa/rdfa.c 2012-01-06 09:59:25.158089322 +0000
@@ -1218,6 +1218,18 @@
rdfa_init_context(context);
#ifdef LIBRDFA_IN_RAPTOR
+ /* Optionally forbid network requests in the XML parser */
+ raptor_sax2_set_option(context->sax2,
+ RAPTOR_OPTION_NO_NET, NULL,
+ RAPTOR_OPTIONS_GET_NUMERIC(context, RAPTOR_OPTION_NO_NET));
+
+ /* Optionally force DTD loads in the XML parser */
+ raptor_sax2_set_option(context->sax2,
+ RAPTOR_OPTION_LOAD_DTD, NULL,
+ RAPTOR_OPTIONS_GET_NUMERIC(context, RAPTOR_OPTION_LOAD_DTD));
+#endif
+
+#ifdef LIBRDFA_IN_RAPTOR
context->base_uri=raptor_new_uri(context->sax2->world, (const unsigned char*)context->base);
raptor_sax2_parse_start(context->sax2, context->base_uri);
#endif
diff -ur raptor2-2.0.6/librdfa/rdfa.h raptor2-2.0.6+patch/librdfa/rdfa.h
--- raptor2-2.0.6/librdfa/rdfa.h 2011-04-26 19:16:35.000000000 +0100
+++ raptor2-2.0.6+patch/librdfa/rdfa.h 2012-01-06 10:03:37.046101513 +0000
@@ -233,6 +233,8 @@
raptor_sax2* sax2;
raptor_namespace_handler namespace_handler;
void* namespace_handler_user_data;
+ raptor_object_options options;
+
#else
XML_Parser parser;
#endif
diff -ur raptor2-2.0.6/src/raptor2.h.in raptor2-2.0.6+patch/src/raptor2.h.in
--- raptor2-2.0.6/src/raptor2.h.in 2011-11-27 17:36:30.000000000 +0000
+++ raptor2-2.0.6+patch/src/raptor2.h.in 2012-01-06 02:04:21.895705896 +0000
@@ -494,6 +494,7 @@
* @RAPTOR_OPTION_WRITER_XML_VERSION: Integer XML version XML 1.0 (10) or XML 1.1 (11)
* @RAPTOR_OPTION_WRITER_XML_DECLARATION: Write XML 1.0 or 1.1 declaration.
* @RAPTOR_OPTION_NO_NET: Deny network requests.
+ * @RAPTOR_OPTION_LOAD_DTD: Load document DTDs.
* @RAPTOR_OPTION_RESOURCE_BORDER: Border color of resource
* nodes for GraphViz DOT serializer.
* @RAPTOR_OPTION_LITERAL_BORDER: Border color of literal nodes
@@ -568,7 +569,8 @@
RAPTOR_OPTION_WWW_CERT_FILENAME,
RAPTOR_OPTION_WWW_CERT_TYPE,
RAPTOR_OPTION_WWW_CERT_PASSPHRASE,
- RAPTOR_OPTION_LAST = RAPTOR_OPTION_WWW_CERT_PASSPHRASE
+ RAPTOR_OPTION_LOAD_DTD,
+ RAPTOR_OPTION_LAST = RAPTOR_OPTION_LOAD_DTD
} raptor_option;
diff -ur raptor2-2.0.6/src/raptor_config.h.in raptor2-2.0.6+patch/src/raptor_config.h.in
--- raptor2-2.0.6/src/raptor_config.h.in 2011-11-24 07:15:46.000000000 +0000
+++ raptor2-2.0.6+patch/src/raptor_config.h.in 2012-01-06 01:55:06.359679001 +0000
@@ -196,6 +196,9 @@
/* does libxml xmlSAXHandler have initialized field */
#undef RAPTOR_LIBXML_XMLSAXHANDLER_INITIALIZED
+/* does libxml have XML_PARSE_DTDLOAD */
+#undef RAPTOR_LIBXML_XML_PARSE_DTDLOAD
+
/* does libxml have XML_PARSE_NONET */
#undef RAPTOR_LIBXML_XML_PARSE_NONET
diff -ur raptor2-2.0.6/src/raptor_grddl.c raptor2-2.0.6+patch/src/raptor_grddl.c
--- raptor2-2.0.6/src/raptor_grddl.c 2011-08-31 20:53:24.000000000 +0100
+++ raptor2-2.0.6+patch/src/raptor_grddl.c 2012-01-06 02:07:42.351715591 +0000
@@ -878,6 +878,10 @@
if(RAPTOR_OPTIONS_GET_NUMERIC(xpbc->rdf_parser, RAPTOR_OPTION_NO_NET))
libxml_options |= XML_PARSE_NONET;
#endif
+#ifdef RAPTOR_LIBXML_XML_PARSE_DTDLOAD
+ if(RAPTOR_OPTIONS_GET_NUMERIC(xpbc->rdf_parser, RAPTOR_OPTION_LOAD_DTD))
+ libxml_options |= XML_PARSE_DTDLOAD;
+#endif
#ifdef HAVE_XMLCTXTUSEOPTIONS
xmlCtxtUseOptions(xc, libxml_options);
#endif
@@ -1439,6 +1443,10 @@
if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NO_NET))
libxml_options |= XML_PARSE_NONET;
#endif
+#ifdef RAPTOR_LIBXML_XML_PARSE_DTDLOAD
+ if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_LOAD_DTD))
+ libxml_options |= XML_PARSE_DTDLOAD;
+#endif
#ifdef HAVE_XMLCTXTUSEOPTIONS
xmlCtxtUseOptions(grddl_parser->xml_ctxt, libxml_options);
#endif
@@ -1488,6 +1496,10 @@
if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NO_NET))
options |= HTML_PARSE_NONET;
#endif
+#ifdef RAPTOR_LIBXML_XML_PARSE_DTDLOAD
+ if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_LOAD_DTD))
+ options |= XML_PARSE_DTDLOAD;
+#endif
htmlCtxtUseOptions(grddl_parser->html_ctxt, options);
diff -ur raptor2-2.0.6/src/raptor_librdfa.c raptor2-2.0.6+patch/src/raptor_librdfa.c
--- raptor2-2.0.6/src/raptor_librdfa.c 2011-10-21 21:41:16.000000000 +0100
+++ raptor2-2.0.6+patch/src/raptor_librdfa.c 2012-01-06 10:05:44.150107663 +0000
@@ -267,6 +267,8 @@
/* returns RDFa Processing Graph error triples - not used by raptor */
rdfa_set_processor_graph_triple_handler(librdfa_parser->context, NULL);
+ librdfa_parser->context->options = rdf_parser->options;
+
rc = rdfa_parse_start(librdfa_parser->context);
if(rc != RDFA_PARSE_SUCCESS)
return 1;
diff -ur raptor2-2.0.6/src/raptor_option.c raptor2-2.0.6+patch/src/raptor_option.c
--- raptor2-2.0.6/src/raptor_option.c 2011-08-01 03:02:22.000000000 +0100
+++ raptor2-2.0.6+patch/src/raptor_option.c 2012-01-06 09:40:28.342034303 +0000
@@ -277,6 +277,12 @@
RAPTOR_OPTION_VALUE_TYPE_STRING,
"wwwCertPassphrase",
"SSL client certificate passphrase"
+ },
+ { RAPTOR_OPTION_LOAD_DTD,
+ (raptor_option_area)(RAPTOR_OPTION_AREA_PARSER | RAPTOR_OPTION_AREA_SAX2),
+ RAPTOR_OPTION_VALUE_TYPE_BOOL,
+ "loadDTD",
+ "Parsers and SAX2 XML Parser should load DTDs."
}
};
diff -ur raptor2-2.0.6/src/raptor_rdfxml.c raptor2-2.0.6+patch/src/raptor_rdfxml.c
--- raptor2-2.0.6/src/raptor_rdfxml.c 2011-10-21 21:41:16.000000000 +0100
+++ raptor2-2.0.6+patch/src/raptor_rdfxml.c 2012-01-06 02:09:14.807720071 +0000
@@ -1001,6 +1001,11 @@
raptor_sax2_set_option(rdf_xml_parser->sax2,
RAPTOR_OPTION_NO_NET, NULL,
RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NO_NET));
+
+ /* Optionally force DTD loads in the XML parser */
+ raptor_sax2_set_option(rdf_xml_parser->sax2,
+ RAPTOR_OPTION_LOAD_DTD, NULL,
+ RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_LOAD_DTD));
raptor_sax2_parse_start(rdf_xml_parser->sax2, uri);
diff -ur raptor2-2.0.6/src/raptor_rss.c raptor2-2.0.6+patch/src/raptor_rss.c
--- raptor2-2.0.6/src/raptor_rss.c 2011-08-31 20:53:24.000000000 +0100
+++ raptor2-2.0.6+patch/src/raptor_rss.c 2012-01-06 02:11:18.495726048 +0000
@@ -249,6 +249,11 @@
raptor_sax2_set_option(rss_parser->sax2,
RAPTOR_OPTION_NO_NET, NULL,
RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NO_NET));
+
+ /* Optionally force DTD loads in the XML parser */
+ raptor_sax2_set_option(rss_parser->sax2,
+ RAPTOR_OPTION_LOAD_DTD, NULL,
+ RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_LOAD_DTD));
raptor_sax2_parse_start(rss_parser->sax2, uri);
diff -ur raptor2-2.0.6/src/raptor_sax2.c raptor2-2.0.6+patch/src/raptor_sax2.c
--- raptor2-2.0.6/src/raptor_sax2.c 2011-11-27 17:36:30.000000000 +0000
+++ raptor2-2.0.6+patch/src/raptor_sax2.c 2012-01-06 10:06:33.994110079 +0000
@@ -518,6 +518,10 @@
if(RAPTOR_OPTIONS_GET_NUMERIC(sax2, RAPTOR_OPTION_NO_NET))
libxml_options |= XML_PARSE_NONET;
#endif
+#ifdef RAPTOR_LIBXML_XML_PARSE_DTDLOAD
+ if(RAPTOR_OPTIONS_GET_NUMERIC(sax2, RAPTOR_OPTION_LOAD_DTD))
+ libxml_options |= XML_PARSE_DTDLOAD;
+#endif
#ifdef HAVE_XMLCTXTUSEOPTIONS
xmlCtxtUseOptions(xc, libxml_options);
#endif
diff -ur raptor2-2.0.6/src/raptor_turtle_writer.c raptor2-2.0.6+patch/src/raptor_turtle_writer.c
--- raptor2-2.0.6/src/raptor_turtle_writer.c 2011-11-12 21:18:03.000000000 +0000
+++ raptor2-2.0.6+patch/src/raptor_turtle_writer.c 2012-01-06 02:11:56.555727893 +0000
@@ -704,6 +704,7 @@
/* Shared */
case RAPTOR_OPTION_NO_NET:
+ case RAPTOR_OPTION_LOAD_DTD:
/* XML writer options */
case RAPTOR_OPTION_RELATIVE_URIS:
More information about the redland-dev
mailing list