[redland-dev] Entities in RDFa

Richard Smith richard at ex-parrot.com
Thu Aug 15 16:25:00 EDT 2013


Last January I submitted a patch to raptor that added a 
added a new parser option called 'loadDTD'.  The patch 
hasn't been applied and there's been no subsequent 
discussion that I've seen.  Perhaps it's my fault for 
going about submitting the patch in the wrong way or the 
wrong place, and if so, apologies.

However I still feel that the patch is of definite advantage 
to raptor, in particular in its handling of entities in RDFa 
where it's very common to see an example like this:

   <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN"
       "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd">
   <html xmlns="http://www.w3.org/1999/xhtml"
         version="XHTML+RDFa 1.0" xml:lang="en">
     <head>
       <title>Test</title>
     </head>
     <body>
       <p>This page was written by
         <span xmlns:dc="http://purl.org/dc/elements/1.1/"
               property="dc:creator">José</span>.</p>
     </body>
   </html>

Note the use of the HTML eacute entity in the name José. 
(This got mangled in the web archive last time round.) 
When I submitted the patch, rapper 2.0.6 couldn't parse 
this, and testing today with 2.0.9 it is still the case. 
If Redland is to be of use with real-world RDFa, without an 
otherwise unnecessary additional pre-procesing stage, this 
needs fixing.

The argument that W3 add a 30s delay in serving the DTDs is 
largely irrelevant.  With the patch, DTD loading only 
happens only if you specifically request it, and if you have 
a suitable XML catalog, libxml2 won't fetch the DTD from the 
W3 but from a local copy on your machine.  (Under Debian, 
the w3c-sgml-lib package installs such a catalog for you.) 
And in any case, if you really need to parse the entity, 
there will be cases when the 30s delay is quite acceptable.

Is it worth me reworking the patch so that it applies 
cleanly against the current code base?

Richard
-------------- next part --------------
diff -ur raptor2-2.0.6/configure.ac raptor2-2.0.6+patch/configure.ac
--- raptor2-2.0.6/configure.ac	2011-11-24 07:15:15.000000000 +0000
+++ raptor2-2.0.6+patch/configure.ac	2012-01-06 01:51:56.815669830 +0000
@@ -700,6 +700,16 @@
 
     AC_CHECK_FUNCS(xmlSAX2InternalSubset xmlCtxtUseOptions)
 
+    AC_MSG_CHECKING(if libxml has parser option XML_PARSE_DTDLOAD)
+    AC_TRY_LINK([
+#ifdef HAVE_LIBXML_PARSER_H
+#include <libxml/parser.h>
+#endif
+], [xmlParserOption foo; foo = XML_PARSE_DTDLOAD],
+                AC_MSG_RESULT(yes)
+		AC_DEFINE(RAPTOR_LIBXML_XML_PARSE_DTDLOAD, 1, [does libxml have XML_PARSE_DTDLOA]),
+		AC_MSG_RESULT(no))
+
     AC_MSG_CHECKING(if libxml has parser option XML_PARSE_NONET)
     AC_TRY_LINK([
 #ifdef HAVE_LIBXML_PARSER_H
diff -ur raptor2-2.0.6/librdfa/rdfa.c raptor2-2.0.6+patch/librdfa/rdfa.c
--- raptor2-2.0.6/librdfa/rdfa.c	2011-08-22 07:05:56.000000000 +0100
+++ raptor2-2.0.6+patch/librdfa/rdfa.c	2012-01-06 09:59:25.158089322 +0000
@@ -1218,6 +1218,18 @@
    rdfa_init_context(context);
 
 #ifdef LIBRDFA_IN_RAPTOR
+  /* Optionally forbid network requests in the XML parser */
+  raptor_sax2_set_option(context->sax2, 
+                         RAPTOR_OPTION_NO_NET, NULL,
+                         RAPTOR_OPTIONS_GET_NUMERIC(context, RAPTOR_OPTION_NO_NET));
+
+  /* Optionally force DTD loads in the XML parser */
+  raptor_sax2_set_option(context->sax2, 
+                         RAPTOR_OPTION_LOAD_DTD, NULL,
+                         RAPTOR_OPTIONS_GET_NUMERIC(context, RAPTOR_OPTION_LOAD_DTD));
+#endif
+
+#ifdef LIBRDFA_IN_RAPTOR
    context->base_uri=raptor_new_uri(context->sax2->world, (const unsigned char*)context->base);
    raptor_sax2_parse_start(context->sax2, context->base_uri);
 #endif
diff -ur raptor2-2.0.6/librdfa/rdfa.h raptor2-2.0.6+patch/librdfa/rdfa.h
--- raptor2-2.0.6/librdfa/rdfa.h	2011-04-26 19:16:35.000000000 +0100
+++ raptor2-2.0.6+patch/librdfa/rdfa.h	2012-01-06 10:03:37.046101513 +0000
@@ -233,6 +233,8 @@
    raptor_sax2* sax2;
    raptor_namespace_handler namespace_handler;
    void* namespace_handler_user_data;
+  raptor_object_options options;
+   
 #else
    XML_Parser parser;
 #endif
diff -ur raptor2-2.0.6/src/raptor2.h.in raptor2-2.0.6+patch/src/raptor2.h.in
--- raptor2-2.0.6/src/raptor2.h.in	2011-11-27 17:36:30.000000000 +0000
+++ raptor2-2.0.6+patch/src/raptor2.h.in	2012-01-06 02:04:21.895705896 +0000
@@ -494,6 +494,7 @@
  * @RAPTOR_OPTION_WRITER_XML_VERSION: Integer XML version XML 1.0 (10) or XML 1.1 (11)
  * @RAPTOR_OPTION_WRITER_XML_DECLARATION: Write XML 1.0 or 1.1 declaration.
  * @RAPTOR_OPTION_NO_NET: Deny network requests.
+ * @RAPTOR_OPTION_LOAD_DTD: Load document DTDs.
  * @RAPTOR_OPTION_RESOURCE_BORDER: Border color of resource
  *   nodes for GraphViz DOT serializer.
  * @RAPTOR_OPTION_LITERAL_BORDER: Border color of literal nodes
@@ -568,7 +569,8 @@
   RAPTOR_OPTION_WWW_CERT_FILENAME,
   RAPTOR_OPTION_WWW_CERT_TYPE,
   RAPTOR_OPTION_WWW_CERT_PASSPHRASE,
-  RAPTOR_OPTION_LAST = RAPTOR_OPTION_WWW_CERT_PASSPHRASE
+  RAPTOR_OPTION_LOAD_DTD,
+  RAPTOR_OPTION_LAST = RAPTOR_OPTION_LOAD_DTD
 } raptor_option;
 
 
diff -ur raptor2-2.0.6/src/raptor_config.h.in raptor2-2.0.6+patch/src/raptor_config.h.in
--- raptor2-2.0.6/src/raptor_config.h.in	2011-11-24 07:15:46.000000000 +0000
+++ raptor2-2.0.6+patch/src/raptor_config.h.in	2012-01-06 01:55:06.359679001 +0000
@@ -196,6 +196,9 @@
 /* does libxml xmlSAXHandler have initialized field */
 #undef RAPTOR_LIBXML_XMLSAXHANDLER_INITIALIZED
 
+/* does libxml have XML_PARSE_DTDLOAD */
+#undef RAPTOR_LIBXML_XML_PARSE_DTDLOAD
+
 /* does libxml have XML_PARSE_NONET */
 #undef RAPTOR_LIBXML_XML_PARSE_NONET
 
diff -ur raptor2-2.0.6/src/raptor_grddl.c raptor2-2.0.6+patch/src/raptor_grddl.c
--- raptor2-2.0.6/src/raptor_grddl.c	2011-08-31 20:53:24.000000000 +0100
+++ raptor2-2.0.6+patch/src/raptor_grddl.c	2012-01-06 02:07:42.351715591 +0000
@@ -878,6 +878,10 @@
       if(RAPTOR_OPTIONS_GET_NUMERIC(xpbc->rdf_parser, RAPTOR_OPTION_NO_NET))
         libxml_options |= XML_PARSE_NONET;
 #endif
+#ifdef RAPTOR_LIBXML_XML_PARSE_DTDLOAD
+      if(RAPTOR_OPTIONS_GET_NUMERIC(xpbc->rdf_parser, RAPTOR_OPTION_LOAD_DTD))
+        libxml_options |= XML_PARSE_DTDLOAD;
+#endif
 #ifdef HAVE_XMLCTXTUSEOPTIONS
       xmlCtxtUseOptions(xc, libxml_options);
 #endif
@@ -1439,6 +1443,10 @@
       if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NO_NET))
         libxml_options |= XML_PARSE_NONET;
 #endif
+#ifdef RAPTOR_LIBXML_XML_PARSE_DTDLOAD
+      if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_LOAD_DTD))
+        libxml_options |= XML_PARSE_DTDLOAD;
+#endif
 #ifdef HAVE_XMLCTXTUSEOPTIONS
       xmlCtxtUseOptions(grddl_parser->xml_ctxt, libxml_options);
 #endif
@@ -1488,6 +1496,10 @@
         if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NO_NET))
           options |= HTML_PARSE_NONET;
 #endif
+#ifdef RAPTOR_LIBXML_XML_PARSE_DTDLOAD
+        if(RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_LOAD_DTD))
+          options |= XML_PARSE_DTDLOAD;
+#endif
 
         htmlCtxtUseOptions(grddl_parser->html_ctxt, options);
  
diff -ur raptor2-2.0.6/src/raptor_librdfa.c raptor2-2.0.6+patch/src/raptor_librdfa.c
--- raptor2-2.0.6/src/raptor_librdfa.c	2011-10-21 21:41:16.000000000 +0100
+++ raptor2-2.0.6+patch/src/raptor_librdfa.c	2012-01-06 10:05:44.150107663 +0000
@@ -267,6 +267,8 @@
   /* returns RDFa Processing Graph error triples - not used by raptor */
   rdfa_set_processor_graph_triple_handler(librdfa_parser->context, NULL);
 
+  librdfa_parser->context->options = rdf_parser->options;
+
   rc = rdfa_parse_start(librdfa_parser->context);
   if(rc != RDFA_PARSE_SUCCESS)
     return 1;
diff -ur raptor2-2.0.6/src/raptor_option.c raptor2-2.0.6+patch/src/raptor_option.c
--- raptor2-2.0.6/src/raptor_option.c	2011-08-01 03:02:22.000000000 +0100
+++ raptor2-2.0.6+patch/src/raptor_option.c	2012-01-06 09:40:28.342034303 +0000
@@ -277,6 +277,12 @@
     RAPTOR_OPTION_VALUE_TYPE_STRING,
     "wwwCertPassphrase",
     "SSL client certificate passphrase"
+  },
+  { RAPTOR_OPTION_LOAD_DTD,
+    (raptor_option_area)(RAPTOR_OPTION_AREA_PARSER | RAPTOR_OPTION_AREA_SAX2),
+    RAPTOR_OPTION_VALUE_TYPE_BOOL,
+    "loadDTD",
+    "Parsers and SAX2 XML Parser should load DTDs."
   }
 };
 
diff -ur raptor2-2.0.6/src/raptor_rdfxml.c raptor2-2.0.6+patch/src/raptor_rdfxml.c
--- raptor2-2.0.6/src/raptor_rdfxml.c	2011-10-21 21:41:16.000000000 +0100
+++ raptor2-2.0.6+patch/src/raptor_rdfxml.c	2012-01-06 02:09:14.807720071 +0000
@@ -1001,6 +1001,11 @@
   raptor_sax2_set_option(rdf_xml_parser->sax2, 
                          RAPTOR_OPTION_NO_NET, NULL,
                          RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NO_NET));
+
+  /* Optionally force DTD loads in the XML parser */
+  raptor_sax2_set_option(rdf_xml_parser->sax2, 
+                         RAPTOR_OPTION_LOAD_DTD, NULL,
+                         RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_LOAD_DTD));
   
   raptor_sax2_parse_start(rdf_xml_parser->sax2, uri);
 
diff -ur raptor2-2.0.6/src/raptor_rss.c raptor2-2.0.6+patch/src/raptor_rss.c
--- raptor2-2.0.6/src/raptor_rss.c	2011-08-31 20:53:24.000000000 +0100
+++ raptor2-2.0.6+patch/src/raptor_rss.c	2012-01-06 02:11:18.495726048 +0000
@@ -249,6 +249,11 @@
   raptor_sax2_set_option(rss_parser->sax2, 
                          RAPTOR_OPTION_NO_NET, NULL,
                          RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_NO_NET));
+
+  /* Optionally force DTD loads in the XML parser */
+  raptor_sax2_set_option(rss_parser->sax2, 
+                         RAPTOR_OPTION_LOAD_DTD, NULL,
+                         RAPTOR_OPTIONS_GET_NUMERIC(rdf_parser, RAPTOR_OPTION_LOAD_DTD));
   
   raptor_sax2_parse_start(rss_parser->sax2, uri);
 
diff -ur raptor2-2.0.6/src/raptor_sax2.c raptor2-2.0.6+patch/src/raptor_sax2.c
--- raptor2-2.0.6/src/raptor_sax2.c	2011-11-27 17:36:30.000000000 +0000
+++ raptor2-2.0.6+patch/src/raptor_sax2.c	2012-01-06 10:06:33.994110079 +0000
@@ -518,6 +518,10 @@
     if(RAPTOR_OPTIONS_GET_NUMERIC(sax2, RAPTOR_OPTION_NO_NET))
       libxml_options |= XML_PARSE_NONET;
 #endif
+#ifdef RAPTOR_LIBXML_XML_PARSE_DTDLOAD
+    if(RAPTOR_OPTIONS_GET_NUMERIC(sax2, RAPTOR_OPTION_LOAD_DTD))
+      libxml_options |= XML_PARSE_DTDLOAD;
+#endif
 #ifdef HAVE_XMLCTXTUSEOPTIONS
     xmlCtxtUseOptions(xc, libxml_options);
 #endif
diff -ur raptor2-2.0.6/src/raptor_turtle_writer.c raptor2-2.0.6+patch/src/raptor_turtle_writer.c
--- raptor2-2.0.6/src/raptor_turtle_writer.c	2011-11-12 21:18:03.000000000 +0000
+++ raptor2-2.0.6+patch/src/raptor_turtle_writer.c	2012-01-06 02:11:56.555727893 +0000
@@ -704,6 +704,7 @@
       
     /* Shared */
     case RAPTOR_OPTION_NO_NET:
+    case RAPTOR_OPTION_LOAD_DTD:
 
     /* XML writer options */
     case RAPTOR_OPTION_RELATIVE_URIS:


More information about the redland-dev mailing list