[redland-dev] Patches for PostgreSQL storage performance

William Waites ww at styx.org
Thu May 15 10:25:02 BST 2008


Hello,

I've noticed a couple of issues with the PostgreSQL back-end affecting
performance on insert operations.

The first is that when a new statement is added, there is a check to see
if the statement exists, but if it does not, any Resource, Literal or
Bnodes are just blindly inserted. The result is the database throwing
duplicate key errors if they already exist. I originally thought that
this was performance affecting as well as noisy, but the first patch
only improves performance marginally (about half a percent time-wise
according to my tests). I still think this patch is a good idea though,
since semi-purposefully causing database errors strikes me as wrong.

The second patch improves performance markedly, about 750%, by adding
indexes on the Statements table.

Patches are against Redland 1.0.7.

Cheers,
-w

[Performance measurements are based on adding 500 statements to a
database containing ~120000 statements already and ~5000 resources
and ~6000 literals. YMMV]
-------------- next part --------------
--- redland-1.0.7/librdf/rdf_storage_postgresql.c	2007-12-20 22:39:42.000000000 +0100
+++ redland-1.0.7-select/librdf/rdf_storage_postgresql.c	2008-05-15 10:28:19.000000000 +0200
@@ -106,6 +106,7 @@
                                                  librdf_statement* statement);
 static int librdf_storage_postgresql_contains_statement(librdf_storage* storage,
                                                    librdf_statement* statement);
+static int librdf_storage_postgresql_contains_value(librdf_storage* storage, char *table, u64 hash);
 
 
 librdf_stream* librdf_storage_postgresql_serialise(librdf_storage* storage);
@@ -865,6 +866,12 @@
       char create_resource[]="INSERT INTO Resources (ID,URI) VALUES (" UINT64_T_FMT ",'%s')";
       /* Escape URI for db query */
       char *escaped_uri;
+     
+      if(librdf_storage_postgresql_contains_value(storage, "Resources", hash)) {
+        librdf_storage_postgresql_release_handle(storage, handle);
+        return hash;
+      } 
+
       if(!(escaped_uri=(char*)LIBRDF_MALLOC(cstring, nodelen*2+1))) {
         librdf_storage_postgresql_release_handle(storage, handle);
         return 0;
@@ -928,6 +935,12 @@
       char create_literal[]="INSERT INTO Literals (ID,Value,Language,Datatype) VALUES (" UINT64_T_FMT ",'%s','%s','%s')";
       /* Escape value, lang and datatype for db query */
       char *escaped_value, *escaped_lang, *escaped_datatype;
+     
+      if(librdf_storage_postgresql_contains_value(storage, "Literals", hash)) {
+        librdf_storage_postgresql_release_handle(storage, handle);
+        return hash;
+      } 
+
       if(!(escaped_value=(char*)LIBRDF_MALLOC(cstring, valuelen*2+1)) ||
           !(escaped_lang=(char*)LIBRDF_MALLOC(cstring, langlen*2+1)) ||
           !(escaped_datatype=(char*)LIBRDF_MALLOC(cstring, datatypelen*2+1))) {
@@ -981,6 +994,12 @@
       char create_bnode[]="INSERT INTO Bnodes (ID,Name) VALUES (" UINT64_T_FMT ",'%s')";
       /* Escape name for db query */
       char *escaped_name;
+     
+      if(librdf_storage_postgresql_contains_value(storage, "Bnodes", hash)) {
+        librdf_storage_postgresql_release_handle(storage, handle);
+        return hash;
+      } 
+
       if(!(escaped_name=(char*)LIBRDF_MALLOC(cstring, nodelen*2+1))) {
         librdf_storage_postgresql_release_handle(storage, handle);
         return 0;
@@ -1183,6 +1202,57 @@
 
 
 /**
+ * librdf_storage_postgresql_contains_value - Test if a given value exists in the table.
+ * used to prevent bouncing postgres errors...
+ *
+ * @table: the table to check
+ * @hash: the hash of the value
+ *
+ * Return value: Non-zero if the model contains the statement.
+ **/
+static int
+librdf_storage_postgresql_contains_value(librdf_storage* storage, char *table, u64 hash)
+{
+  librdf_storage_postgresql_context* context=(librdf_storage_postgresql_context*)storage->context;
+  char find_value[]="SELECT 1 FROM %s WHERE ID=" UINT64_T_FMT " LIMIT 1;";
+  char *query;
+  PGresult *res;
+  PGconn *handle;
+
+  /* Get postgresql connection handle */
+  handle=librdf_storage_postgresql_get_handle(storage);
+  if(!handle)
+    return 0;
+
+  /* Check for statement */
+  if(!(query=(char*)LIBRDF_MALLOC(cstring, strlen(find_value) + strlen(table) + 81))) {
+    librdf_storage_postgresql_release_handle(storage, handle);
+    return 0;
+  }
+  sprintf(query, find_value, table, hash);
+  if(!(res=PQexec(handle, query)) ) {
+    librdf_log(storage->world, 0, LIBRDF_LOG_ERROR, LIBRDF_FROM_STORAGE, NULL,
+               "postgresql query for statement failed: %s",
+               PQerrorMessage(handle));
+    LIBRDF_FREE(cstring,query);
+    librdf_storage_postgresql_release_handle(storage, handle);
+    return 0;
+  }
+  LIBRDF_FREE(cstring, query);
+  if(!(PQntuples(res))) {
+    if(res)
+      PQclear(res);
+    librdf_storage_postgresql_release_handle(storage, handle);
+    return 0;
+  }
+  if(res)
+    PQclear(res);
+  librdf_storage_postgresql_release_handle(storage, handle);
+
+  return 1;
+}
+
+/**
  * librdf_storage_postgresql_contains_statement - Test if a given complete statement is present in the model
  * @storage: the storage
  * @statement: a complete statement
-------------- next part --------------
--- redland-1.0.7/librdf/rdf_storage_postgresql.c	2007-12-20 22:39:42.000000000 +0100
+++ redland-1.0.7-index/librdf/rdf_storage_postgresql.c	2008-05-15 10:58:50.000000000 +0200
@@ -106,6 +106,7 @@
                                                  librdf_statement* statement);
 static int librdf_storage_postgresql_contains_statement(librdf_storage* storage,
                                                    librdf_statement* statement);
 
 librdf_stream* librdf_storage_postgresql_serialise(librdf_storage* storage);
@@ -422,8 +423,14 @@
   Subject numeric(20) NOT NULL,\
   Predicate numeric(20) NOT NULL,\
   Object numeric(20) NOT NULL,\
-  Context numeric(20) NOT NULL\
-) ";
+  Context numeric(20) NOT NULL );\n\
+  CREATE INDEX Statements" UINT64_T_FMT "_Subject_idx\
+    ON Statements" UINT64_T_FMT "(Subject);\n\
+  CREATE INDEX Statements" UINT64_T_FMT "_Predicate_idx\
+    ON Statements" UINT64_T_FMT "(Predicate);\n\
+  CREATE INDEX Statements" UINT64_T_FMT "_Object_idx\
+    ON Statements" UINT64_T_FMT "(Object);";
+
   const char create_table_literals[]="\
   CREATE TABLE Literals (\
   ID numeric(20) NOT NULL,\
@@ -506,12 +513,14 @@
   /* Create tables, if new and not existing */
   if(!status && (librdf_hash_get_as_boolean(options, "new")>0)) 
   {
-    query=(char*)LIBRDF_MALLOC(cstring, strlen(create_table_statements)+20);
+    query=(char*)LIBRDF_MALLOC(cstring, strlen(create_table_statements)+100);
     if(! query)  
       status=1;
     else 
      {
-      sprintf(query, create_table_statements, context->model);
+      sprintf(query, create_table_statements, context->model,
+              context->model, context->model, context->model,
+	      context->model, context->model, context->model);
       if(! PQexec(handle, query) ||
          ! PQexec(handle, create_table_literals) ||
          ! PQexec(handle, create_table_resources) ||


More information about the redland-dev mailing list