[elephant-devel] UTF seriazer/desiriali patch

Hiroyuki Komatsu kom at narihara-lab.jp
Mon Aug 3 20:14:28 UTC 2009


Sorry, I'm not familiar to English.

BDB btree stores utf16/utf32 string into illegal sort order.

There is 2 problems in string serializer:
  UTF serializers serialize into big endian
  UTF32 compator in libberkeley-db.c does not work correctly

attached patch fix these problems.
-------------- next part --------------
diff -rN -u old-elephant/src/db-bdb/libberkeley-db.c new-elephant/src/db-bdb/libberkeley-db.c
--- old-elephant/src/db-bdb/libberkeley-db.c	2009-08-04 04:34:01.000000000 +0900
+++ new-elephant/src/db-bdb/libberkeley-db.c	2009-08-04 04:34:01.000000000 +0900
@@ -1122,7 +1122,7 @@
     /*****
     printf("Doing a 32-bit compare\n");
     *****/
-    return wcs_cmp((wchar_t*)ad+5+offset, read_int32(ad+offset, 1), (wchar_t*)bd+5+offset, read_int32(bd+offset, 1)); 
+   return wcs_cmp((wchar_t*)(ad+5+offset), read_int32(ad+offset, 1), (wchar_t*)(bd+5+offset), read_int32(bd+offset, 1)); 
   default:
     /*****
     printf("Doing a lex compare\n");
@@ -1313,7 +1313,7 @@
   int min, sizediff, diff;
   sizediff = length1 - length2;
   min = sizediff > 0 ? length2 : length1;
-  diff = wcsncmp(a, b, min /4);
+  diff = wcsncmp(a, b, min);
   if (diff == 0) return sizediff;
   return diff;
 }
diff -rN -u old-elephant/src/elephant/unicode.lisp new-elephant/src/elephant/unicode.lisp
--- old-elephant/src/elephant/unicode.lisp	2009-08-04 04:34:01.000000000 +0900
+++ new-elephant/src/elephant/unicode.lisp	2009-08-04 04:34:01.000000000 +0900
@@ -145,10 +145,10 @@
             (loop for i fixnum from 0 below characters do
                   (let ((code (char-code (funcall char string i))))
                     (when (> code #xFFFF) (fail))
-                    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 2) size))
+                    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 2) size 1))
                           ;;			  (coerce (ldb (byte 8 8) code) '(signed 8)))
                           (ldb (byte 8 8) code))
-                    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 2) size 1))
+                    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 2) size 0))
                           ;;			  (coerce (ldb (byte 8 0) code) '(signed 8))))))
                           (ldb (byte 8 0) code))))
             (incf size (* characters 2))
@@ -174,13 +174,13 @@
 	     (loop for i fixnum from 0 below characters do
 		  (let ((code (char-code (funcall char string i))))
 		    (when (> code #x10FFFF) (error "Invalid unicode code type"))
-		    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 4) size 0))
+		    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 4) size 3))
 			  (ldb (byte 8 24) code))
-		    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 4) size 1))
-			  (ldb (byte 8 16) code))
 		    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 4) size 2))
+			  (ldb (byte 8 16) code))
+		    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 4) size 1))
 			  (ldb (byte 8 8) code))
-		    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 4) size 3))
+		    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 4) size 0))
 			  (ldb (byte 8 0) code)))))
 	  (incf size (* characters 4))
 	  t)))
@@ -274,8 +274,8 @@
       (assert (subtypep (type-of string) 'simple-string))
       (assert (compatible-unicode-support-p :utf16le))
       (loop for i fixnum from 0 below length do
-	   (setf code (dpb (next-byte 0) (byte 8 8) 0))
-	   (setf code (dpb (next-byte 1) (byte 8 0) code))
+	   (setf code (dpb (next-byte 1) (byte 8 8) 0))
+	   (setf code (dpb (next-byte 0) (byte 8 0) code))
 	   (setf (schar string i) (code-char code)))
       (incf (elephant-memutil::buffer-stream-position bstream)
 	    (* length 2)))
@@ -294,10 +294,10 @@
     (assert (subtypep (type-of string) 'simple-string))
     (assert (compatible-unicode-support-p :utf32le))
     (loop for i fixnum from 0 below length do
-	 (setf code (dpb (next-byte 0) (byte 8 24) 0))
-	 (setf code (dpb (next-byte 1) (byte 8 16) code))
-	 (setf code (dpb (next-byte 2) (byte 8 8) code))
-	 (setf code (dpb (next-byte 3) (byte 8 0) code))
+	 (setf code (dpb (next-byte 3) (byte 8 24) 0))
+	 (setf code (dpb (next-byte 2) (byte 8 16) code))
+	 (setf code (dpb (next-byte 1) (byte 8 8) code))
+	 (setf code (dpb (next-byte 0) (byte 8 0) code))
 	 (setf (char string i) (code-char code)))
     (incf (elephant-memutil::buffer-stream-position bstream)
 	  (* length 4))



More information about the elephant-devel mailing list