[elephant-devel] revised UTF seriazer/desirializer patch

Sat Aug 8 02:15:46 UTC 2009

This patch does these things;

o Maybe, big endian machines are nothing affected by this
  patch.  I do not have any big endian machine.
 
o little endian machines;
  + UTF strings are serialized into UTF16le or UTF32le with BOM
  + deserializers are test existency of BOM and choice deserialize from
    big endian or little endian.
  + comparators in libberkeley-db are also test BOM,
    create temporally buffer when the string is serialize into big
    endian.

o old store image preserved
o sort order is corrected when migrate old store to new store.

I did not test any other backing store.

-------------- next part --------------
diff -rN -u old-elephant/src/db-bdb/libberkeley-db.c new-elephant/src/db-bdb/libberkeley-db.c

--- old-elephant/src/db-bdb/libberkeley-db.c	2009-08-08 10:51:25.000000000 +0900
+++ new-elephant/src/db-bdb/libberkeley-db.c	2009-08-08 10:51:25.000000000 +0900
@@ -25,6 +25,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <wchar.h>
+#include <stdlib.h>
 
 /* Some utility stuff used to be here but has been placed in
    libmemutil.c  */
@@ -920,7 +921,7 @@
   case S1_UCS4_SYMBOL:
   case S1_UCS4_STRING:
   case S1_UCS4_PATHNAME:
-    return wcs_cmp((wchar_t*)ad+9, read_int(ad, 5), (wchar_t*)bd+9, read_int(bd, 5)); 
+  return wcs_cmp((wchar_t*)(ad+9), read_int(ad, 5), (wchar_t*)(bd+9), read_int(bd, 5)); 
   default:
     return lex_cmp(ad+5, (a->size)-5, bd+5, (b->size)-5);
   }
@@ -1130,7 +1131,7 @@
     /*****
     printf("Doing a 32-bit compare\n");
     *****/
-    return wcs_cmp((wchar_t*)ad+5+offset, read_int32(ad+offset, 1), (wchar_t*)bd+5+offset, read_int32(bd+offset, 1)); 
+    return wcs_cmp((wchar_t*)(ad+5+offset), read_int32(ad+offset, 1), (wchar_t*)(bd+5+offset), read_int32(bd+offset, 1)); 
   default:
     /*****
     printf("Doing a lex compare\n");
@@ -1306,6 +1307,18 @@
 #define strncasecmp _strnicmp
 typedef unsigned short uint16_t;
 #endif
+#define ENDIAN_BIG 0
+#define ENDIAN_LITTLE 1
+
+int machine_endian()
+{
+	uint32_t x = 0x01020304;
+	uint8_t *xp = (uint8_t *)&x;
+	if (*xp == 0x01)
+		return ENDIAN_BIG;
+	else
+		return ENDIAN_LITTLE;
+}
 
 int case_cmp(const unsigned char *a, int32_t length1, const unsigned char *b, int32_t length2) {
   int min, sizediff, diff;
@@ -1316,12 +1329,72 @@
   return diff;
 }
 
+wchar_t utf32_char(const wchar_t *c)
+{
+	uint8_t *cp = (uint8_t *)c;
+	return (cp[3] << 24) | (cp[2] << 16) | (cp[1] << 8) | cp[0];
+}
+
+wchar_t *swap32_string(const wchar_t *str, int32_t length)
+{
+	int i;
+	wchar_t *swap_buff = malloc(4 * length);
+	for (i = 0; i < length; ++i) {
+		uint8_t *sp = (uint8_t *)&str[i],
+			*dp = (uint8_t *)&swap_buff[i];
+		sp[0] = dp[3];
+		sp[1] = dp[2];
+		sp[2] = dp[1];
+		sp[3] = dp[0];
+	}
+	return swap_buff;
+}
+
+#if 0
+void dump_string(int size, uint8_t *str, int32_t length, char *prefix)
+{
+	int i;
+	printf("%s: ", prefix);
+	for (i = 0; i < length * size; i += 2)
+		printf("%02x%02x ", str[i], str[i + 1]);
+	printf("\n");
+}
+#endif
+
 int wcs_cmp(const wchar_t *a, int32_t length1, 
 	    const wchar_t *b, int32_t length2) {
   int min, sizediff, diff;
+  wchar_t *swap_a = NULL, *swap_b = NULL;
+
+#if 0
+  dump_string(4, a, length1, "A");
+  dump_string(4, b, length2, "B");
+#endif
+  if (machine_endian() == ENDIAN_LITTLE) {
+	  if (utf32_char(a) != 0xfffe) {/* BIG-ENDIAN */
+		  swap_a = swap32_string(a, length1);
+		  if (swap_a)
+			  a = swap_a;
+	  } else {		/* LITTLE-ENDIAN */
+		  ++a;
+		  --length1;
+	  }
+	  if (utf32_char(b) != 0xfffe) {/* BIG-ENDIAN */
+		  swap_b = swap32_string(b, length2);
+		  if (swap_b)
+			  b = swap_b;
+	  } else {		/* LITTLE-ENDIAN */
+		  ++b;
+		  --length2;
+	  }
+  }
   sizediff = length1 - length2;
   min = sizediff > 0 ? length2 : length1;
-  diff = wcsncmp(a, b, min /4);
+  diff = wcsncmp(a, b, min);
+  if (swap_a)
+	  free(swap_a);
+  if (swap_b)
+	  free(swap_b);
   if (diff == 0) return sizediff;
   return diff;
 }
@@ -1351,6 +1424,22 @@
 #define UTF_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
 #define UTF_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
 
+uint16_t utf16_char(const uint8_t *str)
+{
+	return (str[1] << 8) | str[0];
+}
+
+uint8_t *swap16_string(const uint8_t *src, int32_t length)
+{
+	int i;
+	uint8_t *swap_buff = malloc(2 * length);
+	for (i = 0; i < length * 2; i += 2) {
+		swap_buff[i + 0] = src[i + 1];
+		swap_buff[i + 1] = src[i + 1];
+	}
+	return swap_buff;
+}
+
 /* compare UTF-16 strings */
 /* memcmp/UnicodeString style, both length-specified */
 /* don't assume byte-aligned! */
@@ -1359,7 +1448,29 @@
   const unsigned char *start1, *start2, *limit1, *limit2;
   UChar c1, c2;
   int32_t lengthResult;
-
+  uint8_t *swap_s1 = NULL, *swap_s2 = NULL;
+#if 0
+  dump_string(2, s1, length1, "S1");
+  dump_string(2, s2, length2, "S2");
+#endif
+  if (machine_endian() == ENDIAN_LITTLE) {
+	  if (utf16_char(s1) != 0xfffe) {/* BIG-ENDIAN */
+		  swap_s1 = swap16_string(s1, length1);
+		  if (swap_s1)
+			  s1 = swap_s1;
+	  } else {		/* LITTLE-ENDIAN */
+		  s1 += 2;
+		  length1 -= 1;
+	  }
+	  if (utf16_char(s2) != 0xfffe) {/* BIG-ENDIAN */
+		  swap_s2 = swap16_string(s2, length2);
+		  if (swap_s2)
+			  s2 = swap_s2;
+	  } else {		/* LITTLE-ENDIAN */
+		  s2 += 2;
+		  length2 -= 1;
+	  }
+  }
   if(length1<length2) {
     lengthResult=-1;
     limit1=s1+2*length1;
@@ -1415,6 +1526,10 @@
       }*/
   }
 
+  if (swap_s1)
+	  free(swap_s1);
+  if (swap_s2)
+	  free(swap_s2);
   return (int32_t)c1-(int32_t)c2;
 }
 
diff -rN -u old-elephant/src/elephant/unicode.lisp new-elephant/src/elephant/unicode.lisp
--- old-elephant/src/elephant/unicode.lisp	2009-08-08 10:51:25.000000000 +0900
+++ new-elephant/src/elephant/unicode.lisp	2009-08-08 10:51:25.000000000 +0900
@@ -41,7 +41,7 @@
 
 ;; #+allegro
 ;; (defun serialize-string (string bstream)
-;;   (elephant-memutil::with-struct-slots ((buffer buffer-stream-buffer)
+;;   (e(lephant-memutil::with-struct-slots ((buffer buffer-stream-buffer)
 ;; 					(size buffer-stream-size)
 ;; 					(allocated buffer-stream-length))
 ;;       bstream
@@ -59,20 +59,20 @@
   (declare (type buffer-stream bstream)
 	   (type string string))
   (cond ((and (not (equal "" string)) (> (char-code (char string 0)) #xFFFF))
-	 (serialize-to-utf32le string bstream))
+	 (serialize-to-utf32 string bstream))
 	;; Accelerate the common case where a character set is not Latin-1
 	((and (not (equal "" string)) (> (char-code (char string 0)) #xFF))
-	 (or (serialize-to-utf16le string bstream)
-	     (serialize-to-utf32le string bstream)))
+	 (or (serialize-to-utf16 string bstream)
+	     (serialize-to-utf32 string bstream)))
 	;; Actually code pages > 0 are rare; so we can pay an extra cost
 	(t (or (serialize-to-utf8 string bstream)
-	       (serialize-to-utf16le string bstream)
-	       (serialize-to-utf32le string bstream)))))
+	       (serialize-to-utf16 string bstream)
+	       (serialize-to-utf32 string bstream)))))
 
 (defun serialize-to-utf8 (string bstream)
   "Standard serialization"
   (declare (type buffer-stream bstream)
- 	   (type string string))
+ 	   (type simple-string string))
   (elephant-memutil::with-struct-slots ((buffer buffer-stream-buffer)
 					(size buffer-stream-size)
 					(allocated buffer-stream-length))
@@ -117,73 +117,105 @@
 	    (setf (buffer-stream-size bstream) needed)
 	    (succeed))))))
 
-(defun serialize-to-utf16le (string bstream)
-  "Serialize to utf16le compliant format unless contains code pages > 0"
+(defvar *machine-endian*
+  (let* ((bstream (make-buffer-stream))
+	 (buffer (buffer-stream-buffer bstream))
+	 (size (buffer-stream-size bstream)))
+    (buffer-write-int32 #x01020304 bstream)
+    (let ((byte-image
+	   (loop for i from 0 to 3
+		 collect (uffi:deref-array buffer '(:array :unsigned-char)
+						   (the fixnum (+ size i))))))
+      (cond ((equal byte-image '(4 3 2 1)) 'endian-little)
+	    ((equal byte-image '(1 2 3 4)) 'endian-big)
+	    (t 'unknown)))))
+
+(defun machine-endian ()
+  *machine-endian*)
+
+(defun write-utf-char-to-buffer (char char-index char-size buffer base endian)
+  (declare (type (signed-byte 31) char-index)
+	   (type (integer 1 4) char-size))
+  (loop for i from 0 below char-size do
+    (setf (uffi:deref-array buffer '(:array :unsigned-char)
+				    (+ (* char-index char-size) base
+				       (the (integer 0 3)
+					 (if (eq endian 'endian-little)
+					     i
+					     (- char-size 1 i)))))
+	  (ldb (byte 8 (* 8 i)) char))))
+  
+(defun serialize-to-utf16 (string bstream)
+  "Serialize to utf16 compliant format unless contains code pages > 0"
   (declare (type buffer-stream bstream)
  	   (type string string))
+  (progn
+    (format *debug-io* "LSIP-ENTER: ")
+    (loop for i from 0 below (length string)
+	  do (format *debug-io* "~4,'0X " (char-code (char string i))))
+    (format *debug-io* "~%"))
   (elephant-memutil::with-struct-slots ((buffer buffer-stream-buffer)
 					(size buffer-stream-size)
 					(allocated buffer-stream-length))
       bstream
       (let* ((saved-size (buffer-stream-size bstream))
 	     (saved-pos (elephant-memutil::buffer-stream-position bstream))
-	     (characters (length string)))
+	     (characters (length string))
+	     (endian (machine-endian))
+	     (bom-length (if (eq endian 'endian-big) 0 1)))
 	(labels ((fail () 
 		   (setf (buffer-stream-size bstream) saved-size)
 		   (setf (elephant-memutil::buffer-stream-position bstream) saved-pos)
-		   (return-from serialize-to-utf16le nil))
+		   (return-from serialize-to-utf16 nil))
 		 (succeed ()
-		   (return-from serialize-to-utf16le t)))
+		   (return-from serialize-to-utf16 t)))
 	  (buffer-write-byte +utf16-string+ bstream)
-	  (buffer-write-int32 characters bstream)
-	  (let ((needed (+ size (* characters 2)))
-                (char (etypecase string
+	  (buffer-write-int32  (+ characters bom-length) bstream)
+	  (let ((needed (+ size (* (+ characters bom-length) 2)))
+		  (char (etypecase string
                         (simple-string #'schar)
                         (string #'char))))
             (when (> needed allocated)
               (resize-buffer-stream bstream needed))
-            (loop for i fixnum from 0 below characters do
-                  (let ((code (char-code (funcall char string i))))
-                    (when (> code #xFFFF) (fail))
-                    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 2) size))
-                          ;;			  (coerce (ldb (byte 8 8) code) '(signed 8)))
-                          (ldb (byte 8 8) code))
-                    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 2) size 1))
-                          ;;			  (coerce (ldb (byte 8 0) code) '(signed 8))))))
-                          (ldb (byte 8 0) code))))
+	    (when (eq endian 'endian-little)
+	      (write-utf-char-to-buffer #xfffe 0 2 buffer size endian)
+	      (incf size 2))
+            (loop for i fixnum from 0 below characters
+		  do (let ((code (char-code (funcall char string i))))
+		       (when (> code #xFFFF) (fail))
+		       (write-utf-char-to-buffer code i 2 buffer size endian)))
             (incf size (* characters 2))
             (succeed))))))
 
-(defun serialize-to-utf32le (string bstream)
+(defun serialize-to-utf32 (string bstream)
   "Serialize to utf32 compliant format unless contains code pages > 0"
-   (declare (type buffer-stream bstream)
-	    (type string string))
+  (declare (type buffer-stream bstream)
+	   (type string string))
   (elephant-memutil::with-struct-slots ((buffer buffer-stream-buffer)
 					(size buffer-stream-size)
 					(allocated buffer-stream-length))
       bstream
-      (let* ((characters (length string)))
-	  (buffer-write-byte +utf32-string+ bstream)
-	  (buffer-write-int32 characters bstream)
-	  (let ((needed (+ size (* 4 characters)))
-                (char (etypecase string
-                        (simple-string #'schar)
-                        (string #'char))))
-	    (when (> needed allocated)
-	      (resize-buffer-stream bstream needed))
-	     (loop for i fixnum from 0 below characters do
-		  (let ((code (char-code (funcall char string i))))
-		    (when (> code #x10FFFF) (error "Invalid unicode code type"))
-		    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 4) size 0))
-			  (ldb (byte 8 24) code))
-		    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 4) size 1))
-			  (ldb (byte 8 16) code))
-		    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 4) size 2))
-			  (ldb (byte 8 8) code))
-		    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 4) size 3))
-			  (ldb (byte 8 0) code)))))
+      (let* ((characters (length string))
+	     (endian (machine-endian))
+	     (bom-length (if (eq endian 'endian-big) 0 1)))
+	(buffer-write-byte +utf32-string+ bstream)
+	(buffer-write-int32 (+ characters bom-length) bstream)
+	(let ((needed (+ size (* 4 (+ characters bom-length))))
+	      (char (etypecase string
+		      (simple-string #'schar)
+		      (string #'char))))
+	  (when (> needed allocated)
+	    (resize-buffer-stream bstream needed))
+	  (when (eq endian 'endian-little)
+	    (write-utf-char-to-buffer #xfffe 0 4 buffer size endian)
+	    (incf size 4))
+	  (loop for i fixnum from 0 below characters
+		do (let ((code (char-code (funcall char string i))))
+		     (when (> code #x10FFFF)
+		       (error "Invalid unicode code type"))
+		     (write-utf-char-to-buffer code i 4 buffer size endian)))
 	  (incf size (* characters 4))
-	  t)))
+	  t))))
 
 ;;
 ;; Deserialization of Strings 
@@ -260,50 +292,67 @@
 						    (+ pos i)))))))
 	string))))
 
+(defun read-utf-char-from-buffer (char-index char-size buffer position endian)
+  (declare (type (integer 1 4) char-size)
+	   (type (signed-byte 31) char-index)
+	   (type fixnum position))
+  (let ((code 0))
+    (macrolet ((next-byte (offset)
+		 `(uffi:deref-array buffer
+				    '(:array :unsigned-byte)
+				    (+ (* char-index 2) position ,offset))))
+      (loop for i from 0 below char-size
+	    do (setf code (dpb (next-byte (if (eq endian 'endian-little)
+					      i (- char-size i 1)))
+			       (byte 8 (* i 8)) code)))
+      code)))
+
 (defmethod deserialize-string ((type (eql :utf16le)) bstream &optional temp-string)
   "All returned strings are simple-strings for, uh, simplicity"
   (declare (type buffer-stream bstream))
   (let* ((length (buffer-read-int32 bstream))
 	 (string (or temp-string (make-string length :element-type 'character)))
 	 (pos (elephant-memutil::buffer-stream-position bstream))
-	 (code 0))
-    (macrolet ((next-byte (offset)
-		 `(uffi:deref-array (buffer-stream-buffer bstream) '(:array :unsigned-byte) (+ (* i 2) pos ,offset))))
-      (declare (type simple-string string)
-	       (type fixnum length pos code))
-      (assert (subtypep (type-of string) 'simple-string))
-      (assert (compatible-unicode-support-p :utf16le))
-      (loop for i fixnum from 0 below length do
-	   (setf code (dpb (next-byte 0) (byte 8 8) 0))
-	   (setf code (dpb (next-byte 1) (byte 8 0) code))
-	   (setf (schar string i) (code-char code)))
-      (incf (elephant-memutil::buffer-stream-position bstream)
-	    (* length 2)))
-    (the simple-string string)))
+	 (code 0) (endian 'endian-big))
+    (declare (type simple-string string)
+	     (type fixnum length pos code))
+    (assert (subtypep (type-of string) 'simple-string))
+    (assert (compatible-unicode-support-p :utf16le))
+    (when (= (read-utf-char-from-buffer 0 2 (buffer-stream-buffer bstream)
+					pos (machine-endian)) #xfffe)
+      (setf endian 'endian-little)
+      (decf length)
+      (incf pos 2)
+      (incf (elephant-memutil::buffer-stream-position bstream) 2))
+    (loop for i fixnum from 0 below length
+	  do (setf code
+		   (read-utf-char-from-buffer i 2 (buffer-stream-buffer bstream)
+			pos endian))
+	     (setf (schar string i) (code-char code)))
+    (incf (elephant-memutil::buffer-stream-position bstream)
+	  (* length 2))
+    (the simple-string (subseq string 0 length))))
 
 (defmethod deserialize-string ((type (eql :utf32le)) bstream  &optional temp-string)
   (declare (type buffer-stream bstream))
-  (macrolet ((next-byte (offset)
-	       `(uffi:deref-array (buffer-stream-buffer bstream) '(:array :unsigned-byte) (+ (* i 4) pos ,offset))))
   (let* ((length (buffer-read-int32 bstream))
 	 (string (or temp-string (make-string length :element-type 'character)))
 	 (pos (elephant-memutil::buffer-stream-position bstream))
-	 (code 0))
+	 (code 0) (endian 'endian-big))
     (declare (type string string)
 	     (type fixnum length pos code))
     (assert (subtypep (type-of string) 'simple-string))
     (assert (compatible-unicode-support-p :utf32le))
+    (when (= (read-utf-char-from-buffer 0 4 (buffer-stream-buffer bstream)
+					pos (machine-endian)) #xfffe)
+	(setf endian 'endian-little)
+	(decf length)
+	(incf pos 4)
+	(incf (elephant-memutil::buffer-stream-position bstream) 4))
     (loop for i fixnum from 0 below length do
-	 (setf code (dpb (next-byte 0) (byte 8 24) 0))
-	 (setf code (dpb (next-byte 1) (byte 8 16) code))
-	 (setf code (dpb (next-byte 2) (byte 8 8) code))
-	 (setf code (dpb (next-byte 3) (byte 8 0) code))
-	 (setf (char string i) (code-char code)))
+      (setf code (read-utf-char-from-buffer i 4 (buffer-stream-buffer bstream)
+		      pos endian))
+      (setf (char string i) (code-char code)))
     (incf (elephant-memutil::buffer-stream-position bstream)
 	  (* length 4))
-    (the simple-string string))))
-
-
-  
-  
-
+    (the simple-string (subseq string 0 length))))