[elephant-devel] revised UTF seriazer/desirializer patch

Sun Aug 16 03:50:46 UTC 2009

Thank you, this looks great.  I'll review it and promote it in the  
next week or so unless Leslie beats me to it.

Thank you,
Ian

On Aug 7, 2009, at 7:15 PM, Hiroyuki Komatsu wrote:

> This patch does these things;
>
> o Maybe, big endian machines are nothing affected by this
>  patch.  I do not have any big endian machine.
>
> o little endian machines;
>  + UTF strings are serialized into UTF16le or UTF32le with BOM
>  + deserializers are test existency of BOM and choice deserialize from
>    big endian or little endian.
>  + comparators in libberkeley-db are also test BOM,
>    create temporally buffer when the string is serialize into big
>    endian.
>
> o old store image preserved
> o sort order is corrected when migrate old store to new store.
>
> I did not test any other backing store.
>
> diff -rN -u old-elephant/src/db-bdb/libberkeley-db.c new-elephant/ 
> src/db-bdb/libberkeley-db.c
> --- old-elephant/src/db-bdb/libberkeley-db.c	2009-08-08  
> 10:51:25.000000000 +0900
> +++ new-elephant/src/db-bdb/libberkeley-db.c	2009-08-08  
> 10:51:25.000000000 +0900
> @@ -25,6 +25,7 @@
> #include <stdio.h>
> #include <string.h>
> #include <wchar.h>
> +#include <stdlib.h>
>
> /* Some utility stuff used to be here but has been placed in
>    libmemutil.c  */
> @@ -920,7 +921,7 @@
>   case S1_UCS4_SYMBOL:
>   case S1_UCS4_STRING:
>   case S1_UCS4_PATHNAME:
> -    return wcs_cmp((wchar_t*)ad+9, read_int(ad, 5), (wchar_t*)bd+9,  
> read_int(bd, 5));
> +  return wcs_cmp((wchar_t*)(ad+9), read_int(ad, 5), (wchar_t*)(bd 
> +9), read_int(bd, 5));
>   default:
>     return lex_cmp(ad+5, (a->size)-5, bd+5, (b->size)-5);
>   }
> @@ -1130,7 +1131,7 @@
>     /*****
>     printf("Doing a 32-bit compare\n");
>     *****/
> -    return wcs_cmp((wchar_t*)ad+5+offset, read_int32(ad+offset, 1),  
> (wchar_t*)bd+5+offset, read_int32(bd+offset, 1));
> +    return wcs_cmp((wchar_t*)(ad+5+offset), read_int32(ad+offset,  
> 1), (wchar_t*)(bd+5+offset), read_int32(bd+offset, 1));
>   default:
>     /*****
>     printf("Doing a lex compare\n");
> @@ -1306,6 +1307,18 @@
> #define strncasecmp _strnicmp
> typedef unsigned short uint16_t;
> #endif
> +#define ENDIAN_BIG 0
> +#define ENDIAN_LITTLE 1
> +
> +int machine_endian()
> +{
> +	uint32_t x = 0x01020304;
> +	uint8_t *xp = (uint8_t *)&x;
> +	if (*xp == 0x01)
> +		return ENDIAN_BIG;
> +	else
> +		return ENDIAN_LITTLE;
> +}
>
> int case_cmp(const unsigned char *a, int32_t length1, const unsigned  
> char *b, int32_t length2) {
>   int min, sizediff, diff;
> @@ -1316,12 +1329,72 @@
>   return diff;
> }
>
> +wchar_t utf32_char(const wchar_t *c)
> +{
> +	uint8_t *cp = (uint8_t *)c;
> +	return (cp[3] << 24) | (cp[2] << 16) | (cp[1] << 8) | cp[0];
> +}
> +
> +wchar_t *swap32_string(const wchar_t *str, int32_t length)
> +{
> +	int i;
> +	wchar_t *swap_buff = malloc(4 * length);
> +	for (i = 0; i < length; ++i) {
> +		uint8_t *sp = (uint8_t *)&str[i],
> +			*dp = (uint8_t *)&swap_buff[i];
> +		sp[0] = dp[3];
> +		sp[1] = dp[2];
> +		sp[2] = dp[1];
> +		sp[3] = dp[0];
> +	}
> +	return swap_buff;
> +}
> +
> +#if 0
> +void dump_string(int size, uint8_t *str, int32_t length, char  
> *prefix)
> +{
> +	int i;
> +	printf("%s: ", prefix);
> +	for (i = 0; i < length * size; i += 2)
> +		printf("%02x%02x ", str[i], str[i + 1]);
> +	printf("\n");
> +}
> +#endif
> +
> int wcs_cmp(const wchar_t *a, int32_t length1,
> 	    const wchar_t *b, int32_t length2) {
>   int min, sizediff, diff;
> +  wchar_t *swap_a = NULL, *swap_b = NULL;
> +
> +#if 0
> +  dump_string(4, a, length1, "A");
> +  dump_string(4, b, length2, "B");
> +#endif
> +  if (machine_endian() == ENDIAN_LITTLE) {
> +	  if (utf32_char(a) != 0xfffe) {/* BIG-ENDIAN */
> +		  swap_a = swap32_string(a, length1);
> +		  if (swap_a)
> +			  a = swap_a;
> +	  } else {		/* LITTLE-ENDIAN */
> +		  ++a;
> +		  --length1;
> +	  }
> +	  if (utf32_char(b) != 0xfffe) {/* BIG-ENDIAN */
> +		  swap_b = swap32_string(b, length2);
> +		  if (swap_b)
> +			  b = swap_b;
> +	  } else {		/* LITTLE-ENDIAN */
> +		  ++b;
> +		  --length2;
> +	  }
> +  }
>   sizediff = length1 - length2;
>   min = sizediff > 0 ? length2 : length1;
> -  diff = wcsncmp(a, b, min /4);
> +  diff = wcsncmp(a, b, min);
> +  if (swap_a)
> +	  free(swap_a);
> +  if (swap_b)
> +	  free(swap_b);
>   if (diff == 0) return sizediff;
>   return diff;
> }
> @@ -1351,6 +1424,22 @@
> #define UTF_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
> #define UTF_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
>
> +uint16_t utf16_char(const uint8_t *str)
> +{
> +	return (str[1] << 8) | str[0];
> +}
> +
> +uint8_t *swap16_string(const uint8_t *src, int32_t length)
> +{
> +	int i;
> +	uint8_t *swap_buff = malloc(2 * length);
> +	for (i = 0; i < length * 2; i += 2) {
> +		swap_buff[i + 0] = src[i + 1];
> +		swap_buff[i + 1] = src[i + 1];
> +	}
> +	return swap_buff;
> +}
> +
> /* compare UTF-16 strings */
> /* memcmp/UnicodeString style, both length-specified */
> /* don't assume byte-aligned! */
> @@ -1359,7 +1448,29 @@
>   const unsigned char *start1, *start2, *limit1, *limit2;
>   UChar c1, c2;
>   int32_t lengthResult;
> -
> +  uint8_t *swap_s1 = NULL, *swap_s2 = NULL;
> +#if 0
> +  dump_string(2, s1, length1, "S1");
> +  dump_string(2, s2, length2, "S2");
> +#endif
> +  if (machine_endian() == ENDIAN_LITTLE) {
> +	  if (utf16_char(s1) != 0xfffe) {/* BIG-ENDIAN */
> +		  swap_s1 = swap16_string(s1, length1);
> +		  if (swap_s1)
> +			  s1 = swap_s1;
> +	  } else {		/* LITTLE-ENDIAN */
> +		  s1 += 2;
> +		  length1 -= 1;
> +	  }
> +	  if (utf16_char(s2) != 0xfffe) {/* BIG-ENDIAN */
> +		  swap_s2 = swap16_string(s2, length2);
> +		  if (swap_s2)
> +			  s2 = swap_s2;
> +	  } else {		/* LITTLE-ENDIAN */
> +		  s2 += 2;
> +		  length2 -= 1;
> +	  }
> +  }
>   if(length1<length2) {
>     lengthResult=-1;
>     limit1=s1+2*length1;
> @@ -1415,6 +1526,10 @@
>       }*/
>   }
>
> +  if (swap_s1)
> +	  free(swap_s1);
> +  if (swap_s2)
> +	  free(swap_s2);
>   return (int32_t)c1-(int32_t)c2;
> }
>
> diff -rN -u old-elephant/src/elephant/unicode.lisp new-elephant/src/ 
> elephant/unicode.lisp
> --- old-elephant/src/elephant/unicode.lisp	2009-08-08  
> 10:51:25.000000000 +0900
> +++ new-elephant/src/elephant/unicode.lisp	2009-08-08  
> 10:51:25.000000000 +0900
> @@ -41,7 +41,7 @@
>
> ;; #+allegro
> ;; (defun serialize-string (string bstream)
> -;;   (elephant-memutil::with-struct-slots ((buffer buffer-stream- 
> buffer)
> +;;   (e(lephant-memutil::with-struct-slots ((buffer buffer-stream- 
> buffer)
> ;; 					(size buffer-stream-size)
> ;; 					(allocated buffer-stream-length))
> ;;       bstream
> @@ -59,20 +59,20 @@
>   (declare (type buffer-stream bstream)
> 	   (type string string))
>   (cond ((and (not (equal "" string)) (> (char-code (char string 0))  
> #xFFFF))
> -	 (serialize-to-utf32le string bstream))
> +	 (serialize-to-utf32 string bstream))
> 	;; Accelerate the common case where a character set is not Latin-1
> 	((and (not (equal "" string)) (> (char-code (char string 0)) #xFF))
> -	 (or (serialize-to-utf16le string bstream)
> -	     (serialize-to-utf32le string bstream)))
> +	 (or (serialize-to-utf16 string bstream)
> +	     (serialize-to-utf32 string bstream)))
> 	;; Actually code pages > 0 are rare; so we can pay an extra cost
> 	(t (or (serialize-to-utf8 string bstream)
> -	       (serialize-to-utf16le string bstream)
> -	       (serialize-to-utf32le string bstream)))))
> +	       (serialize-to-utf16 string bstream)
> +	       (serialize-to-utf32 string bstream)))))
>
> (defun serialize-to-utf8 (string bstream)
>   "Standard serialization"
>   (declare (type buffer-stream bstream)
> - 	   (type string string))
> + 	   (type simple-string string))
>   (elephant-memutil::with-struct-slots ((buffer buffer-stream-buffer)
> 					(size buffer-stream-size)
> 					(allocated buffer-stream-length))
> @@ -117,73 +117,105 @@
> 	    (setf (buffer-stream-size bstream) needed)
> 	    (succeed))))))
>
> -(defun serialize-to-utf16le (string bstream)
> -  "Serialize to utf16le compliant format unless contains code pages  
> > 0"
> +(defvar *machine-endian*
> +  (let* ((bstream (make-buffer-stream))
> +	 (buffer (buffer-stream-buffer bstream))
> +	 (size (buffer-stream-size bstream)))
> +    (buffer-write-int32 #x01020304 bstream)
> +    (let ((byte-image
> +	   (loop for i from 0 to 3
> +		 collect (uffi:deref-array buffer '(:array :unsigned-char)
> +						   (the fixnum (+ size i))))))
> +      (cond ((equal byte-image '(4 3 2 1)) 'endian-little)
> +	    ((equal byte-image '(1 2 3 4)) 'endian-big)
> +	    (t 'unknown)))))
> +
> +(defun machine-endian ()
> +  *machine-endian*)
> +
> +(defun write-utf-char-to-buffer (char char-index char-size buffer  
> base endian)
> +  (declare (type (signed-byte 31) char-index)
> +	   (type (integer 1 4) char-size))
> +  (loop for i from 0 below char-size do
> +    (setf (uffi:deref-array buffer '(:array :unsigned-char)
> +				    (+ (* char-index char-size) base
> +				       (the (integer 0 3)
> +					 (if (eq endian 'endian-little)
> +					     i
> +					     (- char-size 1 i)))))
> +	  (ldb (byte 8 (* 8 i)) char))))
> +
> +(defun serialize-to-utf16 (string bstream)
> +  "Serialize to utf16 compliant format unless contains code pages >  
> 0"
>   (declare (type buffer-stream bstream)
>  	   (type string string))
> +  (progn
> +    (format *debug-io* "LSIP-ENTER: ")
> +    (loop for i from 0 below (length string)
> +	  do (format *debug-io* "~4,'0X " (char-code (char string i))))
> +    (format *debug-io* "~%"))
>   (elephant-memutil::with-struct-slots ((buffer buffer-stream-buffer)
> 					(size buffer-stream-size)
> 					(allocated buffer-stream-length))
>       bstream
>       (let* ((saved-size (buffer-stream-size bstream))
> 	     (saved-pos (elephant-memutil::buffer-stream-position bstream))
> -	     (characters (length string)))
> +	     (characters (length string))
> +	     (endian (machine-endian))
> +	     (bom-length (if (eq endian 'endian-big) 0 1)))
> 	(labels ((fail ()
> 		   (setf (buffer-stream-size bstream) saved-size)
> 		   (setf (elephant-memutil::buffer-stream-position bstream) saved- 
> pos)
> -		   (return-from serialize-to-utf16le nil))
> +		   (return-from serialize-to-utf16 nil))
> 		 (succeed ()
> -		   (return-from serialize-to-utf16le t)))
> +		   (return-from serialize-to-utf16 t)))
> 	  (buffer-write-byte +utf16-string+ bstream)
> -	  (buffer-write-int32 characters bstream)
> -	  (let ((needed (+ size (* characters 2)))
> -                (char (etypecase string
> +	  (buffer-write-int32  (+ characters bom-length) bstream)
> +	  (let ((needed (+ size (* (+ characters bom-length) 2)))
> +		  (char (etypecase string
>                         (simple-string #'schar)
>                         (string #'char))))
>             (when (> needed allocated)
>               (resize-buffer-stream bstream needed))
> -            (loop for i fixnum from 0 below characters do
> -                  (let ((code (char-code (funcall char string i))))
> -                    (when (> code #xFFFF) (fail))
> -                    (setf (uffi:deref-array buffer  
> '(:array :unsigned-char) (+ (* i 2) size))
> -                          ;;			  (coerce (ldb (byte 8 8) code)  
> '(signed 8)))
> -                          (ldb (byte 8 8) code))
> -                    (setf (uffi:deref-array buffer  
> '(:array :unsigned-char) (+ (* i 2) size 1))
> -                          ;;			  (coerce (ldb (byte 8 0) code)  
> '(signed 8))))))
> -                          (ldb (byte 8 0) code))))
> +	    (when (eq endian 'endian-little)
> +	      (write-utf-char-to-buffer #xfffe 0 2 buffer size endian)
> +	      (incf size 2))
> +            (loop for i fixnum from 0 below characters
> +		  do (let ((code (char-code (funcall char string i))))
> +		       (when (> code #xFFFF) (fail))
> +		       (write-utf-char-to-buffer code i 2 buffer size endian)))
>             (incf size (* characters 2))
>             (succeed))))))
>
> -(defun serialize-to-utf32le (string bstream)
> +(defun serialize-to-utf32 (string bstream)
>   "Serialize to utf32 compliant format unless contains code pages > 0"
> -   (declare (type buffer-stream bstream)
> -	    (type string string))
> +  (declare (type buffer-stream bstream)
> +	   (type string string))
>   (elephant-memutil::with-struct-slots ((buffer buffer-stream-buffer)
> 					(size buffer-stream-size)
> 					(allocated buffer-stream-length))
>       bstream
> -      (let* ((characters (length string)))
> -	  (buffer-write-byte +utf32-string+ bstream)
> -	  (buffer-write-int32 characters bstream)
> -	  (let ((needed (+ size (* 4 characters)))
> -                (char (etypecase string
> -                        (simple-string #'schar)
> -                        (string #'char))))
> -	    (when (> needed allocated)
> -	      (resize-buffer-stream bstream needed))
> -	     (loop for i fixnum from 0 below characters do
> -		  (let ((code (char-code (funcall char string i))))
> -		    (when (> code #x10FFFF) (error "Invalid unicode code type"))
> -		    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (*  
> i 4) size 0))
> -			  (ldb (byte 8 24) code))
> -		    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (*  
> i 4) size 1))
> -			  (ldb (byte 8 16) code))
> -		    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (*  
> i 4) size 2))
> -			  (ldb (byte 8 8) code))
> -		    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (*  
> i 4) size 3))
> -			  (ldb (byte 8 0) code)))))
> +      (let* ((characters (length string))
> +	     (endian (machine-endian))
> +	     (bom-length (if (eq endian 'endian-big) 0 1)))
> +	(buffer-write-byte +utf32-string+ bstream)
> +	(buffer-write-int32 (+ characters bom-length) bstream)
> +	(let ((needed (+ size (* 4 (+ characters bom-length))))
> +	      (char (etypecase string
> +		      (simple-string #'schar)
> +		      (string #'char))))
> +	  (when (> needed allocated)
> +	    (resize-buffer-stream bstream needed))
> +	  (when (eq endian 'endian-little)
> +	    (write-utf-char-to-buffer #xfffe 0 4 buffer size endian)
> +	    (incf size 4))
> +	  (loop for i fixnum from 0 below characters
> +		do (let ((code (char-code (funcall char string i))))
> +		     (when (> code #x10FFFF)
> +		       (error "Invalid unicode code type"))
> +		     (write-utf-char-to-buffer code i 4 buffer size endian)))
> 	  (incf size (* characters 4))
> -	  t)))
> +	  t))))
>
> ;;
> ;; Deserialization of Strings
> @@ -260,50 +292,67 @@
> 						    (+ pos i)))))))
> 	string))))
>
> +(defun read-utf-char-from-buffer (char-index char-size buffer  
> position endian)
> +  (declare (type (integer 1 4) char-size)
> +	   (type (signed-byte 31) char-index)
> +	   (type fixnum position))
> +  (let ((code 0))
> +    (macrolet ((next-byte (offset)
> +		 `(uffi:deref-array buffer
> +				    '(:array :unsigned-byte)
> +				    (+ (* char-index 2) position ,offset))))
> +      (loop for i from 0 below char-size
> +	    do (setf code (dpb (next-byte (if (eq endian 'endian-little)
> +					      i (- char-size i 1)))
> +			       (byte 8 (* i 8)) code)))
> +      code)))
> +
> (defmethod deserialize-string ((type (eql :utf16le)) bstream  
> &optional temp-string)
>   "All returned strings are simple-strings for, uh, simplicity"
>   (declare (type buffer-stream bstream))
>   (let* ((length (buffer-read-int32 bstream))
> 	 (string (or temp-string (make-string length :element-type  
> 'character)))
> 	 (pos (elephant-memutil::buffer-stream-position bstream))
> -	 (code 0))
> -    (macrolet ((next-byte (offset)
> -		 `(uffi:deref-array (buffer-stream-buffer bstream)  
> '(:array :unsigned-byte) (+ (* i 2) pos ,offset))))
> -      (declare (type simple-string string)
> -	       (type fixnum length pos code))
> -      (assert (subtypep (type-of string) 'simple-string))
> -      (assert (compatible-unicode-support-p :utf16le))
> -      (loop for i fixnum from 0 below length do
> -	   (setf code (dpb (next-byte 0) (byte 8 8) 0))
> -	   (setf code (dpb (next-byte 1) (byte 8 0) code))
> -	   (setf (schar string i) (code-char code)))
> -      (incf (elephant-memutil::buffer-stream-position bstream)
> -	    (* length 2)))
> -    (the simple-string string)))
> +	 (code 0) (endian 'endian-big))
> +    (declare (type simple-string string)
> +	     (type fixnum length pos code))
> +    (assert (subtypep (type-of string) 'simple-string))
> +    (assert (compatible-unicode-support-p :utf16le))
> +    (when (= (read-utf-char-from-buffer 0 2 (buffer-stream-buffer  
> bstream)
> +					pos (machine-endian)) #xfffe)
> +      (setf endian 'endian-little)
> +      (decf length)
> +      (incf pos 2)
> +      (incf (elephant-memutil::buffer-stream-position bstream) 2))
> +    (loop for i fixnum from 0 below length
> +	  do (setf code
> +		   (read-utf-char-from-buffer i 2 (buffer-stream-buffer bstream)
> +			pos endian))
> +	     (setf (schar string i) (code-char code)))
> +    (incf (elephant-memutil::buffer-stream-position bstream)
> +	  (* length 2))
> +    (the simple-string (subseq string 0 length))))
>
> (defmethod deserialize-string ((type (eql :utf32le)) bstream   
> &optional temp-string)
>   (declare (type buffer-stream bstream))
> -  (macrolet ((next-byte (offset)
> -	       `(uffi:deref-array (buffer-stream-buffer bstream)  
> '(:array :unsigned-byte) (+ (* i 4) pos ,offset))))
>   (let* ((length (buffer-read-int32 bstream))
> 	 (string (or temp-string (make-string length :element-type  
> 'character)))
> 	 (pos (elephant-memutil::buffer-stream-position bstream))
> -	 (code 0))
> +	 (code 0) (endian 'endian-big))
>     (declare (type string string)
> 	     (type fixnum length pos code))
>     (assert (subtypep (type-of string) 'simple-string))
>     (assert (compatible-unicode-support-p :utf32le))
> +    (when (= (read-utf-char-from-buffer 0 4 (buffer-stream-buffer  
> bstream)
> +					pos (machine-endian)) #xfffe)
> +	(setf endian 'endian-little)
> +	(decf length)
> +	(incf pos 4)
> +	(incf (elephant-memutil::buffer-stream-position bstream) 4))
>     (loop for i fixnum from 0 below length do
> -	 (setf code (dpb (next-byte 0) (byte 8 24) 0))
> -	 (setf code (dpb (next-byte 1) (byte 8 16) code))
> -	 (setf code (dpb (next-byte 2) (byte 8 8) code))
> -	 (setf code (dpb (next-byte 3) (byte 8 0) code))
> -	 (setf (char string i) (code-char code)))
> +      (setf code (read-utf-char-from-buffer i 4 (buffer-stream- 
> buffer bstream)
> +		      pos endian))
> +      (setf (char string i) (code-char code)))
>     (incf (elephant-memutil::buffer-stream-position bstream)
> 	  (* length 4))
> -    (the simple-string string))))
> -
> -
> -
> -
> -
> +    (the simple-string (subseq string 0 length))))
>
> _______________________________________________
> elephant-devel site list
> elephant-devel at common-lisp.net
> http://common-lisp.net/mailman/listinfo/elephant-devel