[elephant-devel] revised UTF seriazer/desirializer patch
Hiroyuki Komatsu
kom at narihara-lab.jp
Sat Aug 8 02:15:46 UTC 2009
This patch does these things;
o Maybe, big endian machines are nothing affected by this
patch. I do not have any big endian machine.
o little endian machines;
+ UTF strings are serialized into UTF16le or UTF32le with BOM
+ deserializers are test existency of BOM and choice deserialize from
big endian or little endian.
+ comparators in libberkeley-db are also test BOM,
create temporally buffer when the string is serialize into big
endian.
o old store image preserved
o sort order is corrected when migrate old store to new store.
I did not test any other backing store.
-------------- next part --------------
diff -rN -u old-elephant/src/db-bdb/libberkeley-db.c new-elephant/src/db-bdb/libberkeley-db.c
--- old-elephant/src/db-bdb/libberkeley-db.c 2009-08-08 10:51:25.000000000 +0900
+++ new-elephant/src/db-bdb/libberkeley-db.c 2009-08-08 10:51:25.000000000 +0900
@@ -25,6 +25,7 @@
#include <stdio.h>
#include <string.h>
#include <wchar.h>
+#include <stdlib.h>
/* Some utility stuff used to be here but has been placed in
libmemutil.c */
@@ -920,7 +921,7 @@
case S1_UCS4_SYMBOL:
case S1_UCS4_STRING:
case S1_UCS4_PATHNAME:
- return wcs_cmp((wchar_t*)ad+9, read_int(ad, 5), (wchar_t*)bd+9, read_int(bd, 5));
+ return wcs_cmp((wchar_t*)(ad+9), read_int(ad, 5), (wchar_t*)(bd+9), read_int(bd, 5));
default:
return lex_cmp(ad+5, (a->size)-5, bd+5, (b->size)-5);
}
@@ -1130,7 +1131,7 @@
/*****
printf("Doing a 32-bit compare\n");
*****/
- return wcs_cmp((wchar_t*)ad+5+offset, read_int32(ad+offset, 1), (wchar_t*)bd+5+offset, read_int32(bd+offset, 1));
+ return wcs_cmp((wchar_t*)(ad+5+offset), read_int32(ad+offset, 1), (wchar_t*)(bd+5+offset), read_int32(bd+offset, 1));
default:
/*****
printf("Doing a lex compare\n");
@@ -1306,6 +1307,18 @@
#define strncasecmp _strnicmp
typedef unsigned short uint16_t;
#endif
+#define ENDIAN_BIG 0
+#define ENDIAN_LITTLE 1
+
+int machine_endian()
+{
+ uint32_t x = 0x01020304;
+ uint8_t *xp = (uint8_t *)&x;
+ if (*xp == 0x01)
+ return ENDIAN_BIG;
+ else
+ return ENDIAN_LITTLE;
+}
int case_cmp(const unsigned char *a, int32_t length1, const unsigned char *b, int32_t length2) {
int min, sizediff, diff;
@@ -1316,12 +1329,72 @@
return diff;
}
+wchar_t utf32_char(const wchar_t *c)
+{
+ uint8_t *cp = (uint8_t *)c;
+ return (cp[3] << 24) | (cp[2] << 16) | (cp[1] << 8) | cp[0];
+}
+
+wchar_t *swap32_string(const wchar_t *str, int32_t length)
+{
+ int i;
+ wchar_t *swap_buff = malloc(4 * length);
+ for (i = 0; i < length; ++i) {
+ uint8_t *sp = (uint8_t *)&str[i],
+ *dp = (uint8_t *)&swap_buff[i];
+ sp[0] = dp[3];
+ sp[1] = dp[2];
+ sp[2] = dp[1];
+ sp[3] = dp[0];
+ }
+ return swap_buff;
+}
+
+#if 0
+void dump_string(int size, uint8_t *str, int32_t length, char *prefix)
+{
+ int i;
+ printf("%s: ", prefix);
+ for (i = 0; i < length * size; i += 2)
+ printf("%02x%02x ", str[i], str[i + 1]);
+ printf("\n");
+}
+#endif
+
int wcs_cmp(const wchar_t *a, int32_t length1,
const wchar_t *b, int32_t length2) {
int min, sizediff, diff;
+ wchar_t *swap_a = NULL, *swap_b = NULL;
+
+#if 0
+ dump_string(4, a, length1, "A");
+ dump_string(4, b, length2, "B");
+#endif
+ if (machine_endian() == ENDIAN_LITTLE) {
+ if (utf32_char(a) != 0xfffe) {/* BIG-ENDIAN */
+ swap_a = swap32_string(a, length1);
+ if (swap_a)
+ a = swap_a;
+ } else { /* LITTLE-ENDIAN */
+ ++a;
+ --length1;
+ }
+ if (utf32_char(b) != 0xfffe) {/* BIG-ENDIAN */
+ swap_b = swap32_string(b, length2);
+ if (swap_b)
+ b = swap_b;
+ } else { /* LITTLE-ENDIAN */
+ ++b;
+ --length2;
+ }
+ }
sizediff = length1 - length2;
min = sizediff > 0 ? length2 : length1;
- diff = wcsncmp(a, b, min /4);
+ diff = wcsncmp(a, b, min);
+ if (swap_a)
+ free(swap_a);
+ if (swap_b)
+ free(swap_b);
if (diff == 0) return sizediff;
return diff;
}
@@ -1351,6 +1424,22 @@
#define UTF_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
#define UTF_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
+uint16_t utf16_char(const uint8_t *str)
+{
+ return (str[1] << 8) | str[0];
+}
+
+uint8_t *swap16_string(const uint8_t *src, int32_t length)
+{
+ int i;
+ uint8_t *swap_buff = malloc(2 * length);
+ for (i = 0; i < length * 2; i += 2) {
+ swap_buff[i + 0] = src[i + 1];
+ swap_buff[i + 1] = src[i + 1];
+ }
+ return swap_buff;
+}
+
/* compare UTF-16 strings */
/* memcmp/UnicodeString style, both length-specified */
/* don't assume byte-aligned! */
@@ -1359,7 +1448,29 @@
const unsigned char *start1, *start2, *limit1, *limit2;
UChar c1, c2;
int32_t lengthResult;
-
+ uint8_t *swap_s1 = NULL, *swap_s2 = NULL;
+#if 0
+ dump_string(2, s1, length1, "S1");
+ dump_string(2, s2, length2, "S2");
+#endif
+ if (machine_endian() == ENDIAN_LITTLE) {
+ if (utf16_char(s1) != 0xfffe) {/* BIG-ENDIAN */
+ swap_s1 = swap16_string(s1, length1);
+ if (swap_s1)
+ s1 = swap_s1;
+ } else { /* LITTLE-ENDIAN */
+ s1 += 2;
+ length1 -= 1;
+ }
+ if (utf16_char(s2) != 0xfffe) {/* BIG-ENDIAN */
+ swap_s2 = swap16_string(s2, length2);
+ if (swap_s2)
+ s2 = swap_s2;
+ } else { /* LITTLE-ENDIAN */
+ s2 += 2;
+ length2 -= 1;
+ }
+ }
if(length1<length2) {
lengthResult=-1;
limit1=s1+2*length1;
@@ -1415,6 +1526,10 @@
}*/
}
+ if (swap_s1)
+ free(swap_s1);
+ if (swap_s2)
+ free(swap_s2);
return (int32_t)c1-(int32_t)c2;
}
diff -rN -u old-elephant/src/elephant/unicode.lisp new-elephant/src/elephant/unicode.lisp
--- old-elephant/src/elephant/unicode.lisp 2009-08-08 10:51:25.000000000 +0900
+++ new-elephant/src/elephant/unicode.lisp 2009-08-08 10:51:25.000000000 +0900
@@ -41,7 +41,7 @@
;; #+allegro
;; (defun serialize-string (string bstream)
-;; (elephant-memutil::with-struct-slots ((buffer buffer-stream-buffer)
+;; (e(lephant-memutil::with-struct-slots ((buffer buffer-stream-buffer)
;; (size buffer-stream-size)
;; (allocated buffer-stream-length))
;; bstream
@@ -59,20 +59,20 @@
(declare (type buffer-stream bstream)
(type string string))
(cond ((and (not (equal "" string)) (> (char-code (char string 0)) #xFFFF))
- (serialize-to-utf32le string bstream))
+ (serialize-to-utf32 string bstream))
;; Accelerate the common case where a character set is not Latin-1
((and (not (equal "" string)) (> (char-code (char string 0)) #xFF))
- (or (serialize-to-utf16le string bstream)
- (serialize-to-utf32le string bstream)))
+ (or (serialize-to-utf16 string bstream)
+ (serialize-to-utf32 string bstream)))
;; Actually code pages > 0 are rare; so we can pay an extra cost
(t (or (serialize-to-utf8 string bstream)
- (serialize-to-utf16le string bstream)
- (serialize-to-utf32le string bstream)))))
+ (serialize-to-utf16 string bstream)
+ (serialize-to-utf32 string bstream)))))
(defun serialize-to-utf8 (string bstream)
"Standard serialization"
(declare (type buffer-stream bstream)
- (type string string))
+ (type simple-string string))
(elephant-memutil::with-struct-slots ((buffer buffer-stream-buffer)
(size buffer-stream-size)
(allocated buffer-stream-length))
@@ -117,73 +117,105 @@
(setf (buffer-stream-size bstream) needed)
(succeed))))))
-(defun serialize-to-utf16le (string bstream)
- "Serialize to utf16le compliant format unless contains code pages > 0"
+(defvar *machine-endian*
+ (let* ((bstream (make-buffer-stream))
+ (buffer (buffer-stream-buffer bstream))
+ (size (buffer-stream-size bstream)))
+ (buffer-write-int32 #x01020304 bstream)
+ (let ((byte-image
+ (loop for i from 0 to 3
+ collect (uffi:deref-array buffer '(:array :unsigned-char)
+ (the fixnum (+ size i))))))
+ (cond ((equal byte-image '(4 3 2 1)) 'endian-little)
+ ((equal byte-image '(1 2 3 4)) 'endian-big)
+ (t 'unknown)))))
+
+(defun machine-endian ()
+ *machine-endian*)
+
+(defun write-utf-char-to-buffer (char char-index char-size buffer base endian)
+ (declare (type (signed-byte 31) char-index)
+ (type (integer 1 4) char-size))
+ (loop for i from 0 below char-size do
+ (setf (uffi:deref-array buffer '(:array :unsigned-char)
+ (+ (* char-index char-size) base
+ (the (integer 0 3)
+ (if (eq endian 'endian-little)
+ i
+ (- char-size 1 i)))))
+ (ldb (byte 8 (* 8 i)) char))))
+
+(defun serialize-to-utf16 (string bstream)
+ "Serialize to utf16 compliant format unless contains code pages > 0"
(declare (type buffer-stream bstream)
(type string string))
+ (progn
+ (format *debug-io* "LSIP-ENTER: ")
+ (loop for i from 0 below (length string)
+ do (format *debug-io* "~4,'0X " (char-code (char string i))))
+ (format *debug-io* "~%"))
(elephant-memutil::with-struct-slots ((buffer buffer-stream-buffer)
(size buffer-stream-size)
(allocated buffer-stream-length))
bstream
(let* ((saved-size (buffer-stream-size bstream))
(saved-pos (elephant-memutil::buffer-stream-position bstream))
- (characters (length string)))
+ (characters (length string))
+ (endian (machine-endian))
+ (bom-length (if (eq endian 'endian-big) 0 1)))
(labels ((fail ()
(setf (buffer-stream-size bstream) saved-size)
(setf (elephant-memutil::buffer-stream-position bstream) saved-pos)
- (return-from serialize-to-utf16le nil))
+ (return-from serialize-to-utf16 nil))
(succeed ()
- (return-from serialize-to-utf16le t)))
+ (return-from serialize-to-utf16 t)))
(buffer-write-byte +utf16-string+ bstream)
- (buffer-write-int32 characters bstream)
- (let ((needed (+ size (* characters 2)))
- (char (etypecase string
+ (buffer-write-int32 (+ characters bom-length) bstream)
+ (let ((needed (+ size (* (+ characters bom-length) 2)))
+ (char (etypecase string
(simple-string #'schar)
(string #'char))))
(when (> needed allocated)
(resize-buffer-stream bstream needed))
- (loop for i fixnum from 0 below characters do
- (let ((code (char-code (funcall char string i))))
- (when (> code #xFFFF) (fail))
- (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 2) size))
- ;; (coerce (ldb (byte 8 8) code) '(signed 8)))
- (ldb (byte 8 8) code))
- (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 2) size 1))
- ;; (coerce (ldb (byte 8 0) code) '(signed 8))))))
- (ldb (byte 8 0) code))))
+ (when (eq endian 'endian-little)
+ (write-utf-char-to-buffer #xfffe 0 2 buffer size endian)
+ (incf size 2))
+ (loop for i fixnum from 0 below characters
+ do (let ((code (char-code (funcall char string i))))
+ (when (> code #xFFFF) (fail))
+ (write-utf-char-to-buffer code i 2 buffer size endian)))
(incf size (* characters 2))
(succeed))))))
-(defun serialize-to-utf32le (string bstream)
+(defun serialize-to-utf32 (string bstream)
"Serialize to utf32 compliant format unless contains code pages > 0"
- (declare (type buffer-stream bstream)
- (type string string))
+ (declare (type buffer-stream bstream)
+ (type string string))
(elephant-memutil::with-struct-slots ((buffer buffer-stream-buffer)
(size buffer-stream-size)
(allocated buffer-stream-length))
bstream
- (let* ((characters (length string)))
- (buffer-write-byte +utf32-string+ bstream)
- (buffer-write-int32 characters bstream)
- (let ((needed (+ size (* 4 characters)))
- (char (etypecase string
- (simple-string #'schar)
- (string #'char))))
- (when (> needed allocated)
- (resize-buffer-stream bstream needed))
- (loop for i fixnum from 0 below characters do
- (let ((code (char-code (funcall char string i))))
- (when (> code #x10FFFF) (error "Invalid unicode code type"))
- (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 4) size 0))
- (ldb (byte 8 24) code))
- (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 4) size 1))
- (ldb (byte 8 16) code))
- (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 4) size 2))
- (ldb (byte 8 8) code))
- (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 4) size 3))
- (ldb (byte 8 0) code)))))
+ (let* ((characters (length string))
+ (endian (machine-endian))
+ (bom-length (if (eq endian 'endian-big) 0 1)))
+ (buffer-write-byte +utf32-string+ bstream)
+ (buffer-write-int32 (+ characters bom-length) bstream)
+ (let ((needed (+ size (* 4 (+ characters bom-length))))
+ (char (etypecase string
+ (simple-string #'schar)
+ (string #'char))))
+ (when (> needed allocated)
+ (resize-buffer-stream bstream needed))
+ (when (eq endian 'endian-little)
+ (write-utf-char-to-buffer #xfffe 0 4 buffer size endian)
+ (incf size 4))
+ (loop for i fixnum from 0 below characters
+ do (let ((code (char-code (funcall char string i))))
+ (when (> code #x10FFFF)
+ (error "Invalid unicode code type"))
+ (write-utf-char-to-buffer code i 4 buffer size endian)))
(incf size (* characters 4))
- t)))
+ t))))
;;
;; Deserialization of Strings
@@ -260,50 +292,67 @@
(+ pos i)))))))
string))))
+(defun read-utf-char-from-buffer (char-index char-size buffer position endian)
+ (declare (type (integer 1 4) char-size)
+ (type (signed-byte 31) char-index)
+ (type fixnum position))
+ (let ((code 0))
+ (macrolet ((next-byte (offset)
+ `(uffi:deref-array buffer
+ '(:array :unsigned-byte)
+ (+ (* char-index 2) position ,offset))))
+ (loop for i from 0 below char-size
+ do (setf code (dpb (next-byte (if (eq endian 'endian-little)
+ i (- char-size i 1)))
+ (byte 8 (* i 8)) code)))
+ code)))
+
(defmethod deserialize-string ((type (eql :utf16le)) bstream &optional temp-string)
"All returned strings are simple-strings for, uh, simplicity"
(declare (type buffer-stream bstream))
(let* ((length (buffer-read-int32 bstream))
(string (or temp-string (make-string length :element-type 'character)))
(pos (elephant-memutil::buffer-stream-position bstream))
- (code 0))
- (macrolet ((next-byte (offset)
- `(uffi:deref-array (buffer-stream-buffer bstream) '(:array :unsigned-byte) (+ (* i 2) pos ,offset))))
- (declare (type simple-string string)
- (type fixnum length pos code))
- (assert (subtypep (type-of string) 'simple-string))
- (assert (compatible-unicode-support-p :utf16le))
- (loop for i fixnum from 0 below length do
- (setf code (dpb (next-byte 0) (byte 8 8) 0))
- (setf code (dpb (next-byte 1) (byte 8 0) code))
- (setf (schar string i) (code-char code)))
- (incf (elephant-memutil::buffer-stream-position bstream)
- (* length 2)))
- (the simple-string string)))
+ (code 0) (endian 'endian-big))
+ (declare (type simple-string string)
+ (type fixnum length pos code))
+ (assert (subtypep (type-of string) 'simple-string))
+ (assert (compatible-unicode-support-p :utf16le))
+ (when (= (read-utf-char-from-buffer 0 2 (buffer-stream-buffer bstream)
+ pos (machine-endian)) #xfffe)
+ (setf endian 'endian-little)
+ (decf length)
+ (incf pos 2)
+ (incf (elephant-memutil::buffer-stream-position bstream) 2))
+ (loop for i fixnum from 0 below length
+ do (setf code
+ (read-utf-char-from-buffer i 2 (buffer-stream-buffer bstream)
+ pos endian))
+ (setf (schar string i) (code-char code)))
+ (incf (elephant-memutil::buffer-stream-position bstream)
+ (* length 2))
+ (the simple-string (subseq string 0 length))))
(defmethod deserialize-string ((type (eql :utf32le)) bstream &optional temp-string)
(declare (type buffer-stream bstream))
- (macrolet ((next-byte (offset)
- `(uffi:deref-array (buffer-stream-buffer bstream) '(:array :unsigned-byte) (+ (* i 4) pos ,offset))))
(let* ((length (buffer-read-int32 bstream))
(string (or temp-string (make-string length :element-type 'character)))
(pos (elephant-memutil::buffer-stream-position bstream))
- (code 0))
+ (code 0) (endian 'endian-big))
(declare (type string string)
(type fixnum length pos code))
(assert (subtypep (type-of string) 'simple-string))
(assert (compatible-unicode-support-p :utf32le))
+ (when (= (read-utf-char-from-buffer 0 4 (buffer-stream-buffer bstream)
+ pos (machine-endian)) #xfffe)
+ (setf endian 'endian-little)
+ (decf length)
+ (incf pos 4)
+ (incf (elephant-memutil::buffer-stream-position bstream) 4))
(loop for i fixnum from 0 below length do
- (setf code (dpb (next-byte 0) (byte 8 24) 0))
- (setf code (dpb (next-byte 1) (byte 8 16) code))
- (setf code (dpb (next-byte 2) (byte 8 8) code))
- (setf code (dpb (next-byte 3) (byte 8 0) code))
- (setf (char string i) (code-char code)))
+ (setf code (read-utf-char-from-buffer i 4 (buffer-stream-buffer bstream)
+ pos endian))
+ (setf (char string i) (code-char code)))
(incf (elephant-memutil::buffer-stream-position bstream)
(* length 4))
- (the simple-string string))))
-
-
-
-
-
+ (the simple-string (subseq string 0 length))))
More information about the elephant-devel
mailing list