[bknr-cvs] r2421 - branches/bos/projects/bos/web
ksprotte at common-lisp.net
ksprotte at common-lisp.net
Wed Jan 30 10:50:39 UTC 2008
Author: ksprotte
Date: Wed Jan 30 05:50:38 2008
New Revision: 2421
Added:
branches/bos/projects/bos/web/utf-8.lisp
Modified:
branches/bos/projects/bos/web/bos.web.asd
Log:
added a custom utf-8 hack providing one function: utf-8-string-to-bytes
Modified: branches/bos/projects/bos/web/bos.web.asd
==============================================================================
--- branches/bos/projects/bos/web/bos.web.asd (original)
+++ branches/bos/projects/bos/web/bos.web.asd Wed Jan 30 05:50:38 2008
@@ -19,6 +19,7 @@
:depends-on (:bknr :bknr-modules :bos.m2 :cxml)
:components ((:file "packages")
+ (:file "utf-8" :depends-on ("packages"))
(:file "config" :depends-on ("packages"))
(:file "web-macros" :depends-on ("config"))
(:file "web-utils" :depends-on ("web-macros"))
Added: branches/bos/projects/bos/web/utf-8.lisp
==============================================================================
--- (empty file)
+++ branches/bos/projects/bos/web/utf-8.lisp Wed Jan 30 05:50:38 2008
@@ -0,0 +1,93 @@
+(in-package :bos.web)
+
+;; this code is heavily inspired from trivial-utf-8
+;; it only has one API function, which was not provided
+;; exactly as we need it by trivial-utf-8
+
+;; API
+;; utf-8-string-to-bytes STRING
+
+(eval-when (:compile-toplevel :load-toplevel :execute)
+ (defparameter *optimize*
+ '(optimize (speed 3) (safety 0) (space 0) (debug 1)
+ (compilation-speed 0))))
+
+(define-condition utf-8-decoding-error (simple-error)
+ ((message :initarg :message)
+ (byte :initarg :byte :initform nil))
+ (:report (lambda (err stream)
+ (format stream (slot-value err 'message)
+ (slot-value err 'byte)))))
+
+(declaim (inline utf-8-group-size))
+(defun utf-8-group-size (byte)
+ "Determine the amount of bytes that are part of the character
+starting with a given byte."
+ (declare (type fixnum byte)
+ #.*optimize*)
+ (cond ((zerop (logand byte #b10000000)) 1)
+ ((= (logand byte #b11100000) #b11000000) 2)
+ ((= (logand byte #b11110000) #b11100000) 3)
+ ((= (logand byte #b11111000) #b11110000) 4)
+ (t (error 'utf-8-decoding-error :byte byte
+ :message "Invalid byte at start of character: 0x~X"))))
+
+(defun utf-8-string-length (string)
+ "Calculate the length of the string encoded by the given bytes."
+ (declare (type simple-string string)
+ #.*optimize*)
+ (loop :with i = 0
+ :with string-length = 0
+ :with array-length = (length string)
+ :while (< i array-length)
+ :do (progn
+ (incf (the fixnum string-length) 1)
+ (incf i (utf-8-group-size (char-code (char string i)))))
+ :finally (return string-length)))
+
+(defun get-utf-8-character (string group-size &optional (start 0))
+ "Given an array of bytes and the amount of bytes to use,
+extract the character starting at the given start position."
+ (declare (type simple-string string)
+ (type fixnum group-size start)
+ #.*optimize*)
+ (labels ((next-byte ()
+ (prog1 (char-code (char string start))
+ (incf start)))
+ (six-bits (byte)
+ (unless (= (logand byte #b11000000) #b10000000)
+ (error 'utf-8-decoding-error :byte byte
+ :message "Invalid byte 0x~X inside a character."))
+ (ldb (byte 6 0) byte)))
+ (case group-size
+ (1 (next-byte))
+ (2 (logior (ash (ldb (byte 5 0) (next-byte)) 6)
+ (six-bits (next-byte))))
+ (3 (logior (ash (ldb (byte 4 0) (next-byte)) 12)
+ (ash (six-bits (next-byte)) 6)
+ (six-bits (next-byte))))
+ (4 (logior (ash (ldb (byte 3 0) (next-byte)) 18)
+ (ash (six-bits (next-byte)) 12)
+ (ash (six-bits (next-byte)) 6)
+ (six-bits (next-byte)))))))
+
+(defun utf-8-string-to-bytes (string)
+ (declare #.*optimize*)
+ (loop
+ with buffer = (make-array (utf-8-string-length string)
+ :element-type '(unsigned-byte 16))
+ with string-position = 0
+ with buffer-position = 0
+ with string-length = (length string)
+ while (< string-position string-length)
+ do (let* ((byte (char-code (char string string-position)))
+ (current-group (utf-8-group-size byte)))
+ (when (> (+ current-group string-position) string-length)
+ (error 'utf-8-decoding-error
+ :message "Unfinished character at end of byte array."))
+ (setf (aref buffer buffer-position)
+ (get-utf-8-character string current-group string-position))
+ (incf buffer-position 1)
+ (incf string-position current-group))
+ finally (return buffer)))
+
More information about the Bknr-cvs
mailing list