[cmucl-cvs] [git] CMU Common Lisp branch master updated. snapshot-2012-01-2-g63ecc76

Thu Jan 12 06:13:17 UTC 2012

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "CMU Common Lisp".

The branch, master has been updated
       via  63ecc76acc5a671c45af3a650a239ef59b825777 (commit)
      from  f364ebbe14fa351117a843905b06ae1f7fbff3ae (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 63ecc76acc5a671c45af3a650a239ef59b825777
Author: Raymond Toy <toy.raymond at gmail.com>
Date:   Wed Jan 11 22:08:48 2012 -0800

    Fix ticket:53 by adding UTF-8 to the core.
    
    code/extfmts.lisp:
    * Move the utf-8 implementation to this file
    * Update %find-external-format to return quickly for :utf-8.
    
    code/fd-stream-comp.lisp:
    * Precompile utf-8 functions.
    
    pcl/simple-streams/external-formats/utf-8.lisp:
    * Add note that this is only used for reference now because it's in
      the core.

diff --git a/src/code/extfmts.lisp b/src/code/extfmts.lisp
index 43adbcc..f7e684c 100644
--- a/src/code/extfmts.lisp
+++ b/src/code/extfmts.lisp
@@ -464,6 +464,9 @@
 	    (and (eq name :default) (eq *default-external-format* :iso8859-1)))
     (return-from %find-external-format
       (gethash :iso8859-1 *external-formats*)))
+  (when (eq name :utf-8)
+    (return-from %find-external-format
+      (gethash :utf-8 *external-formats*)))
 
   (when (zerop (hash-table-count *external-format-aliases*))
     (setf (gethash :latin1 *external-format-aliases*) :iso8859-1)
@@ -1158,3 +1161,119 @@ character and illegal outputs are replaced by a question mark.")
     `(setf (aref (ef-cache (find-external-format ,(ef-name ef))) ,slot)
 	 ,(subst (ef-name ef) ef
 		 (function-lambda-expression (aref (ef-cache ef) slot))))))
+
+;; A safe UTF-8 external format.  Any illegal UTF-8 sequences on input
+;; are replaced with the Unicode REPLACEMENT CHARACTER (U+FFFD), or
+;; signals an error as appropriate.
+;;
+;; See Table 3-7, Ch 3.9 in the Unicode book.
+
+(define-external-format :utf-8 (:min 1 :max 4 :documentation 
+"UTF-8 is a variable-length character encoding for Unicode.  By
+default, illegal input sequences are replaced by the Unicode
+replacement character.")
+
+  ()
+  (octets-to-code (state input unput error c i j n)
+    `(labels ((utf8 (,c ,i)
+		(declare (type (unsigned-byte 8) ,c)
+			 (type (integer 1 5) ,i))
+		(let ((,n (ash (ldb (byte (- 6 ,i) 0) ,c)
+			       (* 6 ,i))))
+		  (declare (type (unsigned-byte 31) ,n))
+		  (dotimes (,j ,i (check ,n ,i))
+		    (let ((,c ,input))
+		      ;; Following bytes must all have the form
+		      ;; #b10xxxxxx.  If not, put back the octet we
+		      ;; just read and return the replacement character
+		      ;; for the bad sequence.
+		      (if (< (logxor ,c #x80) #x40)
+			  (setf (ldb (byte 6 (* 6 (- ,i ,j 1))) ,n)
+				(ldb (byte 6 0) ,c))
+			  (progn
+			    (,unput 1)
+			    (return
+			      (values
+			       (locally
+				   ;; No warnings about fdefinition
+				   (declare (optimize (ext:inhibit-warnings 3)))
+				 (if ,error
+				     (funcall ,error "Invalid utf8 octet #x~X at offset ~D"
+					      ,c (1+ ,j))
+				     +replacement-character-code+))
+			       (1+ ,j)))))))))
+	      (check (,n ,i)
+		(declare (type (unsigned-byte 31) ,n)
+			 (type (integer 1 5) ,i))
+		;; We check for overlong sequences (sequences that
+		;; encode to codepoints that don't need that long of a
+		;; sequence) and any surrogate values and any code
+		;; outside the 21-bit Unicode range.
+		(if (or (>= ,n lisp:codepoint-limit)
+			(<= ,n (the (member 127 2047 65535)
+				 (svref #(127 2047 65535) (1- ,i)))) ; overlong
+			(lisp::surrogatep ,n)) ; surrogate
+		    (progn
+		      ;; Replace the entire sequence with the
+		      ;; replacment character
+		      (values (if ,error
+				  (cond
+				    ((>= ,n lisp:codepoint-limit)
+				     (locally
+					 ;; No warnings about fdefinition
+					 (declare (optimize (ext:inhibit-warnings 3)))
+				       (funcall ,error "Invalid codepoint #x~X of ~D octets"
+						,n (1+ ,i))))
+				    ((lisp::surrogatep ,n)
+				     (locally
+					 ;; No warnings about fdefinition
+					 (declare (optimize (ext:inhibit-warnings 3)))
+				       (funcall ,error "Invalid surrogate code #x~X" ,n (1+ ,i))))
+				    (t
+				     (locally
+					 ;; No warnings about fdefinition
+					 (declare (optimize (ext:inhibit-warnings 3)))
+				       (funcall ,error "Overlong utf8 sequence of ~*~D octets" nil (1+ ,i)))))
+				  +replacement-character-code+)
+			      (1+ ,i)))
+		    (values ,n (1+ ,i)))))
+      (let ((,c ,input))
+	(declare (optimize (ext:inhibit-warnings 3)))
+	(cond ((null ,c) (values nil 0))
+	      ((< ,c #b10000000) (values ,c 1))
+	      ((< ,c #b11000010)
+	       (values
+		(locally
+		    ;; No warnings about fdefinition
+		    (declare (optimize (ext:inhibit-warnings 3)))
+		  (if ,error
+		      (funcall ,error "Invalid initial utf8 octet: #x~X" ,c 1)
+		      +replacement-character-code+))
+		       1))
+	      ((< ,c #b11100000) (utf8 ,c 1))
+	      ((< ,c #b11110000) (utf8 ,c 2))
+	      ((< ,c #b11111000) (utf8 ,c 3))
+	      (t
+	       (values
+		(locally
+		    ;; No warnings about fdefinition
+		    (declare (optimize (ext:inhibit-warnings 3)))
+		  (if ,error
+		      (funcall ,error "Invalid initial utf8 octet: #x~X" ,c 1)
+		      +replacement-character-code+))
+		1))))))
+  (code-to-octets (code state output error i j n p init)
+    `(flet ((utf8 (,n ,i)
+          (let* ((,j (- 6 ,i))
+             (,p (* 6 ,i))
+             (,init (logand #xFF (ash #b01111110 ,j))))
+        (,output (logior ,init (ldb (byte ,j ,p) ,n)))
+        (dotimes (,i ,i)
+          (decf ,p 6)
+          (,output (logior 128 (ldb (byte 6 ,p) ,n)))))))
+       (declare (optimize (ext:inhibit-warnings 3)))
+       (cond ((< ,code #x80) (,output ,code))
+         ((< ,code #x800) (utf8 ,code 1))
+         ((< ,code #x10000) (utf8 ,code 2))
+         ((< ,code #x110000) (utf8 ,code 3))
+         (t (error "How did this happen?  Codepoint U+~X is illegal" ,code))))))
diff --git a/src/code/fd-stream-comp.lisp b/src/code/fd-stream-comp.lisp
index a383499..e05dc1a 100644
--- a/src/code/fd-stream-comp.lisp
+++ b/src/code/fd-stream-comp.lisp
@@ -27,3 +27,11 @@
 (stream::precompile-ef-slot :iso8859-1 #.stream::+ef-en+)
 (stream::precompile-ef-slot :iso8859-1 #.stream::+ef-de+)
 
+(stream::precompile-ef-slot :utf-8 #.stream::+ef-cin+)
+(stream::precompile-ef-slot :utf-8 #.stream::+ef-cout+)
+(stream::precompile-ef-slot :utf-8 #.stream::+ef-sout+)
+(stream::precompile-ef-slot :utf-8 #.stream::+ef-os+)
+(stream::precompile-ef-slot :utf-8 #.stream::+ef-so+)
+(stream::precompile-ef-slot :utf-8 #.stream::+ef-en+)
+(stream::precompile-ef-slot :utf-8 #.stream::+ef-de+)
+
diff --git a/src/pcl/simple-streams/external-formats/utf-8.lisp b/src/pcl/simple-streams/external-formats/utf-8.lisp
index fe14f04..7d2084c 100644
--- a/src/pcl/simple-streams/external-formats/utf-8.lisp
+++ b/src/pcl/simple-streams/external-formats/utf-8.lisp
@@ -9,8 +9,13 @@
 (in-package "STREAM")
 (intl:textdomain "cmucl")
 
+;; This is actually implemented in the external-formats code
+;; It appears here only for reference, and will never get loaded
+
+
 ;; A safe UTF-8 external format.  Any illegal UTF-8 sequences on input
-;; are replaced with the Unicode REPLACEMENT CHARACTER (U+FFFD).
+;; are replaced with the Unicode REPLACEMENT CHARACTER (U+FFFD), or
+;; signals an error as appropriate.
 ;;
 ;; See Table 3-7, Ch 3.9 in the Unicode book.
 

-----------------------------------------------------------------------

Summary of changes:
 src/code/extfmts.lisp                              |  119 ++++++++++++++++++++
 src/code/fd-stream-comp.lisp                       |    8 ++
 src/pcl/simple-streams/external-formats/utf-8.lisp |    7 +-
 3 files changed, 133 insertions(+), 1 deletions(-)


hooks/post-receive
-- 
CMU Common Lisp