[git] CMU Common Lisp branch master updated. snapshot-2013-05-14-gb8c0807

Raymond Toy rtoy at common-lisp.net
Sun May 26 15:24:45 UTC 2013


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "CMU Common Lisp".

The branch, master has been updated
       via  b8c080759c41e1a16d0468a4138799e4a3d02151 (commit)
       via  a3f78ebd73e13965fb0d609f639737a1428d6578 (commit)
       via  49f041ad84bf812b25d4fffc210da123400cb6f0 (commit)
       via  9d66b2585eb33ff8106511da512b4772a3887aab (commit)
      from  9f62dcdfab39ef03cf01969b6ea88b962073d09f (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit b8c080759c41e1a16d0468a4138799e4a3d02151
Author: Raymond Toy <toy.raymond at gmail.com>
Date:   Sun May 26 08:24:37 2013 -0700

    Update.

diff --git a/src/general-info/release-20e.txt b/src/general-info/release-20e.txt
index 90df1d4..ee290d9 100644
--- a/src/general-info/release-20e.txt
+++ b/src/general-info/release-20e.txt
@@ -43,10 +43,15 @@ New in this release:
         Pentium.)
     * Update unicode to support Unicode 6.2.
     * Add MP:PROCESS-JOIN, as given in ticket #77.
+    * Added UNICODE package to hold Unicode-related functions.
+      Currently contains Unicode extended versions of STRING-UPCASE,
+      STRING-DOWNCASE, and STRING-CAPITALIZE.
 
   * ANSI compliance fixes:
     * Attempts to modify the standard readtable or the standard pprint
       dispatch table will now signal a continuable error.
+    * Remove the Unicode extensions from string-upcase and friends.
+      This functionality is moved to the new UNICODE package.
 
   * Bugfixes:
     * REPLACE and friends on strings were limited to strings less than

commit a3f78ebd73e13965fb0d609f639737a1428d6578
Author: Raymond Toy <toy.raymond at gmail.com>
Date:   Sun May 26 08:22:00 2013 -0700

    Import many unicode symbols from LISP into the UNICODE package.
    
     src/code/exports.lisp::
     * Import symbols
    
     src/code/unicode.lisp::
     * Remove LISP package prefix from all unicode-related symbols.

diff --git a/src/code/exports.lisp b/src/code/exports.lisp
index 249bc32..b2bdeab 100644
--- a/src/code/exports.lisp
+++ b/src/code/exports.lisp
@@ -2447,6 +2447,20 @@
   (:shadow "STRING-CAPITALIZE"
 	   "STRING-DOWNCASE"
 	   "STRING-UPCASE")
+  (:import-from "LISP"
+		"CODEPOINT"
+		"SURROGATES"
+		"UNICODE-FULL-CASE-LOWER"
+		"UNICODE-FULL-CASE-UPPER"
+		"UNICODE-FULL-CASE-TITLE"
+		"UNICODE-CATEGORY"
+		"+UNICODE-CATEGORY-LOWER+"
+		"+UNICODE-CATEGORY-OTHER+"
+		"+UNICODE-CATEGORY-GRAPHIC+"
+		"+UNICODE-CATEGORY-UPPER+"
+		"+UNICODE-CATEGORY-TITLE+"
+		"UNICODE-UPPER"
+		"UNICODE-WORD-BREAK")
   (:export "STRING-CAPITALIZE"
 	   "STRING-DOWNCASE"
 	   "STRING-UPCASE"
diff --git a/src/code/unicode.lisp b/src/code/unicode.lisp
index 98fe880..cbd7713 100644
--- a/src/code/unicode.lisp
+++ b/src/code/unicode.lisp
@@ -36,14 +36,14 @@
 	      ((= index (the fixnum end)))
 	    (declare (fixnum index))
 	    (multiple-value-bind (code wide)
-		(lisp:codepoint string index)
+		(codepoint string index)
 	      (when wide (incf index))
 	      ;; Handle ASCII specially because this is called early in
 	      ;; initialization, before unidata is available.
 	      (cond ((< 96 code 123)
 		     (write-char (code-char (decf code 32)) s))
 		    ((> code 127)
-		     (write-string (lisp:unicode-full-case-upper code) s))
+		     (write-string (unicode-full-case-upper code) s))
 		    (t
 		     (multiple-value-bind (hi lo)
 			 (surrogates code)
@@ -82,14 +82,14 @@
 	      ((= index (the fixnum end)))
 	    (declare (fixnum index))
 	    (multiple-value-bind (code wide)
-		(lisp:codepoint string index)
+		(codepoint string index)
 	      (when wide (incf index))
 	      ;; Handle ASCII specially because this is called early in
 	      ;; initialization, before unidata is available.
 	      (cond ((< 64 code 91)
 		     (write-char (code-char (incf code 32)) s))
 		    ((> code 127)
-		     (write-string (lisp:unicode-full-case-lower code) s))
+		     (write-string (unicode-full-case-lower code) s))
 		    (t
 		     ;; Handle codes below 64
 		     (multiple-value-bind (hi lo)
@@ -230,7 +230,7 @@
 	((char-word-break-category (c)
 	   ;; Map our unicode word break property into what this
 	   ;; algorithm wants.
-	   (let ((cat (lisp::unicode-word-break c)))
+	   (let ((cat (unicode-word-break c)))
 	     (case cat
 	       ((:lf :cr :newline)
 		:sep)
@@ -241,7 +241,7 @@
 	   ;; Given a valid index i into s, returns the left context
 	   ;; at i.
 	   (multiple-value-bind (c widep)
-	       (lisp:codepoint s i n)
+	       (codepoint s i n)
 	     (let* ((back
 		     ;; If we're at a regular character or a leading
 		     ;; surrogate, decrementing by 1 gets us the to
@@ -279,7 +279,7 @@
 
 	   (let* ((j1 (- j 1)))
 	     (multiple-value-bind (c widep)
-		 (lisp:codepoint s j1)
+		 (codepoint s j1)
 	       (when (eql widep -1)
 		 ;; Back up one more if we're at the trailing
 		 ;; surrogate.
@@ -302,7 +302,7 @@
 		    (if (< i j) j n)))
 		 (otherwise n))
 	       (multiple-value-bind (c widep)
-		   (lisp:codepoint s j)
+		   (codepoint s j)
 		 (let* ((next-j
 			 ;; The next character is either 1 or 2 code
 			 ;; units away.  For a leading surrogate, it's
@@ -394,7 +394,7 @@
 	     n)
 	    (t
 	     (multiple-value-bind (c widep)
-		 (lisp:codepoint s i)
+		 (codepoint s i)
 	       (declare (ignore c))
 	       (lookup (+ i (if (eql widep 1) 2 1)) (left-context i))))))))
 
@@ -416,15 +416,15 @@
 			 (:simple
 			  #'(lambda (ch)
 			      (multiple-value-bind (hi lo)
-				  (lisp::surrogates (lisp::unicode-upper ch))
+				  (surrogates (unicode-upper ch))
 				(write-char hi result)
 				(when lo (write-char lo result)))))
 			 (:full
 			  #'(lambda (ch)
-			      (write-string (lisp::unicode-full-case-upper ch) result)))
+			      (write-string (unicode-full-case-upper ch) result)))
 			 (:title
 			  #'(lambda (ch)
-			      (write-string (lisp::unicode-full-case-title ch) result))))))
+			      (write-string (unicode-full-case-title ch) result))))))
 	    (do ((start start next)
 		 (next (string-next-word-break string start)
 		       (string-next-word-break string next)))
@@ -432,7 +432,7 @@
 		     (>= start end)))
 	      ;; Convert the first character of the word to upper
 	      ;; case, and then make the rest of the word lowercase.
-	      (funcall upper (lisp:codepoint string start))
+	      (funcall upper (codepoint string start))
 	      (write-string (string-downcase string :start (1+ start)
 						    :end next
 						    :casing casing)
@@ -459,29 +459,29 @@
 		   (or (< 47 m 58) (< 64 m 91) (< 96 m 123)
 		       #+(and unicode (not unicode-bootstrap))
 		       (and (> m 127)
-			    (<= lisp::+unicode-category-letter+
-				(lisp::unicode-category m)
-				(+ lisp::+unicode-category-letter+ #x0F)))))
+			    (<= +unicode-category-letter+
+				(unicode-category m)
+				(+ +unicode-category-letter+ #x0F)))))
 		 (upper (ch)
 		   (ecase casing
 		     (:simple
 		      #'(lambda (ch)
 			  (multiple-value-bind (hi lo)
-			      (lisp::surrogates (lisp::unicode-upper ch))
+			      (surrogates (unicode-upper ch))
 			    (write-char hi s)
 			    (when lo (write-char lo s)))))
 		     (:full
 		      #'(lambda (ch)
-			  (write-string (lisp::unicode-full-case-upper ch) s)))
+			  (write-string (unicode-full-case-upper ch) s)))
 		     (:title
 		      #'(lambda (ch)
-			  (write-string (lisp::unicode-full-case-title ch) s))))))
+			  (write-string (unicode-full-case-title ch) s))))))
 	    (do ((index start (1+ index))
 		 (newword t))
 		((= index (the fixnum end)))
 	      (declare (fixnum index))
 	      (multiple-value-bind (code wide)
-		  (lisp:codepoint string index)
+		  (codepoint string index)
 		(when wide (incf index))
 		(cond ((not (alphanump code))
 		       (multiple-value-bind (hi lo)
@@ -495,7 +495,7 @@
 		       (setq newword ()))
 		      (t
 		       ;; char is case-modifiable, but not first
-		       (write-string (lisp:unicode-full-case-lower code) s))))))
+		       (write-string (unicode-full-case-lower code) s))))))
 	  (write-string string s :start end :end offset-slen))))))
 
 (defun string-capitalize (string &key (start 0) end

commit 49f041ad84bf812b25d4fffc210da123400cb6f0
Author: Raymond Toy <toy.raymond at gmail.com>
Date:   Sat May 25 14:38:36 2013 -0700

    Remove all the extensions to string-upcase and friends.  The extended
    functions now live in the new UNICODE package.
    
     src/code/exports.lisp::
     * Export some unicode functions and constants.
    
     src/code/string.lisp::
     * Removed the extended versions of string-upcase and friends.
     * Export surrogates function.
     * Make sure with-one-string is defined so the unicode package can use
       it.
    
     src/code/unicode.lisp:;
     * New file with extended versions of string-upcase and friends.
    
     src/code/unidata.lisp::
     * Export some unicode functions and constants.
    
     src/compiler/fndb.lisp::
     * Update defknowns for string-upcase and friends.
    
     src/tools/worldbuild.lisp::
     * Build unicode.lisp
    
     src/tools/worldcom.lisp::
     * Load unicode.lisp

diff --git a/src/code/exports.lisp b/src/code/exports.lisp
index 6c7bbed..249bc32 100644
--- a/src/code/exports.lisp
+++ b/src/code/exports.lisp
@@ -913,7 +913,17 @@
   (:export "STRING-TO-NFC" "STRING-TO-NFD"
 	   "STRING-TO-NFKC" "STRING-TO-NFKD"
 	   "UNICODE-COMPLETE" "UNICODE-COMPLETE-NAME"
-	   "LOAD-ALL-UNICODE-DATA"))
+	   "UNICODE-FULL-CASE-LOWER"
+	   "UNICODE-FULL-CASE-UPPER"
+	   "UNICODE-FULL-CASE-TITLE"
+	   "UNICODE-CATEGORY"
+	   "+UNICODE-CATEGORY-LOWER+"
+	   "+UNICODE-CATEGORY-OTHER+"
+	   "+UNICODE-CATEGORY-GRAPHIC+"
+	   "+UNICODE-CATEGORY-UPPER+"
+	   "+UNICODE-CATEGORY-TITLE+"
+	   "LOAD-ALL-UNICODE-DATA"
+	   "SURROGATES"))
 
 (defpackage "EVAL"
   (:export "*EVAL-STACK-TRACE*" "*INTERNAL-APPLY-NODE-TRACE*"
@@ -2432,4 +2442,14 @@
 	   "LAST-FWRAPPER"
 	   "DO-FWRAPPERS"))
 
+(defpackage "UNICODE"
+  (:use "COMMON-LISP")
+  (:shadow "STRING-CAPITALIZE"
+	   "STRING-DOWNCASE"
+	   "STRING-UPCASE")
+  (:export "STRING-CAPITALIZE"
+	   "STRING-DOWNCASE"
+	   "STRING-UPCASE"
+	   "STRING-NEXT-WORD-BREAK"))
+
 
diff --git a/src/code/string.lisp b/src/code/string.lisp
index 5176edf..8005cda 100644
--- a/src/code/string.lisp
+++ b/src/code/string.lisp
@@ -30,7 +30,7 @@
 	  nstring-capitalize))
 
 #+unicode
-(export '(string-to-nfd string-to-nfkd string-to-nfkc))
+(export '(string-to-nfd string-to-nfkd string-to-nfkc surrogates))
 
 (declaim (inline surrogatep surrogates-to-codepoint codepoint surrogates))
 
@@ -159,7 +159,7 @@
 ;;; With-One-String is used to set up some string hacking things.  The keywords
 ;;; are parsed, and the string is hacked into a simple-string.
 
-(eval-when (compile)
+(eval-when (compile load eval)
 
 (defmacro with-one-string (string start end cum-offset &rest forms)
   `(let ((,string (if (stringp ,string) ,string (string ,string))))
@@ -594,7 +594,9 @@
 	(setf (schar string i) fill-char))
       (make-string count)))
 
-(defun string-upcase-simple (string &key (start 0) end)
+(defun string-upcase (string &key (start 0) end)
+  _N"Given a string, returns a new string that is a copy of it with all
+  lower case alphabetic characters converted to uppercase."
   (declare (fixnum start))
   (let* ((string (if (stringp string) string (string string)))
 	 (slen (length string)))
@@ -634,54 +636,9 @@
 	  (setf (schar newstring new-index) (schar string index)))
 	newstring))))
 
-#+unicode
-(defun string-upcase-full (string &key (start 0) end)
-  (declare (fixnum start))
-  (let* ((string (if (stringp string) string (string string)))
-	 (slen (length string)))
-    (declare (fixnum slen))
-    (with-output-to-string (s)
-      (with-one-string string start end offset
-        (let ((offset-slen (+ slen offset)))
-	  (declare (fixnum offset-slen))
-	  (write-string string s :start offset :end start)
-	  (do ((index start (1+ index)))
-	      ((= index (the fixnum end)))
-	    (declare (fixnum index))
-	    (multiple-value-bind (code wide)
-		(codepoint string index)
-	      (when wide (incf index))
-	      ;; Handle ASCII specially because this is called early in
-	      ;; initialization, before unidata is available.
-	      (cond ((< 96 code 123)
-		     (write-char (code-char (decf code 32)) s))
-		    ((> code 127)
-		     (write-string (unicode-full-case-upper code) s))
-		    (t
-		     (multiple-value-bind (hi lo)
-			 (surrogates code)
-		       (write-char hi s)
-		       (when lo
-			 (write-char lo s)))))))
-	  (write-string string s :start end :end offset-slen))))))
-
-(defun string-upcase (string &key (start 0) end #+unicode (casing :simple))
-  #-unicode
+(defun string-downcase (string &key (start 0) end)
   _N"Given a string, returns a new string that is a copy of it with all
-  lower case alphabetic characters converted to uppercase."
-  #+unicode
-  _N"Given a string, returns a new string that is a copy of it with all
-  lower case alphabetic characters converted to uppercase.  Casing is
-  :simple or :full for simple or full case conversion, respectively."
-  (declare (fixnum start))
-  #-unicode
-  (string-upcase-simple string :start start :end end)
-  #+unicode
-  (if (eq casing :simple)
-      (string-upcase-simple string :start start :end end)
-      (string-upcase-full string :start start :end end)))
-
-(defun string-downcase-simple (string &key (start 0) end)
+  upper case alphabetic characters converted to lowercase."
   (declare (fixnum start))
   (let* ((string (if (stringp string) string (string string)))
 	 (slen (length string)))
@@ -720,54 +677,12 @@
 	  (setf (schar newstring new-index) (schar string index)))
 	newstring))))
 
-#+unicode
-(defun string-downcase-full (string &key (start 0) end)
-  (declare (fixnum start))
-  (let* ((string (if (stringp string) string (string string)))
-	 (slen (length string)))
-    (declare (fixnum slen))
-    (with-output-to-string (s)
-      (with-one-string string start end offset
-        (let ((offset-slen (+ slen offset)))
-	  (declare (fixnum offset-slen))
-	  (write-string string s :start offset :end start)
-	  (do ((index start (1+ index)))
-	      ((= index (the fixnum end)))
-	    (declare (fixnum index))
-	    (multiple-value-bind (code wide)
-		(codepoint string index)
-	      (when wide (incf index))
-	      ;; Handle ASCII specially because this is called early in
-	      ;; initialization, before unidata is available.
-	      (cond ((< 64 code 91)
-		     (write-char (code-char (incf code 32)) s))
-		    ((> code 127)
-		     (write-string (unicode-full-case-lower code) s))
-		    (t
-		     (multiple-value-bind (hi lo)
-			 (surrogates code)
-		       (write-char hi s)
-		       (when lo
-			 (write-char lo s)))))))
-	  (write-string string s :start end :end offset-slen))))))
-
-(defun string-downcase (string &key (start 0) end #+unicode (casing :simple))
-  #-unicode
-  _N"Given a string, returns a new string that is a copy of it with all
-  upper case alphabetic characters converted to lowercase."
-  #+unicode
-  _N"Given a string, returns a new string that is a copy of it with all
-  upper case alphabetic characters converted to lowercase.  Casing is
-  :simple or :full for simple or full case conversion, respectively."
-  (declare (fixnum start))
-  #-unicode
-  (string-downcase-simple string :start start :end end)
-  #+unicode
-  (if (eq casing :simple)
-      (string-downcase-simple string :start start :end end)
-      (string-downcase-full string :start start :end end)))
-
-(defun string-capitalize-simple (string &key (start 0) end)
+(defun string-capitalize (string &key (start 0) end)
+  _N"Given a string, returns a copy of the string with the first
+  character of each ``word'' converted to upper-case, and remaining
+  chars in the word converted to lower case. A ``word'' is defined
+  to be a string of case-modifiable characters delimited by
+  non-case-modifiable chars."
   (declare (fixnum start))
   (let* ((string (if (stringp string) string (string string)))
 	 (slen (length string)))
@@ -804,74 +719,6 @@
 	  (setf (schar newstring new-index) (schar string index)))
 	newstring))))
 
-#+unicode
-(defun string-capitalize-full (string &key (start 0) end)
-  (declare (fixnum start))
-  (let* ((string (if (stringp string) string (string string)))
-	 (slen (length string)))
-    (declare (fixnum slen))
-    (with-output-to-string (s)
-      (with-one-string string start end offset
-        (let ((offset-slen (+ slen offset)))
-	  (declare (fixnum offset-slen))
-	  (write-string string s :start offset :end start)
-	  (flet ((alphanump (m)
-		   (or (< 47 m 58) (< 64 m 91) (< 96 m 123)
-		       #+(and unicode (not unicode-bootstrap))
-		       (and (> m 127)
-			    (<= +unicode-category-letter+ (unicode-category m)
-				(+ +unicode-category-letter+ #x0F))))))
-	    (do ((index start (1+ index))
-		 (newword t))
-		((= index (the fixnum end)))
-	      (declare (fixnum index))
-	      (multiple-value-bind (code wide)
-		  (codepoint string index)
-		(when wide (incf index))
-		(cond ((not (alphanump code))
-		       (multiple-value-bind (hi lo)
-			   (surrogates code)
-			 (write-char hi s)
-			 (when lo (write-char lo s)))
-		       (setq newword t))
-		      (newword
-		       ;;char is first case-modifiable after non-case-modifiable
-		       (write-string (unicode-full-case-title code) s)
-		       (setq newword ()))
-		      ;;char is case-modifiable, but not first
-		      (t
-		       (write-string (unicode-full-case-lower code) s))))))
-	  (write-string string s :start end :end offset-slen))))))
-
-(defun string-capitalize (string &key (start 0) end
-				 #+unicode (casing :simple)
-				 #+unicode unicode-word-break)
-  #-unicode
-  _N"Given a string, returns a copy of the string with the first
-  character of each ``word'' converted to upper-case, and remaining
-  chars in the word converted to lower case. A ``word'' is defined
-  to be a string of case-modifiable characters delimited by
-  non-case-modifiable chars."
-  #+unicode
-  _N"Given a string, returns a copy of the string with the first
-  character of each ``word'' converted to upper-case, and remaining
-  chars in the word converted to lower case. Casing is :simple or
-  :full for simple or full case conversion, respectively.  If
-  Unicode-Word-Break is non-Nil, then the Unicode word-breaking
-  algorithm is used to determine the word boundaries.  Otherwise, A
-  ``word'' is defined to be a string of case-modifiable characters
-  delimited by non-case-modifiable chars.  "
-  
-  (declare (fixnum start))
-  #-unicode
-  (string-capitalize-simple string :start start :end end)
-  #+unicode
-  (if unicode-word-break
-      (string-capitalize-unicode string :start start :end end :casing casing)
-      (if (eq casing :simple)
-	  (string-capitalize-simple string :start start :end end)
-	  (string-capitalize-full string :start start :end end))))
-
 (defun nstring-upcase (string &key (start 0) end)
   "Given a string, returns that string with all lower case alphabetic
   characters converted to uppercase."
@@ -1390,322 +1237,6 @@
   (if (simple-string-p string) string (coerce string 'simple-string)))
 
 
-;;;
-;;; This is a Lisp translation of the Scheme code from William
-;;; D. Clinger that implements the word-breaking algorithm.  This is
-;;; used with permission.
-;;;
-;;; This version is modified from the original at
-;;; http://www.ccs.neu.edu/home/will/R6RS/ to conform to CMUCL's
-;;; implementation of the word break properties.
-;;;
-;;;
-;;; Copyright statement and original comments:
-;;;
-;;;--------------------------------------------------------------------------------
-
-;; Copyright 2006 William D Clinger.
-;;
-;; Permission to copy this software, in whole or in part, to use this
-;; software for any lawful purpose, and to redistribute this software
-;; is granted subject to the restriction that all copies made of this
-;; software must include this copyright and permission notice in full.
-;;
-;; I also request that you send me a copy of any improvements that you
-;; make to this software so that they may be incorporated within it to
-;; the benefit of the Scheme community.
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;
-;; Word-breaking as defined by Unicode Standard Annex #29.
-;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-;; Implementation notes.
-;;
-;; The string-foldcase, string-downcase, and string-titlecase
-;; procedures rely on the notion of a word, which is defined
-;; by Unicode Standard Annex 29.
-;;
-;; The string-foldcase and string-downcase procedures rely on
-;; word boundaries only when they encounter a Greek sigma, so
-;; their performance should not be greatly affected by the
-;; performance of the word-breaking algorithm.
-;;
-;; The string-titlecase procedure must find all word boundaries,
-;; but it is typically used on short strings (titles).
-;;
-;; Hence the performance of the word-breaking algorithm should
-;; not matter too much for this reference implementation.
-;; Word-breaking is more generally useful, however, so I tried
-;; to make this implementation reasonably efficient.
-;;
-;; Word boundaries are defined by 14 different rules in
-;; Unicode Standard Annex #29, and by GraphemeBreakProperty.txt
-;; and WordBreakProperty.txt.  See also WordBreakTest.html.
-;;
-;; My original implementation of those specifications failed
-;; 6 of the 494 tests in auxiliary/WordBreakTest.txt, but it
-;; appeared to me that those tests were inconsistent with the
-;; word-breaking rules in UAX #29.  John Cowan forwarded my
-;; bug report to the Unicode experts, and Mark Davis responded
-;; on 29 May 2007:
-;;
-;;   Thanks for following up on this. I think you have found a problem in the
-;;   formulation of word break, not the test. The intention was to break after a
-;;   Sep character, as is done in Sentence break. So my previous suggestion was
-;;   incorrect; instead, what we need is a new rule:
-;; 
-;;   *Break after paragraph separators.*
-;;    WB3a. Sep �
-;;   I'll make a propose to the UTC for this.
-;;
-;; Here is Will's translation of those rules (including WB3a)
-;; into a finite state machine that searches forward within a
-;; string, looking for the next position at which a word break
-;; is allowed.  The current state consists of an index i into
-;; the string and a summary of the left context whose rightmost
-;; character is at index i.  The left context is usually
-;; determined by the character at index i, but there are three
-;; complications:
-;;
-;;     Extend and Format characters are ignored unless they
-;;         follow a separator or the beginning of the text.
-;;     ALetter followed by MidLetter is treated specially.
-;;     Numeric followed by MidNum is treated specially.
-;;
-;; In the implementation below, the left context ending at i
-;; is encoded by the following symbols:
-;;
-;;     CR
-;;     Sep (excluding CR)
-;;     ALetter
-;;     MidLetter
-;;     ALetterMidLetter (ALetter followed by MidLetter)
-;;     Numeric
-;;     MidNum
-;;     NumericMidNum (Numeric followed by MidNum)
-;;     Katakana
-;;     ExtendNumLet
-;;     other (none of the above)
-;;
-;; Given a string s and an exact integer i (which need not be
-;; a valid index into s), returns the index of the next character
-;; that is not part of the word containing the character at i,
-;; or the length of s if the word containing the character at i
-;; extends through the end of s.  If i is negative or a valid
-;; index into s, then the returned value will be greater than i.
-;;
-;;;--------------------------------------------------------------------------------
-
-(defun string-next-word-break (s i)
-  (let ((n (length s)))
-    (labels
-	((char-word-break-category (c)
-	   ;; Map our unicode word break property into what this
-	   ;; algorithm wants.
-	   (let ((cat (lisp::unicode-word-break c)))
-	     (case cat
-	       ((:lf :cr :newline)
-		:sep)
-	       ((:extend :format)
-		:extend-or-format)
-	       (otherwise cat))))
-	 (left-context (i)
-	   ;; Given a valid index i into s, returns the left context
-	   ;; at i.
-	   (multiple-value-bind (c widep)
-	       (lisp::codepoint s i n)
-	     (let* ((back
-		     ;; If we're at a regular character or a leading
-		     ;; surrogate, decrementing by 1 gets us the to
-		     ;; previous character.  But for a trailing
-		     ;; surrogate, we need to decrement by 2!
-		     (if (eql widep -1)
-			 2
-			 1))
-		    (cat (char-word-break-category c)))
-	       (case cat
-		 ((:sep)
-		  (if (= c (char-code #\return)) :cr cat))
-		 ((:midletter :midnumlet)
-		  (let ((i-1 (- i back)))
-		    (if (and (<= 0 i-1)
-			     (eq (left-context i-1) :aletter))
-			:aletter-midletter
-			cat)))
-		 ((:midnum :midnumlet)
-		  (let ((i-1 (- i back)))
-		    (if (and (<= 0 i-1)
-			     (eq (left-context i-1) :numeric))
-			:numeric-midnum
-			cat)))
-		 ((:extendorformat)
-		  (if (< 0 i)
-		      (left-context (- i back))
-		      :other))
-		 (otherwise cat)))))
-
-	 (index-of-previous-non-ignored (j)
-	   ;; Returns the index of the last non-Extend, non-Format
-	   ;; character within (substring s 0 j).  Should not be
-	   ;; called unless such a character exists.
-
-	   (let* ((j1 (- j 1)))
-	     (multiple-value-bind (c widep)
-		 (lisp::codepoint s j1)
-	       (when (eql widep -1)
-		 ;; Back up one more if we're at the trailing
-		 ;; surrogate.
-		 (decf j1))
-	       (let ((cat (char-word-break-category c)))
-		 (case cat
-		   ((:extend-or-format)
-		    (index-of-previous-non-ignored j1))
-		   (otherwise j1))))))
-
-	 (lookup (j context)
-	   ;; Given j and the context to the left of (not including) j,
-	   ;; returns the index at the start of the next word
-	   ;; (or before which a word break is permitted).
-
-	   (if (>= j n)
-	       (case context
-		 ((:aletter-midletter :numeric-midnum)
-		  (let ((j (index-of-previous-non-ignored n)))
-		    (if (< i j) j n)))
-		 (otherwise n))
-	       (multiple-value-bind (c widep)
-		   (lisp::codepoint s j)
-		 (let* ((next-j
-			 ;; The next character is either 1 or 2 code
-			 ;; units away.  For a leading surrogate, it's
-			 ;; 2; Otherwise just 1.
-			 (if (eql widep 1)
-			     2
-			     1))
-			(cat (char-word-break-category c)))
-		   (case cat
-		     ((:extend-or-format)
-		      (case context
-			((:cr :sep) j)
-			(otherwise (lookup (+ j next-j) context))))
-		     (otherwise
-		      (case context
-			((:cr)
-			 (if (= c (char-code #\linefeed))
-			     ;; Rule WB3:  Don't break CRLF, continue looking
-			     (lookup (+ j next-j) cat)
-			     j))
-			((:aletter)
-			 (case cat
-			   ((:aletter :numeric :extendnumlet)
-			    ;; Rules WB5, WB9, ?
-			    (lookup (+ j next-j) cat))
-			   ((:midletter :midnumlet)
-			    ;; Rule WB6, need to keep looking
-			    (lookup (+ j next-j) :aletter-midletter))
-			   (otherwise j)))
-			((:aletter-midletter)
-			 (case cat
-			   ((:aletter)
-			    ;; Rule WB7
-			    (lookup (+ j next-j) cat))
-			   (otherwise
-			    ;; Rule WB6 and WB7 were extended, but the
-			    ;; region didn't end with :aletter.  So
-			    ;; backup and break at that point.
-			    (let ((j2 (index-of-previous-non-ignored j)))
-			      (if (< i j2) j2 j)))))
-			((:numeric)
-			 (case cat
-			   ((:numeric :aletter :extendnumlet)
-			    ;; Rules WB8, WB10, ?
-			    (lookup (+ j next-j) cat))
-			   ((:midnum :midnumlet)
-			    ;; Rules WB11, need to keep looking
-			    (lookup (+ j next-j) :numeric-midnum))
-			   (otherwise j)))
-			((:numeric-midnum)
-			 (case cat
-			   ((:numeric)
-			    ;; Rule WB11, keep looking
-			    (lookup (+ j next-j) cat))
-			   (otherwise
-			    ;; Rule WB11, WB12 were extended, but the
-			    ;; region didn't end with :numeric, so
-			    ;; backup and break at that point.
-			    (let ((j2 (index-of-previous-non-ignored j)))
-			      (if (< i j2) j2 j)))))
-			((:midletter :midnum :midnumlet)
-			 ;; Rule WB14
-			 j)
-			((:katakana)
-			 (case cat
-			   ((:katakana :extendnumlet)
-			    ;; Rule WB13, WB13a
-			    (lookup (+ j next-j) cat))
-			   (otherwise j)))
-			((:extendnumlet)
-			 (case cat
-			   ((:extendnumlet :aletter :numeric :katakana)
-			    ;; Rule WB13a, WB13b
-			    (lookup (+ j next-j) cat))
-			   (otherwise j)))
-			((:regional_indicator)
-			 (case cat
-			   ((:regional_indicator)
-			    ;; Rule WB13c
-			    (lookup (+ j next-j) cat))
-			   (otherwise j)))
-			(otherwise j)))))))))
-      (declare (notinline lookup left-context))
-      (cond ((< i 0)
-	     ;; Rule WB1
-	     0)
-	    ((<= n i)
-	     ;; Rule WB2
-	     n)
-	    (t
-	     (multiple-value-bind (c widep)
-		 (lisp::codepoint s i)
-	       (declare (ignore c))
-	       (lookup (+ i (if (eql widep 1) 2 1)) (left-context i))))))))
-
-(defun string-capitalize-unicode (string &key (start 0) end (casing :simple))
-  (declare (type (member :simple :full) casing))
-  (let* ((string (if (stringp string) string (string string)))
-	 (slen (length string)))
-    (declare (fixnum slen))
-    (with-output-to-string (result)
-      (lisp::with-one-string string start end offset
-        (let ((offset-slen (+ slen offset)))
-	  (declare (fixnum offset-slen))
-
-	  (write-string string result :start 0 :end start)
-	  (let ((upper (ecase casing
-			 (:simple
-			  #'(lambda (ch)
-			      (multiple-value-bind (hi lo)
-				  (lisp::surrogates (lisp::unicode-upper ch))
-				(write-char hi result)
-				(when lo (write-char lo result)))))
-			 (:full
-			  #'(lambda (ch)
-			      (write-string (lisp::unicode-full-case-title ch) result))))))
-	    (do ((start start next)
-		 (next (string-next-word-break string start)
-		       (string-next-word-break string next)))
-		((or (= start next)
-		     (>= start end)))
-	      ;; Convert the first character of the word to upper
-	      ;; case, and then make the rest of the word lowercase.
-	      (funcall upper (lisp::codepoint string start))
-	      (write-string (string-downcase string :start (1+ start) :end next :casing casing)
-			    result :start (1+ start) :end next)))
-	  (write-string string result :start end :end offset-slen))))))
-
-
 ;; Some utilities
 (defun codepoints-string (seq)
   "Convert a sequence of codepoints to a string.  Codepoints outside
diff --git a/src/code/unicode.lisp b/src/code/unicode.lisp
new file mode 100644
index 0000000..98fe880
--- /dev/null
+++ b/src/code/unicode.lisp
@@ -0,0 +1,519 @@
+;;; -*- Log: code.log; Package: Unicode -*-
+;;;
+;;; **********************************************************************
+;;; This code was written as part of the CMU Common Lisp project at
+;;; Carnegie Mellon University, and has been placed in the public domain.
+;;;
+(ext:file-comment
+  "$Header: src/code/unicode.lisp $")
+;;;
+;;; **********************************************************************
+;;;
+;;; Functions to process Unicode strings for CMU Common Lisp
+;;; Written by Paul Foley and Raymond Toy.
+;;;
+;;; ****************************************************************
+;;;
+(in-package "UNICODE")
+(intl:textdomain "cmucl")
+
+;; An example where this differs from cl:string-upcase differ:
+;; #\Latin_Small_Letter_Sharp_S
+(defun string-upcase-full (string &key (start 0) end)
+  _N"Given a string, returns a new string that is a copy of it with
+  all lower case alphabetic characters converted to uppercase using
+  full case conversion."
+  (declare (fixnum start)) (let* ((string (if
+  (stringp string) string (string string)))
+	 (slen (length string)))
+    (declare (fixnum slen))
+    (with-output-to-string (s)
+      (lisp::with-one-string string start end offset
+        (let ((offset-slen (+ slen offset)))
+	  (declare (fixnum offset-slen))
+	  (write-string string s :start offset :end start)
+	  (do ((index start (1+ index)))
+	      ((= index (the fixnum end)))
+	    (declare (fixnum index))
+	    (multiple-value-bind (code wide)
+		(lisp:codepoint string index)
+	      (when wide (incf index))
+	      ;; Handle ASCII specially because this is called early in
+	      ;; initialization, before unidata is available.
+	      (cond ((< 96 code 123)
+		     (write-char (code-char (decf code 32)) s))
+		    ((> code 127)
+		     (write-string (lisp:unicode-full-case-upper code) s))
+		    (t
+		     (multiple-value-bind (hi lo)
+			 (surrogates code)
+		       (write-char hi s)
+		       (when lo
+			 (write-char lo s)))))))
+	  (write-string string s :start end :end offset-slen))))))
+
+(defun string-upcase (string &key (start 0) end (casing :full))
+  _N"Given a string, returns a new string that is a copy of it with
+  all lower case alphabetic characters converted to uppercase.  Casing
+  is :simple or :full for simple or full case conversion,
+  respectively."
+  (declare (fixnum start))
+  (if (eq casing :simple)
+      (cl:string-upcase string :start start :end end)
+      (string-upcase-full string :start start :end end)))
+
+
+;; An example this differs from cl:string-downcase:
+;; #\Latin_Capital_Letter_I_With_Dot_Above.
+(defun string-downcase-full (string &key (start 0) end)
+  _N"Given a string, returns a new string that is a copy of it with
+  all uppercase alphabetic characters converted to lowercase using
+  full case conversion.."
+  (declare (fixnum start))
+  (let* ((string (if (stringp string) string (string string)))
+	 (slen (length string)))
+    (declare (fixnum slen))
+    (with-output-to-string (s)
+      (lisp::with-one-string string start end offset
+        (let ((offset-slen (+ slen offset)))
+	  (declare (fixnum offset-slen))
+	  (write-string string s :start offset :end start)
+	  (do ((index start (1+ index)))
+	      ((= index (the fixnum end)))
+	    (declare (fixnum index))
+	    (multiple-value-bind (code wide)
+		(lisp:codepoint string index)
+	      (when wide (incf index))
+	      ;; Handle ASCII specially because this is called early in
+	      ;; initialization, before unidata is available.
+	      (cond ((< 64 code 91)
+		     (write-char (code-char (incf code 32)) s))
+		    ((> code 127)
+		     (write-string (lisp:unicode-full-case-lower code) s))
+		    (t
+		     ;; Handle codes below 64
+		     (multiple-value-bind (hi lo)
+			 (surrogates code)
+		       (write-char hi s)
+		       (when lo
+			 (write-char lo s)))))))
+	  (write-string string s :start end :end offset-slen))))))
+
+(defun string-downcase (string &key (start 0) end (casing :full))
+  _N"Given a string, returns a new string that is a copy of it with all
+  uppercase alphabetic characters converted to lowercase.  Casing is
+  :simple or :full for simple or full case conversion, respectively."
+
+  (declare (fixnum start))
+  (if (eq casing :simple)
+      (cl:string-downcase string :start start :end end)
+      (string-downcase-full string :start start :end end)))
+
+
+;;;
+;;; This is a Lisp translation of the Scheme code from William
+;;; D. Clinger that implements the word-breaking algorithm.  This is
+;;; used with permission.
+;;;
+;;; This version is modified from the original at
+;;; http://www.ccs.neu.edu/home/will/R6RS/ to conform to CMUCL's
+;;; implementation of the word break properties.
+;;;
+;;;
+;;; Copyright statement and original comments:
+;;;
+;;;--------------------------------------------------------------------------------
+
+;; Copyright 2006 William D Clinger.
+;;
+;; Permission to copy this software, in whole or in part, to use this
+;; software for any lawful purpose, and to redistribute this software
+;; is granted subject to the restriction that all copies made of this
+;; software must include this copyright and permission notice in full.
+;;
+;; I also request that you send me a copy of any improvements that you
+;; make to this software so that they may be incorporated within it to
+;; the benefit of the Scheme community.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Word-breaking as defined by Unicode Standard Annex #29.
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Implementation notes.
+;;
+;; The string-foldcase, string-downcase, and string-titlecase
+;; procedures rely on the notion of a word, which is defined
+;; by Unicode Standard Annex 29.
+;;
+;; The string-foldcase and string-downcase procedures rely on
+;; word boundaries only when they encounter a Greek sigma, so
+;; their performance should not be greatly affected by the
+;; performance of the word-breaking algorithm.
+;;
+;; The string-titlecase procedure must find all word boundaries,
+;; but it is typically used on short strings (titles).
+;;
+;; Hence the performance of the word-breaking algorithm should
+;; not matter too much for this reference implementation.
+;; Word-breaking is more generally useful, however, so I tried
+;; to make this implementation reasonably efficient.
+;;
+;; Word boundaries are defined by 14 different rules in
+;; Unicode Standard Annex #29, and by GraphemeBreakProperty.txt
+;; and WordBreakProperty.txt.  See also WordBreakTest.html.
+;;
+;; My original implementation of those specifications failed
+;; 6 of the 494 tests in auxiliary/WordBreakTest.txt, but it
+;; appeared to me that those tests were inconsistent with the
+;; word-breaking rules in UAX #29.  John Cowan forwarded my
+;; bug report to the Unicode experts, and Mark Davis responded
+;; on 29 May 2007:
+;;
+;;   Thanks for following up on this. I think you have found a problem in the
+;;   formulation of word break, not the test. The intention was to break after a
+;;   Sep character, as is done in Sentence break. So my previous suggestion was
+;;   incorrect; instead, what we need is a new rule:
+;; 
+;;   *Break after paragraph separators.*
+;;    WB3a. Sep �
+;;   I'll make a propose to the UTC for this.
+;;
+;; Here is Will's translation of those rules (including WB3a)
+;; into a finite state machine that searches forward within a
+;; string, looking for the next position at which a word break
+;; is allowed.  The current state consists of an index i into
+;; the string and a summary of the left context whose rightmost
+;; character is at index i.  The left context is usually
+;; determined by the character at index i, but there are three
+;; complications:
+;;
+;;     Extend and Format characters are ignored unless they
+;;         follow a separator or the beginning of the text.
+;;     ALetter followed by MidLetter is treated specially.
+;;     Numeric followed by MidNum is treated specially.
+;;
+;; In the implementation below, the left context ending at i
+;; is encoded by the following symbols:
+;;
+;;     CR
+;;     Sep (excluding CR)
+;;     ALetter
+;;     MidLetter
+;;     ALetterMidLetter (ALetter followed by MidLetter)
+;;     Numeric
+;;     MidNum
+;;     NumericMidNum (Numeric followed by MidNum)
+;;     Katakana
+;;     ExtendNumLet
+;;     other (none of the above)
+;;
+;; Given a string s and an exact integer i (which need not be
+;; a valid index into s), returns the index of the next character
+;; that is not part of the word containing the character at i,
+;; or the length of s if the word containing the character at i
+;; extends through the end of s.  If i is negative or a valid
+;; index into s, then the returned value will be greater than i.
+;;
+;;;--------------------------------------------------------------------------------
+
+(defun string-next-word-break (s i)
+  _N"Given a string, S, and a starting index, return the index of the
+  next character that is not part of the word containing the character
+  at the index, or the length of S if the word containing the
+  character extends to the end of S.  If the index is negative or
+  valid index into S, the returned value will be strictly greater than
+  the index."
+  (let ((n (length s)))
+    (labels
+	((char-word-break-category (c)
+	   ;; Map our unicode word break property into what this
+	   ;; algorithm wants.
+	   (let ((cat (lisp::unicode-word-break c)))
+	     (case cat
+	       ((:lf :cr :newline)
+		:sep)
+	       ((:extend :format)
+		:extend-or-format)
+	       (otherwise cat))))
+	 (left-context (i)
+	   ;; Given a valid index i into s, returns the left context
+	   ;; at i.
+	   (multiple-value-bind (c widep)
+	       (lisp:codepoint s i n)
+	     (let* ((back
+		     ;; If we're at a regular character or a leading
+		     ;; surrogate, decrementing by 1 gets us the to
+		     ;; previous character.  But for a trailing
+		     ;; surrogate, we need to decrement by 2!
+		     (if (eql widep -1)
+			 2
+			 1))
+		    (cat (char-word-break-category c)))
+	       (case cat
+		 ((:sep)
+		  (if (= c (char-code #\return)) :cr cat))
+		 ((:midletter :midnumlet)
+		  (let ((i-1 (- i back)))
+		    (if (and (<= 0 i-1)
+			     (eq (left-context i-1) :aletter))
+			:aletter-midletter
+			cat)))
+		 ((:midnum :midnumlet)
+		  (let ((i-1 (- i back)))
+		    (if (and (<= 0 i-1)
+			     (eq (left-context i-1) :numeric))
+			:numeric-midnum
+			cat)))
+		 ((:extendorformat)
+		  (if (< 0 i)
+		      (left-context (- i back))
+		      :other))
+		 (otherwise cat)))))
+
+	 (index-of-previous-non-ignored (j)
+	   ;; Returns the index of the last non-Extend, non-Format
+	   ;; character within (substring s 0 j).  Should not be
+	   ;; called unless such a character exists.
+
+	   (let* ((j1 (- j 1)))
+	     (multiple-value-bind (c widep)
+		 (lisp:codepoint s j1)
+	       (when (eql widep -1)
+		 ;; Back up one more if we're at the trailing
+		 ;; surrogate.
+		 (decf j1))
+	       (let ((cat (char-word-break-category c)))
+		 (case cat
+		   ((:extend-or-format)
+		    (index-of-previous-non-ignored j1))
+		   (otherwise j1))))))
+
+	 (lookup (j context)
+	   ;; Given j and the context to the left of (not including) j,
+	   ;; returns the index at the start of the next word
+	   ;; (or before which a word break is permitted).
+
+	   (if (>= j n)
+	       (case context
+		 ((:aletter-midletter :numeric-midnum)
+		  (let ((j (index-of-previous-non-ignored n)))
+		    (if (< i j) j n)))
+		 (otherwise n))
+	       (multiple-value-bind (c widep)
+		   (lisp:codepoint s j)
+		 (let* ((next-j
+			 ;; The next character is either 1 or 2 code
+			 ;; units away.  For a leading surrogate, it's
+			 ;; 2; Otherwise just 1.
+			 (if (eql widep 1)
+			     2
+			     1))
+			(cat (char-word-break-category c)))
+		   (case cat
+		     ((:extend-or-format)
+		      (case context
+			((:cr :sep) j)
+			(otherwise (lookup (+ j next-j) context))))
+		     (otherwise
+		      (case context
+			((:cr)
+			 (if (= c (char-code #\linefeed))
+			     ;; Rule WB3:  Don't break CRLF, continue looking
+			     (lookup (+ j next-j) cat)
+			     j))
+			((:aletter)
+			 (case cat
+			   ((:aletter :numeric :extendnumlet)
+			    ;; Rules WB5, WB9, ?
+			    (lookup (+ j next-j) cat))
+			   ((:midletter :midnumlet)
+			    ;; Rule WB6, need to keep looking
+			    (lookup (+ j next-j) :aletter-midletter))
+			   (otherwise j)))
+			((:aletter-midletter)
+			 (case cat
+			   ((:aletter)
+			    ;; Rule WB7
+			    (lookup (+ j next-j) cat))
+			   (otherwise
+			    ;; Rule WB6 and WB7 were extended, but the
+			    ;; region didn't end with :aletter.  So
+			    ;; backup and break at that point.
+			    (let ((j2 (index-of-previous-non-ignored j)))
+			      (if (< i j2) j2 j)))))
+			((:numeric)
+			 (case cat
+			   ((:numeric :aletter :extendnumlet)
+			    ;; Rules WB8, WB10, ?
+			    (lookup (+ j next-j) cat))
+			   ((:midnum :midnumlet)
+			    ;; Rules WB11, need to keep looking
+			    (lookup (+ j next-j) :numeric-midnum))
+			   (otherwise j)))
+			((:numeric-midnum)
+			 (case cat
+			   ((:numeric)
+			    ;; Rule WB11, keep looking
+			    (lookup (+ j next-j) cat))
+			   (otherwise
+			    ;; Rule WB11, WB12 were extended, but the
+			    ;; region didn't end with :numeric, so
+			    ;; backup and break at that point.
+			    (let ((j2 (index-of-previous-non-ignored j)))
+			      (if (< i j2) j2 j)))))
+			((:midletter :midnum :midnumlet)
+			 ;; Rule WB14
+			 j)
+			((:katakana)
+			 (case cat
+			   ((:katakana :extendnumlet)
+			    ;; Rule WB13, WB13a
+			    (lookup (+ j next-j) cat))
+			   (otherwise j)))
+			((:extendnumlet)
+			 (case cat
+			   ((:extendnumlet :aletter :numeric :katakana)
+			    ;; Rule WB13a, WB13b
+			    (lookup (+ j next-j) cat))
+			   (otherwise j)))
+			((:regional_indicator)
+			 (case cat
+			   ((:regional_indicator)
+			    ;; Rule WB13c
+			    (lookup (+ j next-j) cat))
+			   (otherwise j)))
+			(otherwise j)))))))))
+      (declare (notinline lookup left-context))
+      (cond ((< i 0)
+	     ;; Rule WB1
+	     0)
+	    ((<= n i)
+	     ;; Rule WB2
+	     n)
+	    (t
+	     (multiple-value-bind (c widep)
+		 (lisp:codepoint s i)
+	       (declare (ignore c))
+	       (lookup (+ i (if (eql widep 1) 2 1)) (left-context i))))))))
+
+(defun string-capitalize-unicode (string &key (start 0) end (casing :simple))
+  "Capitalize String using the Unicode word-break algorithm to find
+  the words in String.  The beginning is capitalized depending on the
+  value of Casing"
+  (declare (type (member :simple :full :title) casing))
+  (let* ((string (if (stringp string) string (string string)))
+	 (slen (length string)))
+    (declare (fixnum slen))
+    (with-output-to-string (result)
+      (lisp::with-one-string string start end offset
+        (let ((offset-slen (+ slen offset)))
+	  (declare (fixnum offset-slen))
+
+	  (write-string string result :start 0 :end start)
+	  (let ((upper (ecase casing
+			 (:simple
+			  #'(lambda (ch)
+			      (multiple-value-bind (hi lo)
+				  (lisp::surrogates (lisp::unicode-upper ch))
+				(write-char hi result)
+				(when lo (write-char lo result)))))
+			 (:full
+			  #'(lambda (ch)
+			      (write-string (lisp::unicode-full-case-upper ch) result)))
+			 (:title
+			  #'(lambda (ch)
+			      (write-string (lisp::unicode-full-case-title ch) result))))))
+	    (do ((start start next)
+		 (next (string-next-word-break string start)
+		       (string-next-word-break string next)))
+		((or (= start next)
+		     (>= start end)))
+	      ;; Convert the first character of the word to upper
+	      ;; case, and then make the rest of the word lowercase.
+	      (funcall upper (lisp:codepoint string start))
+	      (write-string (string-downcase string :start (1+ start)
+						    :end next
+						    :casing casing)
+			    result
+			    :start (1+ start)
+			    :end next)))
+	  (write-string string result :start end :end offset-slen))))))
+
+(defun string-capitalize-full (string &key (start 0) end (casing :full))
+  "Capitalize String using the Common Lisp word-break algorithm to find
+  the words in String.  The beginning is capitalized depending on the
+  value of Casing"
+  (declare (fixnum start)
+	   (type (member :simple :full :title) casing))
+  (let* ((string (if (stringp string) string (string string)))
+	 (slen (length string)))
+    (declare (fixnum slen))
+    (with-output-to-string (s)
+      (lisp::with-one-string string start end offset
+        (let ((offset-slen (+ slen offset)))
+	  (declare (fixnum offset-slen))
+	  (write-string string s :start offset :end start)
+	  (flet ((alphanump (m)
+		   (or (< 47 m 58) (< 64 m 91) (< 96 m 123)
+		       #+(and unicode (not unicode-bootstrap))
+		       (and (> m 127)
+			    (<= lisp::+unicode-category-letter+
+				(lisp::unicode-category m)
+				(+ lisp::+unicode-category-letter+ #x0F)))))
+		 (upper (ch)
+		   (ecase casing
+		     (:simple
+		      #'(lambda (ch)
+			  (multiple-value-bind (hi lo)
+			      (lisp::surrogates (lisp::unicode-upper ch))
+			    (write-char hi s)
+			    (when lo (write-char lo s)))))
+		     (:full
+		      #'(lambda (ch)
+			  (write-string (lisp::unicode-full-case-upper ch) s)))
+		     (:title
+		      #'(lambda (ch)
+			  (write-string (lisp::unicode-full-case-title ch) s))))))
+	    (do ((index start (1+ index))
+		 (newword t))
+		((= index (the fixnum end)))
+	      (declare (fixnum index))
+	      (multiple-value-bind (code wide)
+		  (lisp:codepoint string index)
+		(when wide (incf index))
+		(cond ((not (alphanump code))
+		       (multiple-value-bind (hi lo)
+			   (surrogates code)
+			 (write-char hi s)
+			 (when lo (write-char lo s)))
+		       (setq newword t))
+		      (newword
+		       ;; Char is first case-modifiable after non-case-modifiable
+		       (funcall upper code)
+		       (setq newword ()))
+		      (t
+		       ;; char is case-modifiable, but not first
+		       (write-string (lisp:unicode-full-case-lower code) s))))))
+	  (write-string string s :start end :end offset-slen))))))
+
+(defun string-capitalize (string &key (start 0) end
+				 (casing :title)
+				 (unicode-word-break t))
+  _N"Given a string, returns a copy of the string with the first
+  character of each ``word'' converted to upper-case, and remaining
+  chars in the word converted to lower case. Casing is :simple, :full
+  or :title for simple, full or title case conversion, respectively.  If
+  Unicode-Word-Break is non-Nil, then the Unicode word-breaking
+  algorithm is used to determine the word boundaries.  Otherwise, A
+  ``word'' is defined to be a string of case-modifiable characters
+  delimited by non-case-modifiable chars.  "
+
+  (declare (fixnum start)
+	   (type (member :simple :full :title) casing))
+  (if unicode-word-break
+      (string-capitalize-unicode string :start start :end end :casing casing)
+      (if (eq casing :simple)
+	  (cl:string-capitalize string :start start :end end)
+	  (string-capitalize-full string :start start :end end :casing casing))))
diff --git a/src/code/unidata.lisp b/src/code/unidata.lisp
index 55e3a28..3518100 100644
--- a/src/code/unidata.lisp
+++ b/src/code/unidata.lisp
@@ -18,6 +18,15 @@
 
 (export '(string-to-nfd string-to-nfkc string-to-nfkd string-to-nfc
 	  unicode-complete unicode-complete-name
+	  unicode-full-case-lower
+	  unicode-full-case-upper
+	  unicode-full-case-title
+	  unicode-category
+	  +unicode-category-lower+
+	  +unicode-category-other+
+	  +unicode-category-graphic+
+	  +unicode-category-upper+
+	  +unicode-category-title+
 	  load-all-unicode-data))
 
 (defvar *unidata-path* #p"ext-formats:unidata.bin")
diff --git a/src/compiler/fndb.lisp b/src/compiler/fndb.lisp
index 5229915..052b4c1 100644
--- a/src/compiler/fndb.lisp
+++ b/src/compiler/fndb.lisp
@@ -851,13 +851,11 @@
   (sequence stringable) simple-string (flushable))
 
 (defknown (string-upcase string-downcase)
-  (stringable &key (:start index) (:end sequence-end) #+unicode (:casing case-conversion-type))
+  (stringable &key (:start index) (:end sequence-end))
   simple-string (flushable))
 
 (defknown (string-capitalize)
-  (stringable &key (:start index) (:end sequence-end)
-	      #+unicode (:casing case-conversion-type)
-	      #+unicode (:unicode-word-break boolean))
+  (stringable &key (:start index) (:end sequence-end))
   simple-string (flushable))
 
 (defknown (nstring-upcase nstring-downcase nstring-capitalize)
diff --git a/src/tools/worldbuild.lisp b/src/tools/worldbuild.lisp
index 9d0dbb6..4b5f364 100644
--- a/src/tools/worldbuild.lisp
+++ b/src/tools/worldbuild.lisp
@@ -186,6 +186,8 @@
     ,@(when (c:backend-featurep :mp)
 	    '("target:code/multi-proc"))
     "target:code/intl-tramp"
+    ,@(when (c::backend-featurep :unicode)
+	'("target:code/unicode"))
     ))
 
 (setf *genesis-core-name* "target:lisp/kernel.core")
diff --git a/src/tools/worldcom.lisp b/src/tools/worldcom.lisp
index f8ac65a..150d33c 100644
--- a/src/tools/worldcom.lisp
+++ b/src/tools/worldcom.lisp
@@ -324,6 +324,8 @@
 (comf "target:code/intl-tramp")
 (comf "target:code/intl")
 
+(when (c:backend-featurep :unicode)
+  (comf "target:code/unicode"))
 ); let *byte-compile-top-level*
 
 ); with-compiler-log-file

commit 9d66b2585eb33ff8106511da512b4772a3887aab
Author: Raymond Toy <toy.raymond at gmail.com>
Date:   Fri May 24 19:18:49 2013 -0700

    Fix typo.

diff --git a/src/code/multi-proc.lisp b/src/code/multi-proc.lisp
index e478a95..4e2b1bc 100644
--- a/src/code/multi-proc.lisp
+++ b/src/code/multi-proc.lisp
@@ -1977,6 +1977,6 @@
 		   (setf (lock-process ,lock) nil)))))))
 
 (defun process-join (process)
-  (mp:process-wait (format nil "Waiting for thread ~A to complete" thread)
-                   (lambda () (not (mp:process-alive-p thread))))
+  (mp:process-wait (format nil "Waiting for thread ~A to complete" process)
+                   (lambda () (not (mp:process-alive-p process))))
   (values-list (process-%return-values process)))

-----------------------------------------------------------------------

Summary of changes:
 src/code/exports.lisp            |   36 ++-
 src/code/multi-proc.lisp         |    4 +-
 src/code/string.lisp             |  495 +-----------------------------------
 src/code/unicode.lisp            |  519 ++++++++++++++++++++++++++++++++++++++
 src/code/unidata.lisp            |    9 +
 src/compiler/fndb.lisp           |    6 +-
 src/general-info/release-20e.txt |    5 +
 src/tools/worldbuild.lisp        |    2 +
 src/tools/worldcom.lisp          |    2 +
 9 files changed, 589 insertions(+), 489 deletions(-)
 create mode 100644 src/code/unicode.lisp


hooks/post-receive
-- 
CMU Common Lisp



More information about the cmucl-cvs mailing list