[flexi-streams-cvs] r54 - in branches/edi: . doc

eweitz at common-lisp.net eweitz at common-lisp.net
Sun May 25 01:43:57 UTC 2008


Author: eweitz
Date: Sat May 24 21:43:56 2008
New Revision: 54

Modified:
   branches/edi/decode.lisp
   branches/edi/doc/index.html
   branches/edi/packages.lisp
   branches/edi/specials.lisp
   branches/edi/strings.lisp
   branches/edi/util.lisp
Log:
Compute decoding length


Modified: branches/edi/decode.lisp
==============================================================================
--- branches/edi/decode.lisp	(original)
+++ branches/edi/decode.lisp	Sat May 24 21:43:56 2008
@@ -1,5 +1,5 @@
 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*-
-;;; $Header: /usr/local/cvsrep/flexi-streams/decode.lisp,v 1.16 2008/05/20 23:01:50 edi Exp $
+;;; $Header: /usr/local/cvsrep/flexi-streams/decode.lisp,v 1.18 2008/05/25 01:42:50 edi Exp $
 
 ;;; Copyright (c) 2005-2008, Dr. Edmund Weitz.  All rights reserved.
 
@@ -29,6 +29,234 @@
 
 (in-package :flexi-streams)
 
+(defgeneric compute-number-of-chars (format sequence start end)
+  (declare #.*standard-optimize-settings*)
+  (:documentation "Computes the exact number of characters required to
+decode the sequence of octets in SEQUENCE from START to END using the
+external format FORMAT."))
+
+(defmethod compute-number-of-chars :around (format (list list) start end)
+  (declare #.*standard-optimize-settings*)
+  (call-next-method format (coerce list 'vector) start end))
+
+(defmethod compute-number-of-chars ((format flexi-8-bit-format) sequence start end)
+  (declare #.*fixnum-optimize-settings*)
+  (declare (fixnum start end))
+  (declare (ignore sequence))  
+  (- end start))
+
+(defmethod compute-number-of-chars ((format flexi-crlf-8-bit-format) sequence start end)
+  (declare #.*fixnum-optimize-settings*)
+  (declare (fixnum start end))
+  (let ((i start)
+        (length (- end start)))
+    (declare (fixnum i length))
+    (loop
+     (when (>= i end)
+       (return))
+     (let ((position (search #.(vector +cr+ +lf+) sequence :start2 i :end2 end :test #'=)))
+       (unless position
+         (return))
+       (setq i (1+ position))
+       (decf length)))
+    length))
+
+(defun check-end (format start end i)  
+  (declare #.*fixnum-optimize-settings*)
+  (declare (fixnum start end i))
+  (unless (= i end)
+    (signal-encoding-error format "These ~A octet~:P can't be ~
+decoded using ~A as the sequence is too short.  ~A octet~:P ~
+missing at then end."
+                           (- end start)
+                           (external-format-name format)
+                           (- i end))))
+
+(defmethod compute-number-of-chars ((format flexi-utf-8-format) sequence start end)
+  (declare #.*fixnum-optimize-settings*)
+  (declare (fixnum start end))
+  (let ((sum 0)
+        (i start))
+    (declare (fixnum i sum))
+    (loop
+     (when (>= i end)
+       (return))
+     (let* ((octet (aref sequence i))
+            (length (cond ((not (logbitp 7 octet)) 1)
+                          ((= #b11000000 (logand octet #b11100000)) 2)
+                          ((= #b11100000 (logand octet #b11110000)) 3)
+                          (t 4))))
+       (declare (fixnum length) (octet octet))
+       (incf sum)
+       (incf i length)))
+    (check-end format start end i)
+    sum))
+
+(defmethod compute-number-of-chars ((format flexi-crlf-utf-8-format) sequence start end)
+  (declare #.*fixnum-optimize-settings*)
+  (declare (fixnum start end))
+  (let ((sum 0)
+        (i start)
+        (last-octet 0))
+    (declare (fixnum i sum) (octet last-octet))
+    (loop
+     (when (>= i end)
+       (return))
+     (let* ((octet (aref sequence i))
+            (length (cond ((not (logbitp 7 octet)) 1)
+                          ((= #b11000000 (logand octet #b11100000)) 2)
+                          ((= #b11100000 (logand octet #b11110000)) 3)
+                          (t 4))))
+       (declare (fixnum length) (octet octet))
+       (unless (and (= octet +lf+) (= last-octet +cr+))
+         (incf sum))
+       (incf i length)
+       (setq last-octet octet)))
+    (check-end format start end i)
+    sum))
+
+(defmethod compute-number-of-chars :before ((format flexi-utf-16-format) sequence start end)
+  (declare #.*fixnum-optimize-settings*)
+  (declare (fixnum start end))
+  (unless (evenp (- end start))
+    (signal-encoding-error format "~A octet~:P cannot be decoded using ~
+UTF-16 as ~:*~A is not even."
+                           (- end start))))  
+  
+(defmethod compute-number-of-chars ((format flexi-utf-16-le-format) sequence start end)
+  (declare #.*fixnum-optimize-settings*)
+  (declare (fixnum start end))
+  (let ((sum 0)
+        (i start))
+    (declare (fixnum i sum))
+    (loop
+     (when (>= i end)
+       (return))
+     (let* ((high-octet (aref sequence (1+ i)))
+            (length (cond ((<= #xd8 high-octet #xdf) 4)
+                          (t 2))))
+       (declare (fixnum length) (octet high-octet))
+       (incf sum)
+       (incf i length)))
+    (check-end format start end i)
+    sum))
+
+(defmethod compute-number-of-chars ((format flexi-utf-16-be-format) sequence start end)
+  (declare #.*fixnum-optimize-settings*)
+  (declare (fixnum start end))
+  (let ((sum 0)
+        (i start))
+    (declare (fixnum i sum))
+    (loop
+     (when (>= i end)
+       (return))
+     (let* ((high-octet (aref sequence i))
+            (length (cond ((<= #xd8 high-octet #xdf) 4)
+                          (t 2))))
+       (declare (fixnum length) (octet high-octet))
+       (incf sum)
+       (incf i length)))
+    (check-end format start end i)
+    sum))
+
+(defmethod compute-number-of-chars ((format flexi-crlf-utf-16-le-format) sequence start end)
+  (declare #.*fixnum-optimize-settings*)
+  (declare (fixnum start end))
+  (let ((sum 0)
+        (i start)
+        (last-octet 0))
+    (declare (fixnum i sum) (octet last-octet))
+    (loop
+     (when (>= i end)
+       (return))
+     (let* ((high-octet (aref sequence (1+ i)))
+            (length (cond ((<= #xd8 high-octet #xdf) 4)
+                          (t 2))))
+       (declare (fixnum length) (octet high-octet))
+       (unless (and (zerop high-octet)
+                    (= (the octet (aref sequence i)) +lf+)
+                    (= last-octet +cr+))
+         (incf sum))
+       (incf i length)
+       (setq last-octet (if (zerop high-octet)
+                          (aref sequence i)
+                          0))))
+    (check-end format start end i)
+    sum))
+
+(defmethod compute-number-of-chars ((format flexi-crlf-utf-16-be-format) sequence start end)
+  (declare #.*fixnum-optimize-settings*)
+  (declare (fixnum start end))
+  (let ((sum 0)
+        (i start)
+        (last-octet 0))
+    (declare (fixnum i sum) (octet last-octet))
+    (loop
+     (when (>= i end)
+       (return))
+     (let* ((high-octet (aref sequence i))
+            (length (cond ((<= #xd8 high-octet #xdf) 4)
+                          (t 2))))
+       (declare (fixnum length) (octet high-octet))
+       (unless (and (zerop high-octet)
+                    (= (the octet (aref sequence (1+ i))) +lf+)
+                    (= last-octet +cr+))
+         (incf sum))
+       (incf i length)
+       (setq last-octet (if (zerop high-octet)
+                          (aref sequence (1+ i))
+                          0))))
+    (check-end format start end i)
+    sum))
+(defmethod compute-number-of-chars :before ((format flexi-utf-32-format) sequence start end)
+  (declare #.*fixnum-optimize-settings*)
+  (declare (fixnum start end))
+  (let ((length (- end start)))
+    (unless (zerop (mod length 4))
+      (signal-encoding-error format "~A octet~:P cannot be decoded using ~
+UTF-32 as ~:*~A is not a multiple-value of four."
+                             length))))  
+
+(defmethod compute-number-of-chars ((format flexi-utf-32-format) sequence start end)
+  (declare #.*fixnum-optimize-settings*)
+  (declare (fixnum start end))
+  (declare (ignore sequence))
+  (/ (- end start) 4))
+
+(defmethod compute-number-of-chars ((format flexi-crlf-utf-32-le-format) sequence start end)
+  (declare #.*fixnum-optimize-settings*)
+  (declare (fixnum start end))
+  (let ((i start)
+        (length (/ (- end start) 4)))
+    (decf end 8)
+    (loop
+     (when (> i end)
+       (return))
+     (cond ((loop for j of-type fixnum from i
+                  for octet across #.(vector +cr+ 0 0 0 +lf+ 0 0 0)
+                  always (= octet (aref sequence j)))
+            (decf length)
+            (incf i 8))
+           (t (incf i 4))))
+    length))
+
+(defmethod compute-number-of-chars ((format flexi-crlf-utf-32-be-format) sequence start end)
+  (declare #.*fixnum-optimize-settings*)
+  (declare (fixnum start end))
+  (let ((i start)
+        (length (/ (- end start) 4)))
+    (decf end 8)
+    (loop
+     (when (> i end)
+       (return))
+     (cond ((loop for j of-type fixnum from i
+                  for octet across #.(vector 0 0 0 +cr+ 0 0 0 +lf+)
+                  always (= octet (aref sequence j)))
+            (decf length)
+            (incf i 8))
+           (t (incf i 4))))
+    length))
+
 (defun recover-from-encoding-error (external-format format-control &rest format-args)
   "Helper function used by OCTETS-TO-CHAR-CODE below to deal with
 encoding errors.  Checks if *SUBSTITUTION-CHAR* is not NIL and returns
@@ -242,7 +470,7 @@
   (declare (ignore reader))
   (let ((char-code (call-next-method)))
     (case char-code
-      (#.(char-code #\Return) #.(char-code #\Newline))
+      (#.+cr+ #.(char-code #\Newline))
       (otherwise char-code))))
 
 (defmethod octets-to-char-code ((format flexi-crlf-mixin) reader)
@@ -251,13 +479,13 @@
   (declare (ignore reader))
   (let ((char-code (call-next-method)))
     (case char-code
-      (#.(char-code #\Return)
+      (#.+cr+
        (let ((next-char-code (call-next-method)))
          (case next-char-code
-           (#.(char-code #\Linefeed) #.(char-code #\Newline))
+           (#.+lf+ #.(char-code #\Newline))
            ;; we saw a CR but no LF afterwards, but then the data
            ;; ended, so we just return #\Return
-           ((nil) #.(char-code #\Return))
+           ((nil) +cr+)
            ;; if the character we peeked at wasn't a
            ;; linefeed character we unread its constituents
            (otherwise (funcall *current-unreader* (code-char next-char-code))

Modified: branches/edi/doc/index.html
==============================================================================
--- branches/edi/doc/index.html	(original)
+++ branches/edi/doc/index.html	Sat May 24 21:43:56 2008
@@ -116,6 +116,7 @@
       <li><a href="#string-to-octets"><code>string-to-octets</code></a>
       <li><a href="#octets-to-string"><code>octets-to-string</code></a>
       <li><a href="#octet-length"><code>octet-length</code></a>
+      <li><a href="#char-length"><code>char-length</code></a>
       </ol>
   </ol>
   <li><a href="#position">File positions</a>
@@ -1005,16 +1006,30 @@
 </blockquote>
 
 <p><br>[Function]
-<br><a class=none name="octet-length"><b>octet-length</b> <i>string <tt>&key</tt> external-format start end</i> => <i>length-or-nil</i></a>
+<br><a class=none name="octet-length"><b>octet-length</b> <i>string <tt>&key</tt> external-format start end</i> => <i>length</i></a>
 
 <blockquote><br>
 
 Returns the length of the substring of <code><i>string</i></code> from <code><i>start</i></code> to <code><i>end</i></code> in
 <a href="#octet">octets</a> if encoded using
 the <a href="#external-formats">external format</a> designated
-by <code><i>external-format</i></code>.  Might return <code>NIL</code>
-if there's no efficient way to compute the length without iterating
-through the whole string.
+by <code><i>external-format</i></code>.
+The defaults for
+<code><i>start</i></code> and <code><i>end</i></code>
+are <code>0</code> and the length of the string.  The default
+for <code><i>external-format</i></code> is <code>:LATIN1</code>.
+</blockquote>
+
+<p><br>[Function]
+<br><a class=none name="char-length"><b>char-length</b> <i>sequence <tt>&key</tt> external-format start end</i> => <i>length</i></a>
+
+<blockquote><br>
+
+Kind of the inverse of <a href="#octet-length"><code>OCTET-LENGTH</code></a>.
+Returns the length of the subsequence (of <a href="#octet">octets</a>) of <code><i>sequence</i></code> from <code><i>start</i></code> to <code><i>end</i></code> in
+characters if decoded using
+the <a href="#external-formats">external format</a> designated
+by <code><i>external-format</i></code>.
 The defaults for
 <code><i>start</i></code> and <code><i>end</i></code>
 are <code>0</code> and the length of the sequence.  The default
@@ -1060,7 +1075,7 @@
 his work on making FLEXI-STREAMS faster.
 
 <p>
-$Header: /usr/local/cvsrep/flexi-streams/doc/index.html,v 1.111 2008/05/23 14:56:47 edi Exp $
+$Header: /usr/local/cvsrep/flexi-streams/doc/index.html,v 1.112 2008/05/25 01:41:25 edi Exp $
 <p><a href="http://weitz.de/index.html">BACK TO MY HOMEPAGE</a>
 
 </body>

Modified: branches/edi/packages.lisp
==============================================================================
--- branches/edi/packages.lisp	(original)
+++ branches/edi/packages.lisp	Sat May 24 21:43:56 2008
@@ -1,5 +1,5 @@
 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
-;;; $Header: /usr/local/cvsrep/flexi-streams/packages.lisp,v 1.35 2008/05/21 01:43:43 edi Exp $
+;;; $Header: /usr/local/cvsrep/flexi-streams/packages.lisp,v 1.36 2008/05/25 01:40:54 edi Exp $
 
 ;;; Copyright (c) 2005-2008, Dr. Edmund Weitz.  All rights reserved.
 
@@ -40,6 +40,7 @@
   (:export :*default-eol-style*
            :*default-little-endian*
            :*substitution-char*
+           :char-length
            :external-format-eol-style
            :external-format-error
            :external-format-error-external-format

Modified: branches/edi/specials.lisp
==============================================================================
--- branches/edi/specials.lisp	(original)
+++ branches/edi/specials.lisp	Sat May 24 21:43:56 2008
@@ -1,5 +1,5 @@
 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*-
-;;; $Header: /usr/local/cvsrep/flexi-streams/specials.lisp,v 1.32 2008/05/20 23:01:51 edi Exp $
+;;; $Header: /usr/local/cvsrep/flexi-streams/specials.lisp,v 1.33 2008/05/25 01:40:54 edi Exp $
 
 ;;; Copyright (c) 2005-2008, Dr. Edmund Weitz.  All rights reserved.
 
@@ -49,6 +49,10 @@
   "Like *STANDARD-OPTIMIZE-SETTINGS*, but \(on LispWorks) with all
 arithmetic being fixnum arithmetic.")
 
+(defconstant +lf+ (char-code #\Linefeed))
+
+(defconstant +cr+ (char-code #\Return))
+
 (defvar *current-unreader* nil
   "A unary function which might be called to `unread' a character
 \(i.e. the sequence of octets it represents).

Modified: branches/edi/strings.lisp
==============================================================================
--- branches/edi/strings.lisp	(original)
+++ branches/edi/strings.lisp	Sat May 24 21:43:56 2008
@@ -1,5 +1,5 @@
 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*-
-;;; $Header: /usr/local/cvsrep/flexi-streams/strings.lisp,v 1.24 2008/05/24 23:15:25 edi Exp $
+;;; $Header: /usr/local/cvsrep/flexi-streams/strings.lisp,v 1.26 2008/05/25 01:41:32 edi Exp $
 
 ;;; Copyright (c) 2005-2008, Dr. Edmund Weitz.  All rights reserved.
 
@@ -116,12 +116,17 @@
 
 (defun octet-length (string &key (external-format :latin1) (start 0) (end (length string)))
   "Returns the length of the substring of STRING from START to END in
-octets if encoded using the external format EXTERNAL-FORMAT.  Might
-return NIL if there's no efficient way to compute the length without
-iterating through the whole string."
+octets if encoded using the external format EXTERNAL-FORMAT."
   (declare #.*standard-optimize-settings*)
   (declare (fixnum start end) (string string))
   (setq external-format (maybe-convert-external-format external-format))
-  (let ((factor (encoding-factor external-format)))
-    (typecase factor
-      (fixnum (* factor (- end start))))))
+  (compute-number-of-octets external-format string start end))
+
+(defun char-length (sequence &key (external-format :latin1) (start 0) (end (length sequence)))
+  "Kind of the inverse of OCTET-LENGTH.  Returns the length of the
+subsequence \(of octets) of SEQUENCE from START to END in characters
+if decoded using the external format EXTERNAL-FORMAT."
+  (declare #.*standard-optimize-settings*)
+  (declare (fixnum start end) (string string))
+  (setq external-format (maybe-convert-external-format external-format))
+  (compute-number-of-chars external-format sequence start end))

Modified: branches/edi/util.lisp
==============================================================================
--- branches/edi/util.lisp	(original)
+++ branches/edi/util.lisp	Sat May 24 21:43:56 2008
@@ -1,5 +1,5 @@
 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*-
-;;; $Header: /usr/local/cvsrep/flexi-streams/util.lisp,v 1.21 2008/05/20 23:44:45 edi Exp $
+;;; $Header: /usr/local/cvsrep/flexi-streams/util.lisp,v 1.22 2008/05/25 01:40:54 edi Exp $
 
 ;;; Copyright (c) 2005-2008, Dr. Edmund Weitz.  All rights reserved.
 
@@ -115,7 +115,7 @@
     (unless (find real-name +name-map+
                   :test #'eq
                   :key #'cdr)
-      (error 'external-format-error
+      (error 'external-format-simple-error
              :format-control "~S is not known to be a name for an external format."
              :format-arguments (list name)))
     real-name))



More information about the Flexi-streams-cvs mailing list