[flexi-streams-cvs] r54 - in branches/edi: . doc
eweitz at common-lisp.net
eweitz at common-lisp.net
Sun May 25 01:43:57 UTC 2008
Author: eweitz
Date: Sat May 24 21:43:56 2008
New Revision: 54
Modified:
branches/edi/decode.lisp
branches/edi/doc/index.html
branches/edi/packages.lisp
branches/edi/specials.lisp
branches/edi/strings.lisp
branches/edi/util.lisp
Log:
Compute decoding length
Modified: branches/edi/decode.lisp
==============================================================================
--- branches/edi/decode.lisp (original)
+++ branches/edi/decode.lisp Sat May 24 21:43:56 2008
@@ -1,5 +1,5 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*-
-;;; $Header: /usr/local/cvsrep/flexi-streams/decode.lisp,v 1.16 2008/05/20 23:01:50 edi Exp $
+;;; $Header: /usr/local/cvsrep/flexi-streams/decode.lisp,v 1.18 2008/05/25 01:42:50 edi Exp $
;;; Copyright (c) 2005-2008, Dr. Edmund Weitz. All rights reserved.
@@ -29,6 +29,234 @@
(in-package :flexi-streams)
+(defgeneric compute-number-of-chars (format sequence start end)
+ (declare #.*standard-optimize-settings*)
+ (:documentation "Computes the exact number of characters required to
+decode the sequence of octets in SEQUENCE from START to END using the
+external format FORMAT."))
+
+(defmethod compute-number-of-chars :around (format (list list) start end)
+ (declare #.*standard-optimize-settings*)
+ (call-next-method format (coerce list 'vector) start end))
+
+(defmethod compute-number-of-chars ((format flexi-8-bit-format) sequence start end)
+ (declare #.*fixnum-optimize-settings*)
+ (declare (fixnum start end))
+ (declare (ignore sequence))
+ (- end start))
+
+(defmethod compute-number-of-chars ((format flexi-crlf-8-bit-format) sequence start end)
+ (declare #.*fixnum-optimize-settings*)
+ (declare (fixnum start end))
+ (let ((i start)
+ (length (- end start)))
+ (declare (fixnum i length))
+ (loop
+ (when (>= i end)
+ (return))
+ (let ((position (search #.(vector +cr+ +lf+) sequence :start2 i :end2 end :test #'=)))
+ (unless position
+ (return))
+ (setq i (1+ position))
+ (decf length)))
+ length))
+
+(defun check-end (format start end i)
+ (declare #.*fixnum-optimize-settings*)
+ (declare (fixnum start end i))
+ (unless (= i end)
+ (signal-encoding-error format "These ~A octet~:P can't be ~
+decoded using ~A as the sequence is too short. ~A octet~:P ~
+missing at then end."
+ (- end start)
+ (external-format-name format)
+ (- i end))))
+
+(defmethod compute-number-of-chars ((format flexi-utf-8-format) sequence start end)
+ (declare #.*fixnum-optimize-settings*)
+ (declare (fixnum start end))
+ (let ((sum 0)
+ (i start))
+ (declare (fixnum i sum))
+ (loop
+ (when (>= i end)
+ (return))
+ (let* ((octet (aref sequence i))
+ (length (cond ((not (logbitp 7 octet)) 1)
+ ((= #b11000000 (logand octet #b11100000)) 2)
+ ((= #b11100000 (logand octet #b11110000)) 3)
+ (t 4))))
+ (declare (fixnum length) (octet octet))
+ (incf sum)
+ (incf i length)))
+ (check-end format start end i)
+ sum))
+
+(defmethod compute-number-of-chars ((format flexi-crlf-utf-8-format) sequence start end)
+ (declare #.*fixnum-optimize-settings*)
+ (declare (fixnum start end))
+ (let ((sum 0)
+ (i start)
+ (last-octet 0))
+ (declare (fixnum i sum) (octet last-octet))
+ (loop
+ (when (>= i end)
+ (return))
+ (let* ((octet (aref sequence i))
+ (length (cond ((not (logbitp 7 octet)) 1)
+ ((= #b11000000 (logand octet #b11100000)) 2)
+ ((= #b11100000 (logand octet #b11110000)) 3)
+ (t 4))))
+ (declare (fixnum length) (octet octet))
+ (unless (and (= octet +lf+) (= last-octet +cr+))
+ (incf sum))
+ (incf i length)
+ (setq last-octet octet)))
+ (check-end format start end i)
+ sum))
+
+(defmethod compute-number-of-chars :before ((format flexi-utf-16-format) sequence start end)
+ (declare #.*fixnum-optimize-settings*)
+ (declare (fixnum start end))
+ (unless (evenp (- end start))
+ (signal-encoding-error format "~A octet~:P cannot be decoded using ~
+UTF-16 as ~:*~A is not even."
+ (- end start))))
+
+(defmethod compute-number-of-chars ((format flexi-utf-16-le-format) sequence start end)
+ (declare #.*fixnum-optimize-settings*)
+ (declare (fixnum start end))
+ (let ((sum 0)
+ (i start))
+ (declare (fixnum i sum))
+ (loop
+ (when (>= i end)
+ (return))
+ (let* ((high-octet (aref sequence (1+ i)))
+ (length (cond ((<= #xd8 high-octet #xdf) 4)
+ (t 2))))
+ (declare (fixnum length) (octet high-octet))
+ (incf sum)
+ (incf i length)))
+ (check-end format start end i)
+ sum))
+
+(defmethod compute-number-of-chars ((format flexi-utf-16-be-format) sequence start end)
+ (declare #.*fixnum-optimize-settings*)
+ (declare (fixnum start end))
+ (let ((sum 0)
+ (i start))
+ (declare (fixnum i sum))
+ (loop
+ (when (>= i end)
+ (return))
+ (let* ((high-octet (aref sequence i))
+ (length (cond ((<= #xd8 high-octet #xdf) 4)
+ (t 2))))
+ (declare (fixnum length) (octet high-octet))
+ (incf sum)
+ (incf i length)))
+ (check-end format start end i)
+ sum))
+
+(defmethod compute-number-of-chars ((format flexi-crlf-utf-16-le-format) sequence start end)
+ (declare #.*fixnum-optimize-settings*)
+ (declare (fixnum start end))
+ (let ((sum 0)
+ (i start)
+ (last-octet 0))
+ (declare (fixnum i sum) (octet last-octet))
+ (loop
+ (when (>= i end)
+ (return))
+ (let* ((high-octet (aref sequence (1+ i)))
+ (length (cond ((<= #xd8 high-octet #xdf) 4)
+ (t 2))))
+ (declare (fixnum length) (octet high-octet))
+ (unless (and (zerop high-octet)
+ (= (the octet (aref sequence i)) +lf+)
+ (= last-octet +cr+))
+ (incf sum))
+ (incf i length)
+ (setq last-octet (if (zerop high-octet)
+ (aref sequence i)
+ 0))))
+ (check-end format start end i)
+ sum))
+
+(defmethod compute-number-of-chars ((format flexi-crlf-utf-16-be-format) sequence start end)
+ (declare #.*fixnum-optimize-settings*)
+ (declare (fixnum start end))
+ (let ((sum 0)
+ (i start)
+ (last-octet 0))
+ (declare (fixnum i sum) (octet last-octet))
+ (loop
+ (when (>= i end)
+ (return))
+ (let* ((high-octet (aref sequence i))
+ (length (cond ((<= #xd8 high-octet #xdf) 4)
+ (t 2))))
+ (declare (fixnum length) (octet high-octet))
+ (unless (and (zerop high-octet)
+ (= (the octet (aref sequence (1+ i))) +lf+)
+ (= last-octet +cr+))
+ (incf sum))
+ (incf i length)
+ (setq last-octet (if (zerop high-octet)
+ (aref sequence (1+ i))
+ 0))))
+ (check-end format start end i)
+ sum))
+(defmethod compute-number-of-chars :before ((format flexi-utf-32-format) sequence start end)
+ (declare #.*fixnum-optimize-settings*)
+ (declare (fixnum start end))
+ (let ((length (- end start)))
+ (unless (zerop (mod length 4))
+ (signal-encoding-error format "~A octet~:P cannot be decoded using ~
+UTF-32 as ~:*~A is not a multiple-value of four."
+ length))))
+
+(defmethod compute-number-of-chars ((format flexi-utf-32-format) sequence start end)
+ (declare #.*fixnum-optimize-settings*)
+ (declare (fixnum start end))
+ (declare (ignore sequence))
+ (/ (- end start) 4))
+
+(defmethod compute-number-of-chars ((format flexi-crlf-utf-32-le-format) sequence start end)
+ (declare #.*fixnum-optimize-settings*)
+ (declare (fixnum start end))
+ (let ((i start)
+ (length (/ (- end start) 4)))
+ (decf end 8)
+ (loop
+ (when (> i end)
+ (return))
+ (cond ((loop for j of-type fixnum from i
+ for octet across #.(vector +cr+ 0 0 0 +lf+ 0 0 0)
+ always (= octet (aref sequence j)))
+ (decf length)
+ (incf i 8))
+ (t (incf i 4))))
+ length))
+
+(defmethod compute-number-of-chars ((format flexi-crlf-utf-32-be-format) sequence start end)
+ (declare #.*fixnum-optimize-settings*)
+ (declare (fixnum start end))
+ (let ((i start)
+ (length (/ (- end start) 4)))
+ (decf end 8)
+ (loop
+ (when (> i end)
+ (return))
+ (cond ((loop for j of-type fixnum from i
+ for octet across #.(vector 0 0 0 +cr+ 0 0 0 +lf+)
+ always (= octet (aref sequence j)))
+ (decf length)
+ (incf i 8))
+ (t (incf i 4))))
+ length))
+
(defun recover-from-encoding-error (external-format format-control &rest format-args)
"Helper function used by OCTETS-TO-CHAR-CODE below to deal with
encoding errors. Checks if *SUBSTITUTION-CHAR* is not NIL and returns
@@ -242,7 +470,7 @@
(declare (ignore reader))
(let ((char-code (call-next-method)))
(case char-code
- (#.(char-code #\Return) #.(char-code #\Newline))
+ (#.+cr+ #.(char-code #\Newline))
(otherwise char-code))))
(defmethod octets-to-char-code ((format flexi-crlf-mixin) reader)
@@ -251,13 +479,13 @@
(declare (ignore reader))
(let ((char-code (call-next-method)))
(case char-code
- (#.(char-code #\Return)
+ (#.+cr+
(let ((next-char-code (call-next-method)))
(case next-char-code
- (#.(char-code #\Linefeed) #.(char-code #\Newline))
+ (#.+lf+ #.(char-code #\Newline))
;; we saw a CR but no LF afterwards, but then the data
;; ended, so we just return #\Return
- ((nil) #.(char-code #\Return))
+ ((nil) +cr+)
;; if the character we peeked at wasn't a
;; linefeed character we unread its constituents
(otherwise (funcall *current-unreader* (code-char next-char-code))
Modified: branches/edi/doc/index.html
==============================================================================
--- branches/edi/doc/index.html (original)
+++ branches/edi/doc/index.html Sat May 24 21:43:56 2008
@@ -116,6 +116,7 @@
<li><a href="#string-to-octets"><code>string-to-octets</code></a>
<li><a href="#octets-to-string"><code>octets-to-string</code></a>
<li><a href="#octet-length"><code>octet-length</code></a>
+ <li><a href="#char-length"><code>char-length</code></a>
</ol>
</ol>
<li><a href="#position">File positions</a>
@@ -1005,16 +1006,30 @@
</blockquote>
<p><br>[Function]
-<br><a class=none name="octet-length"><b>octet-length</b> <i>string <tt>&key</tt> external-format start end</i> => <i>length-or-nil</i></a>
+<br><a class=none name="octet-length"><b>octet-length</b> <i>string <tt>&key</tt> external-format start end</i> => <i>length</i></a>
<blockquote><br>
Returns the length of the substring of <code><i>string</i></code> from <code><i>start</i></code> to <code><i>end</i></code> in
<a href="#octet">octets</a> if encoded using
the <a href="#external-formats">external format</a> designated
-by <code><i>external-format</i></code>. Might return <code>NIL</code>
-if there's no efficient way to compute the length without iterating
-through the whole string.
+by <code><i>external-format</i></code>.
+The defaults for
+<code><i>start</i></code> and <code><i>end</i></code>
+are <code>0</code> and the length of the string. The default
+for <code><i>external-format</i></code> is <code>:LATIN1</code>.
+</blockquote>
+
+<p><br>[Function]
+<br><a class=none name="char-length"><b>char-length</b> <i>sequence <tt>&key</tt> external-format start end</i> => <i>length</i></a>
+
+<blockquote><br>
+
+Kind of the inverse of <a href="#octet-length"><code>OCTET-LENGTH</code></a>.
+Returns the length of the subsequence (of <a href="#octet">octets</a>) of <code><i>sequence</i></code> from <code><i>start</i></code> to <code><i>end</i></code> in
+characters if decoded using
+the <a href="#external-formats">external format</a> designated
+by <code><i>external-format</i></code>.
The defaults for
<code><i>start</i></code> and <code><i>end</i></code>
are <code>0</code> and the length of the sequence. The default
@@ -1060,7 +1075,7 @@
his work on making FLEXI-STREAMS faster.
<p>
-$Header: /usr/local/cvsrep/flexi-streams/doc/index.html,v 1.111 2008/05/23 14:56:47 edi Exp $
+$Header: /usr/local/cvsrep/flexi-streams/doc/index.html,v 1.112 2008/05/25 01:41:25 edi Exp $
<p><a href="http://weitz.de/index.html">BACK TO MY HOMEPAGE</a>
</body>
Modified: branches/edi/packages.lisp
==============================================================================
--- branches/edi/packages.lisp (original)
+++ branches/edi/packages.lisp Sat May 24 21:43:56 2008
@@ -1,5 +1,5 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-USER; Base: 10 -*-
-;;; $Header: /usr/local/cvsrep/flexi-streams/packages.lisp,v 1.35 2008/05/21 01:43:43 edi Exp $
+;;; $Header: /usr/local/cvsrep/flexi-streams/packages.lisp,v 1.36 2008/05/25 01:40:54 edi Exp $
;;; Copyright (c) 2005-2008, Dr. Edmund Weitz. All rights reserved.
@@ -40,6 +40,7 @@
(:export :*default-eol-style*
:*default-little-endian*
:*substitution-char*
+ :char-length
:external-format-eol-style
:external-format-error
:external-format-error-external-format
Modified: branches/edi/specials.lisp
==============================================================================
--- branches/edi/specials.lisp (original)
+++ branches/edi/specials.lisp Sat May 24 21:43:56 2008
@@ -1,5 +1,5 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*-
-;;; $Header: /usr/local/cvsrep/flexi-streams/specials.lisp,v 1.32 2008/05/20 23:01:51 edi Exp $
+;;; $Header: /usr/local/cvsrep/flexi-streams/specials.lisp,v 1.33 2008/05/25 01:40:54 edi Exp $
;;; Copyright (c) 2005-2008, Dr. Edmund Weitz. All rights reserved.
@@ -49,6 +49,10 @@
"Like *STANDARD-OPTIMIZE-SETTINGS*, but \(on LispWorks) with all
arithmetic being fixnum arithmetic.")
+(defconstant +lf+ (char-code #\Linefeed))
+
+(defconstant +cr+ (char-code #\Return))
+
(defvar *current-unreader* nil
"A unary function which might be called to `unread' a character
\(i.e. the sequence of octets it represents).
Modified: branches/edi/strings.lisp
==============================================================================
--- branches/edi/strings.lisp (original)
+++ branches/edi/strings.lisp Sat May 24 21:43:56 2008
@@ -1,5 +1,5 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*-
-;;; $Header: /usr/local/cvsrep/flexi-streams/strings.lisp,v 1.24 2008/05/24 23:15:25 edi Exp $
+;;; $Header: /usr/local/cvsrep/flexi-streams/strings.lisp,v 1.26 2008/05/25 01:41:32 edi Exp $
;;; Copyright (c) 2005-2008, Dr. Edmund Weitz. All rights reserved.
@@ -116,12 +116,17 @@
(defun octet-length (string &key (external-format :latin1) (start 0) (end (length string)))
"Returns the length of the substring of STRING from START to END in
-octets if encoded using the external format EXTERNAL-FORMAT. Might
-return NIL if there's no efficient way to compute the length without
-iterating through the whole string."
+octets if encoded using the external format EXTERNAL-FORMAT."
(declare #.*standard-optimize-settings*)
(declare (fixnum start end) (string string))
(setq external-format (maybe-convert-external-format external-format))
- (let ((factor (encoding-factor external-format)))
- (typecase factor
- (fixnum (* factor (- end start))))))
+ (compute-number-of-octets external-format string start end))
+
+(defun char-length (sequence &key (external-format :latin1) (start 0) (end (length sequence)))
+ "Kind of the inverse of OCTET-LENGTH. Returns the length of the
+subsequence \(of octets) of SEQUENCE from START to END in characters
+if decoded using the external format EXTERNAL-FORMAT."
+ (declare #.*standard-optimize-settings*)
+ (declare (fixnum start end) (string string))
+ (setq external-format (maybe-convert-external-format external-format))
+ (compute-number-of-chars external-format sequence start end))
Modified: branches/edi/util.lisp
==============================================================================
--- branches/edi/util.lisp (original)
+++ branches/edi/util.lisp Sat May 24 21:43:56 2008
@@ -1,5 +1,5 @@
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*-
-;;; $Header: /usr/local/cvsrep/flexi-streams/util.lisp,v 1.21 2008/05/20 23:44:45 edi Exp $
+;;; $Header: /usr/local/cvsrep/flexi-streams/util.lisp,v 1.22 2008/05/25 01:40:54 edi Exp $
;;; Copyright (c) 2005-2008, Dr. Edmund Weitz. All rights reserved.
@@ -115,7 +115,7 @@
(unless (find real-name +name-map+
:test #'eq
:key #'cdr)
- (error 'external-format-error
+ (error 'external-format-simple-error
:format-control "~S is not known to be a name for an external format."
:format-arguments (list name)))
real-name))
More information about the Flexi-streams-cvs
mailing list