[armedbear-cvs] r12902 - in trunk/abcl: . src/org/armedbear/lisp/util test/lisp/abcl

Ville Voutilainen vvoutilainen at common-lisp.net
Sat Aug 28 11:09:16 UTC 2010


Author: vvoutilainen
Date: Sat Aug 28 07:09:13 2010
New Revision: 12902

Log:
Fix reading of data containing scandinavian latin1 characters
correctly, and add a simple test for it. The utf-8 test is
just a sanity test so that umlauts as utf-8 aren't broken, the
latin1 test properly fails without this patch and passes
with this patch.


Added:
   trunk/abcl/test/lisp/abcl/latin1-tests.lisp   (contents, props changed)
   trunk/abcl/test/lisp/abcl/latin1-umlauts.txt
   trunk/abcl/test/lisp/abcl/utf8-umlauts.txt
Modified:
   trunk/abcl/abcl.asd
   trunk/abcl/src/org/armedbear/lisp/util/DecodingReader.java
   trunk/abcl/src/org/armedbear/lisp/util/RandomAccessCharacterFile.java

Modified: trunk/abcl/abcl.asd
==============================================================================
--- trunk/abcl/abcl.asd	(original)
+++ trunk/abcl/abcl.asd	Sat Aug 28 07:09:13 2010
@@ -45,6 +45,7 @@
                       (:file "url-pathname")
                       (:file "math-tests")
                       (:file "misc-tests")
+                      (:file "latin1-tests")
                       (:file "bugs" :depends-on ("file-system-tests"))
                       (:file "pathname-tests")))))
 

Modified: trunk/abcl/src/org/armedbear/lisp/util/DecodingReader.java
==============================================================================
--- trunk/abcl/src/org/armedbear/lisp/util/DecodingReader.java	(original)
+++ trunk/abcl/src/org/armedbear/lisp/util/DecodingReader.java	Sat Aug 28 07:09:13 2010
@@ -45,6 +45,7 @@
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CharsetEncoder;
 import java.nio.charset.CoderResult;
+import java.nio.charset.CodingErrorAction;
 
 import org.armedbear.lisp.Debug;
 
@@ -79,6 +80,8 @@
           // we need to be able to unread the byte buffer
         this.stream = new PushbackInputStream(stream, size);
         this.cd = cs.newDecoder();
+        this.cd.onUnmappableCharacter(CodingErrorAction.REPLACE);
+        this.cd.onMalformedInput(CodingErrorAction.REPLACE);
         this.ce = cs.newEncoder();
         bbuf = ByteBuffer.allocate(size);
         bbuf.flip();  // mark the buffer as 'needs refill'
@@ -89,6 +92,8 @@
      */
     public final void setCharset(Charset cs) {
         this.cd = cs.newDecoder();
+        this.cd.onUnmappableCharacter(CodingErrorAction.REPLACE);
+        this.cd.onMalformedInput(CodingErrorAction.REPLACE);
         this.ce = cs.newEncoder();
     }
 
@@ -257,18 +262,23 @@
 
 
         while (cb.remaining() > 0 && notEof) {
+            int oldRemaining = cb.remaining();
             notEof = ensureBbuf(forceRead);
             CoderResult r = cd.decode(bbuf, cb, ! notEof);
-            forceRead = (CoderResult.UNDERFLOW == r);
-
-            if (r.isMalformed()) {
-                throw new RACFMalformedInputException(bbuf.position(),
-                                                      (char)bbuf.get(bbuf.position()),
-                                                      cd.charset().name());
-            } else if (r.isUnmappable()) {
-                // a situation exactly like this is in DecodingReader too
-                Debug.assertTrue(false);
+            if (oldRemaining == cb.remaining()
+                && CoderResult.OVERFLOW == r) {
+                // if this happens, the decoding failed
+                // but the bufs didn't advance. Advance
+                // them manually and do manual replacing,
+                // otherwise we loop endlessly. This occurs
+                // at least when parsing latin1 files with
+                // lowercase o-umlauts in them.
+                // Note that this is at the moment copy-paste
+                // with RandomAccessCharacterFile.read()
+                cb.put('?');
+                bbuf.get();
             }
+            forceRead = (CoderResult.UNDERFLOW == r);
         }
         if (cb.remaining() == len)
             return -1;

Modified: trunk/abcl/src/org/armedbear/lisp/util/RandomAccessCharacterFile.java
==============================================================================
--- trunk/abcl/src/org/armedbear/lisp/util/RandomAccessCharacterFile.java	(original)
+++ trunk/abcl/src/org/armedbear/lisp/util/RandomAccessCharacterFile.java	Sat Aug 28 07:09:13 2010
@@ -370,23 +370,23 @@
         boolean decodeWasUnderflow = false;
         boolean atEof = false;
         while ((cbuf.remaining() > 0) && ! atEof) {
-
+            int oldRemaining = cbuf.remaining();
             atEof = ! ensureReadBbuf(decodeWasUnderflow);
             CoderResult r = cdec.decode(bbuf, cbuf, atEof );
+            if (oldRemaining == cbuf.remaining()
+                && CoderResult.OVERFLOW == r) {
+                // if this happens, the decoding failed
+                // but the bufs didn't advance. Advance
+                // them manually and do manual replacing,
+                // otherwise we loop endlessly. This occurs
+                // at least when parsing latin1 files with
+                // lowercase o-umlauts in them
+                // Note that this is at the moment copy-paste
+                // with DecodingReader.read()
+                cbuf.put('?');
+                bbuf.get();
+            }
             decodeWasUnderflow = (CoderResult.UNDERFLOW == r);
-            if (r.isMalformed())
-                // When reading encoded Unicode, we'd expect to require
-                // catching MalformedInput
-                throw new RACFMalformedInputException(bbuf.position(),
-                                                      (char)bbuf.get(bbuf.position()),
-                                                      cset.name());
-            if (r.isUnmappable())
-                // Since we're mapping TO unicode, we'd expect to be able
-                // to map all characters
-                Debug.assertTrue(false);
-            // OVERFLOW is a normal condition:
-            //  it's equal to cbuf.remaining() == 0
-            // ### EHU: really??? EXACTLY equal??
         }
         if (cbuf.remaining() == len) {
             return -1;

Added: trunk/abcl/test/lisp/abcl/latin1-tests.lisp
==============================================================================
--- (empty file)
+++ trunk/abcl/test/lisp/abcl/latin1-tests.lisp	Sat Aug 28 07:09:13 2010
@@ -0,0 +1,30 @@
+;;; latin1-tests.lisp
+;;;
+;;; Copyright (C) 2010 Ville Voutilainen
+;;; $Id$
+;;;
+;;; This program is free software; you can redistribute it and/or
+;;; modify it under the terms of the GNU General Public License
+;;; as published by the Free Software Foundation; either version 2
+;;; of the License, or (at your option) any later version.
+;;;
+;;; This program is distributed in the hope that it will be useful,
+;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;; GNU General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU General Public License
+;;; along with this program; if not, write to the Free Software
+;;; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+(in-package #:abcl.test.lisp)
+
+(deftest normal-utf8.1
+  (load "utf8-umlauts.txt")
+  t)
+
+(deftest latin1.1
+  (load "latin1-umlauts.txt")
+  t)
+
+

Added: trunk/abcl/test/lisp/abcl/latin1-umlauts.txt
==============================================================================
--- (empty file)
+++ trunk/abcl/test/lisp/abcl/latin1-umlauts.txt	Sat Aug 28 07:09:13 2010
@@ -0,0 +1,3 @@
+;; some umlauts: ÄÄÄÄääääÖÖÖÖööööÅÅÅÅåååå
+(defun not-so-hard ()
+  (format t "just a debug print~%"))

Added: trunk/abcl/test/lisp/abcl/utf8-umlauts.txt
==============================================================================
--- (empty file)
+++ trunk/abcl/test/lisp/abcl/utf8-umlauts.txt	Sat Aug 28 07:09:13 2010
@@ -0,0 +1,3 @@
+;; some umlauts: ÄÄÄÄääääÖÖÖÖööööÅÅÅÅåååå
+(defun not-so-hard ()
+  (format t "just a debug print~%"))




More information about the armedbear-cvs mailing list