[armedbear-cvs] r12902 - in trunk/abcl: . src/org/armedbear/lisp/util test/lisp/abcl
Ville Voutilainen
vvoutilainen at common-lisp.net
Sat Aug 28 11:09:16 UTC 2010
Author: vvoutilainen
Date: Sat Aug 28 07:09:13 2010
New Revision: 12902
Log:
Fix reading of data containing scandinavian latin1 characters
correctly, and add a simple test for it. The utf-8 test is
just a sanity test so that umlauts as utf-8 aren't broken, the
latin1 test properly fails without this patch and passes
with this patch.
Added:
trunk/abcl/test/lisp/abcl/latin1-tests.lisp (contents, props changed)
trunk/abcl/test/lisp/abcl/latin1-umlauts.txt
trunk/abcl/test/lisp/abcl/utf8-umlauts.txt
Modified:
trunk/abcl/abcl.asd
trunk/abcl/src/org/armedbear/lisp/util/DecodingReader.java
trunk/abcl/src/org/armedbear/lisp/util/RandomAccessCharacterFile.java
Modified: trunk/abcl/abcl.asd
==============================================================================
--- trunk/abcl/abcl.asd (original)
+++ trunk/abcl/abcl.asd Sat Aug 28 07:09:13 2010
@@ -45,6 +45,7 @@
(:file "url-pathname")
(:file "math-tests")
(:file "misc-tests")
+ (:file "latin1-tests")
(:file "bugs" :depends-on ("file-system-tests"))
(:file "pathname-tests")))))
Modified: trunk/abcl/src/org/armedbear/lisp/util/DecodingReader.java
==============================================================================
--- trunk/abcl/src/org/armedbear/lisp/util/DecodingReader.java (original)
+++ trunk/abcl/src/org/armedbear/lisp/util/DecodingReader.java Sat Aug 28 07:09:13 2010
@@ -45,6 +45,7 @@
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
+import java.nio.charset.CodingErrorAction;
import org.armedbear.lisp.Debug;
@@ -79,6 +80,8 @@
// we need to be able to unread the byte buffer
this.stream = new PushbackInputStream(stream, size);
this.cd = cs.newDecoder();
+ this.cd.onUnmappableCharacter(CodingErrorAction.REPLACE);
+ this.cd.onMalformedInput(CodingErrorAction.REPLACE);
this.ce = cs.newEncoder();
bbuf = ByteBuffer.allocate(size);
bbuf.flip(); // mark the buffer as 'needs refill'
@@ -89,6 +92,8 @@
*/
public final void setCharset(Charset cs) {
this.cd = cs.newDecoder();
+ this.cd.onUnmappableCharacter(CodingErrorAction.REPLACE);
+ this.cd.onMalformedInput(CodingErrorAction.REPLACE);
this.ce = cs.newEncoder();
}
@@ -257,18 +262,23 @@
while (cb.remaining() > 0 && notEof) {
+ int oldRemaining = cb.remaining();
notEof = ensureBbuf(forceRead);
CoderResult r = cd.decode(bbuf, cb, ! notEof);
- forceRead = (CoderResult.UNDERFLOW == r);
-
- if (r.isMalformed()) {
- throw new RACFMalformedInputException(bbuf.position(),
- (char)bbuf.get(bbuf.position()),
- cd.charset().name());
- } else if (r.isUnmappable()) {
- // a situation exactly like this is in DecodingReader too
- Debug.assertTrue(false);
+ if (oldRemaining == cb.remaining()
+ && CoderResult.OVERFLOW == r) {
+ // if this happens, the decoding failed
+ // but the bufs didn't advance. Advance
+ // them manually and do manual replacing,
+ // otherwise we loop endlessly. This occurs
+ // at least when parsing latin1 files with
+ // lowercase o-umlauts in them.
+ // Note that this is at the moment copy-paste
+ // with RandomAccessCharacterFile.read()
+ cb.put('?');
+ bbuf.get();
}
+ forceRead = (CoderResult.UNDERFLOW == r);
}
if (cb.remaining() == len)
return -1;
Modified: trunk/abcl/src/org/armedbear/lisp/util/RandomAccessCharacterFile.java
==============================================================================
--- trunk/abcl/src/org/armedbear/lisp/util/RandomAccessCharacterFile.java (original)
+++ trunk/abcl/src/org/armedbear/lisp/util/RandomAccessCharacterFile.java Sat Aug 28 07:09:13 2010
@@ -370,23 +370,23 @@
boolean decodeWasUnderflow = false;
boolean atEof = false;
while ((cbuf.remaining() > 0) && ! atEof) {
-
+ int oldRemaining = cbuf.remaining();
atEof = ! ensureReadBbuf(decodeWasUnderflow);
CoderResult r = cdec.decode(bbuf, cbuf, atEof );
+ if (oldRemaining == cbuf.remaining()
+ && CoderResult.OVERFLOW == r) {
+ // if this happens, the decoding failed
+ // but the bufs didn't advance. Advance
+ // them manually and do manual replacing,
+ // otherwise we loop endlessly. This occurs
+ // at least when parsing latin1 files with
+ // lowercase o-umlauts in them
+ // Note that this is at the moment copy-paste
+ // with DecodingReader.read()
+ cbuf.put('?');
+ bbuf.get();
+ }
decodeWasUnderflow = (CoderResult.UNDERFLOW == r);
- if (r.isMalformed())
- // When reading encoded Unicode, we'd expect to require
- // catching MalformedInput
- throw new RACFMalformedInputException(bbuf.position(),
- (char)bbuf.get(bbuf.position()),
- cset.name());
- if (r.isUnmappable())
- // Since we're mapping TO unicode, we'd expect to be able
- // to map all characters
- Debug.assertTrue(false);
- // OVERFLOW is a normal condition:
- // it's equal to cbuf.remaining() == 0
- // ### EHU: really??? EXACTLY equal??
}
if (cbuf.remaining() == len) {
return -1;
Added: trunk/abcl/test/lisp/abcl/latin1-tests.lisp
==============================================================================
--- (empty file)
+++ trunk/abcl/test/lisp/abcl/latin1-tests.lisp Sat Aug 28 07:09:13 2010
@@ -0,0 +1,30 @@
+;;; latin1-tests.lisp
+;;;
+;;; Copyright (C) 2010 Ville Voutilainen
+;;; $Id$
+;;;
+;;; This program is free software; you can redistribute it and/or
+;;; modify it under the terms of the GNU General Public License
+;;; as published by the Free Software Foundation; either version 2
+;;; of the License, or (at your option) any later version.
+;;;
+;;; This program is distributed in the hope that it will be useful,
+;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;;; GNU General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU General Public License
+;;; along with this program; if not, write to the Free Software
+;;; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+(in-package #:abcl.test.lisp)
+
+(deftest normal-utf8.1
+ (load "utf8-umlauts.txt")
+ t)
+
+(deftest latin1.1
+ (load "latin1-umlauts.txt")
+ t)
+
+
Added: trunk/abcl/test/lisp/abcl/latin1-umlauts.txt
==============================================================================
--- (empty file)
+++ trunk/abcl/test/lisp/abcl/latin1-umlauts.txt Sat Aug 28 07:09:13 2010
@@ -0,0 +1,3 @@
+;; some umlauts: ÄÄÄÄääääÖÖÖÖööööÅÅÅÅåååå
+(defun not-so-hard ()
+ (format t "just a debug print~%"))
Added: trunk/abcl/test/lisp/abcl/utf8-umlauts.txt
==============================================================================
--- (empty file)
+++ trunk/abcl/test/lisp/abcl/utf8-umlauts.txt Sat Aug 28 07:09:13 2010
@@ -0,0 +1,3 @@
+;; some umlauts: ÃÃÃÃääääÃÃÃÃööööÃ
Ã
Ã
Ã
åååå
+(defun not-so-hard ()
+ (format t "just a debug print~%"))
More information about the armedbear-cvs
mailing list