diff --git a/src/c/file.d b/src/c/file.d index 20f079e..b6bb9e8 100755 --- a/src/c/file.d +++ b/src/c/file.d @@ -1048,6 +1048,8 @@ user_multistate_encoder(cl_object stream, unsigned char *buffer, ecl_character c static ecl_character utf_8_decoder(cl_object stream) { + int utf8b = (stream->stream.format == @':UTF-8B' ? 1 : 0); + /* In understanding this code: * 0x8 = 1000, 0xC = 1100, 0xE = 1110, 0xF = 1111 * 0x1 = 0001, 0x3 = 0011, 0x7 = 0111, 0xF = 1111 @@ -1060,8 +1062,11 @@ utf_8_decoder(cl_object stream) if ((buffer[0] & 0x80) == 0) { return buffer[0]; } - unlikely_if ((buffer[0] & 0x40) == 0) + unlikely_if ((buffer[0] & 0x40) == 0) { + if (utf8b) + return (0xdc00 | buffer[0]); return decoding_error(stream, buffer, 1); + } if ((buffer[0] & 0x20) == 0) { cum = buffer[0] & 0x1F; nbytes = 1; @@ -1099,6 +1104,8 @@ static int utf_8_encoder(cl_object stream, unsigned char *buffer, ecl_character c) { int nbytes; + int utf8b = (stream->stream.format == @':UTF-8B' ? 1 : 0); + if (c < 0) { return 0; } else if (c <= 0x7F) { @@ -1109,6 +1116,18 @@ utf_8_encoder(cl_object stream, unsigned char *buffer, ecl_character c) buffer[0] = c | 0xC0; /*printf("\n; %04x ;: %04x :: %04x :\n", c_orig, buffer[0], buffer[1]);*/ nbytes = 2; + } else if (c <= 0xdcff && c >= 0xdc80) { + /* Special UTF-16 surrogate range used to implement UTF-8B */ + if (utf8b) { + /* Litteral octet */ + buffer[0] = c; + nbytes = 1; + } else { + /* Treat octet like LATIN-1 */ + buffer[1] = c - 0x20; + buffer[0] = 0xc3; + nbytes = 2; + } } else if (c <= 0xFFFF) { buffer[2] = (c & 0x3f) | 0x80; c >>= 6; buffer[1] = (c & 0x3f) | 0x80; c >>= 6; @@ -2936,6 +2955,9 @@ parse_external_format(cl_object stream, cl_object format, int flags) if (format == @':UTF-8') { return (flags & ~ECL_STREAM_FORMAT) | ECL_STREAM_UTF_8; } + if (format == @':UTF-8B') { + return (flags & ~ECL_STREAM_FORMAT) | ECL_STREAM_UTF_8B; + } if (format == @':UCS-2') { return (flags & ~ECL_STREAM_FORMAT) | ECL_STREAM_UCS_2; } @@ -3019,6 +3041,13 @@ set_stream_elt_type(cl_object stream, cl_fixnum byte_size, int flags, stream->stream.encoder = utf_8_encoder; stream->stream.decoder = utf_8_decoder; break; + case ECL_STREAM_UTF_8B: + IO_STREAM_ELT_TYPE(stream) = @'character'; + byte_size = 8; + stream->stream.format = @':utf-8b'; + stream->stream.encoder = utf_8_encoder; + stream->stream.decoder = utf_8_decoder; + break; case ECL_STREAM_UCS_2: IO_STREAM_ELT_TYPE(stream) = @'character'; byte_size = 8*2; diff --git a/src/c/symbols_list.h b/src/c/symbols_list.h index e93452c..0c3330d 100755 --- a/src/c/symbols_list.h +++ b/src/c/symbols_list.h @@ -1824,6 +1824,7 @@ cl_symbols[] = { {KEY_ "LATIN-1", KEYWORD, NULL, -1, OBJNULL}, {KEY_ "ISO-8859-1", KEYWORD, NULL, -1, OBJNULL}, {KEY_ "UTF-8", KEYWORD, NULL, -1, OBJNULL}, +{KEY_ "UTF-8B", KEYWORD, NULL, -1, OBJNULL}, {KEY_ "UCS-2", KEYWORD, NULL, -1, OBJNULL}, {KEY_ "UCS-4", KEYWORD, NULL, -1, OBJNULL}, diff --git a/src/c/symbols_list2.h b/src/c/symbols_list2.h index 90db2e8..9a25151 100644 --- a/src/c/symbols_list2.h +++ b/src/c/symbols_list2.h @@ -1824,6 +1824,7 @@ cl_symbols[] = { {KEY_ "LATIN-1",NULL}, {KEY_ "ISO-8859-1",NULL}, {KEY_ "UTF-8",NULL}, +{KEY_ "UTF-8B",NULL}, {KEY_ "UCS-2",NULL}, {KEY_ "UCS-4",NULL}, diff --git a/src/h/object.h b/src/h/object.h index 6de2792..ffc853d 100644 --- a/src/h/object.h +++ b/src/h/object.h @@ -595,6 +595,7 @@ enum { ECL_STREAM_ISO_8859_1 = 1, ECL_STREAM_LATIN_1 = 1, ECL_STREAM_UTF_8 = 2, + ECL_STREAM_UTF_8B = 2048, ECL_STREAM_UCS_2 = 3, ECL_STREAM_UCS_2LE = 5 + 128, ECL_STREAM_UCS_2BE = 5, diff --git a/src/lsp/iolib.lsp b/src/lsp/iolib.lsp index ebe5fda..84f8417 100644 --- a/src/lsp/iolib.lsp +++ b/src/lsp/iolib.lsp @@ -268,7 +268,7 @@ the one used internally by ECL compiled files." (let* ((basic-encodings #+unicode - '(:UTF-8 :UCS-2 :UCS-2BE :UCS-2LE :UCS-4 :UCS-4BE + '(:UTF-8 :UTF-8B :UCS-2 :UCS-2BE :UCS-2LE :UCS-4 :UCS-4BE :ISO-8859-1 :LATIN-1 :US-ASCII :DEFAULT) #-unicode '(:DEFAULT))