CVS update by aidan xemacs/src, unicode.c
...
xemacs-cvs at xemacs.org
xemacs-cvs at xemacs.org
Sat Aug 4 16:00:36 EDT 2007
User: aidan
Date: 07/08/04 22:00:36
Modified: xemacs/src ChangeLog charset.h lisp.h lread.c mule-coding.c
unicode.c
Log:
Preserve invalid UTF-8, UTF-16 sequences on encoding, decoding.
Revision Changes Path
1.822 +10 -0 XEmacs/xemacs/lisp/ChangeLog
Index: ChangeLog
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/lisp/ChangeLog,v
retrieving revision 1.821
retrieving revision 1.822
diff -u -p -r1.821 -r1.822
--- ChangeLog 2007/08/02 06:37:48 1.821
+++ ChangeLog 2007/08/04 20:00:10 1.822
@@ -1,3 +1,13 @@
+2007-08-04 Aidan Kehoe <kehoea at parhasard.net>
+
+ * unicode.el:
+ * unicode.el (utf-32):
+ * unicode.el (utf-32-little-endian):
+ Add UTF-32 coding systems.
+
+ * unicode.el (decode-char):
+ Only accept valid Unicode in this function.
+
2007-08-02 Mike Sperber <mike at xemacs.org>
* startup.el (startup-setup-paths): Fix typo in init expression
1.22 +24 -0 XEmacs/xemacs/lisp/unicode.el
Index: unicode.el
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/lisp/unicode.el,v
retrieving revision 1.21
retrieving revision 1.22
diff -u -p -r1.21 -r1.22
--- unicode.el 2007/07/28 08:02:16 1.21
+++ unicode.el 2007/08/04 20:00:13 1.22
@@ -233,6 +233,26 @@ Standard encoding for representing Unico
little-endian t))
(make-coding-system
+ 'utf-32 'unicode
+ "UTF-32"
+ '(mnemonic "UTF32"
+ documentation
+ "UTF-32 Unicode encoding -- fixed-width four-byte encoding,
+characters less than #x10FFFF are not supported. "
+ unicode-type utf-32))
+
+(make-coding-system
+ 'utf-32-little-endian 'unicode
+ "UTF-32 Little Endian"
+ '(mnemonic "UTF32-LE"
+ documentation
+ "Little-endian version of UTF-32 Unicode encoding.
+
+A fixed-width four-byte encoding, characters less than #x10FFFF are not
+supported. "
+ unicode-type ucs-4 little-endian t))
+
+(make-coding-system
'utf-8 'unicode
"UTF-8"
'(mnemonic "UTF8"
@@ -274,6 +294,10 @@ Standard encoding for representing UTF-8
(defun decode-char (quote-ucs code &optional restriction)
"FSF compatibility--return Mule character with Unicode codepoint CODE.
The second argument must be 'ucs, the third argument is ignored. "
+ ;; We're prepared to accept invalid Unicode in unicode-to-char, but not in
+ ;; this function, which is the API that should actually be used, since
+ ;; it's available in GNU and in Mule-UCS.
+ (check-argument-range code #x0 #x10FFFF)
(assert (eq quote-ucs 'ucs) t
"Sorry, decode-char doesn't yet support anything but the UCS. ")
(unicode-to-char code))
1.1078 +47 -0 XEmacs/xemacs/src/ChangeLog
Index: ChangeLog
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/ChangeLog,v
retrieving revision 1.1077
retrieving revision 1.1078
diff -u -p -r1.1077 -r1.1078
--- ChangeLog 2007/07/26 11:15:04 1.1077
+++ ChangeLog 2007/08/04 20:00:20 1.1078
@@ -1,3 +1,50 @@
+2007-08-04 Aidan Kehoe <kehoea at parhasard.net>
+
+ * charset.h:
+ * charset.h (enum unicode_type):
+ Add UNICODE_UTF_32.
+ * lisp.h:
+ Add Qutf_32.
+ * lread.c (read_unicode_escape):
+ Error on an invalid Unicode escape; error on no mapping, as GNU does.
+
+ * mule-coding.c:
+ * mule-coding.c (dynarr_add_2022_one_dimension):
+ * mule-coding.c (dynarr_add_2022_two_dimensions):
+ * mule-coding.c (struct iso2022_coding_stream):
+ * mule-coding.c (decode_unicode_char):
+ * mule-coding.c (indicate_invalid_utf_8):
+ * mule-coding.c (iso2022_decode):
+ * unicode.c:
+ * unicode.c (struct unicode_coding_stream):
+ * unicode.c (decode_unicode_char):
+ * unicode.c (DECODE_ERROR_OCTET):
+ * unicode.c (indicate_invalid_utf_8):
+ * unicode.c (encode_unicode_char_1):
+ * unicode.c (encode_unicode_char):
+ * unicode.c (unicode_convert):
+ * unicode.c (unicode_putprop):
+ * unicode.c (unicode_getprop):
+ * unicode.c (syms_of_unicode):
+ Make UTF-8 and UTF-16 handling more robust; indicate error
+ sequences when decoding, passing the octets as distinct from the
+ corresponding ISO8859-1 characters, and (by default) writing them
+ to disk on encoding. Don't accept over-long UTF-8 sequences, codes
+ >= #x110000, or UTF-16 surrogates on reading in the utf-8 coding
+ system; represent them as error sequences.
+
+ Do accept code points above #x110000 in the ISO IR 196 handling,
+ since we decode Unicode error sequences to "Unicode" code points
+ starting at 0x200000, and will need to save them as such in
+ escape-quoted. Do not accept over-long UTF-8 sequences or UTF-16
+ surrogates in escape-quoted.
+
+ This change means that when a non-UTF-8 file is opened as UTF-8,
+ one change made, and immediately saved, the non-ASCII characters
+ are not corrupted. In Europe, this is a distinct win.
+
+ Add UCS-4, UTF-32 as coding systems.
+
2007-07-26 Aidan Kehoe <kehoea at parhasard.net>
* mule-ccl.c (ccl_driver):
1.17 +10 -2 XEmacs/xemacs/src/charset.h
Index: charset.h
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/charset.h,v
retrieving revision 1.16
retrieving revision 1.17
diff -u -p -r1.16 -r1.17
--- charset.h 2006/11/12 13:40:07 1.16
+++ charset.h 2007/08/04 20:00:23 1.17
@@ -567,12 +567,20 @@ enum unicode_type
UNICODE_UTF_16,
UNICODE_UTF_8,
UNICODE_UTF_7,
- UNICODE_UCS_4
+ UNICODE_UCS_4,
+ UNICODE_UTF_32
};
void encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h,
int USED_IF_MULE (l), unsigned_char_dynarr *dst,
- enum unicode_type type, unsigned int little_endian);
+ enum unicode_type type, unsigned int little_endian,
+ int write_error_characters_as_such);
+
+#define UNICODE_ERROR_OCTET_RANGE_START 0x200000
+
+#define valid_utf_16_first_surrogate(ch) (((ch) & 0xFC00) == 0xD800)
+#define valid_utf_16_last_surrogate(ch) (((ch) & 0xFC00) == 0xDC00)
+#define valid_utf_16_surrogate(ch) (((ch) & 0xF800) == 0xD800)
void set_charset_registries(Lisp_Object charset, Lisp_Object registries);
1.146 +1 -1 XEmacs/xemacs/src/lisp.h
Index: lisp.h
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/lisp.h,v
retrieving revision 1.145
retrieving revision 1.146
diff -u -p -r1.145 -r1.146
--- lisp.h 2007/05/26 18:28:23 1.145
+++ lisp.h 2007/08/04 20:00:23 1.146
@@ -5488,7 +5488,7 @@ void init_charset_unicode_tables (Lisp_O
void free_charset_unicode_tables (Lisp_Object charset);
void recalculate_unicode_precedence (void);
extern Lisp_Object Qunicode;
-extern Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7;
+extern Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7, Qutf_32;
#ifdef MEMORY_USAGE_STATS
Bytecount compute_from_unicode_table_size (Lisp_Object charset,
struct overhead_stats *stats);
1.82 +15 -13 XEmacs/xemacs/src/lread.c
Index: lread.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/lread.c,v
retrieving revision 1.81
retrieving revision 1.82
diff -u -p -r1.81 -r1.82
--- lread.c 2006/11/07 15:58:24 1.81
+++ lread.c 2007/08/04 20:00:24 1.82
@@ -1694,24 +1694,26 @@ read_unicode_escape (Lisp_Object readcha
}
}
+ if (i > 0x110000 || i < 0)
+ {
+ syntax_error ("Not a Unicode code point", make_int(i));
+ }
+
lisp_char = Funicode_to_char(make_int(i), Qnil);
if (EQ(Qnil, lisp_char))
{
- /* This is ugly and horrible and trashes the user's data, but
- it's what unicode.c does. In the future, unicode-to-char
- should not return nil. */
-#ifdef MULE
- i = make_ichar (Vcharset_japanese_jisx0208, 34 + 128, 46 + 128);
-#else
- i = '~';
-#endif
- return i;
- }
- else
- {
- return XCHAR(lisp_char);
+ /* Will happen on non-Mule. Silent corruption is what happens
+ elsewhere, and we used to do that to be consistent, but GNU error,
+ so people writing portable code need to be able to handle that, and
+ given a choice I prefer that behaviour.
+
+ An undesirable aspect to this error is that the code point is shown
+ as a decimal integer, which is mostly unreadable. */
+ syntax_error ("Unsupported Unicode code point", make_int(i));
}
+
+ return XCHAR(lisp_char);
}
1.40 +155 -70 XEmacs/xemacs/src/mule-coding.c
Index: mule-coding.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/mule-coding.c,v
retrieving revision 1.39
retrieving revision 1.40
diff -u -p -r1.39 -r1.40
--- mule-coding.c 2006/11/23 13:43:19 1.39
+++ mule-coding.c 2007/08/04 20:00:24 1.40
@@ -104,7 +104,7 @@ dynarr_add_2022_one_dimension (Lisp_Obje
if (XCHARSET_ENCODE_AS_UTF_8 (charset))
{
encode_unicode_char (charset, c & charmask, 0,
- dst, UNICODE_UTF_8, 0);
+ dst, UNICODE_UTF_8, 0, 0);
}
else
{
@@ -123,7 +123,7 @@ dynarr_add_2022_two_dimensions (Lisp_Obj
encode_unicode_char (charset,
ch & charmask,
c & charmask, dst,
- UNICODE_UTF_8, 0);
+ UNICODE_UTF_8, 0, 0);
}
else
{
@@ -969,6 +969,7 @@ struct iso2022_coding_stream
/* Used for handling UTF-8. */
unsigned char counter;
+ unsigned char indicated_length;
};
static const struct memory_description ccs_description_1[] =
@@ -1804,6 +1805,39 @@ ensure_correct_direction (int direction,
}
}
+/* Note that this name conflicts with a function in unicode.c. */
+static void
+decode_unicode_char (int ucs, unsigned_char_dynarr *dst)
+{
+ Ibyte work[MAX_ICHAR_LEN];
+ int len;
+ Lisp_Object chr;
+
+ chr = Funicode_to_char(make_int(ucs), Qnil);
+ assert (!NILP(chr));
+ len = set_itext_ichar (work, XCHAR(chr));
+ Dynarr_add_many (dst, work, len);
+}
+
+#define DECODE_ERROR_OCTET(octet, dst) \
+ decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, dst)
+
+static inline void
+indicate_invalid_utf_8 (unsigned char indicated_length,
+ unsigned char counter,
+ int ch, unsigned_char_dynarr *dst)
+{
+ Binbyte stored = indicated_length - counter;
+ Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length];
+
+ while (stored > 0)
+ {
+ DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask,
+ dst);
+ mask = 0x80, stored--;
+ }
+}
+
/* Convert ISO2022-format data to internal format. */
static Bytecount
@@ -1907,9 +1941,7 @@ iso2022_decode (struct coding_stream *st
else if (flags & ISO_STATE_UTF_8)
{
unsigned char counter = data->counter;
- Ibyte work[MAX_ICHAR_LEN];
- int len;
- Lisp_Object chr;
+ unsigned char indicated_length = data->indicated_length;
if (ISO_CODE_ESC == c)
{
@@ -1919,74 +1951,127 @@ iso2022_decode (struct coding_stream *st
data->esc_bytes_index = 1;
continue;
}
-
- switch (counter)
- {
- case 0:
- if (c >= 0xfc)
- {
- ch = c & 0x01;
- counter = 5;
- }
- else if (c >= 0xf8)
- {
- ch = c & 0x03;
- counter = 4;
- }
- else if (c >= 0xf0)
- {
- ch = c & 0x07;
- counter = 3;
- }
- else if (c >= 0xe0)
- {
- ch = c & 0x0f;
- counter = 2;
- }
- else if (c >= 0xc0)
- {
- ch = c & 0x1f;
- counter = 1;
- }
- else
- /* ASCII, or the lower control characters.
-
- Perhaps we should signal an error if the character is in
- the range 0x80-0xc0; this is illegal UTF-8. */
- Dynarr_add (dst, (c & 0x7f));
-
- break;
- case 1:
- ch = (ch << 6) | (c & 0x3f);
- chr = Funicode_to_char(make_int(ch), Qnil);
-
- if (!NILP (chr))
- {
- assert(CHARP(chr));
- len = set_itext_ichar (work, XCHAR(chr));
- Dynarr_add_many (dst, work, len);
- }
- else
- {
- /* Shouldn't happen, this code should only be enabled in
- XEmacsen with support for all of Unicode. */
- Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
- Dynarr_add (dst, 34 + 128);
- Dynarr_add (dst, 46 + 128);
- }
-
- ch = 0;
- counter = 0;
- break;
- default:
- ch = (ch << 6) | (c & 0x3f);
- counter--;
- }
- if (str->eof)
- DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
+ if (0 == counter)
+ {
+ if (0 == (c & 0x80))
+ {
+ /* ASCII. */
+ decode_unicode_char (c, dst);
+ }
+ else if (0 == (c & 0x40))
+ {
+ /* Highest bit set, second highest not--there's
+ something wrong. */
+ DECODE_ERROR_OCTET (c, dst);
+ }
+ else if (0 == (c & 0x20))
+ {
+ ch = c & 0x1f;
+ counter = 1;
+ indicated_length = 2;
+ }
+ else if (0 == (c & 0x10))
+ {
+ ch = c & 0x0f;
+ counter = 2;
+ indicated_length = 3;
+ }
+ else if (0 == (c & 0x08))
+ {
+ ch = c & 0x0f;
+ counter = 3;
+ indicated_length = 4;
+ }
+ /* We support lengths longer than 4 here, since we want to
+ represent UTF-8 error chars as distinct from the
+ corresponding ISO 8859-1 characters in escape-quoted.
+
+ However, we can't differentiate UTF-8 error chars as
+ written to disk, and UTF-8 errors in escape-quoted. This
+ is not a big problem;
+ non-Unicode-chars-encoded-as-UTF-8-in-ISO-2022 is not
+ deployed, in practice, so if such a sequence of octets
+ occurs, XEmacs generated it. */
+ else if (0 == (c & 0x04))
+ {
+ ch = c & 0x03;
+ counter = 4;
+ indicated_length = 5;
+ }
+ else if (0 == (c & 0x02))
+ {
+ ch = c & 0x01;
+ counter = 5;
+ indicated_length = 6;
+ }
+ else
+ {
+ /* #xFF is not a valid leading byte in any form of
+ UTF-8. */
+ DECODE_ERROR_OCTET (c, dst);
+
+ }
+ }
+ else
+ {
+ /* counter != 0 */
+ if ((0 == (c & 0x80)) || (0 != (c & 0x40)))
+ {
+ indicate_invalid_utf_8(indicated_length,
+ counter,
+ ch, dst);
+ if (c & 0x80)
+ {
+ DECODE_ERROR_OCTET (c, dst);
+ }
+ else
+ {
+ /* The character just read is ASCII. Treat it as
+ such. */
+ decode_unicode_char (c, dst);
+ }
+ ch = 0;
+ counter = 0;
+ }
+ else
+ {
+ ch = (ch << 6) | (c & 0x3f);
+ counter--;
+
+ /* Just processed the final byte. Emit the character. */
+ if (!counter)
+ {
+ /* Don't accept over-long sequences, or surrogates. */
+ if ((ch < 0x80) ||
+ ((ch < 0x800) && indicated_length > 2) ||
+ ((ch < 0x10000) && indicated_length > 3) ||
+ /* We accept values above #x110000 in
+ escape-quoted, though not in UTF-8. */
+ /* (ch > 0x110000) || */
+ valid_utf_16_surrogate(ch))
+ {
+ indicate_invalid_utf_8(indicated_length,
+ counter,
+ ch, dst);
+ }
+ else
+ {
+ decode_unicode_char (ch, dst);
+ }
+ ch = 0;
+ }
+ }
+ }
+
+ if (str->eof && ch)
+ {
+ DECODE_ERROR_OCTET (ch, dst);
+ ch = 0;
+ }
data->counter = counter;
+ data->indicated_length = indicated_length;
}
else if (byte_c0_p (c) || byte_c1_p (c))
{ /* Control characters */
1.38 +356 -131 XEmacs/xemacs/src/unicode.c
Index: unicode.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/unicode.c,v
retrieving revision 1.37
retrieving revision 1.38
diff -u -p -r1.37 -r1.38
--- unicode.c 2007/05/13 11:11:30 1.37
+++ unicode.c 2007/08/04 20:00:24 1.38
@@ -146,13 +146,6 @@ Boston, MA 02111-1307, USA. */
(1) User-defined charsets: It would be inconvenient to require all
dumped user-defined charsets to be reloaded at init time.
- (2) Starting up in a non-ISO-8859-1 directory. If we load at run-time,
- we don't load the tables until after we've parsed the current
- directories, and we run into a real bootstrapping problem, if the
- directories themselves are non-ISO-8859-1. This is potentially fixable
- once we switch to using Unicode internally, so we don't have to do any
- conversion (other than the automatic kind, e.g. UTF-16 to UTF-8).
-
NB With run-time loading, we load in init-mule-at-startup, in
mule-cmds.el. This is called from startup.el, which is quite late in
the initialization process -- but data-directory isn't set until then.
@@ -192,7 +185,7 @@ Boston, MA 02111-1307, USA. */
convert them back.) */
Lisp_Object Qunicode;
-Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7;
+Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7, Qutf_32;
Lisp_Object Qneed_bom;
Lisp_Object Qutf_16_little_endian, Qutf_16_bom;
@@ -218,10 +211,6 @@ Lisp_Object Qutf_8_bom;
trail = 0xDC00 + (__ctu16s_code & 0x3FF); \
} while (0)
-#define valid_utf_16_first_surrogate(ch) (((ch) & 0xFC00) == 0xD800)
-#define valid_utf_16_last_surrogate(ch) (((ch) & 0xFC00) == 0xDC00)
-#define valid_utf_16_surrogate(ch) (((ch) & 0xF800) == 0xD800)
-
#ifdef MULE
/* Using ints for to_unicode is OK (as long as they are >= 32 bits).
@@ -1703,6 +1692,7 @@ struct unicode_coding_stream
{
/* decode */
unsigned char counter;
+ unsigned char indicated_length;
int seen_char;
/* encode */
Lisp_Object current_charset;
@@ -1716,11 +1706,6 @@ static const struct memory_description u
DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (unicode);
-/* Decode a UCS-2 or UCS-4 character into a buffer. If the lookup fails, use
- <GETA MARK> (U+3013) of JIS X 0208, which means correct character
- is not found, instead.
- #### do something more appropriate (use blob?)
- Danger, Will Robinson! Data loss. Should we signal user? */
static void
decode_unicode_char (int ch, unsigned_char_dynarr *dst,
struct unicode_coding_stream *data,
@@ -1755,9 +1740,32 @@ decode_unicode_char (int ch, unsigned_ch
data->seen_char = 1;
}
+#define DECODE_ERROR_OCTET(octet, dst, data, ignore_bom) \
+ decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, \
+ dst, data, ignore_bom)
+
+static inline void
+indicate_invalid_utf_8 (unsigned char indicated_length,
+ unsigned char counter,
+ int ch, unsigned_char_dynarr *dst,
+ struct unicode_coding_stream *data,
+ unsigned int ignore_bom)
+{
+ Binbyte stored = indicated_length - counter;
+ Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length];
+
+ while (stored > 0)
+ {
+ DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask,
+ dst, data, ignore_bom);
+ mask = 0x80, stored--;
+ }
+}
+
static void
encode_unicode_char_1 (int code, unsigned_char_dynarr *dst,
- enum unicode_type type, unsigned int little_endian)
+ enum unicode_type type, unsigned int little_endian,
+ int write_error_characters_as_such)
{
switch (type)
{
@@ -1767,53 +1775,105 @@ encode_unicode_char_1 (int code, unsigne
if (code < 0x10000) {
Dynarr_add (dst, (unsigned char) (code & 255));
Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
- } else {
- /* Little endian; least significant byte first. */
- int first, second;
-
- CODE_TO_UTF_16_SURROGATES(code, first, second);
-
- Dynarr_add (dst, (unsigned char) (first & 255));
- Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
-
- Dynarr_add (dst, (unsigned char) (second & 255));
- Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
- }
+ } else if (write_error_characters_as_such &&
+ code >= UNICODE_ERROR_OCTET_RANGE_START &&
+ code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+ {
+ Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+ }
+ else if (code < 0x110000)
+ {
+ /* Little endian; least significant byte first. */
+ int first, second;
+
+ CODE_TO_UTF_16_SURROGATES(code, first, second);
+
+ Dynarr_add (dst, (unsigned char) (first & 255));
+ Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
+
+ Dynarr_add (dst, (unsigned char) (second & 255));
+ Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
+ }
+ else
+ {
+ /* Not valid Unicode. Pass U+FFFD, least significant byte
+ first. */
+ Dynarr_add (dst, (unsigned char) 0xFD);
+ Dynarr_add (dst, (unsigned char) 0xFF);
+ }
}
else
{
if (code < 0x10000) {
Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
Dynarr_add (dst, (unsigned char) (code & 255));
- } else {
- /* Big endian; most significant byte first. */
- int first, second;
-
- CODE_TO_UTF_16_SURROGATES(code, first, second);
-
- Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
- Dynarr_add (dst, (unsigned char) (first & 255));
-
- Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
- Dynarr_add (dst, (unsigned char) (second & 255));
- }
+ } else if (write_error_characters_as_such &&
+ code >= UNICODE_ERROR_OCTET_RANGE_START &&
+ code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+ {
+ Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+ }
+ else if (code < 0x110000)
+ {
+ /* Big endian; most significant byte first. */
+ int first, second;
+
+ CODE_TO_UTF_16_SURROGATES(code, first, second);
+
+ Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
+ Dynarr_add (dst, (unsigned char) (first & 255));
+
+ Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
+ Dynarr_add (dst, (unsigned char) (second & 255));
+ }
+ else
+ {
+ /* Not valid Unicode. Pass U+FFFD, most significant byte
+ first. */
+ Dynarr_add (dst, (unsigned char) 0xFF);
+ Dynarr_add (dst, (unsigned char) 0xFD);
+ }
}
break;
case UNICODE_UCS_4:
+ case UNICODE_UTF_32:
if (little_endian)
{
- Dynarr_add (dst, (unsigned char) (code & 255));
- Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
- Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
- Dynarr_add (dst, (unsigned char) (code >> 24));
+ if (write_error_characters_as_such &&
+ code >= UNICODE_ERROR_OCTET_RANGE_START &&
+ code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+ {
+ Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+ }
+ else
+ {
+ /* We generate and accept incorrect sequences here, which is
+ okay, in the interest of preservation of the user's
+ data. */
+ Dynarr_add (dst, (unsigned char) (code & 255));
+ Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+ Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
+ Dynarr_add (dst, (unsigned char) (code >> 24));
+ }
}
else
{
- Dynarr_add (dst, (unsigned char) (code >> 24));
- Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
- Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
- Dynarr_add (dst, (unsigned char) (code & 255));
+ if (write_error_characters_as_such &&
+ code >= UNICODE_ERROR_OCTET_RANGE_START &&
+ code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+ {
+ Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+ }
+ else
+ {
+ /* We generate and accept incorrect sequences here, which is okay,
+ in the interest of preservation of the user's data. */
+ Dynarr_add (dst, (unsigned char) (code >> 24));
+ Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
+ Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+ Dynarr_add (dst, (unsigned char) (code & 255));
+ }
}
break;
@@ -1842,11 +1902,25 @@ encode_unicode_char_1 (int code, unsigne
}
else if (code <= 0x3ffffff)
{
- Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8));
- Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80));
- Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80));
- Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80));
- Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80));
+
+#if !(UNICODE_ERROR_OCTET_RANGE_START > 0x1fffff \
+ && UNICODE_ERROR_OCTET_RANGE_START < 0x3ffffff)
+#error "This code needs to be rewritten. "
+#endif
+ if (write_error_characters_as_such &&
+ code >= UNICODE_ERROR_OCTET_RANGE_START &&
+ code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+ {
+ Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+ }
+ else
+ {
+ Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8));
+ Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80));
+ Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80));
+ Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80));
+ Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80));
+ }
}
else
{
@@ -1870,7 +1944,8 @@ encode_unicode_char_1 (int code, unsigne
void
encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h,
int USED_IF_MULE (l), unsigned_char_dynarr *dst,
- enum unicode_type type, unsigned int little_endian)
+ enum unicode_type type, unsigned int little_endian,
+ int write_error_characters_as_such)
{
#ifdef MULE
int code = ichar_to_unicode (make_ichar (charset, h & 127, l & 127));
@@ -1896,7 +1971,8 @@ encode_unicode_char (Lisp_Object USED_IF
int code = h;
#endif /* MULE */
- encode_unicode_char_1 (code, dst, type, little_endian);
+ encode_unicode_char_1 (code, dst, type, little_endian,
+ write_error_characters_as_such);
}
static Bytecount
@@ -1915,6 +1991,8 @@ unicode_convert (struct coding_stream *s
if (str->direction == CODING_DECODE)
{
unsigned char counter = data->counter;
+ unsigned char indicated_length
+ = data->indicated_length;
while (n--)
{
@@ -1923,46 +2001,92 @@ unicode_convert (struct coding_stream *s
switch (type)
{
case UNICODE_UTF_8:
- switch (counter)
- {
- case 0:
- if (c >= 0xfc)
- {
- ch = c & 0x01;
- counter = 5;
- }
- else if (c >= 0xf8)
- {
- ch = c & 0x03;
- counter = 4;
- }
- else if (c >= 0xf0)
- {
- ch = c & 0x07;
- counter = 3;
- }
- else if (c >= 0xe0)
- {
- ch = c & 0x0f;
- counter = 2;
- }
- else if (c >= 0xc0)
- {
- ch = c & 0x1f;
- counter = 1;
- }
- else
- decode_unicode_char (c, dst, data, ignore_bom);
- break;
- case 1:
- ch = (ch << 6) | (c & 0x3f);
- decode_unicode_char (ch, dst, data, ignore_bom);
- ch = 0;
- counter = 0;
- break;
- default:
- ch = (ch << 6) | (c & 0x3f);
- counter--;
+ if (0 == counter)
+ {
+ if (0 == (c & 0x80))
+ {
+ /* ASCII. */
+ decode_unicode_char (c, dst, data, ignore_bom);
+ }
+ else if (0 == (c & 0x40))
+ {
+ /* Highest bit set, second highest not--there's
+ something wrong. */
+ DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+ }
+ else if (0 == (c & 0x20))
+ {
+ ch = c & 0x1f;
+ counter = 1;
+ indicated_length = 2;
+ }
+ else if (0 == (c & 0x10))
+ {
+ ch = c & 0x0f;
+ counter = 2;
+ indicated_length = 3;
+ }
+ else if (0 == (c & 0x08))
+ {
+ ch = c & 0x0f;
+ counter = 3;
+ indicated_length = 4;
+ }
+ else
+ {
+ /* We don't supports lengths longer than 4 in
+ external-format data. */
+ DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+
+ }
+ }
+ else
+ {
+ /* counter != 0 */
+ if ((0 == (c & 0x80)) || (0 != (c & 0x40)))
+ {
+ indicate_invalid_utf_8(indicated_length,
+ counter,
+ ch, dst, data, ignore_bom);
+ if (c & 0x80)
+ {
+ DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+ }
+ else
+ {
+ /* The character just read is ASCII. Treat it as
+ such. */
+ decode_unicode_char (c, dst, data, ignore_bom);
+ }
+ ch = 0;
+ counter = 0;
+ }
+ else
+ {
+ ch = (ch << 6) | (c & 0x3f);
+ counter--;
+ /* Just processed the final byte. Emit the character. */
+ if (!counter)
+ {
+ /* Don't accept over-long sequences, surrogates,
+ or codes above #x10FFFF. */
+ if ((ch < 0x80) ||
+ ((ch < 0x800) && indicated_length > 2) ||
+ ((ch < 0x10000) && indicated_length > 3) ||
+ valid_utf_16_surrogate(ch) || (ch > 0x110000))
+ {
+ indicate_invalid_utf_8(indicated_length,
+ counter,
+ ch, dst, data,
+ ignore_bom);
+ }
+ else
+ {
+ decode_unicode_char (ch, dst, data, ignore_bom);
+ }
+ ch = 0;
+ }
+ }
}
break;
@@ -1972,39 +2096,51 @@ unicode_convert (struct coding_stream *s
ch = (c << counter) | ch;
else
ch = (ch << 8) | c;
- counter += 8;
- if (counter == 16 && valid_utf_16_first_surrogate(ch))
- break;
+ counter += 8;
- if (counter == 16)
- {
+ if (16 == counter)
+ {
int tempch = ch;
+
+ if (valid_utf_16_first_surrogate(ch))
+ {
+ break;
+ }
ch = 0;
counter = 0;
decode_unicode_char (tempch, dst, data, ignore_bom);
}
- if (counter == 32)
+ else if (32 == counter)
{
int tempch;
- /* #### Signalling an error may be a bit extreme. Should
- we try and read it in anyway? */
- if (!valid_utf_16_first_surrogate(ch >> 16)
- || !valid_utf_16_last_surrogate(ch & 0xFFFF))
+
+ if (!valid_utf_16_last_surrogate(ch & 0xFFFF))
{
- signal_error(Qtext_conversion_error,
- "Invalid UTF-16 surrogate sequence",
- Qunbound);
+ DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
+ ignore_bom);
}
- tempch = utf_16_surrogates_to_code((ch >> 16),
- (ch & 0xffff));
+ else
+ {
+ tempch = utf_16_surrogates_to_code((ch >> 16),
+ (ch & 0xffff));
+ decode_unicode_char(tempch, dst, data, ignore_bom);
+ }
ch = 0;
counter = 0;
- decode_unicode_char(tempch, dst, data, ignore_bom);
- }
+ }
+ else
+ assert(8 == counter || 24 == counter);
break;
case UNICODE_UCS_4:
+ case UNICODE_UTF_32:
if (little_endian)
ch = (c << counter) | ch;
else
@@ -2012,15 +2148,43 @@ unicode_convert (struct coding_stream *s
counter += 8;
if (counter == 32)
{
- int tempch = ch;
- ch = 0;
- counter = 0;
- if (tempch < 0)
+ if (ch > 0x10ffff)
{
- /* !!#### indicate an error */
- tempch = '~';
+ /* ch is not a legal Unicode character. We're fine
+ with that in UCS-4, though not in UTF-32. */
+ if (UNICODE_UCS_4 == type && ch < 0x80000000)
+ {
+ decode_unicode_char (ch, dst, data, ignore_bom);
+ }
+ else if (little_endian)
+ {
+ DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
+ ignore_bom);
+ }
+ else
+ {
+ DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
+ ignore_bom);
+ }
}
- decode_unicode_char (tempch, dst, data, ignore_bom);
+ else
+ {
+ decode_unicode_char (ch, dst, data, ignore_bom);
+ }
+ ch = 0;
+ counter = 0;
}
break;
@@ -2032,10 +2196,67 @@ unicode_convert (struct coding_stream *s
}
}
- if (str->eof)
- DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
+
+ if (str->eof && ch)
+ {
+ switch (type)
+ {
+ case UNICODE_UTF_8:
+ indicate_invalid_utf_8(indicated_length,
+ counter, ch, dst, data,
+ ignore_bom);
+ break;
+
+ case UNICODE_UTF_16:
+ case UNICODE_UCS_4:
+ case UNICODE_UTF_32:
+ if (8 == counter)
+ {
+ DECODE_ERROR_OCTET (ch, dst, data, ignore_bom);
+ }
+ else if (16 == counter)
+ {
+ if (little_endian)
+ {
+ DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom);
+ DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+ ignore_bom);
+ }
+ else
+ {
+ DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom);
+ }
+ }
+ else if (24 == counter)
+ {
+ if (little_endian)
+ {
+ DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom);
+ DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+ ignore_bom);
+ }
+ else
+ {
+ DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
+ ignore_bom);
+ }
+ }
+ else assert(0);
+ break;
+ }
+ ch = 0;
+ }
data->counter = counter;
+ data->indicated_length = indicated_length;
}
else
{
@@ -2054,7 +2275,7 @@ unicode_convert (struct coding_stream *s
if (XCODING_SYSTEM_UNICODE_NEED_BOM (str->codesys) && !data->wrote_bom)
{
- encode_unicode_char_1 (0xFEFF, dst, type, little_endian);
+ encode_unicode_char_1 (0xFEFF, dst, type, little_endian, 1);
data->wrote_bom = 1;
}
@@ -2068,7 +2289,7 @@ unicode_convert (struct coding_stream *s
{ /* Processing ASCII character */
ch = 0;
encode_unicode_char (Vcharset_ascii, c, 0, dst, type,
- little_endian);
+ little_endian, 1);
char_boundary = 1;
}
@@ -2092,20 +2313,20 @@ unicode_convert (struct coding_stream *s
for the rationale behind subtracting #xa0 from the
character's code. */
encode_unicode_char (Vcharset_control_1, c - 0xa0, 0, dst,
- type, little_endian);
+ type, little_endian, 1);
else
{
switch (XCHARSET_REP_BYTES (charset))
{
case 2:
encode_unicode_char (charset, c, 0, dst, type,
- little_endian);
+ little_endian, 1);
break;
case 3:
if (XCHARSET_PRIVATE_P (charset))
{
encode_unicode_char (charset, c, 0, dst, type,
- little_endian);
+ little_endian, 1);
ch = 0;
}
else if (ch)
@@ -2119,7 +2340,7 @@ unicode_convert (struct coding_stream *s
handle this yet. */
encode_unicode_char (Vcharset_ascii, '~', 0,
dst, type,
- little_endian);
+ little_endian, 1);
}
else
{
@@ -2138,7 +2359,7 @@ unicode_convert (struct coding_stream *s
else
#endif /* ENABLE_COMPOSITE_CHARS */
encode_unicode_char (charset, ch, c, dst, type,
- little_endian);
+ little_endian, 1);
ch = 0;
}
else
@@ -2151,7 +2372,7 @@ unicode_convert (struct coding_stream *s
if (ch)
{
encode_unicode_char (charset, ch, c, dst, type,
- little_endian);
+ little_endian, 1);
ch = 0;
}
else
@@ -2521,6 +2742,8 @@ unicode_putprop (Lisp_Object codesys, Li
type = UNICODE_UTF_7;
else if (EQ (value, Qucs_4))
type = UNICODE_UCS_4;
+ else if (EQ (value, Qutf_32))
+ type = UNICODE_UTF_32;
else
invalid_constant ("Invalid Unicode type", key);
@@ -2546,6 +2769,7 @@ unicode_getprop (Lisp_Object coding_syst
case UNICODE_UTF_8: return Qutf_8;
case UNICODE_UTF_7: return Qutf_7;
case UNICODE_UCS_4: return Qucs_4;
+ case UNICODE_UTF_32: return Qutf_32;
default: ABORT ();
}
}
@@ -2620,6 +2844,7 @@ syms_of_unicode (void)
DEFSYMBOL (Qunicode);
DEFSYMBOL (Qucs_4);
DEFSYMBOL (Qutf_16);
+ DEFSYMBOL (Qutf_32);
DEFSYMBOL (Qutf_8);
DEFSYMBOL (Qutf_7);
More information about the XEmacs-CVS
mailing list