carbon2-commit: Correct little-endian UTF-16 surrogate handling.

Aidan Kehoe aidan-guest at alioth.debian.org
Sun Feb 1 10:32:23 EST 2009


changeset:   4622:2669b1b7e33b70dad0f42a94629ab3afeffa9b55
user:        Aidan Kehoe <kehoea at parhasard.net>
date:        Sat Jan 31 13:06:37 2009 +0000
files:       src/ChangeLog src/unicode.c tests/ChangeLog tests/automated/mule-tests.el
description:
Correct little-endian UTF-16 surrogate handling.

src/ChangeLog addition:

2009-01-31  Aidan Kehoe  <kehoea at parhasard.net>

	* unicode.c (unicode_convert):
	Correct little-endian UTF-16 surrogate handling.

tests/ChangeLog addition:

2009-01-31  Aidan Kehoe  <kehoea at parhasard.net>

	* automated/mule-tests.el:
	Test little-endian Unicode surrogates too.


diff -r 00ed9903a988de9a983c074bf0abdc667639eeaf -r 2669b1b7e33b70dad0f42a94629ab3afeffa9b55 src/ChangeLog
--- a/src/ChangeLog	Sun Jan 18 12:56:51 2009 +0000
+++ b/src/ChangeLog	Sat Jan 31 13:06:37 2009 +0000
@@ -1,3 +1,8 @@ 2009-01-16  Aidan Kehoe  <kehoea at parhasa
+2009-01-31  Aidan Kehoe  <kehoea at parhasard.net>
+
+	* unicode.c (unicode_convert): 
+	Correct little-endian UTF-16 surrogate handling. 
+
 2009-01-16  Aidan Kehoe  <kehoea at parhasard.net>
 
 	* chartab.c (print_table_entry): 
diff -r 00ed9903a988de9a983c074bf0abdc667639eeaf -r 2669b1b7e33b70dad0f42a94629ab3afeffa9b55 src/unicode.c
--- a/src/unicode.c	Sun Jan 18 12:56:51 2009 +0000
+++ b/src/unicode.c	Sat Jan 31 13:06:37 2009 +0000
@@ -2115,23 +2115,47 @@ unicode_convert (struct coding_stream *s
 		{
 		  int tempch;
 
-		  if (!valid_utf_16_last_surrogate(ch & 0xFFFF))
-		    {
-                      DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
-                                        ignore_bom);
-                      DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
-                                        ignore_bom);
-                      DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
-                                        ignore_bom);
-                      DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
-                                        ignore_bom);
-		    }
-                  else 
+                  if (little_endian)
                     {
-                      tempch = utf_16_surrogates_to_code((ch >> 16), 
-                                                         (ch & 0xffff));
-                      decode_unicode_char(tempch, dst, data, ignore_bom);
+                      if (!valid_utf_16_last_surrogate(ch >> 16))
+                        {
+                          DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
+                                              ignore_bom);
+                          DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+                                              ignore_bom);
+                          DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+                                              ignore_bom);
+                          DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
+                                              ignore_bom);
+                        }
+                      else
+                        {
+                          tempch = utf_16_surrogates_to_code((ch & 0xffff),
+                                                             (ch >> 16));
+                          decode_unicode_char(tempch, dst, data, ignore_bom); 
+                        }
                     }
+                  else
+                    {
+                      if (!valid_utf_16_last_surrogate(ch & 0xFFFF))
+                        {
+                          DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
+                                              ignore_bom);
+                          DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+                                              ignore_bom);
+                          DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+                                              ignore_bom);
+                          DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
+                                              ignore_bom);
+                        }
+                      else 
+                        {
+                          tempch = utf_16_surrogates_to_code((ch >> 16), 
+                                                             (ch & 0xffff));
+                          decode_unicode_char(tempch, dst, data, ignore_bom); 
+                        }
+                    }
+
 		  ch = 0;
 		  counter = 0;
                 }
diff -r 00ed9903a988de9a983c074bf0abdc667639eeaf -r 2669b1b7e33b70dad0f42a94629ab3afeffa9b55 tests/ChangeLog
--- a/tests/ChangeLog	Sun Jan 18 12:56:51 2009 +0000
+++ b/tests/ChangeLog	Sat Jan 31 13:06:37 2009 +0000
@@ -1,3 +1,8 @@ 2009-01-18  Aidan Kehoe  <kehoea at parhasa
+2009-01-31  Aidan Kehoe  <kehoea at parhasard.net>
+
+	* automated/mule-tests.el: 
+	Test little-endian Unicode surrogates too. 
+
 2009-01-18  Aidan Kehoe  <kehoea at parhasard.net>
 	
 	* automated/lisp-tests.el: (char-table-with-string): 
diff -r 00ed9903a988de9a983c074bf0abdc667639eeaf -r 2669b1b7e33b70dad0f42a94629ab3afeffa9b55 tests/automated/mule-tests.el
--- a/tests/automated/mule-tests.el	Sun Jan 18 12:56:51 2009 +0000
+++ b/tests/automated/mule-tests.el	Sat Jan 31 13:06:37 2009 +0000
@@ -446,12 +446,17 @@ This is a naive implementation in Lisp. 
 		       (encode-coding-string xemacs-character 'ctext))))))
 
   (loop
-    for (code-point encoded) 
-    in '((#x10000 "\xd8\x00\xdc\x00")
-         (#x10FFFD "\xdb\xff\xdf\xfd"))
-    do (Assert (equal (encode-coding-string 
-                       (decode-char 'ucs code-point) 'utf-16)
-                      encoded)))
+    for (code-point utf-16-big-endian utf-16-little-endian) 
+    in '((#x10000 "\xd8\x00\xdc\x00" "\x00\xd8\x00\xdc")
+         (#x10FFFD "\xdb\xff\xdf\xfd" "\xff\xdb\xfd\xdf"))
+    do
+    (Assert (equal (encode-coding-string 
+                    (decode-char 'ucs code-point) 'utf-16)
+                   utf-16-big-endian))
+    (Assert (equal (encode-coding-string 
+                    (decode-char 'ucs code-point) 'utf-16-le)
+                   utf-16-little-endian))
+
          
   ;;---------------------------------------------------------------
   ;; Regression test for a couple of CCL-related bugs. 





More information about the XEmacs-Patches mailing list