[PATCH] Support non-BMP UTF-16.

Aidan Kehoe kehoea at parhasard.net
Mon Apr 30 09:53:04 EDT 2007




src/ChangeLog addition:

2007-04-30  Aidan Kehoe  <kehoea at parhasard.net>

	* unicode.c:
	* unicode.c (encode_unicode_char_1):
	* unicode.c (unicode_convert):
	Support non-BMP characters in UTF-16. 


tests/ChangeLog addition:

2007-04-30  Aidan Kehoe  <kehoea at parhasard.net>

	* automated/mule-tests.el (featurep):
	Minimal tests of the non-BMP UTF-16 support. 


XEmacs Trunk source patch:
Diff command:   cvs -q diff -Nu
Files affected: tests/automated/mule-tests.el
===================================================================
RCS src/unicode.c
===================================================================
RCS

Index: src/unicode.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/unicode.c,v
retrieving revision 1.36
diff -u -u -r1.36 unicode.c
--- src/unicode.c	2006/12/29 18:09:51	1.36
+++ src/unicode.c	2007/04/30 13:51:00
@@ -200,6 +200,28 @@
 
 Lisp_Object Qutf_8_bom;
 
+/* See the Unicode FAQ, http://www.unicode.org/faq/utf_bom.html#35 for this
+   algorithm. 
+ 
+   (They also give another, really verbose one, as part of their explanation
+   of the various planes of the encoding, but we won't use that.) */
+ 
+#define UTF_16_LEAD_OFFSET (0xD800 - (0x10000 >> 10))
+#define UTF_16_SURROGATE_OFFSET (0x10000 - (0xD800 << 10) - 0xDC00)
+
+#define utf_16_surrogates_to_code(lead, trail) \
+  (((lead) << 10) + (trail) + UTF_16_SURROGATE_OFFSET)
+
+#define CODE_TO_UTF_16_SURROGATES(codepoint, lead, trail) do {	\
+    int __ctu16s_code = (codepoint);				\
+    lead = UTF_16_LEAD_OFFSET + (__ctu16s_code >> 10);		\
+    trail = 0xDC00 + (__ctu16s_code & 0x3FF);			\
+} while (0)
+
+#define valid_utf_16_first_surrogate(ch) (((ch) & 0xFC00) == 0xD800)
+#define valid_utf_16_last_surrogate(ch) (((ch) & 0xFC00) == 0xDC00)
+#define valid_utf_16_surrogate(ch) (((ch) & 0xF800) == 0xD800)
+
 #ifdef MULE 
 
 /* Using ints for to_unicode is OK (as long as they are >= 32 bits).
@@ -1742,13 +1764,39 @@
     case UNICODE_UTF_16:
       if (little_endian)
 	{
-	  Dynarr_add (dst, (unsigned char) (code & 255));
-	  Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+	  if (code < 0x10000) {
+	    Dynarr_add (dst, (unsigned char) (code & 255));
+	    Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+	  } else {
+	    /* Little endian; least significant byte first. */
+	    int first, second;
+
+	    CODE_TO_UTF_16_SURROGATES(code, first, second);
+
+	    Dynarr_add (dst, (unsigned char) (first & 255));
+	    Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
+
+	    Dynarr_add (dst, (unsigned char) (second & 255));
+	    Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
+	  }
 	}
       else
 	{
-	  Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
-	  Dynarr_add (dst, (unsigned char) (code & 255));
+	  if (code < 0x10000) {
+	    Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+	    Dynarr_add (dst, (unsigned char) (code & 255));
+	  } else {
+	    /* Big endian; most significant byte first. */
+	    int first, second;
+
+	    CODE_TO_UTF_16_SURROGATES(code, first, second);
+
+	    Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
+	    Dynarr_add (dst, (unsigned char) (first & 255));
+
+	    Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
+	    Dynarr_add (dst, (unsigned char) (second & 255));
+	  }
 	}
       break;
 
@@ -1919,17 +1967,40 @@
 	      break;
 
 	    case UNICODE_UTF_16:
+
 	      if (little_endian)
 		ch = (c << counter) | ch;
 	      else
 		ch = (ch << 8) | c;
 	      counter += 8;
+
+	      if (counter == 16 && valid_utf_16_first_surrogate(ch))
+		break;
+
 	      if (counter == 16)
 		{
 		  int tempch = ch;
 		  ch = 0;
 		  counter = 0;
 		  decode_unicode_char (tempch, dst, data, ignore_bom);
+		}
+	      if (counter == 32)
+		{
+		  int tempch;
+		  /* #### Signalling an error may be a bit extreme. Should
+		     we try and read it in anyway? */
+		  if (!valid_utf_16_first_surrogate(ch >> 16) 
+		      || !valid_utf_16_last_surrogate(ch & 0xFFFF))
+		    {
+		      signal_error(Qtext_conversion_error, 
+				   "Invalid UTF-16 surrogate sequence", 
+				   Qunbound);
+		    }
+		  tempch = utf_16_surrogates_to_code((ch >> 16), 
+						     (ch & 0xffff));
+		  ch = 0;
+		  counter = 0;
+		  decode_unicode_char(tempch, dst, data, ignore_bom);
 		}
 	      break;
 
Index: tests/automated/mule-tests.el
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/tests/automated/mule-tests.el,v
retrieving revision 1.14
diff -u -u -r1.14 mule-tests.el
--- tests/automated/mule-tests.el	2007/04/29 13:20:00	1.14
+++ tests/automated/mule-tests.el	2007/04/30 13:51:00
@@ -339,9 +339,9 @@
 	      'utf-8
 	    'iso-8859-2))
 	 )
-    ;; This is how you suppress output from `message', called by `write-region'
     (Assert (not (equal name1 name2)))
     (Assert (not (file-exists-p name1)))
+    ;; This is how you suppress output from `message', called by `write-region'
     (Silence-Message
      (write-region (point-min) (point-max) name1))
     (Assert (file-exists-p name1))
@@ -399,6 +399,14 @@
 	(Assert (equal (concat "\033%G" utf-8-char)
 		       (encode-coding-string xemacs-character 'ctext))))))
 
+  (loop
+    for (code-point encoded) 
+    in '((#x10000 "\xd8\x00\xdc\x00")
+         (#x10FFFD "\xdb\xff\xdf\xfd"))
+    do (Assert (equal (encode-coding-string 
+                       (decode-char 'ucs code-point) 'utf-16)
+                      encoded)))
+         
   ;;---------------------------------------------------------------
   ;; Regression test for a couple of CCL-related bugs. 
   ;;---------------------------------------------------------------

-- 
On the quay of the little Black Sea port, where the rescued pair came once
more into contact with civilization, Dobrinton was bitten by a dog which was
assumed to be mad, though it may only have been indiscriminating. (Saki)



More information about the XEmacs-Patches mailing list