CVS update by aidan xemacs/src ...
xemacs-cvs at xemacs.org
xemacs-cvs at xemacs.org
Wed Nov 14 14:41:13 EST 2007
User: aidan
Date: 07/11/14 20:41:13
Modified: xemacs/src ChangeLog lread.c unicode.c
Log:
Correct the dumped information for the Unicode JIT infrastructure.
Revision Changes Path
1.859 +9 -0 XEmacs/xemacs/lisp/ChangeLog
Index: ChangeLog
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/lisp/ChangeLog,v
retrieving revision 1.858
retrieving revision 1.859
diff -u -p -r1.858 -r1.859
--- ChangeLog 2007/11/14 19:25:39 1.858
+++ ChangeLog 2007/11/14 19:41:04 1.859
@@ -1,5 +1,14 @@
2007-11-14 Aidan Kehoe <kehoea at parhasard.net>
+ * unicode.el (unicode-error-default-translation-table):
+ * unicode.el (unicode-error-sequence-regexp-range):
+ * unicode.el (frob-unicode-errors-region):
+ Make these variables and the single function available to
+ make-docfile, by moving them to the start of the line. This
+ conflicts with normal indentation of Lisp, unfortunately.
+
+2007-11-14 Aidan Kehoe <kehoea at parhasard.net>
+
* subr.el (string-to-sequence):
* subr.el (string-to-list):
* subr.el (string-to-vector):
1.27 +35 -31 XEmacs/xemacs/lisp/unicode.el
Index: unicode.el
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/lisp/unicode.el,v
retrieving revision 1.26
retrieving revision 1.27
diff -u -p -r1.26 -r1.27
--- unicode.el 2007/10/13 14:08:30 1.26
+++ unicode.el 2007/11/14 19:41:05 1.27
@@ -494,36 +494,40 @@ The second argument must be 'ucs, the th
(char-syntax ascii-or-latin-1))
syntax-table))
- ;; Create all the Unicode error sequences, normally as jit-ucs-charset-0
- ;; characters starting at U+200000 (which isn't a valid Unicode code
- ;; point). Make them available to user code.
- (defvar unicode-error-default-translation-table
- (loop
- with char-table = (make-char-table 'char)
- for i from ?\x00 to ?\xFF
- do
- (put-char-table (aref
- ;; #xd800 is the first leading surrogate;
- ;; trailing surrogates must be in the range
- ;; #xdc00-#xdfff. These examples are not, so we
- ;; intentionally provoke an error sequence.
- (decode-coding-string (format "\xd8\x00\x00%c" i)
- 'utf-16-be)
- 3)
- i
- char-table)
- finally return char-table)
- "Translation table mapping Unicode error sequences to Latin-1 chars.
+;; *Sigh*, declarations needs to be at the start of the line to be picked up
+;; by make-docfile. Not so much an issue with ccl-encode-to-ucs-2, which we
+;; don't necessarily want to advertise, but the following are important.
+
+;; Create all the Unicode error sequences, normally as jit-ucs-charset-0
+;; characters starting at U+200000 (which isn't a valid Unicode code
+;; point). Make them available to user code.
+(defvar unicode-error-default-translation-table
+ (loop
+ with char-table = (make-char-table 'char)
+ for i from ?\x00 to ?\xFF
+ do
+ (put-char-table (aref
+ ;; #xd800 is the first leading surrogate;
+ ;; trailing surrogates must be in the range
+ ;; #xdc00-#xdfff. These examples are not, so we
+ ;; intentionally provoke an error sequence.
+ (decode-coding-string (format "\xd8\x00\x00%c" i)
+ 'utf-16-be)
+ 3)
+ i
+ char-table)
+ finally return char-table)
+ "Translation table mapping Unicode error sequences to Latin-1 chars.
To transform XEmacs Unicode error sequences to the Latin-1 characters that
correspond to the octets on disk, you can use this variable. ")
- (defvar unicode-error-sequence-regexp-range
- (format "%c%c-%c"
- (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 0)
- (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 3)
- (aref (decode-coding-string "\xd8\x00\x00\xFF" 'utf-16-be) 3))
- "Regular expression range to match Unicode error sequences in XEmacs.
+(defvar unicode-error-sequence-regexp-range
+ (format "%c%c-%c"
+ (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 0)
+ (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 3)
+ (aref (decode-coding-string "\xd8\x00\x00\xFF" 'utf-16-be) 3))
+ "Regular expression range to match Unicode error sequences in XEmacs.
Invalid Unicode sequences on input are represented as XEmacs
characters with values stored as the keys in
@@ -559,14 +563,14 @@ invalid octet. You can use this variabl
nil
(format "Could not find char ?\\x%x in buffer" i))))
- (defun frob-unicode-errors-region (frob-function begin end &optional buffer)
- "Call FROB-FUNCTION on the Unicode error sequences between BEGIN and END.
+(defun frob-unicode-errors-region (frob-function begin end &optional buffer)
+ "Call FROB-FUNCTION on the Unicode error sequences between BEGIN and END.
Optional argument BUFFER specifies the buffer that should be examined for
such sequences. "
- (check-argument-type #'functionp frob-function)
- (check-argument-range begin (point-min buffer) (point-max buffer))
- (check-argument-range end (point-min buffer) (point-max buffer))
+ (check-argument-type #'functionp frob-function)
+ (check-argument-range begin (point-min buffer) (point-max buffer))
+ (check-argument-range end (point-min buffer) (point-max buffer))
(save-excursion
(save-restriction
(if buffer (set-buffer buffer))
1.1106 +12 -0 XEmacs/xemacs/src/ChangeLog
Index: ChangeLog
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/ChangeLog,v
retrieving revision 1.1105
retrieving revision 1.1106
diff -u -p -r1.1105 -r1.1106
--- ChangeLog 2007/11/14 18:51:20 1.1105
+++ ChangeLog 2007/11/14 19:41:08 1.1106
@@ -1,5 +1,17 @@
2007-11-14 Aidan Kehoe <kehoea at parhasard.net>
+ * lread.c (read_unicode_escape):
+ Correct the range check for Unicode characters specified with
+ source-level escapes.
+ * unicode.c:
+ * unicode.c (unicode_to_ichar):
+ * unicode.c (coding_system_type_create_unicode):
+ Correct the dump behaviour for just-in-time Unicode code
+ points. Update the docstring for #'unicode-to-char to indicate
+ that code points will run out above around 400,000 in a session.
+
+2007-11-14 Aidan Kehoe <kehoea at parhasard.net>
+
* editfns.c (vars_of_editfns):
Correct the docstring of user-full-name.
* fileio.c:
1.83 +1 -1 XEmacs/xemacs/src/lread.c
Index: lread.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/lread.c,v
retrieving revision 1.82
retrieving revision 1.83
diff -u -p -r1.82 -r1.83
--- lread.c 2007/08/04 20:00:24 1.82
+++ lread.c 2007/11/14 19:41:09 1.83
@@ -1694,7 +1694,7 @@ read_unicode_escape (Lisp_Object readcha
}
}
- if (i > 0x110000 || i < 0)
+ if (i >= 0x110000 || i < 0)
{
syntax_error ("Not a Unicode code point", make_int(i));
}
1.39 +34 -26 XEmacs/xemacs/src/unicode.c
Index: unicode.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/unicode.c,v
retrieving revision 1.38
retrieving revision 1.39
diff -u -p -r1.38 -r1.39
--- unicode.c 2007/08/04 20:00:24 1.38
+++ unicode.c 2007/11/14 19:41:09 1.39
@@ -336,6 +336,11 @@ Lisp_Object Vcurrent_jit_charset;
Lisp_Object Qlast_allocated_character;
Lisp_Object Qccl_encode_to_ucs_2;
+Lisp_Object Vnumber_of_jit_charsets;
+Lisp_Object Vlast_jit_charset_final;
+Lisp_Object Vcharset_descr;
+
+
/************************************************************************/
/* Unicode implementation */
@@ -1080,8 +1085,6 @@ unicode_to_ichar (int code, Lisp_Object_
int code_levels;
int i;
int n = Dynarr_length (charsets);
- static int number_of_jit_charsets;
- static Ascbyte last_jit_charset_final;
type_checking_assert (code >= 0);
/* This shortcut depends on the representation of an Ichar, see text.c.
@@ -1124,33 +1127,21 @@ unicode_to_ichar (int code, Lisp_Object_
(-1 == (i = get_free_codepoint(Vcurrent_jit_charset))))
{
Ibyte setname[32];
- Lisp_Object charset_descr = build_string
- ("Mule charset for otherwise unknown Unicode code points.");
-
- struct gcpro gcpro1;
+ int number_of_jit_charsets = XINT (Vnumber_of_jit_charsets);
+ Ascbyte last_jit_charset_final = XCHAR (Vlast_jit_charset_final);
- if ('\0' == last_jit_charset_final)
- {
- /* This final byte shit is, umm, not that cool. */
- last_jit_charset_final = 0x30;
- }
+ /* This final byte shit is, umm, not that cool. */
+ assert (last_jit_charset_final >= 0x30);
/* Assertion added partly because our Win32 layer doesn't
support snprintf; with this, we're sure it won't overflow
the buffer. */
assert(100 > number_of_jit_charsets);
-
- qxesprintf(setname, "jit-ucs-charset-%d", number_of_jit_charsets++);
- /* Aside: GCPROing here would be overkill according to the FSF's
- philosophy. make-charset cannot currently GC, but is intended
- to be called from Lisp, with its arguments protected by the
- Lisp reader. We GCPRO in case it GCs in the future and no-one
- checks all the C callers. */
+ qxesprintf(setname, "jit-ucs-charset-%d", number_of_jit_charsets);
- GCPRO1 (charset_descr);
Vcurrent_jit_charset = Fmake_charset
- (intern((const CIbyte *)setname), charset_descr,
+ (intern((const CIbyte *)setname), Vcharset_descr,
/* Set encode-as-utf-8 to t, to have this character set written
using UTF-8 escapes in escape-quoted and ctext. This
sidesteps the fact that our internal character -> Unicode
@@ -1159,12 +1150,16 @@ unicode_to_ichar (int code, Lisp_Object_
nconc2 (list6(Qcolumns, make_int(1), Qchars, make_int(96),
Qdimension, make_int(2)),
list6(Qregistries, Qunicode_registries,
- Qfinal, make_char(last_jit_charset_final++),
+ Qfinal, make_char(last_jit_charset_final),
/* This CCL program is initialised in
unicode.el. */
Qccl_program, Qccl_encode_to_ucs_2))));
- UNGCPRO;
+ /* Record for the Unicode infrastructure that we've created
+ this character set. */
+ Vnumber_of_jit_charsets = make_int (number_of_jit_charsets + 1);
+ Vlast_jit_charset_final = make_char (last_jit_charset_final + 1);
+
i = get_free_codepoint(Vcurrent_jit_charset);
}
@@ -1421,10 +1416,15 @@ argument.
If the CODE would not otherwise be converted to an XEmacs character, and the
list of character sets to be consulted is nil or the default, a new XEmacs
character will be created for it in one of the `jit-ucs-charset' Mule
-character sets, and that character will be returned. There is scope for
-tens of thousands of separate Unicode code points in every session using
-this technique, so despite XEmacs' internal encoding not being based on
-Unicode, your data won't be trashed.
+character sets, and that character will be returned.
+
+This is limited to around 400,000 characters per XEmacs session, though, so
+while normal usage will not be problematic, things like:
+
+\(dotimes (i #x110000) (decode-char 'ucs i))
+
+will eventually error. The long-term solution to this is Unicode as an
+internal encoding.
*/
(code, USED_IF_MULE (charsets)))
{
@@ -2862,6 +2862,14 @@ syms_of_unicode (void)
void
coding_system_type_create_unicode (void)
{
+ staticpro (&Vnumber_of_jit_charsets);
+ Vnumber_of_jit_charsets = make_int (0);
+ staticpro (&Vlast_jit_charset_final);
+ Vlast_jit_charset_final = make_char (0x30);
+ staticpro (&Vcharset_descr);
+ Vcharset_descr
+ = build_string ("Mule charset for otherwise unknown Unicode code points.");
+
INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (unicode, "unicode-coding-system-p");
CODING_SYSTEM_HAS_METHOD (unicode, print);
CODING_SYSTEM_HAS_METHOD (unicode, convert);
More information about the XEmacs-CVS
mailing list