Attachment "utf-8-nullbyte.patch" to
ticket [949905ffff]
added by
rmax
2004-05-08 02:49:17.
Index: generic/tclEncoding.c
===================================================================
RCS file: /cvsroot/tcl/tcl/generic/tclEncoding.c,v
retrieving revision 1.21
diff -u -r1.21 tclEncoding.c
--- generic/tclEncoding.c 5 May 2004 18:28:39 -0000 1.21
+++ generic/tclEncoding.c 7 May 2004 19:47:02 -0000
@@ -225,6 +225,16 @@
CONST char *src, int srcLen, int flags,
Tcl_EncodingState *statePtr, char *dst, int dstLen,
int *srcReadPtr, int *dstWrotePtr,
+ int *dstCharsPtr, int pureNullMode));
+static int UtfIntToUtfExtProc _ANSI_ARGS_((ClientData clientData,
+ CONST char *src, int srcLen, int flags,
+ Tcl_EncodingState *statePtr, char *dst, int dstLen,
+ int *srcReadPtr, int *dstWrotePtr,
+ int *dstCharsPtr));
+static int UtfExtToUtfIntProc _ANSI_ARGS_((ClientData clientData,
+ CONST char *src, int srcLen, int flags,
+ Tcl_EncodingState *statePtr, char *dst, int dstLen,
+ int *srcReadPtr, int *dstWrotePtr,
int *dstCharsPtr));
static int TclFindEncodings _ANSI_ARGS_((CONST char *argv0));
@@ -272,8 +282,8 @@
systemEncoding = Tcl_GetEncoding(NULL, type.encodingName);
type.encodingName = "utf-8";
- type.toUtfProc = UtfToUtfProc;
- type.fromUtfProc = UtfToUtfProc;
+ type.toUtfProc = UtfExtToUtfIntProc;
+ type.fromUtfProc = UtfIntToUtfExtProc;
type.freeProc = NULL;
type.nullSize = 1;
type.clientData = NULL;
@@ -1775,6 +1785,105 @@
return result;
}
+
+/*
+ *-------------------------------------------------------------------------
+ *
+ * UtfExtToUtfIntProc --
+ *
+ * Convert from UTF-8 to UTF-8. While converting null-bytes from
+ * the Tcl's internal representation (0xc0, 0x80) to the official
+ * representation (0x00). See UtfToUtfProc for details.
+ *
+ * Results:
+ * Returns TCL_OK if conversion was successful.
+ *
+ * Side effects:
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static int
+UtfIntToUtfExtProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
+ srcReadPtr, dstWrotePtr, dstCharsPtr)
+ ClientData clientData; /* Not used. */
+ CONST char *src; /* Source string in UTF-8. */
+ int srcLen; /* Source string length in bytes. */
+ int flags; /* Conversion control flags. */
+ Tcl_EncodingState *statePtr;/* Place for conversion routine to store
+ * state information used during a piecewise
+ * conversion. Contents of statePtr are
+ * initialized and/or reset by conversion
+ * routine under control of flags argument. */
+ char *dst; /* Output buffer in which converted string
+ * is stored. */
+ int dstLen; /* The maximum length of output buffer in
+ * bytes. */
+ int *srcReadPtr; /* Filled with the number of bytes from the
+ * source string that were converted. This
+ * may be less than the original source length
+ * if there was a problem converting some
+ * source characters. */
+ int *dstWrotePtr; /* Filled with the number of bytes that were
+ * stored in the output buffer as a result of
+ * the conversion. */
+ int *dstCharsPtr; /* Filled with the number of characters that
+ * correspond to the bytes stored in the
+ * output buffer. */
+{
+ return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
+ srcReadPtr, dstWrotePtr, dstCharsPtr, 1);
+}
+
+/*
+ *-------------------------------------------------------------------------
+ *
+ * UtfExtToUtfIntProc --
+ *
+ * Convert from UTF-8 to UTF-8 while converting null-bytes from
+ * the official representation (0x00) to Tcl's internal
+ * representation (0xc0, 0x80). See UtfToUtfProc for details.
+ *
+ * Results:
+ * Returns TCL_OK if conversion was successful.
+ *
+ * Side effects:
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static int
+UtfExtToUtfIntProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
+ srcReadPtr, dstWrotePtr, dstCharsPtr)
+ ClientData clientData; /* Not used. */
+ CONST char *src; /* Source string in UTF-8. */
+ int srcLen; /* Source string length in bytes. */
+ int flags; /* Conversion control flags. */
+ Tcl_EncodingState *statePtr;/* Place for conversion routine to store
+ * state information used during a piecewise
+ * conversion. Contents of statePtr are
+ * initialized and/or reset by conversion
+ * routine under control of flags argument. */
+ char *dst; /* Output buffer in which converted string
+ * is stored. */
+ int dstLen; /* The maximum length of output buffer in
+ * bytes. */
+ int *srcReadPtr; /* Filled with the number of bytes from the
+ * source string that were converted. This
+ * may be less than the original source length
+ * if there was a problem converting some
+ * source characters. */
+ int *dstWrotePtr; /* Filled with the number of bytes that were
+ * stored in the output buffer as a result of
+ * the conversion. */
+ int *dstCharsPtr; /* Filled with the number of characters that
+ * correspond to the bytes stored in the
+ * output buffer. */
+{
+ return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
+ srcReadPtr, dstWrotePtr, dstCharsPtr, 0);
+}
+
/*
*-------------------------------------------------------------------------
*
@@ -1795,7 +1904,7 @@
static int
UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
- srcReadPtr, dstWrotePtr, dstCharsPtr)
+ srcReadPtr, dstWrotePtr, dstCharsPtr, pureNullMode)
ClientData clientData; /* Not used. */
CONST char *src; /* Source string in UTF-8. */
int srcLen; /* Source string length in bytes. */
@@ -1820,6 +1929,10 @@
int *dstCharsPtr; /* Filled with the number of characters that
* correspond to the bytes stored in the
* output buffer. */
+ int pureNullMode; /* Convert embedded nulls from
+ * internal representation to real
+ * null-bytes or vice versa */
+
{
CONST char *srcStart, *srcEnd, *srcClose;
char *dstStart, *dstEnd;
@@ -1852,8 +1965,21 @@
result = TCL_CONVERT_NOSPACE;
break;
}
- if (UCHAR(*src) < 0x80) {
+ if (UCHAR(*src) < 0x80 &&
+ !(UCHAR(*src) == 0 && pureNullMode == 0)) {
+ /*
+ * Copy 7bit chatacters, but skip null-bytes when we are
+ * in input mode, so that they get converted to 0xc080.
+ */
*dst++ = *src++;
+ } else if (pureNullMode == 1 &&
+ UCHAR(*src) == 0xc0 &&
+ UCHAR(*(src+1)) == 0x80) {
+ /*
+ * Convert 0xc080 to real nulls when we are in output mode.
+ */
+ *dst++ = 0;
+ src += 2;
} else {
src += Tcl_UtfToUniChar(src, &ch);
dst += Tcl_UniCharToUtf(ch, dst);
Index: tests/encoding.test
===================================================================
RCS file: /cvsroot/tcl/tcl/tests/encoding.test,v
retrieving revision 1.19
diff -u -r1.19 encoding.test
--- tests/encoding.test 14 Nov 2003 20:44:46 -0000 1.19
+++ tests/encoding.test 7 May 2004 19:47:02 -0000
@@ -291,6 +291,21 @@
encoding convertto utf-8 \xa3
} "\xc2\xa3"
+test encoding-15.2 {UtfToUtfProc null character output} {
+ set x \u0000
+ set y [encoding convertto utf-8 \u0000]
+ set y [encoding convertfrom identity $y]
+ binary scan $y H* z
+ list [string bytelength $x] [string bytelength $y] $z
+} {2 1 00}
+
+test encoding-15.3 {UtfToUtfProc null character input} {
+ set x [encoding convertfrom identity \x00]
+ set y [encoding convertfrom utf-8 $x]
+ binary scan [encoding convertto identity $y] H* z
+ list [string bytelength $x] [string bytelength $y] $z
+} {1 2 c080}
+
test encoding-16.1 {UnicodeToUtfProc} {
encoding convertfrom unicode NN
} "\u4e4e"