Tcl Source Code

Artifact [7fc927918b]
Login

Artifact 7fc927918b62796256cd14cdacd849823b5e0226:

Attachment "utf-8-nullbyte.patch" to ticket [949905ffff] added by rmax 2004-05-08 02:49:17.
Index: generic/tclEncoding.c
===================================================================
RCS file: /cvsroot/tcl/tcl/generic/tclEncoding.c,v
retrieving revision 1.21
diff -u -r1.21 tclEncoding.c
--- generic/tclEncoding.c	5 May 2004 18:28:39 -0000	1.21
+++ generic/tclEncoding.c	7 May 2004 19:47:02 -0000
@@ -225,6 +225,16 @@
 			    CONST char *src, int srcLen, int flags,
 			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
 			    int *srcReadPtr, int *dstWrotePtr,
+			    int *dstCharsPtr, int pureNullMode));
+static int		UtfIntToUtfExtProc _ANSI_ARGS_((ClientData clientData,
+			    CONST char *src, int srcLen, int flags,
+			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
+			    int *srcReadPtr, int *dstWrotePtr,
+			    int *dstCharsPtr));
+static int		UtfExtToUtfIntProc _ANSI_ARGS_((ClientData clientData,
+			    CONST char *src, int srcLen, int flags,
+			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
+			    int *srcReadPtr, int *dstWrotePtr,
 			    int *dstCharsPtr));
 static int		TclFindEncodings _ANSI_ARGS_((CONST char *argv0));
 
@@ -272,8 +282,8 @@
     systemEncoding	= Tcl_GetEncoding(NULL, type.encodingName);
 
     type.encodingName	= "utf-8";
-    type.toUtfProc	= UtfToUtfProc;
-    type.fromUtfProc    = UtfToUtfProc;
+    type.toUtfProc	= UtfExtToUtfIntProc;
+    type.fromUtfProc	= UtfIntToUtfExtProc;
     type.freeProc	= NULL;
     type.nullSize	= 1;
     type.clientData	= NULL;
@@ -1775,6 +1785,105 @@
     return result;
 }
 
+
+/*
+ *-------------------------------------------------------------------------
+ *
+ * UtfExtToUtfIntProc --
+ *
+ *	Convert from UTF-8 to UTF-8. While converting null-bytes from
+ *	the Tcl's internal representation (0xc0, 0x80) to the official
+ *	representation (0x00). See UtfToUtfProc for details.
+ *
+ * Results:
+ *	Returns TCL_OK if conversion was successful.
+ *
+ * Side effects:
+ *	None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static int 
+UtfIntToUtfExtProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
+	     srcReadPtr, dstWrotePtr, dstCharsPtr)
+    ClientData clientData;	/* Not used. */
+    CONST char *src;		/* Source string in UTF-8. */
+    int srcLen;			/* Source string length in bytes. */
+    int flags;			/* Conversion control flags. */
+    Tcl_EncodingState *statePtr;/* Place for conversion routine to store
+				 * state information used during a piecewise
+				 * conversion.  Contents of statePtr are
+				 * initialized and/or reset by conversion
+				 * routine under control of flags argument. */
+    char *dst;			/* Output buffer in which converted string
+				 * is stored. */
+    int dstLen;			/* The maximum length of output buffer in
+				 * bytes. */
+    int *srcReadPtr;		/* Filled with the number of bytes from the
+				 * source string that were converted.  This
+				 * may be less than the original source length
+				 * if there was a problem converting some
+				 * source characters. */
+    int *dstWrotePtr;		/* Filled with the number of bytes that were
+				 * stored in the output buffer as a result of
+				 * the conversion. */
+    int *dstCharsPtr;		/* Filled with the number of characters that
+				 * correspond to the bytes stored in the
+				 * output buffer. */
+{
+    return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
+			srcReadPtr, dstWrotePtr, dstCharsPtr, 1);
+}
+
+/*
+ *-------------------------------------------------------------------------
+ *
+ * UtfExtToUtfIntProc --
+ *
+ *	Convert from UTF-8 to UTF-8 while converting null-bytes from
+ *	the official representation (0x00) to Tcl's internal
+ *	representation (0xc0, 0x80). See UtfToUtfProc for details.
+ *
+ * Results:
+ *	Returns TCL_OK if conversion was successful.
+ *
+ * Side effects:
+ *	None.
+ *
+ *-------------------------------------------------------------------------
+ */
+static int 
+UtfExtToUtfIntProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
+	     srcReadPtr, dstWrotePtr, dstCharsPtr)
+    ClientData clientData;	/* Not used. */
+    CONST char *src;		/* Source string in UTF-8. */
+    int srcLen;			/* Source string length in bytes. */
+    int flags;			/* Conversion control flags. */
+    Tcl_EncodingState *statePtr;/* Place for conversion routine to store
+				 * state information used during a piecewise
+				 * conversion.  Contents of statePtr are
+				 * initialized and/or reset by conversion
+				 * routine under control of flags argument. */
+    char *dst;			/* Output buffer in which converted string
+				 * is stored. */
+    int dstLen;			/* The maximum length of output buffer in
+				 * bytes. */
+    int *srcReadPtr;		/* Filled with the number of bytes from the
+				 * source string that were converted.  This
+				 * may be less than the original source length
+				 * if there was a problem converting some
+				 * source characters. */
+    int *dstWrotePtr;		/* Filled with the number of bytes that were
+				 * stored in the output buffer as a result of
+				 * the conversion. */
+    int *dstCharsPtr;		/* Filled with the number of characters that
+				 * correspond to the bytes stored in the
+				 * output buffer. */
+{
+    return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
+			srcReadPtr, dstWrotePtr, dstCharsPtr, 0);
+}
+
 /*
  *-------------------------------------------------------------------------
  *
@@ -1795,7 +1904,7 @@
 
 static int 
 UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
-	srcReadPtr, dstWrotePtr, dstCharsPtr)
+	     srcReadPtr, dstWrotePtr, dstCharsPtr, pureNullMode)
     ClientData clientData;	/* Not used. */
     CONST char *src;		/* Source string in UTF-8. */
     int srcLen;			/* Source string length in bytes. */
@@ -1820,6 +1929,10 @@
     int *dstCharsPtr;		/* Filled with the number of characters that
 				 * correspond to the bytes stored in the
 				 * output buffer. */
+    int pureNullMode;		/* Convert embedded nulls from
+				 * internal representation to real
+				 * null-bytes or vice versa */
+
 {
     CONST char *srcStart, *srcEnd, *srcClose;
     char *dstStart, *dstEnd;
@@ -1852,8 +1965,21 @@
 	    result = TCL_CONVERT_NOSPACE;
 	    break;
 	}
-	if (UCHAR(*src) < 0x80) {
+	if (UCHAR(*src) < 0x80 &&
+	    !(UCHAR(*src) == 0 && pureNullMode == 0)) {
+	    /*
+	     * Copy 7bit chatacters, but skip null-bytes when we are
+	     * in input mode, so that they get converted to 0xc080.
+	     */
 	    *dst++ = *src++;
+	} else if (pureNullMode == 1 &&
+		   UCHAR(*src) == 0xc0 &&
+		   UCHAR(*(src+1)) == 0x80) {
+	    /* 
+	     * Convert 0xc080 to real nulls when we are in output mode.
+	     */
+	    *dst++ = 0;
+	    src += 2;
 	} else {
 	    src += Tcl_UtfToUniChar(src, &ch);
 	    dst += Tcl_UniCharToUtf(ch, dst);
Index: tests/encoding.test
===================================================================
RCS file: /cvsroot/tcl/tcl/tests/encoding.test,v
retrieving revision 1.19
diff -u -r1.19 encoding.test
--- tests/encoding.test	14 Nov 2003 20:44:46 -0000	1.19
+++ tests/encoding.test	7 May 2004 19:47:02 -0000
@@ -291,6 +291,21 @@
     encoding convertto utf-8 \xa3
 } "\xc2\xa3"
 
+test encoding-15.2 {UtfToUtfProc null character output} {
+    set x \u0000
+    set y [encoding convertto utf-8 \u0000]
+    set y [encoding convertfrom identity $y]
+    binary scan $y H* z
+    list [string bytelength $x] [string bytelength $y] $z
+} {2 1 00}
+
+test encoding-15.3 {UtfToUtfProc null character input} {
+    set x [encoding convertfrom identity \x00]
+    set y [encoding convertfrom utf-8 $x]
+    binary scan [encoding convertto identity $y] H* z
+    list [string bytelength $x] [string bytelength $y] $z
+} {1 2 c080}
+
 test encoding-16.1 {UnicodeToUtfProc} {
     encoding convertfrom unicode NN
 } "\u4e4e"