Tcl Source Code

Artifact [629f264bd8]
Login

Artifact 629f264bd811519aac67bd0e771bd53c91fc7089:

Attachment "219210.diff.5" to ticket [219210ffff] added by andreas_kupries 2001-04-19 03:23:20.
? generic/foo
? unix/httpd
Index: ChangeLog
===================================================================
RCS file: /cvsroot/tcl/tcl/ChangeLog,v
retrieving revision 1.419
diff -u -r1.419 ChangeLog
--- ChangeLog	2001/04/10 18:32:39	1.419
+++ ChangeLog	2001/04/18 20:21:07
@@ -1,3 +1,19 @@
+2001-04-18  Andreas Kupries <[email protected]>
+
+	* doc/fcopy.n: Updated to reflect the extended behaviour of 'fcopy'.
+
+	* tests/io.test: Added tests 'io-52.9', 'io-52.10' and 'io-52.11'
+	  to test the handling of encodings by 'fcopy' / 'TclCopychannel'
+	  [Bug #209210].
+
+	* generic/tclIO.c: Split of both 'Tcl_ReadChars' and
+	  'Tcl_WriteChars' into a public error checking and an internal
+	  working part. The public functions now use the new internal
+	  ones. The new functions are 'DoReadChars' and 'DoWriteChars'.
+	  Extended 'CopyData' to use the new functions 'DoXChars' when
+	  required by the encodings on the input and output channels
+	  [Bug #209210].
+
 2001-04-10  Kevin B. Kenny    <[email protected]>
 	* unix/tclUnixTime.c: Altered code to use memcpy instead of
 	structure assigments in an effort to achieve better K&R
@@ -7,7 +23,7 @@
 
 	* unix/tclUnixTime.c: Fixed silly typo in calls to 'gmtime' and
 	'localtime' that broke the Linux build.
-	
+
 2001-04-09  Kevin B. Kenny    <[email protected]>
 
 	* unix/tclLoadShl.c: Added DYNAMIC_PATH to the load flags so that
Index: doc/fcopy.n
===================================================================
RCS file: /cvsroot/tcl/tcl/doc/fcopy.n,v
retrieving revision 1.2
diff -u -r1.2 fcopy.n
--- doc/fcopy.n	1998/09/14 18:39:52	1.2
+++ doc/fcopy.n	2001/04/18 20:21:08
@@ -71,6 +71,19 @@
 Only the number of bytes written to \fIoutchan\fR is reported,
 either as the return value of a synchronous \fBfcopy\fP or
 as the argument to the callback for an asynchronous \fBfcopy\fP.
+.PP
+\fBFcopy\fR obeys the encodings configured for the channels. This
+means that the incoming characters are converted internally first
+UTF-8 and then into the encoding of the channel \fBfcopy\fR writes
+to. See the manual entry for \fBfconfigure\fR for details on the
+\fB\-encoding\fR option. No conversion is done if both channels are
+set to encoding "binary". If only the output channel is set to
+encoding "binary" the system will write the internal UTF-8
+representation of the incoming characters. If only the input channel
+is set to encoding "binary" the system will assume that the incoming
+bytes are valid UTF-8 characters and convert them according to the
+output encoding. The behaviour of the system for bytes which are not
+valid UTF-8 characters is undefined in this case.
 
 .SH EXAMPLE
 .PP
Index: generic/tclIO.c
===================================================================
RCS file: /cvsroot/tcl/tcl/generic/tclIO.c,v
retrieving revision 1.29
diff -u -r1.29 tclIO.c
--- generic/tclIO.c	2001/03/30 23:06:39	1.29
+++ generic/tclIO.c	2001/04/18 20:21:27
@@ -10,7 +10,7 @@
  * See the file "license.terms" for information on usage and redistribution
  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclIO.c,v 1.29 2001/03/30 23:06:39 andreas_kupries Exp $
+ * RCS: @(#) $Id: tclIO.c,v 1.28 2001/01/30 17:32:06 dgp Exp $
  */
 
 #include "tclInt.h"
@@ -112,6 +112,10 @@
 				int slen));
 static int		DoWrite _ANSI_ARGS_((Channel *chanPtr, char *src,
 				int srcLen));
+static int		DoReadChars _ANSI_ARGS_ ((Channel* chan,
+				Tcl_Obj* objPtr, int toRead, int appendFlag));
+static int		DoWriteChars _ANSI_ARGS_ ((Channel* chan,
+				CONST char* src, int len));
 static int		FilterInputBytes _ANSI_ARGS_((Channel *chanPtr,
 				GetsState *statePtr));
 static int		FlushChannel _ANSI_ARGS_((Tcl_Interp *interp,
@@ -2508,7 +2512,10 @@
  *	Puts a sequence of bytes into an output buffer, may queue the
  *	buffer for output if it gets full, and also remembers whether the
  *	current buffer is ready e.g. if it contains a newline and we are in
- *	line buffering mode.
+ *	line buffering mode. Compensates stacking, i.e. will redirect the
+ *	data from the specified channel to the topmost channel in a stack.
+ *
+ *	No encoding conversions are applied to the bytes being read.
  *
  * Results:
  *	The number of bytes written or -1 in case of error. If -1,
@@ -2555,8 +2562,11 @@
  *	Puts a sequence of bytes into an output buffer, may queue the
  *	buffer for output if it gets full, and also remembers whether the
  *	current buffer is ready e.g. if it contains a newline and we are in
- *	line buffering mode.
+ *	line buffering mode. Writes directly to the driver of the channel,
+ *	does not compensate for stacking.
  *
+ *	No encoding conversions are applied to the bytes being read.
+ *
  * Results:
  *	The number of bytes written or -1 in case of error. If -1,
  *	Tcl_GetErrno will return the error code.
@@ -2611,7 +2621,8 @@
  *	using the channel's current encoding, may queue the buffer for
  *	output if it gets full, and also remembers whether the current
  *	buffer is ready e.g. if it contains a newline and we are in
- *	line buffering mode.
+ *	line buffering mode. Compensates stacking, i.e. will redirect the
+ *	data from the specified channel to the topmost channel in a stack.
  *
  * Results:
  *	The number of bytes written or -1 in case of error. If -1,
@@ -2631,18 +2642,55 @@
     int len;			/* Length of string in bytes, or < 0 for 
 				 * strlen(). */
 {
-    /*
-     * Always use the topmost channel of the stack
-     */
-    Channel *chanPtr;
     ChannelState *statePtr;	/* state info for channel */
 
     statePtr = ((Channel *) chan)->state;
-    chanPtr  = statePtr->topChanPtr;
 
     if (CheckChannelErrors(statePtr, TCL_WRITABLE) != 0) {
 	return -1;
     }
+
+    return DoWriteChars ((Channel*) chan, src, len);
+}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * DoWriteChars --
+ *
+ *	Takes a sequence of UTF-8 characters and converts them for output
+ *	using the channel's current encoding, may queue the buffer for
+ *	output if it gets full, and also remembers whether the current
+ *	buffer is ready e.g. if it contains a newline and we are in
+ *	line buffering mode. Compensates stacking, i.e. will redirect the
+ *	data from the specified channel to the topmost channel in a stack.
+ *
+ * Results:
+ *	The number of bytes written or -1 in case of error. If -1,
+ *	Tcl_GetErrno will return the error code.
+ *
+ * Side effects:
+ *	May buffer up output and may cause output to be produced on the
+ *	channel.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+DoWriteChars(chanPtr, src, len)
+    Channel* chanPtr;		/* The channel to buffer output for. */
+    CONST char *src;		/* UTF-8 characters to queue in output buffer. */
+    int len;			/* Length of string in bytes, or < 0 for 
+				 * strlen(). */
+{
+    /*
+     * Always use the topmost channel of the stack
+     */
+    ChannelState *statePtr;	/* state info for channel */
+
+    statePtr = chanPtr->state;
+    chanPtr  = statePtr->topChanPtr;
+
     if (len < 0) {
         len = strlen(src);
     }
@@ -4037,12 +4085,8 @@
 				 * of the object. */
 
 {
-    Channel *chanPtr = (Channel *) chan;
-    ChannelState *statePtr = chanPtr->state;	/* state info for channel */
-    ChannelBuffer *bufPtr;
-    int offset, factor, copied, copiedNow, result;
-    Tcl_Encoding encoding;
-#define UTF_EXPANSION_FACTOR	1024
+    Channel*      chanPtr  = (Channel *) chan;
+    ChannelState* statePtr = chanPtr->state;	/* state info for channel */
     
     /*
      * This operation should occur at the top of a channel stack.
@@ -4051,12 +4095,64 @@
     chanPtr = statePtr->topChanPtr;
 
     if (CheckChannelErrors(statePtr, TCL_READABLE) != 0) {
-	copied = -1;
-	goto done;
+        /*
+	 * Update the notifier state so we don't block while there is still
+	 * data in the buffers.
+	 */
+        UpdateInterest(chanPtr);
+	return -1;
     }
 
+    return DoReadChars (chanPtr, objPtr, toRead, appendFlag);
+}
+/*
+ *---------------------------------------------------------------------------
+ *
+ * DoReadChars --
+ *
+ *	Reads from the channel until the requested number of characters
+ *	have been seen, EOF is seen, or the channel would block.  EOL
+ *	and EOF translation is done.  If reading binary data, the raw
+ *	bytes are wrapped in a Tcl byte array object.  Otherwise, the raw
+ *	bytes are converted to UTF-8 using the channel's current encoding
+ *	and stored in a Tcl string object.
+ *
+ * Results:
+ *	The number of characters read, or -1 on error. Use Tcl_GetErrno()
+ *	to retrieve the error code for the error that occurred.
+ *
+ * Side effects:
+ *	May cause input to be buffered.
+ *
+ *---------------------------------------------------------------------------
+ */
+ 
+int
+DoReadChars(chanPtr, objPtr, toRead, appendFlag)
+    Channel* chanPtr;		/* The channel to read. */
+    Tcl_Obj *objPtr;		/* Input data is stored in this object. */
+    int toRead;			/* Maximum number of characters to store,
+				 * or -1 to read all available data (up to EOF
+				 * or when channel blocks). */
+    int appendFlag;		/* If non-zero, data read from the channel
+				 * will be appended to the object.  Otherwise,
+				 * the data will replace the existing contents
+				 * of the object. */
+
+{
+    ChannelState *statePtr = chanPtr->state;	/* state info for channel */
+    ChannelBuffer *bufPtr;
+    int offset, factor, copied, copiedNow, result;
+    Tcl_Encoding encoding;
+#define UTF_EXPANSION_FACTOR	1024
+    
+    /*
+     * This operation should occur at the top of a channel stack.
+     */
+
+    chanPtr  = statePtr->topChanPtr;
     encoding = statePtr->encoding;
-    factor = UTF_EXPANSION_FACTOR;
+    factor   = UTF_EXPANSION_FACTOR;
 
     if (appendFlag == 0) {
 	if (encoding == NULL) {
@@ -7037,7 +7133,12 @@
     int result = TCL_OK;
     int size;
     int total;
+    int sizeb;
+    Tcl_Obj* bufObj = NULL;
+    char* buffer;
 
+    int inBinary, outBinary, sameEncoding; /* Encoding control */
+
     inChan	= (Tcl_Channel) csPtr->readPtr;
     outChan	= (Tcl_Channel) csPtr->writePtr;
     inStatePtr	= csPtr->readPtr->state;
@@ -7053,8 +7154,16 @@
      * thus gets the bottom of the stack.
      */
 
-    while (csPtr->toRead != 0) {
+    inBinary     = (inStatePtr->encoding  == NULL);
+    outBinary    = (outStatePtr->encoding == NULL);
+    sameEncoding = (inStatePtr->encoding  == outStatePtr->encoding);
+
+    if (!(inBinary || sameEncoding)) {
+        bufObj = Tcl_NewObj ();
+	Tcl_IncrRefCount (bufObj);
+    }
 
+    while (csPtr->toRead != 0) {
 	/*
 	 * Check for unreported background errors.
 	 */
@@ -7079,8 +7188,13 @@
 	} else {
 	    size = csPtr->toRead;
 	}
-	size = DoRead(inStatePtr->topChanPtr, csPtr->buffer, size);
 
+	if (inBinary || sameEncoding) {
+	    size = DoRead(inStatePtr->topChanPtr, csPtr->buffer, size);
+	} else {
+	    size = DoReadChars(inStatePtr->topChanPtr, bufObj, size, 0 /* No append */);
+	}
+
 	if (size < 0) {
 	    readError:
 	    errObj = Tcl_NewObj();
@@ -7105,6 +7219,10 @@
 		Tcl_CreateChannelHandler(inChan, TCL_READABLE,
 			CopyEventProc, (ClientData) csPtr);
 	    }
+	    if (bufObj != (Tcl_Obj*) NULL) {
+	        Tcl_DecrRefCount (bufObj);
+		bufObj = (Tcl_Obj*) NULL;
+	    }
 	    return TCL_OK;
 	}
 
@@ -7112,8 +7230,25 @@
 	 * Now write the buffer out.
 	 */
 
-	size = DoWrite(outStatePtr->topChanPtr, csPtr->buffer, size);
-	if (size < 0) {
+	if (inBinary || sameEncoding) {
+	    buffer = csPtr->buffer;
+	    sizeb = size;
+	} else {
+	    buffer = Tcl_GetStringFromObj (bufObj, &sizeb);
+	}
+
+	if (outBinary || sameEncoding) {
+	    sizeb = DoWrite(outStatePtr->topChanPtr, buffer, sizeb);
+	} else {
+	    sizeb = DoWriteChars(outStatePtr->topChanPtr, buffer, sizeb);
+	}
+
+	if (inBinary || sameEncoding) {
+	    /* Both read and write counted bytes */
+	    size = sizeb;
+	} /* else : Read counted characters, write counted bytes, i.e. size != sizeb */
+
+	if (sizeb < 0) {
 	    writeError:
 	    errObj = Tcl_NewObj();
 	    Tcl_AppendStringsToObj(errObj, "error writing \"",
@@ -7148,6 +7283,10 @@
 		Tcl_CreateChannelHandler(outChan, TCL_WRITABLE,
 			CopyEventProc, (ClientData) csPtr);
 	    }
+	    if (bufObj != (Tcl_Obj*) NULL) {
+	        Tcl_DecrRefCount (bufObj);
+		bufObj = (Tcl_Obj*) NULL;
+	    }
 	    return TCL_OK;
 	}
 
@@ -7166,8 +7305,17 @@
 		Tcl_CreateChannelHandler(outChan, TCL_WRITABLE,
 			CopyEventProc, (ClientData) csPtr);
 	    }
+	    if (bufObj != (Tcl_Obj*) NULL) {
+	        Tcl_DecrRefCount (bufObj);
+		bufObj = (Tcl_Obj*) NULL;
+	    }
 	    return TCL_OK;
 	}
+    } /* while */
+
+    if (bufObj != (Tcl_Obj*) NULL) {
+        Tcl_DecrRefCount (bufObj);
+	bufObj = (Tcl_Obj*) NULL;
     }
 
     /*
@@ -7217,6 +7365,8 @@
  * DoRead --
  *
  *	Reads a given number of bytes from a channel.
+ *
+ *	No encoding conversions are applied to the bytes being read.
  *
  * Results:
  *	The number of characters read, or -1 on error. Use Tcl_GetErrno()
Index: tests/io.test
===================================================================
RCS file: /cvsroot/tcl/tcl/tests/io.test,v
retrieving revision 1.16
diff -u -r1.16 io.test
--- tests/io.test	2001/04/04 17:35:25	1.16
+++ tests/io.test	2001/04/18 20:21:35
@@ -6369,6 +6369,84 @@
     list $s0 [file size test1]
 } {40 40}
 
+# Empty files, to register them with the test facility
+makeFile {} kyrillic.txt
+makeFile {} utf8-fcopy.txt
+makeFile {} utf8-rp.txt
+
+# Create kyrillic file
+set out [open kyrillic.txt w]
+fconfigure $out -encoding koi8-r
+puts       $out "\u0410\u0410"
+close      $out
+
+test io-52.9 {TclCopyChannel & encodings} {
+    # Copy kyrillic to UTF-8, using fcopy.
+
+    set in  [open kyrillic.txt r]
+    set out [open utf8-fcopy.txt w]
+
+    fconfigure $in  -encoding koi8-r
+    fconfigure $out -encoding utf-8
+
+    fcopy $in $out
+    close $in
+    close $out
+
+    # Do the same again, but differently (read/puts).
+
+    set in  [open kyrillic.txt r]
+    set out [open utf8-rp.txt w]
+
+    fconfigure $in  -encoding koi8-r
+    fconfigure $out -encoding utf-8
+
+    puts -nonewline $out [read $in]
+
+    close $in
+    close $out
+
+    list \
+	    [file size kyrillic.txt]   \
+	    [file size utf8-fcopy.txt] \
+	    [file size utf8-rp.txt]
+} {3 5 5}
+
+test io-52.10 {TclCopyChannel & encodings} {
+    # encoding to binary (=> implies that the
+    # internal utf-8 is written)
+
+    set in  [open kyrillic.txt r]
+    set out [open utf8-fcopy.txt w]
+
+    fconfigure $in  -encoding koi8-r
+    fconfigure $out -encoding binary
+
+    fcopy $in $out
+    close $in
+    close $out
+
+    file size utf8-fcopy.txt
+} 5
+
+test io-52.11 {TclCopyChannel & encodings} {
+    # binary to encoding => the input has to be
+    # in utf-8 to make sense to the encoder
+
+    set in  [open utf8-fcopy.txt r]
+    set out [open kyrillic.txt w]
+
+    fconfigure $in  -encoding binary
+    fconfigure $out -encoding koi8-r
+
+    fcopy $in $out
+    close $in
+    close $out
+
+    file size kyrillic.txt
+} 3
+
+
 test io-53.1 {CopyData} {
     removeFile test1
     set f1 [open $thisScript]