Attachment "219210.diff.3" to
ticket [219210ffff]
added by
andreas_kupries
2001-04-06 22:20:34.
? generic/foo
? unix/httpd
Index: ChangeLog
===================================================================
RCS file: /cvsroot/tcl/tcl/ChangeLog,v
retrieving revision 1.409
diff -u -r1.409 ChangeLog
--- ChangeLog 2001/04/06 14:27:51 1.409
+++ ChangeLog 2001/04/06 15:09:13
@@ -1,3 +1,19 @@
+2001-04-05 Andreas Kupries <[email protected]>
+
+ * doc/fcopy.n: Updated to reflect the extended behaviour of 'fcopy'.
+
+ * tests/io.test: Added tests 'io-52.9', 'io-52.10' and 'io-52.11'
+ to test the handling of encodings by 'fcopy' / 'TclCopychannel'
+ [Bug #209210].
+
+ * generic/tclIO.c: Split of both 'Tcl_ReadChars' and
+ 'Tcl_WriteChars' into a public error checking and an internal
+ working part. The public functions now use the new internal
+ ones. The new functions are 'DoReadChars' and 'DoWriteChars'.
+ Extended 'CopyData' to use the new functions 'DoXChars' when
+ required by the encodings on the input and output channels
+ [Bug #209210].
+
2001-04-06 Donal K. Fellows <[email protected]>
* doc/read.n: Added section on "USE WITH SERIAL PORTS" to resolve
Index: doc/fcopy.n
===================================================================
RCS file: /cvsroot/tcl/tcl/doc/fcopy.n,v
retrieving revision 1.2
diff -u -r1.2 fcopy.n
--- doc/fcopy.n 1998/09/14 18:39:52 1.2
+++ doc/fcopy.n 2001/04/06 15:09:14
@@ -71,6 +71,19 @@
Only the number of bytes written to \fIoutchan\fR is reported,
either as the return value of a synchronous \fBfcopy\fP or
as the argument to the callback for an asynchronous \fBfcopy\fP.
+.PP
+\fBFcopy\fR obeys the encodings configured for the channels. This
+means that the incoming characters are converted internally first
+UTF-8 and then into the encoding of the channel \fBfcopy\fR writes
+to. See the manual entry for \fBfconfigure\fR for details on the
+\fB\-encoding\fR option. No conversion is done if both channels are
+set to encoding "binary". If only the output channel is set to
+encoding "binary" the system will write the internal UTF-8
+representation of the incoming characters. If only the input channel
+is set to encoding "binary" the system will assume that the incoming
+bytes are valid UTF-8 characters and convert them according to the
+output encoding. The behaviour of the system for bytes which are not
+valid UTF-8 characters is undefined in this case.
.SH EXAMPLE
.PP
Index: generic/tclIO.c
===================================================================
RCS file: /cvsroot/tcl/tcl/generic/tclIO.c,v
retrieving revision 1.29
diff -u -r1.29 tclIO.c
--- generic/tclIO.c 2001/03/30 23:06:39 1.29
+++ generic/tclIO.c 2001/04/06 15:09:32
@@ -10,7 +10,7 @@
* See the file "license.terms" for information on usage and redistribution
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
*
- * RCS: @(#) $Id: tclIO.c,v 1.29 2001/03/30 23:06:39 andreas_kupries Exp $
+ * RCS: @(#) $Id: tclIO.c,v 1.28 2001/01/30 17:32:06 dgp Exp $
*/
#include "tclInt.h"
@@ -112,6 +112,10 @@
int slen));
static int DoWrite _ANSI_ARGS_((Channel *chanPtr, char *src,
int srcLen));
+static int DoReadChars _ANSI_ARGS_ ((Channel* chan,
+ Tcl_Obj* objPtr, int toRead, int appendFlag));
+static int DoWriteChars _ANSI_ARGS_ ((Channel* chan,
+ CONST char* src, int len));
static int FilterInputBytes _ANSI_ARGS_((Channel *chanPtr,
GetsState *statePtr));
static int FlushChannel _ANSI_ARGS_((Tcl_Interp *interp,
@@ -2508,7 +2512,10 @@
* Puts a sequence of bytes into an output buffer, may queue the
* buffer for output if it gets full, and also remembers whether the
* current buffer is ready e.g. if it contains a newline and we are in
- * line buffering mode.
+ * line buffering mode. Compensates stacking, i.e. will redirect the
+ * data from the specified channel to the topmost channel in a stack.
+ *
+ * No encoding conversions are applied to the bytes being read.
*
* Results:
* The number of bytes written or -1 in case of error. If -1,
@@ -2555,8 +2562,11 @@
* Puts a sequence of bytes into an output buffer, may queue the
* buffer for output if it gets full, and also remembers whether the
* current buffer is ready e.g. if it contains a newline and we are in
- * line buffering mode.
+ * line buffering mode. Writes directly to the driver of the channel,
+ * does not compensate for stacking.
*
+ * No encoding conversions are applied to the bytes being read.
+ *
* Results:
* The number of bytes written or -1 in case of error. If -1,
* Tcl_GetErrno will return the error code.
@@ -2611,7 +2621,8 @@
* using the channel's current encoding, may queue the buffer for
* output if it gets full, and also remembers whether the current
* buffer is ready e.g. if it contains a newline and we are in
- * line buffering mode.
+ * line buffering mode. Compensates stacking, i.e. will redirect the
+ * data from the specified channel to the topmost channel in a stack.
*
* Results:
* The number of bytes written or -1 in case of error. If -1,
@@ -2631,18 +2642,55 @@
int len; /* Length of string in bytes, or < 0 for
* strlen(). */
{
- /*
- * Always use the topmost channel of the stack
- */
- Channel *chanPtr;
ChannelState *statePtr; /* state info for channel */
statePtr = ((Channel *) chan)->state;
- chanPtr = statePtr->topChanPtr;
if (CheckChannelErrors(statePtr, TCL_WRITABLE) != 0) {
return -1;
}
+
+ return DoWriteChars ((Channel*) chan, src, len);
+}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * DoWriteChars --
+ *
+ * Takes a sequence of UTF-8 characters and converts them for output
+ * using the channel's current encoding, may queue the buffer for
+ * output if it gets full, and also remembers whether the current
+ * buffer is ready e.g. if it contains a newline and we are in
+ * line buffering mode. Compensates stacking, i.e. will redirect the
+ * data from the specified channel to the topmost channel in a stack.
+ *
+ * Results:
+ * The number of bytes written or -1 in case of error. If -1,
+ * Tcl_GetErrno will return the error code.
+ *
+ * Side effects:
+ * May buffer up output and may cause output to be produced on the
+ * channel.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+DoWriteChars(chanPtr, src, len)
+ Channel* chanPtr; /* The channel to buffer output for. */
+ CONST char *src; /* UTF-8 characters to queue in output buffer. */
+ int len; /* Length of string in bytes, or < 0 for
+ * strlen(). */
+{
+ /*
+ * Always use the topmost channel of the stack
+ */
+ ChannelState *statePtr; /* state info for channel */
+
+ statePtr = chanPtr->state;
+ chanPtr = statePtr->topChanPtr;
+
if (len < 0) {
len = strlen(src);
}
@@ -4037,12 +4085,8 @@
* of the object. */
{
- Channel *chanPtr = (Channel *) chan;
- ChannelState *statePtr = chanPtr->state; /* state info for channel */
- ChannelBuffer *bufPtr;
- int offset, factor, copied, copiedNow, result;
- Tcl_Encoding encoding;
-#define UTF_EXPANSION_FACTOR 1024
+ Channel* chanPtr = (Channel *) chan;
+ ChannelState* statePtr = chanPtr->state; /* state info for channel */
/*
* This operation should occur at the top of a channel stack.
@@ -4051,12 +4095,64 @@
chanPtr = statePtr->topChanPtr;
if (CheckChannelErrors(statePtr, TCL_READABLE) != 0) {
- copied = -1;
- goto done;
+ /*
+ * Update the notifier state so we don't block while there is still
+ * data in the buffers.
+ */
+ UpdateInterest(chanPtr);
+ return -1;
}
+ return DoReadChars (chanPtr, objPtr, toRead, appendFlag);
+}
+/*
+ *---------------------------------------------------------------------------
+ *
+ * DoReadChars --
+ *
+ * Reads from the channel until the requested number of characters
+ * have been seen, EOF is seen, or the channel would block. EOL
+ * and EOF translation is done. If reading binary data, the raw
+ * bytes are wrapped in a Tcl byte array object. Otherwise, the raw
+ * bytes are converted to UTF-8 using the channel's current encoding
+ * and stored in a Tcl string object.
+ *
+ * Results:
+ * The number of characters read, or -1 on error. Use Tcl_GetErrno()
+ * to retrieve the error code for the error that occurred.
+ *
+ * Side effects:
+ * May cause input to be buffered.
+ *
+ *---------------------------------------------------------------------------
+ */
+
+int
+DoReadChars(chanPtr, objPtr, toRead, appendFlag)
+ Channel* chanPtr; /* The channel to read. */
+ Tcl_Obj *objPtr; /* Input data is stored in this object. */
+ int toRead; /* Maximum number of characters to store,
+ * or -1 to read all available data (up to EOF
+ * or when channel blocks). */
+ int appendFlag; /* If non-zero, data read from the channel
+ * will be appended to the object. Otherwise,
+ * the data will replace the existing contents
+ * of the object. */
+
+{
+ ChannelState *statePtr = chanPtr->state; /* state info for channel */
+ ChannelBuffer *bufPtr;
+ int offset, factor, copied, copiedNow, result;
+ Tcl_Encoding encoding;
+#define UTF_EXPANSION_FACTOR 1024
+
+ /*
+ * This operation should occur at the top of a channel stack.
+ */
+
+ chanPtr = statePtr->topChanPtr;
encoding = statePtr->encoding;
- factor = UTF_EXPANSION_FACTOR;
+ factor = UTF_EXPANSION_FACTOR;
if (appendFlag == 0) {
if (encoding == NULL) {
@@ -7037,7 +7133,12 @@
int result = TCL_OK;
int size;
int total;
+ int sizeb;
+ Tcl_Obj* bufObj = NULL;
+ char* buffer;
+ int inBinary, outBinary, sameEncoding; /* Encoding control */
+
inChan = (Tcl_Channel) csPtr->readPtr;
outChan = (Tcl_Channel) csPtr->writePtr;
inStatePtr = csPtr->readPtr->state;
@@ -7053,6 +7154,15 @@
* thus gets the bottom of the stack.
*/
+ inBinary = (inStatePtr->encoding == NULL);
+ outBinary = (outStatePtr->encoding == NULL);
+ sameEncoding = (inStatePtr->encoding == outStatePtr->encoding);
+
+ if (!(inBinary || sameEncoding)) {
+ bufObj = Tcl_NewObj ();
+ Tcl_IncrRefCount (bufObj);
+ }
+
while (csPtr->toRead != 0) {
/*
@@ -7078,8 +7188,13 @@
size = csPtr->bufSize;
} else {
size = csPtr->toRead;
+ }
+
+ if (inBinary || sameEncoding) {
+ size = DoRead(inStatePtr->topChanPtr, csPtr->buffer, size);
+ } else {
+ size = DoReadChars(inStatePtr->topChanPtr, bufObj, size, 0 /* No append */);
}
- size = DoRead(inStatePtr->topChanPtr, csPtr->buffer, size);
if (size < 0) {
readError:
@@ -7105,6 +7220,10 @@
Tcl_CreateChannelHandler(inChan, TCL_READABLE,
CopyEventProc, (ClientData) csPtr);
}
+ if (bufObj != (Tcl_Obj*) NULL) {
+ Tcl_DecrRefCount (bufObj);
+ bufObj = (Tcl_Obj*) NULL;
+ }
return TCL_OK;
}
@@ -7112,8 +7231,25 @@
* Now write the buffer out.
*/
- size = DoWrite(outStatePtr->topChanPtr, csPtr->buffer, size);
- if (size < 0) {
+ if (inBinary || sameEncoding) {
+ buffer = csPtr->buffer;
+ sizeb = size;
+ } else {
+ buffer = Tcl_GetStringFromObj (bufObj, &sizeb);
+ }
+
+ if (outBinary) {
+ sizeb = DoWrite(outStatePtr->topChanPtr, buffer, sizeb);
+ } else {
+ sizeb = DoWriteChars(outStatePtr->topChanPtr, buffer, sizeb);
+ }
+
+ if (inBinary || sameEncoding) {
+ /* Both read and write counted bytes */
+ size = sizeb;
+ } /* else : Read counted characters, write counted bytes, i.e. size != sizeb */
+
+ if (sizeb < 0) {
writeError:
errObj = Tcl_NewObj();
Tcl_AppendStringsToObj(errObj, "error writing \"",
@@ -7148,6 +7284,10 @@
Tcl_CreateChannelHandler(outChan, TCL_WRITABLE,
CopyEventProc, (ClientData) csPtr);
}
+ if (bufObj != (Tcl_Obj*) NULL) {
+ Tcl_DecrRefCount (bufObj);
+ bufObj = (Tcl_Obj*) NULL;
+ }
return TCL_OK;
}
@@ -7166,8 +7306,17 @@
Tcl_CreateChannelHandler(outChan, TCL_WRITABLE,
CopyEventProc, (ClientData) csPtr);
}
+ if (bufObj != (Tcl_Obj*) NULL) {
+ Tcl_DecrRefCount (bufObj);
+ bufObj = (Tcl_Obj*) NULL;
+ }
return TCL_OK;
}
+ } /* while */
+
+ if (bufObj != (Tcl_Obj*) NULL) {
+ Tcl_DecrRefCount (bufObj);
+ bufObj = (Tcl_Obj*) NULL;
}
/*
@@ -7217,6 +7366,8 @@
* DoRead --
*
* Reads a given number of bytes from a channel.
+ *
+ * No encoding conversions are applied to the bytes being read.
*
* Results:
* The number of characters read, or -1 on error. Use Tcl_GetErrno()
Index: tests/io.test
===================================================================
RCS file: /cvsroot/tcl/tcl/tests/io.test,v
retrieving revision 1.16
diff -u -r1.16 io.test
--- tests/io.test 2001/04/04 17:35:25 1.16
+++ tests/io.test 2001/04/06 15:09:41
@@ -6369,6 +6369,84 @@
list $s0 [file size test1]
} {40 40}
+# Empty files, to register them with the test facility
+makeFile {} kyrillic.txt
+makeFile {} utf8-fcopy.txt
+makeFile {} utf8-rp.txt
+
+# Create kyrillic file
+set out [open kyrillic.txt w]
+fconfigure $out -encoding koi8-r
+puts $out "\u0410\u0410"
+close $out
+
+test io-52.9 {TclCopyChannel & encodings} {
+ # Copy kyrillic to UTF-8, using fcopy.
+
+ set in [open kyrillic.txt r]
+ set out [open utf8-fcopy.txt w]
+
+ fconfigure $in -encoding koi8-r
+ fconfigure $out -encoding utf-8
+
+ fcopy $in $out
+ close $in
+ close $out
+
+ # Do the same again, but differently (read/puts).
+
+ set in [open kyrillic.txt r]
+ set out [open utf8-rp.txt w]
+
+ fconfigure $in -encoding koi8-r
+ fconfigure $out -encoding utf-8
+
+ puts -nonewline $out [read $in]
+
+ close $in
+ close $out
+
+ list \
+ [file size kyrillic.txt] \
+ [file size utf8-fcopy.txt] \
+ [file size utf8-rp.txt]
+} {3 5 5}
+
+test io-52.10 {TclCopyChannel & encodings} {
+ # encoding to binary (=> implies that the
+ # internal utf-8 is written)
+
+ set in [open kyrillic.txt r]
+ set out [open utf8-fcopy.txt w]
+
+ fconfigure $in -encoding koi8-r
+ fconfigure $out -encoding binary
+
+ fcopy $in $out
+ close $in
+ close $out
+
+ file size utf8-fcopy.txt
+} 5
+
+test io-52.11 {TclCopyChannel & encodings} {
+ # binary to encoding => the input has to be
+ # in utf-8 to make sense to the encoder
+
+ set in [open utf8-fcopy.txt r]
+ set out [open kyrillic.txt w]
+
+ fconfigure $in -encoding binary
+ fconfigure $out -encoding koi8-r
+
+ fcopy $in $out
+ close $in
+ close $out
+
+ file size kyrillic.txt
+} 3
+
+
test io-53.1 {CopyData} {
removeFile test1
set f1 [open $thisScript]