Tcl Source Code

Check-in [110d7c7820]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:experiment: Handle BOM by switching to utf-8 when the system encoding is cp1250-cp1258 or identity (and the encoding is not otherwise specified)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | bug-3466099
Files: files | file ages | folders
SHA1: 110d7c7820197d5679ecb19e2c5d08b53865c02f
User & Date: jan.nijtmans 2012-03-01 22:48:48
Context
2012-03-06
14:18
another solution with auto recognition (without fixed cpBomTable) Closed-Leaf check-in: 8da0451f94 user: sebres tags: bug-3466099
2012-03-01
22:48
experiment: Handle BOM by switching to utf-8 when the system encoding is cp1250-cp1258 or identity (... check-in: 110d7c7820 user: jan.nijtmans tags: bug-3466099
2012-02-29
22:38
oops, that's no utf-8 BOM ;-( check-in: d2c359a194 user: jan.nijtmans tags: core-8-5-branch
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to generic/tclIOUtil.c.

1727
1728
1729
1730
1731
1732
1733















1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746


1747
1748
1749
1750
1751
1752
1753
 * Side effects:
 *	Depends on the commands in the file. During the evaluation of the
 *	contents of the file, iPtr->scriptFile is made to point to pathPtr
 *	(the old value is cached and replaced when this function returns).
 *
 *----------------------------------------------------------------------
 */
















int
Tcl_FSEvalFileEx(
    Tcl_Interp *interp,		/* Interpreter in which to process file. */
    Tcl_Obj *pathPtr,		/* Path of file to process. Tilde-substitution
				 * will be performed on this name. */
    const char *encodingName)	/* If non-NULL, then use this encoding for the
				 * file. NULL means use the system encoding. */
{
    int length, result = TCL_ERROR;
    Tcl_StatBuf statBuf;
    Tcl_Obj *oldScriptFile;
    Interp *iPtr;


    char *string;
    Tcl_Channel chan;
    Tcl_Obj *objPtr;

    if (Tcl_FSGetNormalizedPath(interp, pathPtr) == NULL) {
	return result;
    }







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>













>
>







1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
 * Side effects:
 *	Depends on the commands in the file. During the evaluation of the
 *	contents of the file, iPtr->scriptFile is made to point to pathPtr
 *	(the old value is cached and replaced when this function returns).
 *
 *----------------------------------------------------------------------
 */

/* Table containing how the BOM looks like when decoded by
 * the encodings cp1250 up to cp1258. When we encounter this
 * as first bytes read from the file, the real encoding is utf-8 */
static const char *cpBomTable[] = {
    "\xc4\x8f\xc2\xbb\xc5\xbc",
    "\xd0\xbf\xc2\xbb\xd1\x97",
    "\xc3\xaf\xc2\xbb\xc2\xbf",
    "\xce\xbf\xc2\xbb\xce\x8f",
    "\xc3\xaf\xc2\xbb\xc2\xbf",
    "\xd7\x9f\xc2\xbb\xc2\xbf",
    "\xc3\xaf\xc2\xbb\xd8\x9f",
    "\xc4\xbc\xc2\xbb\xc3\xa6",
    "\xc3\xaf\xc2\xbb\xc2\xbf"
};

int
Tcl_FSEvalFileEx(
    Tcl_Interp *interp,		/* Interpreter in which to process file. */
    Tcl_Obj *pathPtr,		/* Path of file to process. Tilde-substitution
				 * will be performed on this name. */
    const char *encodingName)	/* If non-NULL, then use this encoding for the
				 * file. NULL means use the system encoding. */
{
    int length, result = TCL_ERROR;
    Tcl_StatBuf statBuf;
    Tcl_Obj *oldScriptFile;
    Interp *iPtr;
    const char *bom = "\xef\xbb\xbf";
    int bomsize = 1;
    char *string;
    Tcl_Channel chan;
    Tcl_Obj *objPtr;

    if (Tcl_FSGetNormalizedPath(interp, pathPtr) == NULL) {
	return result;
    }
1780
1781
1782
1783
1784
1785
1786













1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804








1805
1806
1807
1808
1809
1810
1811
1812
1813

    if (encodingName != NULL) {
	if (Tcl_SetChannelOption(interp, chan, "-encoding", encodingName)
		!= TCL_OK) {
	    Tcl_Close(interp,chan);
	    return result;
	}













    }

    objPtr = Tcl_NewObj();
    Tcl_IncrRefCount(objPtr);
    /* Try to read first character of stream, so we can
     * check for utf-8 BOM to be handled especially.
     */
    if (Tcl_ReadChars(chan, objPtr, 1, 0) < 0) {
	Tcl_Close(interp, chan);
	Tcl_AppendResult(interp, "couldn't read file \"",
		Tcl_GetString(pathPtr), "\": ", Tcl_PosixError(interp), NULL);
	goto end;
    }
    string = Tcl_GetString(objPtr);
    /*
     * If first character is not a BOM, append the remaining characters,
     * otherwise replace them [Bug 3466099].
     */








    if (Tcl_ReadChars(chan, objPtr, -1,
	    memcmp(string, "\xef\xbb\xbf", 3)) < 0) {
	Tcl_Close(interp, chan);
	Tcl_AppendResult(interp, "couldn't read file \"",
		Tcl_GetString(pathPtr), "\": ", Tcl_PosixError(interp), NULL);
	goto end;
    }

    if (Tcl_Close(interp, chan) != TCL_OK) {







>
>
>
>
>
>
>
>
>
>
>
>
>




|
|

|





<

|
|

>
>
>
>
>
>
>
>
|
<







1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829

1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842

1843
1844
1845
1846
1847
1848
1849

    if (encodingName != NULL) {
	if (Tcl_SetChannelOption(interp, chan, "-encoding", encodingName)
		!= TCL_OK) {
	    Tcl_Close(interp,chan);
	    return result;
	}
    } else {
	const char *systemencoding = Tcl_GetEncodingName(NULL);
	if (!strcmp(systemencoding, "identity")) {
	    /* identity reads the BOM as 3 characters */
	    bomsize = 3;
	} else if (!memcmp(systemencoding, "cp125", 5) &&
		(systemencoding[5] >= '0') && (systemencoding[5] <= '8') &&
		!systemencoding[6]) {
	    /* cp125[0-8] reads the BOM as 3 characters as well, but
	     * take the real form from the table above */
	    bom = cpBomTable[systemencoding[5] - '0'];
	    bomsize = 3;
	}
    }

    objPtr = Tcl_NewObj();
    Tcl_IncrRefCount(objPtr);
    /* Try to read first characters of stream, so we can
     * check for BOM to be handled especially.
     */
    if (Tcl_ReadChars(chan, objPtr, bomsize, 0) < 0) {
	Tcl_Close(interp, chan);
	Tcl_AppendResult(interp, "couldn't read file \"",
		Tcl_GetString(pathPtr), "\": ", Tcl_PosixError(interp), NULL);
	goto end;
    }

    /*
     * If first character is a BOM, throw it away and switch
     * encoding to utf-8 [Bug 3466099].
     */
    if (!strcmp(Tcl_GetString(objPtr), bom)) {
	Tcl_SetObjLength(objPtr, 0);
	if (Tcl_SetChannelOption(interp, chan, "-encoding", "utf-8")
		!= TCL_OK) {
	    Tcl_Close(interp,chan);
	    return result;
	}
    }
    if (Tcl_ReadChars(chan, objPtr, -1, 1) < 0) {

	Tcl_Close(interp, chan);
	Tcl_AppendResult(interp, "couldn't read file \"",
		Tcl_GetString(pathPtr), "\": ", Tcl_PosixError(interp), NULL);
	goto end;
    }

    if (Tcl_Close(interp, chan) != TCL_OK) {

Changes to tests/source.test.

116
117
118
119
120
121
122


















123
124
125
126
127
128
129
    close $out
    set y old-y
    source -encoding utf-8 $sourcefile
    return $y
} -cleanup {
    removeFile $sourcefile
} -result {new-y}



















test source-3.1 {return in middle of source file} -setup {
    set sourcefile [makeFile {
	set x new-x
	return allDone
	set y new-y
    } source.file]







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
    close $out
    set y old-y
    source -encoding utf-8 $sourcefile
    return $y
} -cleanup {
    removeFile $sourcefile
} -result {new-y}
foreach {n e} {8 identity 9 cp1250 10 cp1251 11 cp1252 12 cp1253 13 cp1254 14 cp1255 15 cp1256 16 cp1257 17 cp1258} {
test source-2.$n [list $e with BOM] -setup {
    set sourcefile [makeFile {} source.file]
    set saveencoding [encoding system]
} -body {
    encoding system $e
    set out [open $sourcefile w]
    fconfigure $out -encoding utf-8
    puts $out "\ufeffset y new-y\xe9"
    close $out
    set y old-y
    source $sourcefile
    return $y
} -cleanup {
    encoding system $saveencoding
    removeFile $sourcefile
} -result "new-y\xe9"
}

test source-3.1 {return in middle of source file} -setup {
    set sourcefile [makeFile {
	set x new-x
	return allDone
	set y new-y
    } source.file]