Tcl Source Code

Check-in [8da0451f94]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:another solution with auto recognition (without fixed cpBomTable)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | bug-3466099
Files: files | file ages | folders
SHA1: 8da0451f94965ca03f713ebfc5f997560fbc5956
User & Date: sebres 2012-03-06 14:18:48
Context
2012-03-06
14:18
another solution with auto recognition (without fixed cpBomTable) Closed-Leaf check-in: 8da0451f94 user: sebres tags: bug-3466099
2012-03-01
22:48
experiment: Handle BOM by switching to utf-8 when the system encoding is cp1250-cp1258 or identity (... check-in: 110d7c7820 user: jan.nijtmans tags: bug-3466099
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to generic/tclIOUtil.c.

1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764

1765
1766
1767
1768
1769
1770
1771
 *	Depends on the commands in the file. During the evaluation of the
 *	contents of the file, iPtr->scriptFile is made to point to pathPtr
 *	(the old value is cached and replaced when this function returns).
 *
 *----------------------------------------------------------------------
 */

/* Table containing how the BOM looks like when decoded by
 * the encodings cp1250 up to cp1258. When we encounter this
 * as first bytes read from the file, the real encoding is utf-8 */
static const char *cpBomTable[] = {
    "\xc4\x8f\xc2\xbb\xc5\xbc",
    "\xd0\xbf\xc2\xbb\xd1\x97",
    "\xc3\xaf\xc2\xbb\xc2\xbf",
    "\xce\xbf\xc2\xbb\xce\x8f",
    "\xc3\xaf\xc2\xbb\xc2\xbf",
    "\xd7\x9f\xc2\xbb\xc2\xbf",
    "\xc3\xaf\xc2\xbb\xd8\x9f",
    "\xc4\xbc\xc2\xbb\xc3\xa6",
    "\xc3\xaf\xc2\xbb\xc2\xbf"
};

int
Tcl_FSEvalFileEx(
    Tcl_Interp *interp,		/* Interpreter in which to process file. */
    Tcl_Obj *pathPtr,		/* Path of file to process. Tilde-substitution
				 * will be performed on this name. */
    const char *encodingName)	/* If non-NULL, then use this encoding for the
				 * file. NULL means use the system encoding. */
{
    int length, result = TCL_ERROR;
    Tcl_StatBuf statBuf;
    Tcl_Obj *oldScriptFile;
    Interp *iPtr;
    const char *bom = "\xef\xbb\xbf";
    int bomsize = 1;
    char *string;

    Tcl_Channel chan;
    Tcl_Obj *objPtr;

    if (Tcl_FSGetNormalizedPath(interp, pathPtr) == NULL) {
	return result;
    }








<
<
<
<
<
<
<
<
<
<
<
<
<
<
<












|
<

>







1728
1729
1730
1731
1732
1733
1734















1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747

1748
1749
1750
1751
1752
1753
1754
1755
1756
 *	Depends on the commands in the file. During the evaluation of the
 *	contents of the file, iPtr->scriptFile is made to point to pathPtr
 *	(the old value is cached and replaced when this function returns).
 *
 *----------------------------------------------------------------------
 */
















int
Tcl_FSEvalFileEx(
    Tcl_Interp *interp,		/* Interpreter in which to process file. */
    Tcl_Obj *pathPtr,		/* Path of file to process. Tilde-substitution
				 * will be performed on this name. */
    const char *encodingName)	/* If non-NULL, then use this encoding for the
				 * file. NULL means use the system encoding. */
{
    int length, result = TCL_ERROR;
    Tcl_StatBuf statBuf;
    Tcl_Obj *oldScriptFile;
    Interp *iPtr;
    Tcl_Encoding encoding = NULL;

    char *string;
    int len;
    Tcl_Channel chan;
    Tcl_Obj *objPtr;

    if (Tcl_FSGetNormalizedPath(interp, pathPtr) == NULL) {
	return result;
    }

1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829

1830
1831
1832
1833

1834
1835









1836
1837
1838


1839









1840


1841


















1842
1843
1844
1845
1846
1847
1848
     * The eofchar is \32 (^Z). This is the usual on Windows, but we effect
     * this cross-platform to allow for scripted documents. [Bug: 2040]
     */

    Tcl_SetChannelOption(interp, chan, "-eofchar", "\32");

    /*
     * If the encoding is specified, set it for the channel. Else don't touch
     * it (and use the system encoding) Report error on unknown encoding.
     */

    if (encodingName != NULL) {
	if (Tcl_SetChannelOption(interp, chan, "-encoding", encodingName)
		!= TCL_OK) {
	    Tcl_Close(interp,chan);
	    return result;
	}
    } else {
	const char *systemencoding = Tcl_GetEncodingName(NULL);
	if (!strcmp(systemencoding, "identity")) {
	    /* identity reads the BOM as 3 characters */
	    bomsize = 3;
	} else if (!memcmp(systemencoding, "cp125", 5) &&
		(systemencoding[5] >= '0') && (systemencoding[5] <= '8') &&
		!systemencoding[6]) {
	    /* cp125[0-8] reads the BOM as 3 characters as well, but
	     * take the real form from the table above */
	    bom = cpBomTable[systemencoding[5] - '0'];
	    bomsize = 3;
	}
    }

    objPtr = Tcl_NewObj();
    Tcl_IncrRefCount(objPtr);
    /* Try to read first characters of stream, so we can
     * check for BOM to be handled especially.
     */
    if (Tcl_ReadChars(chan, objPtr, bomsize, 0) < 0) {
	Tcl_Close(interp, chan);
	Tcl_AppendResult(interp, "couldn't read file \"",
		Tcl_GetString(pathPtr), "\": ", Tcl_PosixError(interp), NULL);
	goto end;
    }

    /*
     * If first character is a BOM, throw it away and switch
     * encoding to utf-8 [Bug 3466099].
     */

    if (!strcmp(Tcl_GetString(objPtr), bom)) {
	Tcl_SetObjLength(objPtr, 0);









	if (Tcl_SetChannelOption(interp, chan, "-encoding", "utf-8")
		!= TCL_OK) {
	    Tcl_Close(interp,chan);


	    return result;









	}


    }


















    if (Tcl_ReadChars(chan, objPtr, -1, 1) < 0) {
	Tcl_Close(interp, chan);
	Tcl_AppendResult(interp, "couldn't read file \"",
		Tcl_GetString(pathPtr), "\": ", Tcl_PosixError(interp), NULL);
	goto end;
    }








|
<

<
<
|
|
|
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<




|
|

|





>

|
<

>
|
|
>
>
>
>
>
>
>
>
>
|
|
|
>
>
|
>
>
>
>
>
>
>
>
>
|
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1772
1773
1774
1775
1776
1777
1778
1779

1780


1781
1782
1783
1784














1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800

1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
     * The eofchar is \32 (^Z). This is the usual on Windows, but we effect
     * this cross-platform to allow for scripted documents. [Bug: 2040]
     */

    Tcl_SetChannelOption(interp, chan, "-eofchar", "\32");

    /*
     * Set channel encoding to utf-8 to read first characters (BOM char?).

     */


    if (Tcl_SetChannelOption(interp, chan, "-encoding", "utf-8")
	    != TCL_OK) {
	Tcl_Close(interp,chan);
	return result;














    }

    objPtr = Tcl_NewObj();
    Tcl_IncrRefCount(objPtr);
    /* Try to read first 4 characters of stream in utf-8, so we can
     * check for utf-8 BOM to be handled especially. [Bug 3466099]
     */
    if (Tcl_ReadChars(chan, objPtr, 4, 0) < 0) {
	Tcl_Close(interp, chan);
	Tcl_AppendResult(interp, "couldn't read file \"",
		Tcl_GetString(pathPtr), "\": ", Tcl_PosixError(interp), NULL);
	goto end;
    }
    string = TclGetStringFromObj(objPtr, &len);
    /*
     * If first characters is not a BOM, convert it to given or system encoding.

     */
    if (len < 3 || memcmp(string, "\xef\xbb\xbf", 3) != 0) {
	Tcl_DString ds;
	
	/*
	 * If the encoding is not specified, set it to the system encoding,
	 * hereafter, bypass utf-8, otherwise set it to the channel.
	 * Report error on unknown encoding.
	 */
	if (encodingName == NULL || *encodingName == 0) {
	    encodingName = Tcl_GetEncodingName(NULL);
	}
	if (strcmp(encodingName, "utf-8") != 0) {
	    if (Tcl_SetChannelOption(interp, chan, "-encoding", encodingName)
		    != TCL_OK) {
		Tcl_Close(interp, chan);
		goto end;
	    }

	    /* 
	     * Convert first char from utf-8 (as it were binary) to 
	     * encoding 'encodingName'.
	     */
	    encoding = Tcl_GetEncoding(interp, encodingName);
	    if (encoding == NULL) {
		Tcl_Close(interp, chan);
		goto end;
	    };

	    string = (char *) Tcl_GetByteArrayFromObj(objPtr, &len);
	    Tcl_ExternalToUtfDString(encoding, string, len, &ds);

	    Tcl_DecrRefCount(objPtr);
	    objPtr = Tcl_NewStringObj(
		    Tcl_DStringValue(&ds), Tcl_DStringLength(&ds));
	    Tcl_IncrRefCount(objPtr);
	    Tcl_DStringFree(&ds);
	};
    } else if (len >= 1) {
	/*
	 * Remove BOM characters.
	 */
	Tcl_Obj * newPtr = Tcl_NewStringObj(string+3, len-3);
	Tcl_DecrRefCount(objPtr);
	objPtr = newPtr;
	Tcl_IncrRefCount(objPtr);
    }
    /*
     * Append the remaining characters read from channel.
     */
    if (Tcl_ReadChars(chan, objPtr, -1, 1) < 0) {
	Tcl_Close(interp, chan);
	Tcl_AppendResult(interp, "couldn't read file \"",
		Tcl_GetString(pathPtr), "\": ", Tcl_PosixError(interp), NULL);
	goto end;
    }

1886
1887
1888
1889
1890
1891
1892



1893
1894
1895
1896
1897
1898
1899
		"\n    (file \"%.*s%s\" line %d)",
		(overflow ? limit : length), pathString,
		(overflow ? "..." : ""), interp->errorLine));
    }

  end:
    Tcl_DecrRefCount(objPtr);



    return result;
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_GetErrno --







>
>
>







1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
		"\n    (file \"%.*s%s\" line %d)",
		(overflow ? limit : length), pathString,
		(overflow ? "..." : ""), interp->errorLine));
    }

  end:
    Tcl_DecrRefCount(objPtr);
    if (encoding != NULL)
	Tcl_FreeEncoding(encoding);

    return result;
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_GetErrno --