Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | experiment: Handle BOM by switching to utf-8 when the system encoding is cp1250-cp1258 or identity (and the encoding is not otherwise specified) |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | bug-3466099 |
Files: | files | file ages | folders |
SHA1: |
110d7c7820197d5679ecb19e2c5d08b5 |
User & Date: | jan.nijtmans 2012-03-01 22:48:48 |
Context
2012-03-06
| ||
14:18 | another solution with auto recognition (without fixed cpBomTable) Closed-Leaf check-in: 8da0451f94 user: sebres tags: bug-3466099 | |
2012-03-01
| ||
22:48 | experiment: Handle BOM by switching to utf-8 when the system encoding is cp1250-cp1258 or identity (... check-in: 110d7c7820 user: jan.nijtmans tags: bug-3466099 | |
2012-02-29
| ||
22:38 | oops, that's no utf-8 BOM ;-( check-in: d2c359a194 user: jan.nijtmans tags: core-8-5-branch | |
Changes
Changes to generic/tclIOUtil.c.
︙ | ︙ | |||
1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 | * Side effects: * Depends on the commands in the file. During the evaluation of the * contents of the file, iPtr->scriptFile is made to point to pathPtr * (the old value is cached and replaced when this function returns). * *---------------------------------------------------------------------- */ int Tcl_FSEvalFileEx( Tcl_Interp *interp, /* Interpreter in which to process file. */ Tcl_Obj *pathPtr, /* Path of file to process. Tilde-substitution * will be performed on this name. */ const char *encodingName) /* If non-NULL, then use this encoding for the * file. NULL means use the system encoding. */ { int length, result = TCL_ERROR; Tcl_StatBuf statBuf; Tcl_Obj *oldScriptFile; Interp *iPtr; char *string; Tcl_Channel chan; Tcl_Obj *objPtr; if (Tcl_FSGetNormalizedPath(interp, pathPtr) == NULL) { return result; } | > > > > > > > > > > > > > > > > > | 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 | * Side effects: * Depends on the commands in the file. During the evaluation of the * contents of the file, iPtr->scriptFile is made to point to pathPtr * (the old value is cached and replaced when this function returns). * *---------------------------------------------------------------------- */ /* Table containing how the BOM looks like when decoded by * the encodings cp1250 up to cp1258. When we encounter this * as first bytes read from the file, the real encoding is utf-8 */ static const char *cpBomTable[] = { "\xc4\x8f\xc2\xbb\xc5\xbc", "\xd0\xbf\xc2\xbb\xd1\x97", "\xc3\xaf\xc2\xbb\xc2\xbf", "\xce\xbf\xc2\xbb\xce\x8f", "\xc3\xaf\xc2\xbb\xc2\xbf", "\xd7\x9f\xc2\xbb\xc2\xbf", "\xc3\xaf\xc2\xbb\xd8\x9f", "\xc4\xbc\xc2\xbb\xc3\xa6", "\xc3\xaf\xc2\xbb\xc2\xbf" }; int Tcl_FSEvalFileEx( Tcl_Interp *interp, /* Interpreter in which to process file. */ Tcl_Obj *pathPtr, /* Path of file to process. Tilde-substitution * will be performed on this name. */ const char *encodingName) /* If non-NULL, then use this encoding for the * file. NULL means use the system encoding. */ { int length, result = TCL_ERROR; Tcl_StatBuf statBuf; Tcl_Obj *oldScriptFile; Interp *iPtr; const char *bom = "\xef\xbb\xbf"; int bomsize = 1; char *string; Tcl_Channel chan; Tcl_Obj *objPtr; if (Tcl_FSGetNormalizedPath(interp, pathPtr) == NULL) { return result; } |
︙ | ︙ | |||
1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 | if (encodingName != NULL) { if (Tcl_SetChannelOption(interp, chan, "-encoding", encodingName) != TCL_OK) { Tcl_Close(interp,chan); return result; } } objPtr = Tcl_NewObj(); Tcl_IncrRefCount(objPtr); | > > > > > > > > > > > > > | | | < | | > > > > > > > > | < | 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 | if (encodingName != NULL) { if (Tcl_SetChannelOption(interp, chan, "-encoding", encodingName) != TCL_OK) { Tcl_Close(interp,chan); return result; } } else { const char *systemencoding = Tcl_GetEncodingName(NULL); if (!strcmp(systemencoding, "identity")) { /* identity reads the BOM as 3 characters */ bomsize = 3; } else if (!memcmp(systemencoding, "cp125", 5) && (systemencoding[5] >= '0') && (systemencoding[5] <= '8') && !systemencoding[6]) { /* cp125[0-8] reads the BOM as 3 characters as well, but * take the real form from the table above */ bom = cpBomTable[systemencoding[5] - '0']; bomsize = 3; } } objPtr = Tcl_NewObj(); Tcl_IncrRefCount(objPtr); /* Try to read first characters of stream, so we can * check for BOM to be handled especially. */ if (Tcl_ReadChars(chan, objPtr, bomsize, 0) < 0) { Tcl_Close(interp, chan); Tcl_AppendResult(interp, "couldn't read file \"", Tcl_GetString(pathPtr), "\": ", Tcl_PosixError(interp), NULL); goto end; } /* * If first character is a BOM, throw it away and switch * encoding to utf-8 [Bug 3466099]. */ if (!strcmp(Tcl_GetString(objPtr), bom)) { Tcl_SetObjLength(objPtr, 0); if (Tcl_SetChannelOption(interp, chan, "-encoding", "utf-8") != TCL_OK) { Tcl_Close(interp,chan); return result; } } if (Tcl_ReadChars(chan, objPtr, -1, 1) < 0) { Tcl_Close(interp, chan); Tcl_AppendResult(interp, "couldn't read file \"", Tcl_GetString(pathPtr), "\": ", Tcl_PosixError(interp), NULL); goto end; } if (Tcl_Close(interp, chan) != TCL_OK) { |
︙ | ︙ |
Changes to tests/source.test.
︙ | ︙ | |||
116 117 118 119 120 121 122 123 124 125 126 127 128 129 | close $out set y old-y source -encoding utf-8 $sourcefile return $y } -cleanup { removeFile $sourcefile } -result {new-y} test source-3.1 {return in middle of source file} -setup { set sourcefile [makeFile { set x new-x return allDone set y new-y } source.file] | > > > > > > > > > > > > > > > > > > | 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | close $out set y old-y source -encoding utf-8 $sourcefile return $y } -cleanup { removeFile $sourcefile } -result {new-y} foreach {n e} {8 identity 9 cp1250 10 cp1251 11 cp1252 12 cp1253 13 cp1254 14 cp1255 15 cp1256 16 cp1257 17 cp1258} { test source-2.$n [list $e with BOM] -setup { set sourcefile [makeFile {} source.file] set saveencoding [encoding system] } -body { encoding system $e set out [open $sourcefile w] fconfigure $out -encoding utf-8 puts $out "\ufeffset y new-y\xe9" close $out set y old-y source $sourcefile return $y } -cleanup { encoding system $saveencoding removeFile $sourcefile } -result "new-y\xe9" } test source-3.1 {return in middle of source file} -setup { set sourcefile [makeFile { set x new-x return allDone set y new-y } source.file] |
︙ | ︙ |