-
十月 14
【原文转至http://dreamisx.blog.163.com/blog/static/11500483920122134947776/】
一点闲话:
2009年的时候,写了几篇博客文章,此外也将以前写的一些文章抄到博客上,完了之后发现没啥好写的了?! 晕啊。为什么别人就能坚持不懈地写那么多文章,而我却想不出来写什么呢?平日里,我可是挺喜欢写些代码,来实现一些自己想要的功能,但我发觉,似乎写代码还算好,写文章还真不容易。
最近总算又想出点东西来了,得,写下来吧,要不然感觉对不起这好不容易下定决心才开的博客啊,哈哈。
近日整理自己写的文本编辑器时,觉得不能自动识别文本编码用起来很不方便,于是研究了一下文本文件的编码方式,发现如果文件中有BOM(Byte Order Mark)头,那识别起来简单;如果没有BOM头,就没那么好处理了。
用Google搜索了老半天,自己也苦思冥想,发现大致可以划分为以下几种情况(这里只考虑中英文):
1、有BOM头的文件
这个处理起来比较简单,只要识别出各自的编码方式,然后出加以转换就可以了。
2、没有BOM头的UCS2(UTF-16)文件
这种类型的文件如果其中含有Ascii码字符,那也可以用比较简单的方式加以判断:看看文件中是否有为0的字符,如果有,基本上可以认为是UCS2了。当然,文件中有错,或没有Ascii码字符,那就没法判断了,因为UCS2(UTF-16)用的编码是0x0000-0xFFFF,所以难以从字符特征上来判断(可以通过字频、词频统计以及词语特征分析等方法识别,不过本文主要讨论GB/Big5的识别,所以这里不作探讨)。
3、没有BOM头的UTF-8编码的文件
由于UTF-8编码有一定的特征, 如下面的说明:
UTF-8
Ascii Chars:
00-7F // 1 Bytes = 0xxxxxxx
Multi Bytes:
C0-DF + 80-BF // 2 Bytes = 110xxxxx 10xxxxxx
E0-EF + 80-BF + 80-BF // 3 Bytes = 1110xxxx 10xxxxxx 10xxxxxx
F0-F7 + 80-BF + 80-BF + 80-BF // 4 Bytes = 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
所以可以从编码特征上来考虑。
4、没有BOM头的ANSI文件
此类文件里可能有Ascii字符,也可能有GB2312/GBK/GB18030/Big5的中文字符。单纯从文件上来看,它与没有BOM头的UTF-8编码文件类似,所以必须考虑如何与UTF-8文件区分开来。
此外,还有不常用的UCS4(UTF-32),这里就不考虑了。GB18030是与GBK兼容的,这里也只考虑它的2字节部分,4字节部分较少用到,不作考虑。
通过以上分析,可以看出,现在的主要问题是如何区分没有BOM头的UTF-8文件和ANSI文件。进一步,还有如何判断ANSI文件是简体还是繁体的问题。
以下代码是我想出来的一个猜测文本文件编码的方法,这可是原创哦(代码是Delphi的):
type TCharEncoding = (ceAnsi, ceBinary, ceUtf_8, ceUcs2_LE, ceUcs2_BE, ceUtf_32, ceGB, ceBig5); function GuessCharEncoding(const buf: string; SeeGBBig5: Boolean): TCharEncoding; var len: Longint; function Maybe3BytesUtf8(Index: Integer): Boolean; begin Result := (Index + 2 <= len) and (buf[Index] in [#$E0..#$EF]) and (buf[Index +1] in [#$80..#$BF]) and (buf[Index +2] in [#$80..#$BF]); end; var idx: Longint; iUtf8: Longint; maybeGB: Integer; //GB2312/GBK/GB18030 mayBig5: Integer; //Big5 mayUtf8: Integer; //Utf-8 maybeLE: Integer; //Unicode 16 (UCS2) , Little Endian maybeBE: Integer; //Unicode 16 (UCS2) , Big Endian mayBins: Integer; //Binary File Chars ? serZero: Integer; ratio: Integer; chsCount: Integer; utf8Count: Integer; gbkNulls: Integer; big5Nulls: Integer; begin Result := ceAnsi; maybeGB := 0; mayBig5 := 0; mayUtf8 := 0; maybeLE := 0; maybeBE := 0; mayBins := 0; serZero := 0; chsCount := 0; utf8Count := 0; gbkNulls := 0; big5Nulls := 0; len := Length(buf); idx := 1; while idx <= len do begin if idx < len then begin if (buf[idx] in [#$A1..#$A7]) and (buf[idx +1] in [#$40..#$A0]) or (buf[idx] in [#$AA..#$AF, #$F8..#$FE]) and (buf[idx +1] in [#$A1..#$FE]) then begin Inc(gbkNulls); end; if (buf[idx +1] in [#$7F..#$A0]) or (buf[idx] in [#$C7, #$C8]) and (buf[idx +1] in [#$40..#$FE]) or (buf[idx] = #$C6) and (buf[idx +1] in [#$A1..#$FE]) then begin Inc(big5Nulls); end; end; case buf[idx] of #0: begin Inc(mayBins); if (idx < len) and (buf[idx +1] = #0) then begin Inc(serZero); end; if (idx mod 2) = 0 then begin Inc(maybeLE); end else begin Inc(maybeBE); end; end; #1..#8, #11, #12, #14..#31: begin Inc(mayBins); end; #$80: begin iUtf8 := idx; Inc(iUtf8); if (iUtf8 < len) and (buf[iUtf8] in [#$80..#$BF]) then Inc(iUtf8); if Maybe3BytesUtf8(iUtf8) then begin Inc(mayUtf8, 32); Inc(utf8Count); end; end; #$81..#$BF: begin if buf[idx] in [#$81..#$A0] then begin Inc(maybeGB, 8); end else begin Inc(maybeGB, 8); Inc(mayBig5, 8); end; Inc(chsCount); Inc(idx); iUtf8 := idx; if (iUtf8 < len) and (buf[iUtf8] in [#$80..#$BF]) then Inc(iUtf8); if Maybe3BytesUtf8(iUtf8) then begin Inc(mayUtf8, 32); Inc(utf8Count); end; end; #$C0..#$DF: begin if (idx < len) and (buf[idx +1] in [#$80..#$BF]) then begin Inc(mayUtf8); Inc(utf8Count); Inc(chsCount); if (buf[idx +1] in [#$A1..#$BF]) then begin Inc(maybeGB); Inc(mayBig5); end else begin Inc(maybeGB, 4); end; end else begin Inc(maybeGB); Inc(mayBig5); Inc(chsCount); end; Inc(idx); end; #$E0..#$EF: begin if (idx + 2 <= len) and (buf[idx +1] in [#$80..#$BF]) and (buf[idx +2] in [#$80..#$BF]) then begin Inc(mayUtf8, 32); Inc(utf8Count); end; Inc(maybeGB); Inc(mayBig5); Inc(chsCount); Inc(idx); end; #$F0..#$FE: begin if buf[idx] in [#$FA..#$FE] then begin Inc(maybeGB, 8); end; Inc(maybeGB, 8); Inc(mayBig5, 8); Inc(chsCount); Inc(idx); end; end; Inc(idx); end; // set encoding if (mayBins > 1) or (maybeLE > 1) or (maybeBE > 1) or (mayBins * 8 >= len) or (maybeLE * 8 >= len) or (maybeBE * 8 >= len) then begin if (mayBins > maybeLE *2) and (mayBins > maybeBE *2) or (serZero > 10) or (serZero * 8 > Max(maybeLE, maybeBE)) then begin Result := ceBinary; end else if maybeLE >= maybeBE then begin Result := ceUcs2_LE; end else begin Result := ceUcs2_BE; end; end else if (maybeGB >= mayUtf8) or (mayBig5 >= mayUtf8) or (chsCount >= utf8Count *2) then begin ratio := (maybeGB - mayBig5) * 100 div Max(1, Max(maybeGB, mayBig5)); if Abs(ratio) <= 5 then begin if gbkNulls > big5Nulls then begin Result := ceBig5; end else if gbkNulls < big5Nulls then begin Result := ceGB; end else if SeeGBBig5 and ((maybeGB > 0) or (mayBig5 > 0)) then begin Result := TryToDistinguishGBOrBig5(Copy(buf, 1, len)); end; end else begin if ratio > 0 then begin Result := ceGB; end else begin Result := ceBig5; end; end; end else if mayUtf8 > 0 then begin Result := ceUtf_8; end; end; function GuessCharEncoding(AStream: TStream; SeeGBBig5: Boolean): TCharEncoding; var buf: string; begin SetLength(buf, Min(SamplingSize, AStream.Size - AStream.Position)); AStream.Read(buf[1], Length(buf)); Result := GuessCharEncoding(buf, SeeGBBig5); end;
请仔细看这段代码,并将它与下面关于编码的说明对比:
{---------------------------- Character Encoding -----------------------------
1. UTF-8
Ascii Chars: 00-7F // 1 Bytes = 0xxxxxxx
Multi Bytes: C0-DF + 80-BF // 2 Bytes = 110xxxxx 10xxxxxx
E0-EF + 80-BF + 80-BF // 3 Bytes = 1110xxxx 10xxxxxx 10xxxxxx
F0-F7 + 80-BF + 80-BF + 80-BF // 4 Bytes = 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2. Unicode 16 (UCS2) [as UTF-16 for 96.9%]
2 Bytes Characters: 0000-FFFF
3. UTF-16
0000-D7FF = 0000-D7FF // 2 Bytes
E000-FFFF = E000-FFFF // 2 Bytes
10000-10FFFF = D800-D8FF + DC00-DCFF // 4 Bytes
D800-DFFF // Surrogate
4. UTF-32 = 32 bit unsigned integer of character
5. GB2312-80
Ascii Chars: 00-7F
Simplified Chinese Chars: A1-F7 + A1-FE
6. Big5
Ascii Chars: 00-7F
Traditional Chinses Chars: A1-F9 + 40-7E
A1-F9 + A1-FE
7. GBK
Ascii Chars: 00-7F
Chinses Chars: 81-FE + 40-7E
81-FE + 80-FE
8. GB18030-2000
Ascii Chars: 00-7F
Chinese Chars (2 Bytes): 81-FE + 40-7E
81-FE + 80-FE
Chinese Chars (4 Bytes): 81-FE + 30-39 + 81-FE + 30-39
看出来了吗?这个函数利用UTF-8的编码特征和GB/Big5的编码范围,再加上权值,来判断是UTF-8还是ANSI文档(UCS2(UTF-16)的判断只是附带的,可能也不是很准确)。
以我的测试来看,用这种方法可以比较准确地区分UTF-8和ANSI编码的文件。
附:完整单元代码,只支持D5~D2007
unit SmartLoadFile; {******************************************************************************* * 智能识别文本编码 * * * * 功能: 本单元用于智能识别文本文件编码,包括简繁体文本的识别,支持D5-D2007 * * 实现: 2012.3.10 ~ 2012.3.15 完成Ansi(GB/Big5)/Unicode的识别功能 * * 2012.3.18 ~ 2012.3.19 完成UTF-16与二进制文件的区分功能 * * 2012.6.2 ~ 2012.6.3 改进简繁体的识别方法,提高了识别率 * * Blog: dreamisx.blog.163.com * * EMail: iamdream%yeah.net (% -> @) * *******************************************************************************} interface {$IFDEF UNICODE}Sorry, not Support Unicode Version!{$ENDIF} {$IFDEF VER150} //消除D7警告 {$WARN UNIT_PLATFORM OFF} {$WARN UNSAFE_CODE OFF} {$WARN UNSAFE_TYPE OFF} {$WARN UNSAFE_CAST OFF} {$ENDIF} uses Windows, Messages, SysUtils, Classes, Math, GB2BigEx {$IFDEF VER130}, Unicode{$ENDIF}; type TCharEncoding = (ceAnsi, ceBinary, ceUtf_8, ceUcs2_LE, ceUcs2_BE, ceUtf_32, ceGB, ceBig5); TCharConvertProc = function (const S: string): string; function SysIsSimplifiedChinese: Boolean; function SysIsTraditionalChinese: Boolean; function TryToDistinguishGBOrBig5(const S: string): TCharEncoding; function GuessCharEncoding(const buf: string; SeeGBBig5: Boolean): TCharEncoding; overload; function GuessCharEncoding(AStream: TStream; SeeGBBig5: Boolean): TCharEncoding; overload; procedure InitCharConvert(AGB2Big5, ABig52GB: TCharConvertProc; AExactCompare: Boolean = False; ASamplingSize: Integer = 4096); function SmartLoadTextFileToStream(const AFileName: string; AStream: TStream; TryToDecode: Boolean = True; SeeGBBig5: Boolean = True): TCharEncoding; implementation {---------------------------- Character Encoding ----------------------------- 1. UTF-8 Ascii Chars: 00-7F // 1 Bytes = 0xxxxxxx Multi Bytes: C0-DF + 80-BF // 2 Bytes = 110xxxxx 10xxxxxx E0-EF + 80-BF + 80-BF // 3 Bytes = 1110xxxx 10xxxxxx 10xxxxxx F0-F7 + 80-BF + 80-BF + 80-BF // 4 Bytes = 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 2. Unicode 16 (UCS2) [as UTF-16 for 96.9%] 2 Bytes Characters: 0000-FFFF 3. UTF-16 0000-D7FF = 0000-D7FF // 2 Bytes E000-FFFF = E000-FFFF // 2 Bytes 10000-10FFFF = D800-D8FF + DC00-DCFF // 4 Bytes D800-DFFF // Surrogate 4. UTF-32 = 32 bit unsigned integer of character 5. GB2312-80 Ascii Chars: 00-7F Simplified Chinese Chars: A1-F7 + A1-FE Note: A9 + A4-EF ==> Tabs in Chinese Chars 6. Big5 Ascii Chars: 00-7F Traditional Chinses Chars: A1-F9 + 40-7E A1-F9 + A1-FE 7. GBK Ascii Chars: 00-7F Chinses Chars: 81-FE + 40-7E 81-FE + 80-FE 8. GB18030-2000 Ascii Chars: 00-7F Chinese Chars (2 Bytes): 81-FE + 40-7E 81-FE + 80-FE Chinese Chars (4 Bytes): 81-FE + 30-39 + 81-FE + 30-39 null GBK: A1.40-A0 ... A7.40-A0 AA.A1-FE ... AF.A1-FE F8.A1-FE ... FE.A1-FE null Big5: C6.A1-FE C7.40-FE C8.40-FE XX.7F-A0 -----------------------------------------------------------------------------} var ExactCompare: Boolean = False; // for exact convert GB / Big5 SamplingSize: Integer = 4096; // for guessing GB / Big5 GB2Big5Proc: TCharConvertProc; Big52GBProc: TCharConvertProc; { Internal functions of transfering text to ANSI } procedure DoUtf8ToAnsi(Source, Target: TStream); var buf: string; ret: string; begin SetLength(buf, Source.Size - Source.Position); Source.Read(PChar(buf)^, Length(buf)); ret := Utf8ToAnsi(buf); Target.Write(PChar(ret)^, Length(ret)); end; procedure DoLittleEndianUnicode16ToAnsi(Source, Target: TStream); const //UCS2 ? (Cannot process 4 Bytes' Chars!!) cBufSize = 4096; var buf: array[1..cBufSize] of Char; pIn: PWideChar; readSize: Longint; sOut: string; begin repeat readSize := Source.Read(buf, cBufSize); if readSize > 0 then begin pIn := PWideChar(@buf[1]); sOut := WideCharLenToString(pIn, readSize shr 1); Target.Write(Pointer(sOut)^, Length(sOut)); end; until readSize < cBufSize; end; procedure DoBigEndianUnicode16ToAnsi(Source, Target: TStream); const //UCS2 ? (Cannot process 4 Bytes' Chars!!) cBufSize = 4096; var buf: array[1..cBufSize] of Char; pIn: PWideChar; readSize: Longint; sOut: string; c: Char; i: Longint; begin repeat readSize := Source.Read(buf, cBufSize); if readSize > 0 then begin pIn := PWideChar(@buf[1]); // adjust byte order i := 1; while i <= readSize do begin c := buf[i]; buf[i] := buf[i +1]; buf[i +1] := c; Inc(i, 2); end; sOut := WideCharLenToString(pIn, readSize shr 1); Target.Write(Pointer(sOut)^, Length(sOut)); end; until readSize < cBufSize; end; procedure DoGBToAnsi(Source, Target: TStream); var S: string; begin //If System is Traditional Chinese, Convert it. if SysIsTraditionalChinese() then begin SetLength(S, Source.Size); Source.Read(PChar(S)^, Length(S)); S := Gb2Big5Proc(S); Target.Write(PChar(S)^, Length(S)); end else begin Target.CopyFrom(Source, Source.Size); end; end; procedure DoBig5ToAnsi(Source, Target: TStream); var S: string; begin //If System is Simplified Chinese, Convert it. if SysIsSimplifiedChinese() then begin SetLength(S, Source.Size); Source.Read(PChar(S)^, Length(S)); S := Big52GbProc(S); Target.Write(PChar(S)^, Length(S)); end else begin Target.CopyFrom(Source, Source.Size); end; end; { Count Chinese Characters } function CountChineseChars(const S: string): Integer; var i: Integer; begin Result := 0; i := 1; while i <= Length(S) do begin if S[i] > #$80 then begin Inc(Result, 2); Inc(i, 2); end else begin Inc(i); end; end; end; //....................... functions of GB / Big5 ............................. function SysIsSimplifiedChinese: Boolean; begin Result := (SysLocale.PriLangID = LANG_CHINESE) and (SysLocale.SubLangID in [SUBLANG_CHINESE_SIMPLIFIED, SUBLANG_CHINESE_SINGAPORE]); end; function SysIsTraditionalChinese: Boolean; begin Result := (SysLocale.PriLangID = LANG_CHINESE) and (SysLocale.SubLangID in [SUBLANG_CHINESE_TRADITIONAL, SUBLANG_CHINESE_HONGKONG]); end; function TryToDistinguishGBOrBig5(const S: string): TCharEncoding; function MyCompareChineseStr(const s1, s2: string): Boolean; var difCount: Integer; i, k: Integer; cmpLen: Integer; begin difCount := 0; i := 1; k := 1; while (i <= Length(s1)) and (k <= Length(s2)) do begin if s1[i] <> s2[k] then begin if (i +2 <= Length(s1)) and (k +2 <= Length(s2)) then begin //比较原理:两次转换后,某些字可能会转戌一个'?',以下处理这种情况 if (s1[i +1] = s2[k]) and (s1[i +2] = s2[k +1]) then begin Inc(i); end else if (s1[i] = s2[k +1]) and (s1[i +1] = s2[k +2]) then begin Inc(k); end else begin Inc(difCount); end; end else begin Inc(difCount); end; end; Inc(i); Inc(k); end; if ExactCompare then begin cmpLen := CountChineseChars(S); end else begin cmpLen := Length(S); end; Result := difCount * 100 div Max(1, cmpLen) <= 6; // different <= 6% end; begin Result := ceAnsi; if SysLocale.PriLangID = LANG_CHINESE then begin case SysLocale.SubLangID of SUBLANG_CHINESE_SIMPLIFIED, SUBLANG_CHINESE_SINGAPORE: begin if MyCompareChineseStr(S, Big52GBProc(GB2Big5Proc(S))) then begin Result := ceGB; end else begin Result := ceBig5; end; end; SUBLANG_CHINESE_TRADITIONAL, SUBLANG_CHINESE_HONGKONG: begin if MyCompareChineseStr(S, GB2Big5Proc(Big52GBProc(S))) then begin Result := ceBig5; end else begin Result := ceGB; end; end; end; end; end; //............................................................................ // It can distinguish encoding is UTF8 or Ansi, // but may not distinguish encoding is GB or Big5, // so it need to process GB / Big5 specially function GuessCharEncoding(const buf: string; SeeGBBig5: Boolean): TCharEncoding; var len: Longint; function Maybe3BytesUtf8(Index: Integer): Boolean; begin Result := (Index + 2 <= len) and (buf[Index] in [#$E0..#$EF]) and (buf[Index +1] in [#$80..#$BF]) and (buf[Index +2] in [#$80..#$BF]); end; var idx: Longint; iUtf8: Longint; maybeGB: Integer; //GB2312/GBK/GB18030 mayBig5: Integer; //Big5 mayUtf8: Integer; //Utf-8 maybeLE: Integer; //Unicode 16 (UCS2) , Little Endian maybeBE: Integer; //Unicode 16 (UCS2) , Big Endian mayBins: Integer; //Binary File Chars ? serZero: Integer; ratio: Integer; chsCount: Integer; utf8Count: Integer; gbkNulls: Integer; big5Nulls: Integer; begin Result := ceAnsi; maybeGB := 0; mayBig5 := 0; mayUtf8 := 0; maybeLE := 0; maybeBE := 0; mayBins := 0; serZero := 0; chsCount := 0; utf8Count := 0; gbkNulls := 0; big5Nulls := 0; len := Length(buf); idx := 1; while idx <= len do begin if idx < len then begin if (buf[idx] in [#$A1..#$A7]) and (buf[idx +1] in [#$40..#$A0]) or (buf[idx] in [#$AA..#$AF, #$F8..#$FE]) and (buf[idx +1] in [#$A1..#$FE]) then begin Inc(gbkNulls); end; if (buf[idx +1] in [#$7F..#$A0]) or (buf[idx] in [#$C7, #$C8]) and (buf[idx +1] in [#$40..#$FE]) or (buf[idx] = #$C6) and (buf[idx +1] in [#$A1..#$FE]) then begin Inc(big5Nulls); end; end; case buf[idx] of #0: begin Inc(mayBins); if (idx < len) and (buf[idx +1] = #0) then begin Inc(serZero); end; if (idx mod 2) = 0 then begin Inc(maybeLE); end else begin Inc(maybeBE); end; end; #1..#8, #11, #12, #14..#31: begin Inc(mayBins); end; #$80: begin iUtf8 := idx; Inc(iUtf8); if (iUtf8 < len) and (buf[iUtf8] in [#$80..#$BF]) then Inc(iUtf8); if Maybe3BytesUtf8(iUtf8) then begin Inc(mayUtf8, 32); Inc(utf8Count); end; end; #$81..#$BF: begin if buf[idx] in [#$81..#$A0] then begin Inc(maybeGB, 8); end else begin Inc(maybeGB, 8); Inc(mayBig5, 8); end; Inc(chsCount); Inc(idx); iUtf8 := idx; if (iUtf8 < len) and (buf[iUtf8] in [#$80..#$BF]) then Inc(iUtf8); if Maybe3BytesUtf8(iUtf8) then begin Inc(mayUtf8, 32); Inc(utf8Count); end; end; #$C0..#$DF: begin if (idx < len) and (buf[idx +1] in [#$80..#$BF]) then begin Inc(mayUtf8); Inc(utf8Count); Inc(chsCount); if (buf[idx +1] in [#$A1..#$BF]) then begin Inc(maybeGB); Inc(mayBig5); end else begin Inc(maybeGB, 4); end; end else begin Inc(maybeGB); Inc(mayBig5); Inc(chsCount); end; Inc(idx); end; #$E0..#$EF: begin if (idx + 2 <= len) and (buf[idx +1] in [#$80..#$BF]) and (buf[idx +2] in [#$80..#$BF]) then begin Inc(mayUtf8, 32); Inc(utf8Count); end; Inc(maybeGB); Inc(mayBig5); Inc(chsCount); Inc(idx); end; #$F0..#$FE: begin if buf[idx] in [#$FA..#$FE] then begin Inc(maybeGB, 8); end; Inc(maybeGB, 8); Inc(mayBig5, 8); Inc(chsCount); Inc(idx); end; end; Inc(idx); end; // set encoding if (mayBins > 1) or (maybeLE > 1) or (maybeBE > 1) or (mayBins * 8 >= len) or (maybeLE * 8 >= len) or (maybeBE * 8 >= len) then begin if (mayBins > maybeLE *2) and (mayBins > maybeBE *2) or (serZero > 10) or (serZero * 8 > Max(maybeLE, maybeBE)) then begin Result := ceBinary; end else if maybeLE >= maybeBE then begin Result := ceUcs2_LE; end else begin Result := ceUcs2_BE; end; end else if (maybeGB >= mayUtf8) or (mayBig5 >= mayUtf8) or (chsCount >= utf8Count *2) then begin ratio := (maybeGB - mayBig5) * 100 div Max(1, Max(maybeGB, mayBig5)); if Abs(ratio) <= 5 then begin if gbkNulls > big5Nulls then begin Result := ceBig5; end else if gbkNulls < big5Nulls then begin Result := ceGB; end else if SeeGBBig5 and ((maybeGB > 0) or (mayBig5 > 0)) then begin Result := TryToDistinguishGBOrBig5(Copy(buf, 1, len)); end; end else begin if ratio > 0 then begin Result := ceGB; end else begin Result := ceBig5; end; end; end else if mayUtf8 > 0 then begin Result := ceUtf_8; end; end; function GuessCharEncoding(AStream: TStream; SeeGBBig5: Boolean): TCharEncoding; var buf: string; begin SetLength(buf, Min(SamplingSize, AStream.Size - AStream.Position)); AStream.Read(buf[1], Length(buf)); Result := GuessCharEncoding(buf, SeeGBBig5); end; function DoTryToDecode(Source, Target: TStream; SeeGBBig5: Boolean): TCharEncoding; begin // test character encoding Result := GuessCharEncoding(Source, SeeGBBig5); // transfer encoding Source.Seek(0, soFromBeginning); case Result of ceUtf_8: DoUtf8ToAnsi(Source, Target); ceUcs2_LE: DoLittleEndianUnicode16ToAnsi(Source, Target); ceUcs2_BE: DoBigEndianUnicode16ToAnsi(Source, Target); ceUtf_32: raise Exception.Create('UTF-32 not support yet.'); ceGB: DoGBToAnsi(Source, Target); ceBig5: DoBig5ToAnsi(Source, Target); else //ceAnsi, ceBinary Target.CopyFrom(Source, Source.Size); end; end; {------------------------- Intialize Char Convert ---------------------------} procedure InitCharConvert(AGB2Big5, ABig52GB: TCharConvertProc; AExactCompare: Boolean; ASamplingSize: Integer); begin ExactCompare := AExactCompare; SamplingSize := ASamplingSize; if Assigned(AGB2Big5) then begin GB2Big5Proc := AGB2Big5; end else begin GB2Big5Proc := GB2BigEx.Gb2Big5; end; if Assigned(ABig52GB) then begin Big52GBProc := ABig52GB; end else begin Big52GBProc := GB2BigEx.Big52GB; end; end; {--------------------------- Smart load text file ---------------------------} function SmartLoadTextFileToStream(const AFileName: string; AStream: TStream; TryToDecode, SeeGBBig5: Boolean): TCharEncoding; var fs: TFileStream; bom: array[1..4] of Char; len: Longint; begin Result := ceAnsi; fs := TFileStream.Create(AFileName, fmOpenRead or fmShareDenyNone); with fs do try if Size = 0 then Exit; len := Read(bom, SizeOf(bom)); if (len >= 3) and (bom[1] = #$EF) and (bom[2] = #$BB) and (bom[3] = #$BF) then begin //UTF8 Result := ceUtf_8; Seek(3, soFromBeginning); DoUtf8ToAnsi(fs, AStream); end else if (len >= 2) and (bom[1] = #$FF) and (bom[2] = #$FE) then begin //UTF-16/UCS-2, little endian Result := ceUcs2_LE; Seek(2, soFromBeginning); DoLittleEndianUnicode16ToAnsi(fs, AStream); end else if (len >= 2) and (bom[1] = #$FE) and (bom[2] = #$FF) then begin //UTF-16/UCS-2, big endian Result := ceUcs2_BE; Seek(2, soFromBeginning); DoBigEndianUnicode16ToAnsi(fs, AStream); end else begin Seek(0, soFromBeginning); if TryToDecode then begin Result := DoTryToDecode(fs, AStream, SeeGBBig5); end else begin AStream.CopyFrom(fs, fs.Size); end; end; AStream.Position := 0; finally fs.Free; end; end; initialization GB2Big5Proc := GB2BigEx.Gb2Big5; Big52GBProc := GB2BigEx.Big52GB; end.
//****************************************************************************** // GB/Big5相互转换代码(直接调用API实现,也可以用其他类似函数代替) unit GB2BigEX; interface {$IFDEF VER150} //D7 {$WARN UNSAFE_CODE OFF} {$WARN UNSAFE_TYPE OFF} {$WARN UNSAFE_CAST OFF} {$ENDIF} uses Windows; function Gb2Big5(const S: string): string; function Big52Gb(const S: string): string; implementation function UnicodeEncode(const S: string; CodePage: Integer): WideString; var Len: Integer; begin Len := Length(S) + 1; SetLength(Result, Len); Len := MultiByteToWideChar(CodePage, 0, PChar(S), -1, PWideChar(Result), Len); SetLength(Result, Len - 1); //end is #0 end; function UnicodeDecode(const S: WideString; CodePage: Integer): string; var Len: Integer; begin Len := Length(S) * 2 + 1; //one for #0 SetLength(Result, Len); Len := WideCharToMultiByte(CodePage, 0, PWideChar(S), -1, PChar(Result), Len, nil, nil); SetLength(Result, Len - 1); end; function Gb2Big5(const S: string): string; begin SetLength(Result, Length(S)); LCMapString(GetUserDefaultLCID, LCMAP_TRADITIONAL_CHINESE, PChar(S), Length(S), PChar(Result), Length(Result)); Result := UnicodeDecode(UnicodeEncode(Result, 936), 950); //改成Result := UnicodeEncode(Result, 936);则可在GBK中显示繁体字 end; function Big52Gb(const S: string): string; var tmp: string; begin tmp := UnicodeDecode(UnicodeEncode(S, 950), 936); SetLength(Result, Length(tmp)); LCMapString(GetUserDefaultLCID, LCMAP_SIMPLIFIED_CHINESE, PChar(tmp), Length(tmp), PChar(Result), Length(Result)); end; end.
◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。
1楼 小焱