From 2790fbbf74a5b53a45874a9268758e89eb2e4373 Mon Sep 17 00:00:00 2001 From: fireattack Date: Wed, 12 Aug 2015 17:57:54 -0500 Subject: [PATCH] Added UTF8 (no BOM) detection. --- FixCue/Form1.cs | 134 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 102 insertions(+), 32 deletions(-) diff --git a/FixCue/Form1.cs b/FixCue/Form1.cs index a60e2c2..52313f6 100644 --- a/FixCue/Form1.cs +++ b/FixCue/Form1.cs @@ -50,6 +50,12 @@ private string OpenFile(string original_path) { switch (getcodetype(original_path)) { + case "CODETYPE_UTF8NOBOM": + { + var utf8WithoutBom = new UTF8Encoding(false); + readText = File.ReadAllText(original_path, utf8WithoutBom); + break; + } case "CODETYPE_SHIFTJIS": { readText = File.ReadAllText(original_path, Encoding.GetEncoding(932)); @@ -222,60 +228,124 @@ private bool IsUserAdministrator() private string getcodetype(string path) { - string strCodeType = "CODETYPE_SHIFTJIS"; + string strCodeType = "CODETYPE_UTF8NOBOM"; Byte[] MyByte = File.ReadAllBytes(path); int high, low, chr, i; int JP1 = 0, JP2 = 0; bool FakeJP = false; - for (i = 0; i < MyByte.Length; ) + i = 0; + bool isUTF8 = true; + while (i < MyByte.Length) { - high = MyByte[i]; //读取第一个byte - i++; - if (high <= 0x7F) //ASCII码区 - { - low = high; - high = 0; - } - else if ((high >= 0xA1) && (high <= 0xDF)) //半角片假名区 + if ((0x80 & MyByte[i]) == 0) // ASCII { - low = high; - high = 0; - JP1++; + i++; + continue; } - else //双字节区 + else if ((0xE0 & MyByte[i]) == 0xC0) // 110xxxxx { - low = MyByte[i]; //读取低位 - i++; - JP2++; + if (i+1>MyByte.Length) + { + isUTF8 = false; + break; + } + if ((0xC0 & MyByte[i + 1]) == 0x80) // 10xxxxxx + { + i += 2; + continue; + } + else + { + isUTF8 = false; + break; + } } - chr = low + high * 256; - - if (chr < 0x80) // ASCII - { } - else if (chr < 0xA1) // 0x80 - 0xA0 未定义空间 + else if ((0xF0 & MyByte[i]) == 0xE0) // 1110xxxx { - strCodeType = "CODETYPE_DEFAULT"; // 未知编码 - break; + if (i + 1 > MyByte.Length) + { + isUTF8 = false; + break; + } + if (i + 1 > MyByte.Length) + { + isUTF8 = false; + break; + } + if (((0xC0 & MyByte[i + 1]) == 0x80) && ((0xC0 & MyByte[i + 2]) == 0x80)) // 10xxxxxx 10xxxxxx + { + i += 3; + continue; + } + else + { + isUTF8 = false; + break; + } } - else if (chr < (0xA1 + 63)) // 0xA1 - 0xDF 半角假名区 - { } - else if (chr < 0x8140) // 0xE0 - 0x813F 未定义空间 + else // 不是UTF-8字符串 { - strCodeType = "CODETYPE_DEFAULT"; // 未知编码 + isUTF8 = false; break; } - else // 0x8140 - 0xFFFF + } + + if (isUTF8 == false) + strCodeType = "CODETYPE_SHIFTJIS"; + + if (strCodeType == "CODETYPE_SHIFTJIS") + { + for (i = 0; i < MyByte.Length; ) { - char a = JISMapBuffer[chr - 0x8140 + 63]; - if (a == '\uFFFD') + high = MyByte[i]; //读取第一个byte + i++; + if (high <= 0x7F) //ASCII码区 + { + low = high; + high = 0; + } + else if ((high >= 0xA1) && (high <= 0xDF)) //半角片假名区 + { + low = high; + high = 0; + JP1++; + } + else //双字节区 { - strCodeType = "CODETYPE_GBK"; + low = MyByte[i]; //读取低位 + i++; + JP2++; + } + chr = low + high * 256; + + if (chr < 0x80) // ASCII + { } + else if (chr < 0xA1) // 0x80 - 0xA0 未定义空间 + { + strCodeType = "CODETYPE_DEFAULT"; // 未知编码 + break; + } + else if (chr < (0xA1 + 63)) // 0xA1 - 0xDF 半角假名区 + { } + else if (chr < 0x8140) // 0xE0 - 0x813F 未定义空间 + { + strCodeType = "CODETYPE_DEFAULT"; // 未知编码 break; } + else // 0x8140 - 0xFFFF + { + char a = JISMapBuffer[chr - 0x8140 + 63]; + if (a == '\uFFFD') + { + strCodeType = "CODETYPE_GBK"; + break; + } + } } } + if ((strCodeType == "CODETYPE_SHIFTJIS")&&((float)JP1/JP2 >= 1.8)) {