java动态获取上传⽂件的编码类型
package com.sjfl.main;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.URL;
/**
* <Detect encoding .>
* Copyright (C) <2009> <Fluck,ACC /dev>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* EncodingDetect.java<br>
* ⾃动获取⽂件的编码
* 使⽤⽰例
String filePath="D:/";
//获得⽂件编码
String JavaEncode(filePath);
//根据⽂件编码获得⽂件内容
String adFileToString(new File(filePath),fileEncode)
* @author Billows.Van
* @since Create on 2010-01-27 11:19:00
* @version 1.0
*/
public class EncodingDetect {
public static void main(String[] args) {
String file = "E:/PWCP_ANM_ARQC_S99_EAIR_ANM_L88_l";
String encode=getJavaEncode(file);
System.out.println(encode);
readFile(file,encode);
}
/**
* 得到⽂件的编码
* @param filePath ⽂件路径
* @return⽂件的编码
*/
public static String getJavaEncode(String filePath){
BytesEncodingDetect s = new BytesEncodingDetect();
String fileCode = BytesEncodingDetect.javaname[s.detectEncoding(new File(filePath))];
return fileCode;
}
public static void readFile(String file, String code) {
BufferedReader fr;
try {
String myCode = code!=null&&!"".equals(code) ? code : "UTF8";
InputStreamReader read = new InputStreamReader(new FileInputStream( file), myCode);
fr = new BufferedReader(read);
String line = null;
int flag=1;
// 读取每⼀⾏,如果结束了,line会为空
while ((line = fr.readLine()) != null && im().length() > 0) {
if(flag==1) {
line=line.substring(1);//去掉⽂件头
flag++;
}
// 每⼀⾏创建⼀个Student对象,并存⼊数组中
System.out.println(line);
}
fr.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
class BytesEncodingDetect extends Encoding {
// Frequency tables to hold the GB, Big5, and EUC-TW character
// frequencies
int GBFreq[][];
int GBKFreq[][];
int Big5Freq[][];
int Big5PFreq[][];
int EUC_TWFreq[][];
int KRFreq[][];
int JPFreq[][];
// int UnicodeFreq[94][128];
// public static String[] nicename;
// public static String[] codings;
public boolean debug;
public BytesEncodingDetect() {
super();
debug = false;
GBFreq = new int[94][94];
GBKFreq = new int[126][191];
Big5Freq = new int[94][158];
Big5PFreq = new int[126][191];
EUC_TWFreq = new int[94][94];
KRFreq = new int[94][94];
JPFreq = new int[94][94];
// Initialize the Frequency Table for GB, GBK, Big5, EUC-TW, KR, JP
initialize_frequencies();
initialize_frequencies();
}
public static void main(String argc[]) {
BytesEncodingDetect sinodetector;
int result = OTHER;
int i;
sinodetector = new BytesEncodingDetect();
for (i = 0; i < argc.length; i++) {
if (argc[i].startsWith("") == true) {
try {
result = sinodetector.detectEncoding(new URL(argc[i]));
} catch (Exception e) {
}
} else if (argc[i].equals("-d")) {
sinodetector.debug = true;
continue;
} else {
result = sinodetector.detectEncoding(new File(argc[i]));
}
System.out.println(nicename[result]);
}
}
/**
* Function : detectEncoding Aruguments: URL Returns : One of the encodings from the Encoding enumeration (GB2312, HZ, BIG5, * EUC_TW, ASCII, or OTHER) Description: This function looks at the URL contents and assigns it a probability score for each
* encoding type. The encoding type with the highest probability is returned.
*/
public int detectEncoding(URL testurl) {
byte[] rawtext = new byte[10000];
int bytesread = 0, byteoffset = 0;
int guess = OTHER;
InputStream chinesestream;
try {
chinesestream = testurl.openStream();
while ((bytesread = ad(rawtext, byteoffset, rawtext.length - byteoffset)) > 0) {
byteoffset += bytesread;
}
;
chinesestream.close();
guess = detectEncoding(rawtext);
} catch (Exception e) {
guess = -1;
}
return guess;
}
/**
* Function : detectEncoding Aruguments: File Returns : One of the encodings from the Encoding enumeration (GB2312, HZ, BIG5, * EUC_TW, ASCII, or OTHER) Description: This function looks at the file and assigns it a probability score for each encoding
* type. The encoding type with the highest probability is returned.
*/
public int detectEncoding(File testfile) {
FileInputStream chinesefile;
byte[] rawtext;
rawtext = new byte[(int) testfile.length()];
try {
chinesefile = new FileInputStream(testfile);
chinesefile.close();
} catch (Exception e) {
}
}
return detectEncoding(rawtext);
}
/**
* Function : detectEncoding Aruguments: byte array Returns : One of the encodings from the Encoding enumeration (GB2312, HZ, * BIG5, EUC_TW, ASCII, or OTHER) Description: This function looks at the byte array and assigns it a probability score for
* each encoding type. The encoding type with the highest probability is returned.
*/
public int detectEncoding(byte[] rawtext) {
int[] scores;
int index, maxscore = 0;
int encoding_guess = OTHER;
scores = new int[TOTALTYPES];
// Assign Scores
scores[GB2312] = gb2312_probability(rawtext);
scores[GBK] = gbk_probability(rawtext);
scores[GB18030] = gb18030_probability(rawtext);
scores[HZ] = hz_probability(rawtext);
scores[BIG5] = big5_probability(rawtext);
scores[CNS11643] = euc_tw_probability(rawtext);
scores[ISO2022CN] = iso_2022_cn_probability(rawtext);
scores[UTF8] = utf8_probability(rawtext);
scores[UNICODE] = utf16_probability(rawtext);
scores[EUC_KR] = euc_kr_probability(rawtext);
scores[CP949] = cp949_probability(rawtext);
scores[JOHAB] = 0;
unicode文件格式scores[ISO2022KR] = iso_2022_kr_probability(rawtext);
scores[ASCII] = ascii_probability(rawtext);
scores[SJIS] = sjis_probability(rawtext);
scores[EUC_JP] = euc_jp_probability(rawtext);
scores[ISO2022JP] = iso_2022_jp_probability(rawtext);
scores[UNICODET] = 0;
scores[UNICODES] = 0;
scores[ISO2022CN_GB] = 0;
scores[ISO2022CN_CNS] = 0;
scores[OTHER] = 0;
// Tabulate Scores
for (index = 0; index < TOTALTYPES; index++) {
if (debug)
if (scores[index] > maxscore) {
encoding_guess = index;
maxscore = scores[index];
}
}
// Return OTHER if nothing scored above 50
if (maxscore <= 50) {
encoding_guess = OTHER;
}
return encoding_guess;
}
/
*
* Function: gb2312_probability Argument: pointer to byte array Returns : number from 0 to 100 representing probability text
* in array uses GB-2312 encoding
*/
int gb2312_probability(byte[] rawtext) {
int i, rawtextlen = 0;
int dbchars = 1, gbchars = 1;
long gbfreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int row, column;
// Stage 1: Check to see if characters fit into acceptable ranges
rawtextlen = rawtext.length;
for (i = 0; i < rawtextlen - 1; i++) {
for (i = 0; i < rawtextlen - 1; i++) {
// println(rawtext[i]);
if (rawtext[i] >= 0) {
// asciichars++;
} else {
dbchars++;
if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xF7 && (byte) 0xA1 <= rawtext[i + 1]
&& rawtext[i + 1] <= (byte) 0xFE) {
gbchars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0xA1;
column = rawtext[i + 1] + 256 - 0xA1;
if (GBFreq[row][column] != 0) {
gbfreq += GBFreq[row][column];
} else if (15 <= row && row < 55) {
// In GB high-freq character range
gbfreq += 200;
}
}
i++;
}
}
rangeval = 50 * ((float) gbchars / (float) dbchars);
freqval = 50 * ((float) gbfreq / (float) totalfreq);
return (int) (rangeval + freqval);
}
/*
* Function: gbk_probability Argument: pointer to byte array Returns : number from 0 to 100 representing probability text in
* array uses GBK encoding
*/
int gbk_probability(byte[] rawtext) {
int i, rawtextlen = 0;
int dbchars = 1, gbchars = 1;
long gbfreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int row, column;
// Stage 1: Check to see if characters fit into acceptable ranges
rawtextlen = rawtext.length;
for (i = 0; i < rawtextlen - 1; i++) {
// println(rawtext[i]);
if (rawtext[i] >= 0) {
// asciichars++;
} else {
dbchars++;
if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xF7 && // Original GB range
(byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) {
gbchars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0xA1;
column = rawtext[i + 1] + 256 - 0xA1;
// System.out.println("original row " + row + " column " + column);
if (GBFreq[row][column] != 0) {
gbfreq += GBFreq[row][column];
} else if (15 <= row && row < 55) {
gbfreq += 200;
}
} else if ((byte) 0x81 <= rawtext[i]
&& rawtext[i] <= (byte) 0xFE
&& // Extended GB range
(((byte) 0x80 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) || ((byte) 0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x7E))) { gbchars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0x81;
if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {
column = rawtext[i + 1] - 0x40;
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论