java动态获取上传文件的编码类型--688IT编程网

java动态获取上传⽂件的编码类型

package com.sjfl.main;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.URL;

/**

* <Detect encoding .>

* Copyright (C) <2009> <Fluck,ACC /dev>

* This program is free software: you can redistribute it and/or modify

* it under the terms of the GNU General Public License as published by

* the Free Software Foundation, either version 3 of the License, or

* (at your option) any later version.

* This program is distributed in the hope that it will be useful,

* but WITHOUT ANY WARRANTY; without even the implied warranty of

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

* GNU General Public License for more details.

* EncodingDetect.java<br>

* ⾃动获取⽂件的编码

* 使⽤⽰例

String filePath="D:/";

//获得⽂件编码

String JavaEncode(filePath);

//根据⽂件编码获得⽂件内容

String adFileToString(new File(filePath),fileEncode)

* @author Billows.Van

* @since Create on 2010-01-27 11:19:00

* @version 1.0

public class EncodingDetect {

public static void main(String[] args) {

String file = "E:/PWCP_ANM_ARQC_S99_EAIR_ANM_L88_l";

String encode=getJavaEncode(file);

System.out.println(encode);

readFile(file,encode);

}

/**

* 得到⽂件的编码

* @param filePath ⽂件路径

* @return⽂件的编码

public static String getJavaEncode(String filePath){

BytesEncodingDetect s = new BytesEncodingDetect();

String fileCode = BytesEncodingDetect.javaname[s.detectEncoding(new File(filePath))];

return fileCode;

}

public static void readFile(String file, String code) {

BufferedReader fr;

try {

String myCode = code!=null&&!"".equals(code) ? code : "UTF8";

InputStreamReader read = new InputStreamReader(new FileInputStream( file), myCode);

fr = new BufferedReader(read);

String line = null;

int flag=1;

// 读取每⼀⾏，如果结束了，line会为空

while ((line = fr.readLine()) != null && im().length() > 0) {

if(flag==1) {

line=line.substring(1);//去掉⽂件头

flag++;

}

// 每⼀⾏创建⼀个Student对象，并存⼊数组中

System.out.println(line);

}

fr.close();

} catch (FileNotFoundException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

class BytesEncodingDetect extends Encoding {

// Frequency tables to hold the GB, Big5, and EUC-TW character

// frequencies

int GBFreq[][];

int GBKFreq[][];

int Big5Freq[][];

int Big5PFreq[][];

int EUC_TWFreq[][];

int KRFreq[][];

int JPFreq[][];

// int UnicodeFreq[94][128];

// public static String[] nicename;

// public static String[] codings;

public boolean debug;

public BytesEncodingDetect() {

super();

debug = false;

GBFreq = new int[94][94];

GBKFreq = new int[126][191];

Big5Freq = new int[94][158];

Big5PFreq = new int[126][191];

EUC_TWFreq = new int[94][94];

KRFreq = new int[94][94];

JPFreq = new int[94][94];

// Initialize the Frequency Table for GB, GBK, Big5, EUC-TW, KR, JP

initialize_frequencies();

}

public static void main(String argc[]) {

BytesEncodingDetect sinodetector;

int result = OTHER;

int i;

sinodetector = new BytesEncodingDetect();

for (i = 0; i < argc.length; i++) {

if (argc[i].startsWith("") == true) {

try {

result = sinodetector.detectEncoding(new URL(argc[i]));

} catch (Exception e) {

}

} else if (argc[i].equals("-d")) {

sinodetector.debug = true;

continue;

} else {

result = sinodetector.detectEncoding(new File(argc[i]));

}

System.out.println(nicename[result]);

}

/**

* Function : detectEncoding Aruguments: URL Returns : One of the encodings from the Encoding enumeration (GB2312, HZ, BIG5, * EUC_TW, ASCII, or OTHER) Description: This function looks at the URL contents and assigns it a probability score for each

* encoding type. The encoding type with the highest probability is returned.

public int detectEncoding(URL testurl) {

byte[] rawtext = new byte[10000];

int bytesread = 0, byteoffset = 0;

int guess = OTHER;

InputStream chinesestream;

try {

chinesestream = testurl.openStream();

while ((bytesread = ad(rawtext, byteoffset, rawtext.length - byteoffset)) > 0) {

byteoffset += bytesread;

}

;

chinesestream.close();

guess = detectEncoding(rawtext);

} catch (Exception e) {

guess = -1;

}

return guess;

}

/**

* Function : detectEncoding Aruguments: File Returns : One of the encodings from the Encoding enumeration (GB2312, HZ, BIG5, * EUC_TW, ASCII, or OTHER) Description: This function looks at the file and assigns it a probability score for each encoding

* type. The encoding type with the highest probability is returned.

public int detectEncoding(File testfile) {

FileInputStream chinesefile;

byte[] rawtext;

rawtext = new byte[(int) testfile.length()];

try {

chinesefile = new FileInputStream(testfile);

chinesefile.close();

} catch (Exception e) {

}

return detectEncoding(rawtext);

}

/**

* Function : detectEncoding Aruguments: byte array Returns : One of the encodings from the Encoding enumeration (GB2312, HZ, * BIG5, EUC_TW, ASCII, or OTHER) Description: This function looks at the byte array and assigns it a probability score for

* each encoding type. The encoding type with the highest probability is returned.

public int detectEncoding(byte[] rawtext) {

int[] scores;

int index, maxscore = 0;

int encoding_guess = OTHER;

scores = new int[TOTALTYPES];

// Assign Scores

scores[GB2312] = gb2312_probability(rawtext);

scores[GBK] = gbk_probability(rawtext);

scores[GB18030] = gb18030_probability(rawtext);

scores[HZ] = hz_probability(rawtext);

scores[BIG5] = big5_probability(rawtext);

scores[CNS11643] = euc_tw_probability(rawtext);

scores[ISO2022CN] = iso_2022_cn_probability(rawtext);

scores[UTF8] = utf8_probability(rawtext);

scores[UNICODE] = utf16_probability(rawtext);

scores[EUC_KR] = euc_kr_probability(rawtext);

scores[CP949] = cp949_probability(rawtext);

scores[JOHAB] = 0;

unicode文件格式scores[ISO2022KR] = iso_2022_kr_probability(rawtext);

scores[ASCII] = ascii_probability(rawtext);

scores[SJIS] = sjis_probability(rawtext);

scores[EUC_JP] = euc_jp_probability(rawtext);

scores[ISO2022JP] = iso_2022_jp_probability(rawtext);

scores[UNICODET] = 0;

scores[UNICODES] = 0;

scores[ISO2022CN_GB] = 0;

scores[ISO2022CN_CNS] = 0;

scores[OTHER] = 0;

// Tabulate Scores

for (index = 0; index < TOTALTYPES; index++) {

if (debug)

if (scores[index] > maxscore) {

encoding_guess = index;

maxscore = scores[index];

}

// Return OTHER if nothing scored above 50

if (maxscore <= 50) {

encoding_guess = OTHER;

}

return encoding_guess;

}

* Function: gb2312_probability Argument: pointer to byte array Returns : number from 0 to 100 representing probability text

* in array uses GB-2312 encoding

int gb2312_probability(byte[] rawtext) {

int i, rawtextlen = 0;

int dbchars = 1, gbchars = 1;

long gbfreq = 0, totalfreq = 1;

float rangeval = 0, freqval = 0;

int row, column;

// Stage 1: Check to see if characters fit into acceptable ranges

rawtextlen = rawtext.length;

for (i = 0; i < rawtextlen - 1; i++) {

// println(rawtext[i]);

if (rawtext[i] >= 0) {

// asciichars++;

} else {

dbchars++;

if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xF7 && (byte) 0xA1 <= rawtext[i + 1]

&& rawtext[i + 1] <= (byte) 0xFE) {

gbchars++;

totalfreq += 500;

row = rawtext[i] + 256 - 0xA1;

column = rawtext[i + 1] + 256 - 0xA1;

if (GBFreq[row][column] != 0) {

gbfreq += GBFreq[row][column];

} else if (15 <= row && row < 55) {

// In GB high-freq character range

gbfreq += 200;

}

i++;

}

rangeval = 50 * ((float) gbchars / (float) dbchars);

freqval = 50 * ((float) gbfreq / (float) totalfreq);

return (int) (rangeval + freqval);

}

* Function: gbk_probability Argument: pointer to byte array Returns : number from 0 to 100 representing probability text in

* array uses GBK encoding

int gbk_probability(byte[] rawtext) {

int i, rawtextlen = 0;

int dbchars = 1, gbchars = 1;

long gbfreq = 0, totalfreq = 1;

float rangeval = 0, freqval = 0;

int row, column;

// Stage 1: Check to see if characters fit into acceptable ranges

rawtextlen = rawtext.length;

for (i = 0; i < rawtextlen - 1; i++) {

// println(rawtext[i]);

if (rawtext[i] >= 0) {

// asciichars++;

} else {

dbchars++;

if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xF7 && // Original GB range

(byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) {

gbchars++;

totalfreq += 500;

row = rawtext[i] + 256 - 0xA1;

column = rawtext[i + 1] + 256 - 0xA1;

// System.out.println("original row " + row + " column " + column);

if (GBFreq[row][column] != 0) {

gbfreq += GBFreq[row][column];

} else if (15 <= row && row < 55) {

gbfreq += 200;

}

} else if ((byte) 0x81 <= rawtext[i]

&& rawtext[i] <= (byte) 0xFE

&& // Extended GB range

(((byte) 0x80 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) || ((byte) 0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x7E))) { gbchars++;

totalfreq += 500;

row = rawtext[i] + 256 - 0x81;

if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {

column = rawtext[i + 1] - 0x40;

688IT编程网

java动态获取上传文件的编码类型

发表评论

推荐文章

随机森林算法介绍及R语言实现

基于随机森林优化的神经网络算法在冬小麦产量预测中的应用研究_百度文 ...

基于正则化贪心森林算法的情感分析方法研究

随机森林算法和grandientboosting算法

基于随机森林的图像分类算法研究

热门文章

随机森林特征选择原理

自动驾驶系统中的随机森林算法解析

随机森林算法及其在生物信息学中的应用

监督学习中的随机森林算法解析(六)

随机森林算法在数据分析中的应用

机器学习——随机森林,RandomForestClassifier参数含义详解

随机森林的算法

随机森林算法作用

监督学习中的随机森林算法解析(十)

随机森林算法案例

随机森林案例

二分类问题常用的模型

绘制ssd框架训练流程

一种基于信息熵和DTW的多维时间序列相似性度量算法

SVM训练过程范文

如何使用支持向量机进行股票预测与交易分析

二分类交叉熵损失函数binary

tinybert_训练中文文本分类模型_概述说明

基于门控可形变卷积和分层Transformer的图像修复模型及其应用

人工智能开发技术的测试和评估方法

最新文章

基于随机森林的数据分类算法改进

人工智能中的智能识别与分类技术

基于人工智能技术的随机森林算法在医疗数据挖掘中的应用

随机森林回归模型的建模步骤

r语言随机森林预测模型校准曲线

《2024年随机森林算法优化研究》范文

标签列表

688IT编程网

java动态获取上传文件的编码类型

发表评论

推荐文章

随机森林算法介绍及R语言实现

基于随机森林优化的神经网络算法在冬小麦产量预测中的应用研究_百度文 ...

基于正则化贪心森林算法的情感分析方法研究

随机森林算法和grandientboosting算法

基于随机森林的图像分类算法研究

热门文章

随机森林特征选择原理

自动驾驶系统中的随机森林算法解析

随机森林算法及其在生物信息学中的应用

监督学习中的随机森林算法解析(六)

随机森林算法在数据分析中的应用

机器学习——随机森林,RandomForestClassifier参数含义详解

随机森林 的算法

随机森林算法作用

监督学习中的随机森林算法解析(十)

随机森林算法案例

随机森林案例

二分类问题常用的模型

绘制ssd框架训练流程

一种基于信息熵和DTW的多维时间序列相似性度量算法

SVM训练过程范文

如何使用支持向量机进行股票预测与交易分析

二分类交叉熵损失函数binary

tinybert_训练中文文本分类模型_概述说明

基于门控可形变卷积和分层Transformer的图像修复模型及其应用

人工智能开发技术的测试和评估方法

最新文章

基于随机森林的数据分类算法改进

人工智能中的智能识别与分类技术

基于人工智能技术的随机森林算法在医疗数据挖掘中的应用

随机森林回归模型的建模步骤

r语言随机森林预测模型校准曲线

《2024年随机森林算法优化研究》范文

标签列表

随机森林的算法