中⽂⽂本校对源码java_浅谈中⽂⽂本⾃动纠错在影视剧搜索
中应⽤与Java实现
1.背景:
这周由于项⽬需要对搜索框中输⼊的错误影⽚名进⾏校正处理,以提升搜索命中率和⽤户体验,研究了⼀下中⽂⽂本⾃动纠错(专业点讲是校对,proofread),并初步实现了该功能,特此记录。
2.简介:
中⽂输⼊错误的校对与更正是指在输⼊不常见或者错误⽂字时系统提⽰⽂字有误,最简单的例⼦就是在word⾥打字时会有红⾊下划线提⽰。实现该功能⽬前主要有两⼤思路:
(1) 基于⼤量字典的分词法:主要是将待分析的汉字串与⼀个很⼤的“机器词典”中的词条进⾏匹配,若在词典中到则匹配成功;该⽅法易于实现,⽐较适⽤于输⼊的汉字串
属于某个或某⼏个领域的名词或名称;
(2) 基于统计信息的分词法:常⽤的是N-Gram语⾔模型,其实就是N-1阶Markov(马尔科夫)模型;在此简介⼀下该模型:
上式是Byes公式,表明字符串X1X2……Xm出现的概率是每个字单独出现的条件概率之积,为了简化计算假设字Xi的出现仅与前⾯紧挨着的N-1个字符有关,则上⾯的公式变为:
这就是N-1阶Markov(马尔科夫)模型,计算出概率后与⼀个阈值对⽐,若⼩于该阈值则提⽰该字符串拼写有误。
3.实现:
由于本⼈项⽬针对的输⼊汉字串基本上是影视剧名称以及综艺动漫节⽬的名字,语料库的范围相对稳定些,所以这⾥采⽤2-Gram即⼆元语⾔模型与字典分词相结合的⽅法;
先说下思路:
对语料库进⾏分词处理—> 计算⼆元词条出现概率(在语料库的样本下,⽤词条出现的频率代替)—> 对待分析的汉字串分词并出最⼤连续字符串和第⼆⼤连续字符串—>
利⽤最⼤和第⼆⼤连续字符串与语料库的影⽚名称匹配 —> 部分匹配则现实拼写有误并返回更正的字符串(所以字典很重要)
备注:分词这⾥⽤ICTCLAS Java API
上代码:
创建类ChineseWordProofread
3.1 初始化分词包并对影⽚语料库进⾏分词处理
1 publicICTCLAS2011 initWordSegmentation(){2
3 ICTCLAS2011 wordSeg = newICTCLAS2011();
4 try{
5 String argu = "F:\\Java\\workspace\\wordProofread"; //set your project path
6 System.out.println("ICTCLAS_Init");
7 if (ICTCLAS2011.ICTCLAS_Bytes("GB2312"),0) == false)
8 {9
System.out.println("Init Fail!");10 //return null;
11 }12
13 /*
14 * 设置词性标注集15 ID 代表词性集16 1 计算所⼀级标注集17 0 计算所⼆级标注集18 2 北⼤⼆级标注集19 3 北⼤⼀级标注集20 */
21 wordSeg.ICTCLAS_SetPOSmap(2);22
23 }catch(Exception ex){24 System.out.println("words segmentation initialization failed");it(-1);26 }27 returnwordSeg;28 }29
30 public booleanwordSegmentate(String argu1,String argu2){31 boolean ictclasFileProcess = false;32 try{33 //⽂件分词
34 ictclasFileProcess = wordSeg.ICTCLAS_Bytes("GB2312"), Bytes("GB2312"), 0);35
36 //ICTCLAS2011.ICTCLAS_Exit();
37
38 }catch(Exception ex){39 System.out.println("file process segmentation failed");it(-1);41 }42 returnictclasFileProcess;43 }
3.2 计算词条(tokens)出现的频率
1 public MapcalculateTokenCount(String afterWordSegFile){
2 Map wordCountMap = new HashMap();
3 File movieInfoFile = newFile(afterWordSegFile);
4 BufferedReader movieBR = null;
5 try{
6 movieBR = new
BufferedReader(newFileReader(movieInfoFile));7 } catch(FileNotFoundException e) {8 System.out.println(" file not found");9 e.printStackTrace();10 }11
12 String wordsline = null;13 try{14 while ((adLine()) != null){15 String[] words =
21 if (words.length > 1 && i < words.length-1){22 StringBuffer wordStrBuf = newStringBuffer();23
wordStrBuf.append(words[i]).append(words[i+1]);24 int wordStrCount = (String())==null ? (String());25 wordCountMap.String(), wordStrCount+1);26 totalTokensCount += 1;27 }28动漫网站设计源代码
29 }30 }31 } catch(IOException e) {32 System.out.println("read file failed");33 e.printStackTrace();34 }35
36 returnwordCountMap;37 }
3.3 出待分析字符串中的正确tokens
1 public MapcalculateTokenCount(String afterWordSegFile){
2 Map wordCountMap = new HashMap();
3 File movieInfoFile = newFile(afterWordSegFile);
4 BufferedReader movieBR = null;
5 try{
6 movieBR = new
BufferedReader(newFileReader(movieInfoFile));7 } catch(FileNotFoundException e) {8 System.out.println(" file not found");9 e.printStackTrace();10 }11
12 String wordsline = null;13 try{14 while ((adLine()) != null){15 String[] words =
21 if (words.length > 1 && i < words.length-1){22 StringBuffer wordStrBuf = newStringBuffer();23
wordStrBuf.append(words[i]).append(words[i+1]);24 int wordStrCount = (String())==null ? (String());25 wordCountMap.String(), wordStrCount+1);26 totalTokensCount += 1;27 }28
29 }30 }31 } catch(IOException e) {32 System.out.println("read file failed");33 e.printStackTrace();34 }35
36 returnwordCountMap;37 }
3.4 得到最⼤连续和第⼆⼤连续字符串(也可能为单个字符)
1 publicString[] getMaxAndSecondMaxSequnce(String[] sInputResult){
2 List correctTokens
=getCorrectTokens(sInputResult);3 //TODO
4 System.out.println(correctTokens);
5 String[] maxAndSecondMaxSeq = new String[2];
6 if (correctTokens.size() == 0) return null;
7 else if (correctTokens.size() == 1){
8 maxAndSecondMaxSeq[0]=(0);
9 maxAndSecondMaxSeq[1]=(0);10 returnmaxAndSecondMaxSeq;11 }12
13 String maxSequence = (0);14 String maxSequence2 = (correctTokens.size()-1);15 String littleword = "";16 for (int i=1;i
18 if ((i).length() >maxSequence.length()){19 maxSequence =(i);20 } else if ((i).length() ==maxSequence.length()){21
22 //select the word with greater probability for single-word
23 if ((i).length()==1){24 if ((i))
>probBetweenTowTokens(maxSequence)) {25 maxSequence2 =(i);26 }27 }28 //select words with smaller probability for multi-word, because the smaller has more self information
29 else if ((i).length()>1){30 if ((i))
<=probBetweenTowTokens(maxSequence)) {31 maxSequence2 =(i);32 }33 }34
35 } else if ((i).length() >maxSequence2.length()){36 maxSequence2 =(i);37 } else if ((i).length() ==maxSequence2.length()){38 if ((i))
>probBetweenTowTokens(maxSequence2)){39 maxSequence2 =(i);40 }41 }42 }43 //TODO
44 System.out.println(maxSequence+" : "+maxSequence2);45 //delete the sub-word from a string
46 if (maxSequence2.length() ==maxSequence.length()){47 int maxseqvaluableTokens =maxSequence.length();48 int maxseq2valuableTokens =maxSequence2.length();49 float min_truncate_prob_a = 0;50 float min_truncate_prob_b = 0;51 String aword = "";52 String bword = "";53 for (int i=0;i=min_truncate_prob_a){57 min_truncate_prob_a =tokenprob ;58 aword =(i);59 }60 }61 else if ((!maxSequence2.(i)))
&&(i))){62 if (tokenprob >=min_truncate_prob_b){63 min_
truncate_prob_b
=tokenprob;64 bword =(i);65 }66 }67 }68 //TODO
69 System.out.println(aword+" VS "+bword);70 System.out.println(min_truncate_prob_a+" VS "+min_truncate_prob_b);71 if (aword.length()>0 && min_truncate_prob_a
82 }83
84 if (maxseqvaluableTokens
91 }92 maxAndSecondMaxSeq[0] =maxSequence;93 maxAndSecondMaxSeq[1] =maxSequence2;94
95 returnmaxAndSecondMaxSeq ;96 }
3.5 返回更正列表
1 public ListproofreadAndSuggest(String sInput){
2 //List correctTokens = new ArrayList();
3 List correctedList = new ArrayList();
4 List crtTempList = new ArrayList();5
6 //TODO
7 Calendar startProcess =Instance();8 char[] str2char =CharArray();9 String[] sInputResult = new String[str2char.length];//cwp.wordSegmentate(sInput);
10 for (int t=0;t
16 String[] MaxAndSecondMaxSequnce =getMaxAndSecondMaxSequnce(sInputResult);17
18 //display errors and suggest correct movie name19 //System.out.println("hasError="+hasError);
20 if (hasError !=0){21 if (MaxAndSecondMaxSequnce.length>1){22 String maxSequence =
MaxAndSecondMaxSequnce[0];23 String maxSequence2 = MaxAndSecondMaxSequnce[1];24 for (int j=0;j
26 String movie =(j);27
28
29 //System.out.println("maxseq is "+maxSequence+", maxseq2 is "+maxSequence2);30
31 //select movie
32 if (maxSequence2.equals("")){33 ains(maxSequence)) correctedList.add(movie);34 }35 else{36 if
(ains(maxSequence) &&ains(maxSequence2)){37 //correctedList.clear();
38 crtTempList.add(movie);39 //correctedList.add(movie);40 //break;
41 }42 //else if (ains(maxSequence) || ains(maxSequence2)) correctedList.add(movie);
43 else ains(maxSequence)) correctedList.add(movie);44 }45
46 }47
48 if (crtTempList.size()>0){49 correctedList.clear();50 correctedList.addAll(crtTempList);51 }52
53 //TODO
54 if (hasError ==1) System.out.println("No spellig error,Sorry for having no this movie,do you want to get
:"+String()+" ?");55 //TODO
56 else System.out.println("Spellig error,do you want to get :"+String()+" ?");57 } //TODO
58 else System.out.println("there are spellig errors, no anyone correct token in your spelled words,so I can't guess what you want, please check it again");59
60 } //TODO
61 else System.out.println("No spelling error");62
63 //TODO
64 Calendar endProcess =Instance();65 long elapsetime = (TimeInMillis()-
ICTCLAS2011.ICTCLAS_Exit();68
69 returncorrectedList ;70 }
3.6 显⽰校对结果
1 public static voidmain(String[] args) {2
3 String argu1 = ""; //movies name file
4 String argu2 = ""; //words after segmenting name of all movies
5
6 SimpleDateFormat sdf=new SimpleDateFormat("HH:mm:ss");
7 String startInitTime = sdf.format(newjava.util.Date());
8 System.out.println(startInitTime+" ---start initializing work---");
9 ChineseWordProofread cwp =
newChineseWordProofread(argu1,argu2);10
11 String endInitTime = sdf.format(newjava.util.Date());12 System.out.println(endInitTime+" ---end initializing work---");13
14 Scanner scanner = newScanner(System.in);15 while(true){16 System.out.print("请输⼊影⽚名:");17
18 String input =();19
20 if (input.equals("EXIT")) break;21
22 cwp.proofreadAndSuggest(input);23
24 }25 scanner.close();26 }
在我的机器上实验结果如下:
最后要说的是我⽤的语料库没有做太多处理,所以最后出来的有很多正确的结果,⽐如⾮诚勿扰会有《⾮诚勿扰⼗⼆⽉合集》等,这些只要在影⽚语料库上处理下即可;
还有就是该模型不适合⼤规模在线数据,⽐如说搜索引擎中的⾃动校正或者叫智能提⽰,即使在影视剧、动漫、综艺等影⽚的⾃动检测错误和更正上本模型还有很多提升的地⽅,若您不吝惜键盘,请敲上你的想法,让我知道,让我们开源、开放、开⼼,最后源码在
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论