Created
March 11, 2014 13:17
-
-
Save NotBadPad/9485407 to your computer and use it in GitHub Desktop.
简单的中文最大前缀匹配,目前仅能切分连续中文句子。可以将标点、数字、英文、特殊字符考虑进来进行处理,就能处理一般的文本了。 经测试800w+字数,22M的文件分词需要900ms,可能因为处理方式和文件内容比较简单吧。 有空实现个中文trie树试试,看看咋样。
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.gj.split; | |
import java.io.BufferedReader; | |
import java.io.File; | |
import java.io.FileInputStream; | |
import java.io.FileNotFoundException; | |
import java.io.IOException; | |
import java.io.InputStreamReader; | |
import java.util.ArrayList; | |
import java.util.HashSet; | |
import java.util.List; | |
import java.util.Set; | |
/** | |
* 简单的中文最大前缀匹配,仅能切分连续中文句子 | |
* @author guojing | |
* | |
* 2013-8-27 | |
*/ | |
public class MMWordMatch { | |
private static Set<String> words; | |
/** | |
* 加载词典 | |
* @param path 词典文件路径 | |
*/ | |
public static void loadWords(String path){ | |
if(null==words){ | |
words=new HashSet<String>(); | |
} | |
if(null==path||path.equals("")){ | |
return; | |
} | |
File f=new File(path); | |
try { | |
InputStreamReader isReader=new InputStreamReader(new FileInputStream(f)); | |
BufferedReader bufferReader=new BufferedReader(isReader); | |
String word=""; | |
while((word=bufferReader.readLine())!=null){ | |
words.add(word); | |
} | |
} catch (FileNotFoundException e) { | |
System.err.println("词典文件未找到!"); | |
} catch (IOException e) { | |
System.err.println("IO读取错误!"); | |
} | |
} | |
/** | |
* 切分语句 | |
* @param document 语句 | |
* @return | |
*/ | |
public static List<String> splitWords(String document){ | |
List<String> result=new ArrayList<String>(); | |
int offset=0;//记录每次截取的位置 | |
//一次截取四个字进行匹配,若匹配到则逐字减少,如果只有一个字,则直接放入list | |
while(document.length()>0){ | |
if(document.length()>=4){ | |
offset=4; | |
}else{ | |
offset=document.length(); | |
} | |
String matchWord=document.substring(0, offset); | |
if(words.contains(matchWord)){ | |
result.add(matchWord); | |
}else{ | |
//逐字减少进行判断 | |
while(matchWord.length()>1){ | |
offset--; | |
matchWord=matchWord.substring(0, offset); | |
if(words.contains(matchWord)){ | |
result.add(matchWord); | |
break; | |
} | |
} | |
//一个字则直接加入 | |
if(matchWord.length()==1){ | |
result.add(matchWord); | |
} | |
} | |
//继续截取 | |
document=document.substring(offset, document.length()); | |
} | |
return result; | |
} | |
public static void main(String[] args) { | |
loadWords("E:\\test\\dict.txt"); | |
List<String> result=splitWords("我的朋友们很高兴"); | |
for(String str : result){ | |
System.out.print(str+"|"); | |
} | |
} | |
/** | |
* 词典结构如下(每行一个词): | |
我们 | |
我的 | |
朋友 | |
朋友们 | |
人们 | |
高兴 | |
*/ | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment