NotBadPad · March 11, 2014 13:17
diff --git a/MMWordMatch b/MMWordMatch
 package com.gj.split;

 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;

 /**
 * 简单的中文最大前缀匹配，仅能切分连续中文句子
 * @author guojing
 *
 * 2013-8-27
 */
 public class MMWordMatch {
 	private static Set<String> words;
 	
 	/**
 	 * 加载词典
 	 * @param path	词典文件路径
 	 */
 	public static void loadWords(String path){
 		if(null==words){
 			words=new HashSet<String>();
 		}
 		
 		if(null==path||path.equals("")){
 			
 			return;
 		}
 		
 		File f=new File(path);
 		try {
 			InputStreamReader isReader=new InputStreamReader(new FileInputStream(f));
 			BufferedReader bufferReader=new BufferedReader(isReader);
 			String word="";
 			while((word=bufferReader.readLine())!=null){
 				words.add(word);
 			}
 		} catch (FileNotFoundException e) {
 			System.err.println("词典文件未找到！");
 		} catch (IOException e) {
 			System.err.println("IO读取错误！");
 		}
 	}
 	
 	/**
 	 * 切分语句
 	 * @param document	语句
 	 * @return
 	 */
 	public static List<String> splitWords(String document){
 		List<String> result=new ArrayList<String>();
 		int offset=0;//记录每次截取的位置
 		
 		//一次截取四个字进行匹配，若匹配到则逐字减少，如果只有一个字，则直接放入list
 		while(document.length()>0){
 			if(document.length()>=4){
 				offset=4;
 			}else{
 				offset=document.length();
 			}
 			String matchWord=document.substring(0, offset);
 			if(words.contains(matchWord)){
 				result.add(matchWord);
 			}else{
 				//逐字减少进行判断
 				while(matchWord.length()>1){
 					offset--;
 					matchWord=matchWord.substring(0, offset);
 					if(words.contains(matchWord)){
 						result.add(matchWord);
 						break; 
 					}
 				}
 				//一个字则直接加入
 				if(matchWord.length()==1){
 					result.add(matchWord);
 				}
 			}
 			//继续截取
 			document=document.substring(offset, document.length());
 		}
 		
 		return result;
 	}
 	
 	public static void main(String[] args) {
 		loadWords("E:\\test\\dict.txt");
 		List<String> result=splitWords("我的朋友们很高兴");
 		for(String str : result){
 			System.out.print(str+"|");
 		}
 	}

 	/**
 	 * 词典结构如下(每行一个词)：
 	   	我们
 		我的
 		朋友
 		朋友们
 		人们
 		高兴
 	 */
 }
	package com.gj.split;

	import java.io.BufferedReader;
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.FileNotFoundException;
	import java.io.IOException;
	import java.io.InputStreamReader;
	import java.util.ArrayList;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Set;

	/**
	* 简单的中文最大前缀匹配，仅能切分连续中文句子
	* @author guojing
	*
	* 2013-8-27
	*/
	public class MMWordMatch {
	private static Set<String> words;

	/**
	* 加载词典
	* @param path 词典文件路径
	*/
	public static void loadWords(String path){
	if(null==words){
	words=new HashSet<String>();
	}

	if(null==path\|\|path.equals("")){

	return;
	}

	File f=new File(path);
	try {
	InputStreamReader isReader=new InputStreamReader(new FileInputStream(f));
	BufferedReader bufferReader=new BufferedReader(isReader);
	String word="";
	while((word=bufferReader.readLine())!=null){
	words.add(word);
	}
	} catch (FileNotFoundException e) {
	System.err.println("词典文件未找到！");
	} catch (IOException e) {
	System.err.println("IO读取错误！");
	}
	}

	/**
	* 切分语句
	* @param document 语句
	* @return
	*/
	public static List<String> splitWords(String document){
	List<String> result=new ArrayList<String>();
	int offset=0;//记录每次截取的位置

	//一次截取四个字进行匹配，若匹配到则逐字减少，如果只有一个字，则直接放入list
	while(document.length()>0){
	if(document.length()>=4){
	offset=4;
	}else{
	offset=document.length();
	}
	String matchWord=document.substring(0, offset);
	if(words.contains(matchWord)){
	result.add(matchWord);
	}else{
	//逐字减少进行判断
	while(matchWord.length()>1){
	offset--;
	matchWord=matchWord.substring(0, offset);
	if(words.contains(matchWord)){
	result.add(matchWord);
	break;
	}
	}
	//一个字则直接加入
	if(matchWord.length()==1){
	result.add(matchWord);
	}
	}
	//继续截取
	document=document.substring(offset, document.length());
	}

	return result;
	}

	public static void main(String[] args) {
	loadWords("E:\\test\\dict.txt");
	List<String> result=splitWords("我的朋友们很高兴");
	for(String str : result){
	System.out.print(str+"\|");
	}
	}

	/**
	* 词典结构如下(每行一个词)：
	我们
	我的
	朋友
	朋友们
	人们
	高兴
	*/
	}