gtke · June 1, 2017 13:30 · salyouna · Jun 1, 2017
diff --git a/Lucene b/Lucene
 import org.apache.commons.digester3.*;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.Field.Index;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.queryparser.classic.ParseException;
 import org.apache.lucene.queryparser.classic.QueryParser;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.Version;
 import org.xml.sax.SAXException;
 import org.apache.lucene.index.LogMergePolicy;
 import org.apache.lucene.index.MergePolicy;

 import java.io.File;
 import java.io.FileFilter;
 import java.io.FileReader;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Scanner;


 /**
 * Parses the contents of doc XML file.  The name of the file to
 * parse must be specified as the first command line argument.
 */
 public class PathTest
 {
 	public static StandardAnalyzer analyzer;
 	public static IndexWriter writer;
 	public static IndexSearcher searcher;
 	public static IndexReader reader;
 	public static String indexDir = "C:/Users/tkeshelashvilg/workspace/LuceneDemo/test"; 
 	public static String dataDir = "C:/Users/tkeshelashvilg/workspace/LuceneDemo/medplus"; 
 	public static QueryParser parser;
 	public static Query query;
 	public static Directory   dir;
 	public static IndexWriterConfig config;
 	public static String currentFile;
 	/**
 	 * Prints the contact information to standard output.
 	 *
 	 * @param contact the <code>Contact</code> to print out
 	 */

 	/**
 	 * Configures Digester rules and actions, parses the XML file specified
 	 * as the first argument.
 	 *
 	 * @param args command line arguments
 	 * @throws Exception 
 	 */
 	public static void main(String[] args) throws Exception
 	{
 		parseAndIndex();
 		
 		//Searching	
 		System.out.println("search:  ");
 		Scanner input = new Scanner(System.in);
 		String s = input.nextLine();
 		search(dataDir, s);
 	}
 	
 	public static void parseAndIndex() throws SAXException, Exception{
 		Digester digester = new Digester();
 		dir = FSDirectory.open(new File(indexDir));

 		analyzer = new StandardAnalyzer(Version.LUCENE_43);
 		config = new IndexWriterConfig(Version.LUCENE_43, analyzer);
 		writer = new IndexWriter(dir, config);
 		long startTime = System.currentTimeMillis();

 		digester.setValidating(false);     
 		digester.addObjectCreate("doc", PathTest.class );
 		digester.addObjectCreate("doc/", TextDoc.class );
 		digester.addCallMethod("doc/title",       "setTitle", 0);
 		digester.addCallMethod("doc/sections/section/title",   "addsectionTitle", 0);
 		digester.addCallMethod("doc/sections/section/text",       "addText", 0);
 		//indexing
 		digester.addSetNext("doc",               "indexDoc" );
 		//    TextParser tp = (TextParser) digester.parse(new File(args[0]));
 		parseXML(dataDir, new TextFilesFilter(), digester);
 		writer.close();
 		long stopTime = System.currentTimeMillis();
 		System.out.println("It took " + (stopTime - startTime)/1000 + " seconds to parse and index the documents");   
 		
 	}

 	public static void parseXML(String dataDir, FileFilter filter, Digester digester) throws Exception, SAXException{
 		PathTest tp;
 		File [] files = new File(dataDir).listFiles();
 		for(File f : files){
 			if(!f.isDirectory()&&
 					!f.isHidden()&&
 					f.exists()&&
 					f.canRead()&&
 					(filter == null || filter.accept(f))){
 				currentFile = f.getCanonicalPath();
 				tp = (PathTest) digester.parse(f);
 			}
 		}
 	}

 	private static class TextFilesFilter implements FileFilter{
 		public boolean accept(File path){
 			return path.getName().toLowerCase().endsWith(".xml");
 		}
 	}

 	@SuppressWarnings("deprecation")
 	public static void indexDoc(TextDoc textDoc) throws Exception{
 		Document doc = new Document();
 		doc.add(new Field("pageTitle",textDoc.getTitle(), Field.Store.YES, Index.ANALYZED));
 		doc.add(new Field("sectionTitle", textDoc.getSectionArray(), Field.Store.YES, Index.ANALYZED));
 		doc.add(new Field("content", textDoc.getText(), Field.Store.YES, Index.ANALYZED));
 		doc.add(new Field("uri", currentFile, Field.Store.YES, Index.NOT_ANALYZED_NO_NORMS));
 		System.out.println("Indexing " + currentFile);
 		writer.addDocument(doc);
 		System.out.println(" number of docs indexed: " + writer.numDocs());

 	}

 	public static void printAll(TextDoc textDoc){
 		System.out.println("Page Title: " + textDoc.getTitle());
 		System.out.println("Section Title: " + textDoc.getSectionArray());
 		System.out.println("Content: " + textDoc.getText());
 	}

 	public static void search(String indexDir, String s) throws IOException, ParseException{
 		System.out.println("Trying to search: ");
 		System.out.println(dir);
 		reader = DirectoryReader.open(dir);
 		searcher = new IndexSearcher(reader);    
 		parser = new QueryParser(Version.LUCENE_43,"content",analyzer);
 		query = parser.parse(s);
 		System.out.println("query: " +query);
 		TopDocs hits = searcher.search(query, 10);
 		System.out.println("Found: " + hits.totalHits);

 		for(ScoreDoc scoreDoc : hits.scoreDocs){
 			Document doc = searcher.doc(scoreDoc.doc);
 			System.out.println( doc.get("pageTitle"));
 			System.out.println(doc.get("uri"));
 		}
 		reader.close();	 
 	}

 	public static class TextDoc{
 		public String title;
 		public List<String> textArr = new ArrayList<String>();
 		public List<String> sectionTitleArray = new ArrayList<String>();
 		public void setTitle(String title)
 		{
 			this.title = title;
 		}
 		public String getTitle(){
 			return this.title;
 		}
 		public void addText(String text){
 			textArr.add(text);
 		}
 		public String getText(){
 			return this.textArr.toString();
 		}
 		public void addsectionTitle(String title){
 			this.sectionTitleArray.add(title);
 		}
 		public String getSectionArray(){
 			return this.sectionTitleArray.toString();
 		}
 	}  
 }
	import org.apache.commons.digester3.*;
	import org.apache.lucene.analysis.standard.StandardAnalyzer;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.document.Field.Index;
	import org.apache.lucene.document.FieldType;
	import org.apache.lucene.index.DirectoryReader;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.IndexWriter;
	import org.apache.lucene.index.IndexWriterConfig;
	import org.apache.lucene.queryparser.classic.ParseException;
	import org.apache.lucene.queryparser.classic.QueryParser;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.search.ScoreDoc;
	import org.apache.lucene.search.TopDocs;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.store.FSDirectory;
	import org.apache.lucene.util.Version;
	import org.xml.sax.SAXException;
	import org.apache.lucene.index.LogMergePolicy;
	import org.apache.lucene.index.MergePolicy;

	import java.io.File;
	import java.io.FileFilter;
	import java.io.FileReader;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.List;
	import java.util.Scanner;


	/**
	* Parses the contents of doc XML file. The name of the file to
	* parse must be specified as the first command line argument.
	*/
	public class PathTest
	{
	public static StandardAnalyzer analyzer;
	public static IndexWriter writer;
	public static IndexSearcher searcher;
	public static IndexReader reader;
	public static String indexDir = "C:/Users/tkeshelashvilg/workspace/LuceneDemo/test";
	public static String dataDir = "C:/Users/tkeshelashvilg/workspace/LuceneDemo/medplus";
	public static QueryParser parser;
	public static Query query;
	public static Directory dir;
	public static IndexWriterConfig config;
	public static String currentFile;
	/**
	* Prints the contact information to standard output.
	*
	* @param contact the <code>Contact</code> to print out
	*/

	/**
	* Configures Digester rules and actions, parses the XML file specified
	* as the first argument.
	*
	* @param args command line arguments
	* @throws Exception
	*/
	public static void main(String[] args) throws Exception
	{
	parseAndIndex();

	//Searching
	System.out.println("search: ");
	Scanner input = new Scanner(System.in);
	String s = input.nextLine();
	search(dataDir, s);
	}

	public static void parseAndIndex() throws SAXException, Exception{
	Digester digester = new Digester();
	dir = FSDirectory.open(new File(indexDir));

	analyzer = new StandardAnalyzer(Version.LUCENE_43);
	config = new IndexWriterConfig(Version.LUCENE_43, analyzer);
	writer = new IndexWriter(dir, config);
	long startTime = System.currentTimeMillis();

	digester.setValidating(false);
	digester.addObjectCreate("doc", PathTest.class );
	digester.addObjectCreate("doc/", TextDoc.class );
	digester.addCallMethod("doc/title", "setTitle", 0);
	digester.addCallMethod("doc/sections/section/title", "addsectionTitle", 0);
	digester.addCallMethod("doc/sections/section/text", "addText", 0);
	//indexing
	digester.addSetNext("doc", "indexDoc" );
	// TextParser tp = (TextParser) digester.parse(new File(args[0]));
	parseXML(dataDir, new TextFilesFilter(), digester);
	writer.close();
	long stopTime = System.currentTimeMillis();
	System.out.println("It took " + (stopTime - startTime)/1000 + " seconds to parse and index the documents");

	}

	public static void parseXML(String dataDir, FileFilter filter, Digester digester) throws Exception, SAXException{
	PathTest tp;
	File [] files = new File(dataDir).listFiles();
	for(File f : files){
	if(!f.isDirectory()&&
	!f.isHidden()&&
	f.exists()&&
	f.canRead()&&
	(filter == null \|\| filter.accept(f))){
	currentFile = f.getCanonicalPath();
	tp = (PathTest) digester.parse(f);
	}
	}
	}

	private static class TextFilesFilter implements FileFilter{
	public boolean accept(File path){
	return path.getName().toLowerCase().endsWith(".xml");
	}
	}

	@SuppressWarnings("deprecation")
	public static void indexDoc(TextDoc textDoc) throws Exception{
	Document doc = new Document();
	doc.add(new Field("pageTitle",textDoc.getTitle(), Field.Store.YES, Index.ANALYZED));
	doc.add(new Field("sectionTitle", textDoc.getSectionArray(), Field.Store.YES, Index.ANALYZED));
	doc.add(new Field("content", textDoc.getText(), Field.Store.YES, Index.ANALYZED));
	doc.add(new Field("uri", currentFile, Field.Store.YES, Index.NOT_ANALYZED_NO_NORMS));
	System.out.println("Indexing " + currentFile);
	writer.addDocument(doc);
	System.out.println(" number of docs indexed: " + writer.numDocs());

	}

	public static void printAll(TextDoc textDoc){
	System.out.println("Page Title: " + textDoc.getTitle());
	System.out.println("Section Title: " + textDoc.getSectionArray());
	System.out.println("Content: " + textDoc.getText());
	}

	public static void search(String indexDir, String s) throws IOException, ParseException{
	System.out.println("Trying to search: ");
	System.out.println(dir);
	reader = DirectoryReader.open(dir);
	searcher = new IndexSearcher(reader);
	parser = new QueryParser(Version.LUCENE_43,"content",analyzer);
	query = parser.parse(s);
	System.out.println("query: " +query);
	TopDocs hits = searcher.search(query, 10);
	System.out.println("Found: " + hits.totalHits);

	for(ScoreDoc scoreDoc : hits.scoreDocs){
	Document doc = searcher.doc(scoreDoc.doc);
	System.out.println( doc.get("pageTitle"));
	System.out.println(doc.get("uri"));
	}
	reader.close();
	}

	public static class TextDoc{
	public String title;
	public List<String> textArr = new ArrayList<String>();
	public List<String> sectionTitleArray = new ArrayList<String>();
	public void setTitle(String title)
	{
	this.title = title;
	}
	public String getTitle(){
	return this.title;
	}
	public void addText(String text){
	textArr.add(text);
	}
	public String getText(){
	return this.textArr.toString();
	}
	public void addsectionTitle(String title){
	this.sectionTitleArray.add(title);
	}
	public String getSectionArray(){
	return this.sectionTitleArray.toString();
	}
	}
	}