Last active
June 1, 2017 13:30
-
-
Save gtke/5833889 to your computer and use it in GitHub Desktop.
Parse XML + Index + Search
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.commons.digester3.*; | |
import org.apache.lucene.analysis.standard.StandardAnalyzer; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.document.Field; | |
import org.apache.lucene.document.Field.Index; | |
import org.apache.lucene.document.FieldType; | |
import org.apache.lucene.index.DirectoryReader; | |
import org.apache.lucene.index.IndexReader; | |
import org.apache.lucene.index.IndexWriter; | |
import org.apache.lucene.index.IndexWriterConfig; | |
import org.apache.lucene.queryparser.classic.ParseException; | |
import org.apache.lucene.queryparser.classic.QueryParser; | |
import org.apache.lucene.search.IndexSearcher; | |
import org.apache.lucene.search.Query; | |
import org.apache.lucene.search.ScoreDoc; | |
import org.apache.lucene.search.TopDocs; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.FSDirectory; | |
import org.apache.lucene.util.Version; | |
import org.xml.sax.SAXException; | |
import org.apache.lucene.index.LogMergePolicy; | |
import org.apache.lucene.index.MergePolicy; | |
import java.io.File; | |
import java.io.FileFilter; | |
import java.io.FileReader; | |
import java.io.IOException; | |
import java.util.ArrayList; | |
import java.util.List; | |
import java.util.Scanner; | |
/** | |
* Parses the contents of doc XML file. The name of the file to | |
* parse must be specified as the first command line argument. | |
*/ | |
public class PathTest | |
{ | |
public static StandardAnalyzer analyzer; | |
public static IndexWriter writer; | |
public static IndexSearcher searcher; | |
public static IndexReader reader; | |
public static String indexDir = "C:/Users/tkeshelashvilg/workspace/LuceneDemo/test"; | |
public static String dataDir = "C:/Users/tkeshelashvilg/workspace/LuceneDemo/medplus"; | |
public static QueryParser parser; | |
public static Query query; | |
public static Directory dir; | |
public static IndexWriterConfig config; | |
public static String currentFile; | |
/** | |
* Prints the contact information to standard output. | |
* | |
* @param contact the <code>Contact</code> to print out | |
*/ | |
/** | |
* Configures Digester rules and actions, parses the XML file specified | |
* as the first argument. | |
* | |
* @param args command line arguments | |
* @throws Exception | |
*/ | |
public static void main(String[] args) throws Exception | |
{ | |
parseAndIndex(); | |
//Searching | |
System.out.println("search: "); | |
Scanner input = new Scanner(System.in); | |
String s = input.nextLine(); | |
search(dataDir, s); | |
} | |
public static void parseAndIndex() throws SAXException, Exception{ | |
Digester digester = new Digester(); | |
dir = FSDirectory.open(new File(indexDir)); | |
analyzer = new StandardAnalyzer(Version.LUCENE_43); | |
config = new IndexWriterConfig(Version.LUCENE_43, analyzer); | |
writer = new IndexWriter(dir, config); | |
long startTime = System.currentTimeMillis(); | |
digester.setValidating(false); | |
digester.addObjectCreate("doc", PathTest.class ); | |
digester.addObjectCreate("doc/", TextDoc.class ); | |
digester.addCallMethod("doc/title", "setTitle", 0); | |
digester.addCallMethod("doc/sections/section/title", "addsectionTitle", 0); | |
digester.addCallMethod("doc/sections/section/text", "addText", 0); | |
//indexing | |
digester.addSetNext("doc", "indexDoc" ); | |
// TextParser tp = (TextParser) digester.parse(new File(args[0])); | |
parseXML(dataDir, new TextFilesFilter(), digester); | |
writer.close(); | |
long stopTime = System.currentTimeMillis(); | |
System.out.println("It took " + (stopTime - startTime)/1000 + " seconds to parse and index the documents"); | |
} | |
public static void parseXML(String dataDir, FileFilter filter, Digester digester) throws Exception, SAXException{ | |
PathTest tp; | |
File [] files = new File(dataDir).listFiles(); | |
for(File f : files){ | |
if(!f.isDirectory()&& | |
!f.isHidden()&& | |
f.exists()&& | |
f.canRead()&& | |
(filter == null || filter.accept(f))){ | |
currentFile = f.getCanonicalPath(); | |
tp = (PathTest) digester.parse(f); | |
} | |
} | |
} | |
private static class TextFilesFilter implements FileFilter{ | |
public boolean accept(File path){ | |
return path.getName().toLowerCase().endsWith(".xml"); | |
} | |
} | |
@SuppressWarnings("deprecation") | |
public static void indexDoc(TextDoc textDoc) throws Exception{ | |
Document doc = new Document(); | |
doc.add(new Field("pageTitle",textDoc.getTitle(), Field.Store.YES, Index.ANALYZED)); | |
doc.add(new Field("sectionTitle", textDoc.getSectionArray(), Field.Store.YES, Index.ANALYZED)); | |
doc.add(new Field("content", textDoc.getText(), Field.Store.YES, Index.ANALYZED)); | |
doc.add(new Field("uri", currentFile, Field.Store.YES, Index.NOT_ANALYZED_NO_NORMS)); | |
System.out.println("Indexing " + currentFile); | |
writer.addDocument(doc); | |
System.out.println(" number of docs indexed: " + writer.numDocs()); | |
} | |
public static void printAll(TextDoc textDoc){ | |
System.out.println("Page Title: " + textDoc.getTitle()); | |
System.out.println("Section Title: " + textDoc.getSectionArray()); | |
System.out.println("Content: " + textDoc.getText()); | |
} | |
public static void search(String indexDir, String s) throws IOException, ParseException{ | |
System.out.println("Trying to search: "); | |
System.out.println(dir); | |
reader = DirectoryReader.open(dir); | |
searcher = new IndexSearcher(reader); | |
parser = new QueryParser(Version.LUCENE_43,"content",analyzer); | |
query = parser.parse(s); | |
System.out.println("query: " +query); | |
TopDocs hits = searcher.search(query, 10); | |
System.out.println("Found: " + hits.totalHits); | |
for(ScoreDoc scoreDoc : hits.scoreDocs){ | |
Document doc = searcher.doc(scoreDoc.doc); | |
System.out.println( doc.get("pageTitle")); | |
System.out.println(doc.get("uri")); | |
} | |
reader.close(); | |
} | |
public static class TextDoc{ | |
public String title; | |
public List<String> textArr = new ArrayList<String>(); | |
public List<String> sectionTitleArray = new ArrayList<String>(); | |
public void setTitle(String title) | |
{ | |
this.title = title; | |
} | |
public String getTitle(){ | |
return this.title; | |
} | |
public void addText(String text){ | |
textArr.add(text); | |
} | |
public String getText(){ | |
return this.textArr.toString(); | |
} | |
public void addsectionTitle(String title){ | |
this.sectionTitleArray.add(title); | |
} | |
public String getSectionArray(){ | |
return this.sectionTitleArray.toString(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello, i have an XML file, and I would like to index and parse it with Lucene, how to procced?. This is an example of my Xml file: