Skip to content

Instantly share code, notes, and snippets.

@gtke
Last active June 1, 2017 13:30
Show Gist options
  • Save gtke/5833889 to your computer and use it in GitHub Desktop.
Save gtke/5833889 to your computer and use it in GitHub Desktop.
Parse XML + Index + Search
import org.apache.commons.digester3.*;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.xml.sax.SAXException;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.index.MergePolicy;
import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
/**
* Parses the contents of doc XML file. The name of the file to
* parse must be specified as the first command line argument.
*/
public class PathTest
{
public static StandardAnalyzer analyzer;
public static IndexWriter writer;
public static IndexSearcher searcher;
public static IndexReader reader;
public static String indexDir = "C:/Users/tkeshelashvilg/workspace/LuceneDemo/test";
public static String dataDir = "C:/Users/tkeshelashvilg/workspace/LuceneDemo/medplus";
public static QueryParser parser;
public static Query query;
public static Directory dir;
public static IndexWriterConfig config;
public static String currentFile;
/**
* Prints the contact information to standard output.
*
* @param contact the <code>Contact</code> to print out
*/
/**
* Configures Digester rules and actions, parses the XML file specified
* as the first argument.
*
* @param args command line arguments
* @throws Exception
*/
public static void main(String[] args) throws Exception
{
parseAndIndex();
//Searching
System.out.println("search: ");
Scanner input = new Scanner(System.in);
String s = input.nextLine();
search(dataDir, s);
}
public static void parseAndIndex() throws SAXException, Exception{
Digester digester = new Digester();
dir = FSDirectory.open(new File(indexDir));
analyzer = new StandardAnalyzer(Version.LUCENE_43);
config = new IndexWriterConfig(Version.LUCENE_43, analyzer);
writer = new IndexWriter(dir, config);
long startTime = System.currentTimeMillis();
digester.setValidating(false);
digester.addObjectCreate("doc", PathTest.class );
digester.addObjectCreate("doc/", TextDoc.class );
digester.addCallMethod("doc/title", "setTitle", 0);
digester.addCallMethod("doc/sections/section/title", "addsectionTitle", 0);
digester.addCallMethod("doc/sections/section/text", "addText", 0);
//indexing
digester.addSetNext("doc", "indexDoc" );
// TextParser tp = (TextParser) digester.parse(new File(args[0]));
parseXML(dataDir, new TextFilesFilter(), digester);
writer.close();
long stopTime = System.currentTimeMillis();
System.out.println("It took " + (stopTime - startTime)/1000 + " seconds to parse and index the documents");
}
public static void parseXML(String dataDir, FileFilter filter, Digester digester) throws Exception, SAXException{
PathTest tp;
File [] files = new File(dataDir).listFiles();
for(File f : files){
if(!f.isDirectory()&&
!f.isHidden()&&
f.exists()&&
f.canRead()&&
(filter == null || filter.accept(f))){
currentFile = f.getCanonicalPath();
tp = (PathTest) digester.parse(f);
}
}
}
private static class TextFilesFilter implements FileFilter{
public boolean accept(File path){
return path.getName().toLowerCase().endsWith(".xml");
}
}
@SuppressWarnings("deprecation")
public static void indexDoc(TextDoc textDoc) throws Exception{
Document doc = new Document();
doc.add(new Field("pageTitle",textDoc.getTitle(), Field.Store.YES, Index.ANALYZED));
doc.add(new Field("sectionTitle", textDoc.getSectionArray(), Field.Store.YES, Index.ANALYZED));
doc.add(new Field("content", textDoc.getText(), Field.Store.YES, Index.ANALYZED));
doc.add(new Field("uri", currentFile, Field.Store.YES, Index.NOT_ANALYZED_NO_NORMS));
System.out.println("Indexing " + currentFile);
writer.addDocument(doc);
System.out.println(" number of docs indexed: " + writer.numDocs());
}
public static void printAll(TextDoc textDoc){
System.out.println("Page Title: " + textDoc.getTitle());
System.out.println("Section Title: " + textDoc.getSectionArray());
System.out.println("Content: " + textDoc.getText());
}
public static void search(String indexDir, String s) throws IOException, ParseException{
System.out.println("Trying to search: ");
System.out.println(dir);
reader = DirectoryReader.open(dir);
searcher = new IndexSearcher(reader);
parser = new QueryParser(Version.LUCENE_43,"content",analyzer);
query = parser.parse(s);
System.out.println("query: " +query);
TopDocs hits = searcher.search(query, 10);
System.out.println("Found: " + hits.totalHits);
for(ScoreDoc scoreDoc : hits.scoreDocs){
Document doc = searcher.doc(scoreDoc.doc);
System.out.println( doc.get("pageTitle"));
System.out.println(doc.get("uri"));
}
reader.close();
}
public static class TextDoc{
public String title;
public List<String> textArr = new ArrayList<String>();
public List<String> sectionTitleArray = new ArrayList<String>();
public void setTitle(String title)
{
this.title = title;
}
public String getTitle(){
return this.title;
}
public void addText(String text){
textArr.add(text);
}
public String getText(){
return this.textArr.toString();
}
public void addsectionTitle(String title){
this.sectionTitleArray.add(title);
}
public String getSectionArray(){
return this.sectionTitleArray.toString();
}
}
}
@salyouna
Copy link

salyouna commented Jun 1, 2017

Hello, i have an XML file, and I would like to index and parse it with Lucene, how to procced?. This is an example of my Xml file:

                               <num>number </num>
                              <title> title </title>
                              <querytime> time</querytime>
                              <querytweettime>number</querytweettime>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment