ansaurus

Question

Finding the position of search hits from Lucene

Answer 1

+2 A:

Hit highlighting is a pretty common thing - check out this tutorial. http://www.cocooncenter.org/articles/lucene.html

Chii 2009-08-21 10:54:04

Answer 2

+1 A:

TermFreqVector is what I used. Here is a working demo, that prints both the term positions, and the starting and ending term indexes:

    # public class Search {  
#       
#     public static void main(String[] args) throws IOException, ParseException{  
#     
#         Search s = new Search();  
#         s.doSearch(args[0], args[1]);  
#     }  
#       
#     Search(){}  
#       
#     public void doSearch(String db, String querystr) throws IOException, ParseException{  
#           
#         // 1. Specify the analyzer for tokenizing text.  
#         //    The same analyzer should be used as was used for indexing  
#         StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);  
#       
#         Directory index = FSDirectory.open(new File(db));  
#       
#         // 2. query  
#         Query q = new QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(querystr);  
#       
#         // 3. search  
#         int hitsPerPage = 10;  
#         IndexSearcher searcher = new IndexSearcher(index, true);  
#         IndexReader reader = IndexReader.open(index, true);  
#         searcher.setDefaultFieldSortScoring(true, false);  
#         TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);  
#         searcher.search(q, collector);  
#         ScoreDoc[] hits = collector.topDocs().scoreDocs;  
#           
#         // 4. display term positions, and term indexes   
#         System.out.println("Found " + hits.length + " hits.");  
#         for(int i=0;i<hits.length;++i) {  
#               
#             int docId = hits[i].doc;  
#             TermFreqVector tfvector = reader.getTermFreqVector(docId, "contents");  
#             TermPositionVector tpvector = (TermPositionVector)tfvector;  
#             // this part works only if there is one term in the query string,  
#             // otherwise you will have to iterate this section over the query terms.  
#             int termidx = tfvector.indexOf(querystr);  
#             int[] termposx = tpvector.getTermPositions(termidx);  
#             TermVectorOffsetInfo[] tvoffsetinfo = tpvector.getOffsets(termidx);  
#               
#             for (int j=0;j<termposx.length;j++) {  
#                 System.out.println("termpos : "+termposx[j]);  
#             }  
#             for (int j=0;j<tvoffsetinfo.length;j++) {  
#                 int offsetStart = tvoffsetinfo[j].getStartOffset();  
#                 int offsetEnd = tvoffsetinfo[j].getEndOffset();  
#                 System.out.println("offsets : "+offsetStart+" "+offsetEnd);  
#             }  
#               
#             // print some info about where the hit was found...  
#             Document d = searcher.doc(docId);  
#             System.out.println((i + 1) + ". " + d.get("path"));  
#         }  
#       
#         // searcher can only be closed when there  
#         // is no need to access the documents any more.   
#         searcher.close();  
#     }  
#       
# }

Allasso 2010-04-08 14:04:29

ansaurus

tags:

views:

answers:

Finding the position of search hits from Lucene

related questions