lucene索引_加权操作、对日期和数字进行索引、IndexReader的设计
package com.dhb.index; import java.io.File; import java.io.IOException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashMap; import java.util.Map; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericField; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexReader.FieldOption; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; import org.junit.Before; import org.junit.Test; public class IndexUtil { private String[] ids = {"1","2","3","4","5","6"}; private String[] emails = {"aa@csdn.org","bb@csdn.org","cc@sina.org","dd@sina.org", "ee@qq.com","ff@qq.com"}; private String[] contents = {"Welcome to my office ,I like surfing internet.", "hello boys like haha", "hello girls we like each other.", "I like football,you like too.", "I like basketball very much, how about you?", "bye-bye see you I don't like."}; private int[] attachment ={2,3,1,4,5,5}; private String[] names = {"Victor","Nancy","Kitty","Cindy","Tom","Tony"}; private Map<String, Float> scores = new HashMap<String, Float>(); private Date[] dates = null; private static IndexReader reader = null; private Directory directory = null; @Before public void IndexUtilBefore() { try { setDates(); scores.put("qq.com", 2.0f); scores.put("sina.org", 1.5f); directory = FSDirectory.open(new File("D:/luceneData/index02")); reader = IndexReader.open(directory, false); } catch (IOException e) { e.printStackTrace(); } } public IndexSearcher getSearcher() { try { if(reader==null) { reader = IndexReader.open(directory); } else { IndexReader tr = IndexReader.openIfChanged(reader); if(tr!=null) { reader.close(); //关闭原来的reader reader = tr; } } return new IndexSearcher(reader); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; } private void setDates() { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); dates = new Date[ids.length]; try { dates[0] = sdf.parse("2010-02-19"); dates[1] = sdf.parse("2012-01-11"); dates[2] = sdf.parse("2011-09-19"); dates[3] = sdf.parse("2010-12-22"); dates[4] = sdf.parse("2012-01-01"); dates[5] = sdf.parse("2011-05-19"); } catch (ParseException e) { e.printStackTrace(); } } @Test public void index() { IndexWriter writer = null; IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)); try { writer = new IndexWriter(directory, iwc); //清空所有索引 writer.deleteAll(); Document doc = null; for (int i = 0;i < ids.length; i++) { /** * Field.Store.YES或者NO(存储域选项) * 1、设置为YES表示把这个域中的内容完全存储到文件中,方便进行文本的还原 * 2、设置为NO表示把这个域中的内容不存储到文件中,但是可以被索引,此时内容无法还原(doc.get) */ /** * 使用Field.Index.*来进行操作 * Index.ANALYZED:进行分词和索引,适用于标题和内容 * Index.NOT_ANALYZED:进行索引,但不进行分词,如身份证号码,姓名,ID等,适用于精确搜索 * Index.ANALYZED_NO_NORMS进行分词但是不存储norms信息,这个norms中包含了创建索引的时间和权值等信息 * Index.NOT_ANALYZED_NO_NORMS即不进行分词也不存储norms信息 * Index.NO不进行索引 */ /** * NOT_ANALYZED_NO_NORMS YES 标示符(主键,文件名),电话号码,身份证号,姓名,日期 * ANALYZED YES 文档标题和摘要 * ANALYZED NO 文档正文 * NO YES 文档类型,数据库主键(不进行索引) * NOT_ANALYZED NO 隐藏关键字 */ doc = new Document(); doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS)); doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED)); doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED)); doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS)); /** * 加权操作 */ String et = emails[i].substring(emails[i].indexOf("@")+1); System.out.println(et); if(scores.containsKey(et)) { doc.setBoost(scores.get(et)); } else { doc.setBoost(0.5f); } //对数字的操作,存储数字 doc.add(new NumericField("attachment", Field.Store.YES, true).setIntValue(attachment[i])); //对日期的操作,存储日期 doc.add(new NumericField("dates", Field.Store.YES, true).setLongValue(dates[i].getTime())); writer.addDocument(doc); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if(writer!=null) try { writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } @Test public void query() { try { IndexReader reader = IndexReader.open(directory); //通过reader可以有效地获取文档的数量 System.out.println("numDocs:"+reader.numDocs()); System.out.println("maxDocs:"+reader.maxDoc()); //通过reader可以有效地获取删除的文档的数量 System.out.println("numDeletedDocs:"+reader.numDeletedDocs()); reader.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } @Test /** * 删除 */ public void delete() { IndexWriter writer = null; try { writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35))); //参数是一个选项,可以是一个query;也可以是一个term,term是一个精确查找的值 //此时删除的文档并不会被完全删除,而是存储在一个回收站中的,是可以恢复的 writer.deleteDocuments(new Term("id", "1")); writer.commit(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { /*if(writer!=null) try { writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } */ } } @Test public void delete2() { try { reader.deleteDocuments(new Term("id", "1")); reader.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } @Test /** * 强制删除 * 在lucene3.5之前都是使用optimize()进行处理,但是这个操作消耗资源,已经被弃用 */ public void forceDelete() { //删除优化,删除回收站文件 IndexWriter writer = null; try { writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35))); writer.forceMergeDeletes(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if(writer!=null) try { writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } @Test /** * 恢复删除 */ public void unDelete() { //使用indexReader进行恢复 try { IndexReader reader = IndexReader.open(directory, false); //恢复时必须把IndexReader的只读(readOnly)设置为false reader.undeleteAll(); reader.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } @Test public void search() { try { IndexReader reader = IndexReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); TermQuery query = new TermQuery(new Term("content", "like")); TopDocs tds = searcher.search(query, 10); for(ScoreDoc sd : tds.scoreDocs) { Document d = searcher.doc(sd.doc); System.out.println("("+sd.doc+") "+"--权值:"+d.getBoost()+"--分数:"+sd.score+ d.get("name")+"["+d.get("email")+"] "+d.get("id")+",附件:" +d.get("attachment")+",日期:"+d.get("dates")); } reader.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } @Test public void search2() { try { //IndexReader reader = IndexReader.open(directory); //IndexSearcher searcher = new IndexSearcher(reader); //方式二: IndexSearcher searcher = getSearcher(); TermQuery query = new TermQuery(new Term("content", "like")); TopDocs tds = searcher.search(query, 10); for(ScoreDoc sd : tds.scoreDocs) { Document d = searcher.doc(sd.doc); System.out.println("("+sd.doc+") "+"--权值:"+d.getBoost()+"--分数:"+sd.score+ d.get("name")+"["+d.get("email")+"] "+d.get("id")+",附件:" +d.get("attachment")+",日期:"+d.get("dates")); } searcher.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } @Test public void search3() { for (int i = 0; i < 5; i++) { search2(); System.out.println("------------------"); try { Thread.sleep(10000); } catch (InterruptedException e) { e.printStackTrace(); } } } }
本文由用户 jopen 自行上传分享,仅供网友学习交流。所有权归原作者,若您的权利被侵害,请联系管理员。
转载本站原创文章,请注明出处,并保留原始链接、图片水印。
本站是一个以用户分享为主的开源技术平台,欢迎各类分享!