I've read some documents and build a lucene index which looks like
Documents:
id 1
keyword foo bar
keyword john
id 2
keyword foo
id 3
keyword john doe
keyword bar foo
keyword what the hell
I want to query lucene in a way, where I can combine single term and phrases.
Let's say my query is
foo bar
should give back the doc ids 1, 2 and 3
The query
"foo bar"
should give back the doc ids 1
The query
john
should give back the doc ids 1 and 3
The query
john "foo bar"
should give back the doc ids 1
My implementation in java is not working. Also reading tons of documents didn't help.
When I query my index with
"foo bar"
I get 0 hits
When I query my index with
foo "john doe"
I get back the doc ids 1, 2 and 3 (i would expect only doc id 3 since the query is meant as foo AND "john doe") The problem is, that "john doe" gives back 0 hits but foo gives back 3 hits.
My goal is to combine single term and phrase terms. What am I doing wrong? I've also played around with the analyzers with no luck.
My implementation looks like this:
import ...
public class Indexer
{
private static final Logger LOG = LoggerFactory.getLogger(Indexer.class);
private final File indexDir;
private IndexWriter writer;
public Indexer(File indexDir)
{
this.indexDir = indexDir;
this.writer = null;
}
private IndexWriter createIndexWriter()
{
try
{
Directory dir = FSDirectory.open(indexDir);
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_34);
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_34, analyzer);
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
iwc.setRAMBufferSizeMB(256.0);
IndexWriter idx = new IndexWriter(dir, iwc);
idx.deleteAll();
return idx;
} catch (IOException e)
{
throw new RuntimeException(String.format("Could create indexer on directory [%s]", indexDir.getAbsolutePath()), e);
}
}
public void index(TestCaseDescription desc)
{
if (writer == null)
writer = createIndexWriter();
Document doc = new Document();
addPathToDoc(desc, doc);
addLastModifiedToDoc(desc, doc);
addIdToDoc(desc, doc);
for (String keyword : desc.getKeywords())
addKeywordToDoc(doc, keyword);
updateIndex(doc, desc);
}
private void addIdToDoc(TestCaseDescription desc, Document doc)
{
Field idField = new Field(LuceneConstants.FIELD_ID, desc.getId(), Field.Store.YES, Field.Index.ANALYZED);
idField.setIndexOptions(IndexOptions.DOCS_ONLY);
doc.add(idField);
}
private void addKeywordToDoc(Document doc, String keyword)
{
Field keywordField = new Field(LuceneConstants.FIELD_KEYWORDS, keyword, Field.Store.YES, Field.Index.ANALYZED);
keywordField.setIndexOptions(IndexOptions.DOCS_ONLY);
doc.add(keywordField);
}
private void addLastModifiedToDoc(TestCaseDescription desc, Document doc)
{
NumericField modifiedField = new NumericField(LuceneConstants.FIELD_LAST_MODIFIED);
modifiedField.setLongValue(desc.getLastModified());
doc.add(modifiedField);
}
private void addPathToDoc(TestCaseDescription desc, Document doc)
{
Field pathField = new Field(LuceneConstants.FIELD_PATH, desc.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
pathField.setIndexOptions(IndexOptions.DOCS_ONLY);
doc.add(pathField);
}
private void updateIndex(Document doc, TestCaseDescription desc)
{
try
{
if (writer.getConfig().getOpenMode() == OpenMode.CREATE)
{
// New index, so we just add the document (no old document can be there):
LOG.debug(String.format("Adding testcase [%s] (%s)", desc.getId(), desc.getPath()));
writer.addDocument(doc);
} else
{
// Existing index (an old copy of this document may have been indexed) so
// we use updateDocument instead to replace the old one matching the exact
// path, if present:
LOG.debug(String.format("Updating testcase [%s] (%s)", desc.getId(), desc.getPath()));
writer.updateDocument(new Term(LuceneConstants.FIELD_PATH, desc.getPath()), doc);
}
} catch (IOException e)
{
throw new RuntimeException(String.format("Could not create or update index for testcase [%s] (%s)", desc.getId(),
desc.getPath()), e);
}
}
public void store()
{
try
{
writer.close();
} catch (IOException e)
{
throw new RuntimeException(String.format("Could not write index [%s]", writer.getDirectory().toString()));
}
writer = null;
}
}
import ...
public class Searcher
{
private static final Logger LOG = LoggerFactory.getLogger(Searcher.class);
private final Analyzer analyzer;
private final QueryParser parser;
private final File indexDir;
public Searcher(File indexDir)
{
this.indexDir = indexDir;
analyzer = new StandardAnalyzer(Version.LUCENE_34);
parser = new QueryParser(Version.LUCENE_34, LuceneConstants.FIELD_KEYWORDS, analyzer);
parser.setAllowLeadingWildcard(true);
}
public List<String> search(String searchString)
{
List<String> testCaseIds = new ArrayList<String>();
try
{
IndexSearcher searcher = getIndexSearcher(indexDir);
Query query = parser.parse(searchString);
LOG.info("Searching for: " + query.toString(parser.getField()));
AllDocCollector results = new AllDocCollector();
searcher.search(query, results);
LOG.info("Found [{}] hit", results.getHits().size());
for (ScoreDoc scoreDoc : results.getHits())
{
Document doc = searcher.doc(scoreDoc.doc);
String id = doc.get(LuceneConstants.FIELD_ID);
testCaseIds.add(id);
}
searcher.close();
return testCaseIds;
} catch (Exception e)
{
throw new RuntimeException(String.format("Could not search index [%s]", indexDir.getAbsolutePath()), e);
}
}
private IndexSearcher getIndexSearcher(File indexDir)
{
try
{
FSDirectory dir = FSDirectory.open(indexDir);
return new IndexSearcher(dir);
} catch (IOException e)
{
LOG.error(String.format("Could not open index directory [%s]", indexDir.getAbsolutePath()), e);
throw new RuntimeException(e);
}
}
}
Why are you using DOCS_ONLY?! If you only index docids, then you only have a basic inverted index with term->document mappings, but no proximity information. So thats why your phrase queries don't work.