How can I implement the tf-idf and cosine similarity in Lucene? I'm using Lucene 4.2. The program that I've created does not use tf-idf and Cosine similaryty, it only uses TopScoreDocCollector.
import com.mysql.jdbc.Statement;
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStreamReader;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriter;
import java.sql.DriverManager;
import java.sql.Connection;
import java.sql.ResultSet;
import org.apache.lucene.analysis.id.IndonesianAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
public class IndexMysqlDBStemming {
public static void main(String[] args) throws Exception {
// 1. Create Index From Database
Class.forName("com.mysql.jdbc.Driver").newInstance();
Connection connection = DriverManager.getConnection("jdbc:mysql://localhost/db_haiquran", "root", "");
IndonesianAnalyzer analyzer = new IndonesianAnalyzer(Version.LUCENE_42);
//StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_42);
QueryParser parser = new QueryParser(Version.LUCENE_42, "result", analyzer);
Directory INDEX_DIR = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_42, analyzer);
IndexWriter writer = new IndexWriter(INDEX_DIR, config);
String query = "SELECT * FROM ayat";
java.sql.Statement statement = connection.createStatement();
ResultSet result = statement.executeQuery(query);
while (result.next()) {
Document document = new Document();
document.add(new Field("NO_INDEX_AYAT", result.getString("NO_INDEX_AYAT"), Field.Store.YES, Field.Index.NOT_ANALYZED));
document.add(new Field("NO_SURAT", result.getString("NO_SURAT"), Field.Store.YES, Field.Index.NOT_ANALYZED));
document.add(new Field("NO_AYAT", result.getString("NO_AYAT"), Field.Store.YES, Field.Index.NOT_ANALYZED));
document.add(new Field("TEXT_INDO", result.getString("TEXT_INDO"), Field.Store.YES, Field.Index.ANALYZED));
document.add(new Field("TEXT_ARAB", result.getString("TEXT_ARAB"), Field.Store.YES, Field.Index.NOT_ANALYZED));
writer.updateDocument(new Term("NO_INDEX_AYAT", result.getString("NO_INDEX_AYAT")), document);
}
writer.close();
// 2. Query
System.out.println("Enter your search keyword in here : ");
BufferedReader bufferRead = new BufferedReader(new InputStreamReader(System.in));
String s = bufferRead.readLine();
String querystr = args.length > 0 ? args[0] :s;
try {
System.out.println(parser.parse(querystr)+"\n"); //amenit
System.out.println();
} catch (ParseException ex) {
// Exception
}
Query q = new QueryParser(Version.LUCENE_42, "TEXT_INDO", analyzer).parse(querystr);
// 3. Search
int hitsPerPage = 10;
IndexReader reader = DirectoryReader.open(INDEX_DIR);
IndexSearcher searcher = new IndexSearcher(reader);
TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
searcher.search(q, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
// 4. Display results
System.out.println("Found : " + hits.length + " hits.");
System.out.println("No" + " ID " + "\t" + " Surat " + "\t" + " No Ayat " + "\t" + " Terjemahan Ayat " + "\t" + " Teks Arab ");
for (int i=0; i<hits.length; i++) {
int docID = hits[i].doc;
Document d = searcher.doc(docID);
System.out.println((i+1) + ". " + d.get("NO_INDEX_AYAT") + "\t" + d.get("NO_SURAT") + "\t" + d.get("NO_AYAT")+
"\t" + d.get("TEXT_INDO") + "\t" + d.get("TEXT_ARAB"));
}
reader.close();
}
}
How can I display the results of the calculation using tf-idf and cosine similarity?
Unless there is something I'm missing, you're already done. Well done!
The similarity algorithm being used by default is the DefaultSimilarity, but most of the documentation (and logic) you'll find in it's base class TFIDFSimilarity.
And TFIDFSimilarity is indeed an implementation of a TF-IDF and Cosine similarity scoring model.