Previous Implementation : Applied elastic-search but very less accuracy, because user enter any text like "I need" == "want to"
Dataset information : Dataset contains each row as, Text( or paragraph) and Label (as Page number). here dataset size is small, I have only 500 rows.
Applied NLP for,
Result : Accuracy on test data(or validation data) is 23% but on train data is 91%
import time
from time import strftime
import numpy as np
from keras.callbacks import CSVLogger, ModelCheckpoint
from keras.layers import Dense, Input, LSTM, ActivityRegularization
from keras.layers import Embedding, Dropout,Bidirectional
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.regularizers import l2
from keras.utils import to_categorical
import pickle
from DataGenerator import *
BASE_DIR = ''
GLOVE_DIR = 'D:/Dataset/glove.6B' # BASE_DIR + '/glove.6B/'
MAX_SEQUENCE_LENGTH = 50
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2
# first, build index mapping words in the embeddings set
# to their embedding vector
np.random.seed(1337) # for reproducibility
print('Indexing word vectors.')
t_start = time.time()
embeddings_index = {}
if os.path.exists('pickle/glove.pickle'):
print('Pickle found..')
with open('pickle/glove.pickle', 'rb') as handle:
embeddings_index = pickle.load(handle)
else:
print('Pickle not found...')
f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'), encoding='utf8')
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
with open('pickle/glove.pickle', 'wb') as handle:
pickle.dump(embeddings_index, handle, protocol=pickle.HIGHEST_PROTOCOL)
print('Found %s word vectors.' % len(embeddings_index))
# second, prepare text samples and their labels
print('Processing text dataset')
texts = [] # list of text samples
labels = [] # list of label ids
labels_index = {} # dictionary mapping label name to numeric id
(texts, labels, labels_index) = get_data('D:/PolicyDocument/')
print('Found %s texts.' % len(texts))
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]
# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words + 1, EMBEDDING_DIM))
print('Preparing embedding matrix. :', embedding_matrix.shape)
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(embedding_matrix.shape[0],
embedding_matrix.shape[1],
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
mask_zero=True,
trainable=False)
print('Training model.')
csv_file = "logs/training_log_" + strftime("%Y-%m-%d %H-%M", time.localtime()) + ".csv"
model_file = "models/Model_" + strftime("%Y-%m-%d %H-%M", time.localtime()) + ".mdl"
print("Model file:" + model_file)
csv_logger = CSVLogger(csv_file)
# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
num_lstm = np.random.randint(175, 275)
rate_drop_dense = 0.15 + np.random.rand() * 0.25
x = LSTM(num_lstm, return_sequences=True, W_regularizer=l2(0.001))(embedded_sequences)
x = Dropout(0.5)(x)
x = LSTM(64)(x)
x = Dropout(0.25)(x)
x = ActivityRegularization(l1=0.01, l2=0.001)(x)
preds = Dense(len(labels_index), activation='softmax')(x)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['acc'])
model_checkpoint = ModelCheckpoint(model_file, monitor='val_loss', verbose=0, save_best_only=True,
save_weights_only=False, mode='auto')
model.fit(x_train, y_train,
batch_size=1,
nb_epoch=600,
validation_data=(x_val, y_val), callbacks=[csv_logger, model_checkpoint])
score = model.evaluate(x_val, y_val, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])
t_end = time.time()
total = t_end - t_start
ret_str = "Time needed(s): " + str(total)
print(ret_str)
Dropout and BN are very effective with feedforward NNs. However, they can cause problems with RNNs (There are many papers published on this topic)
The best way to make your RNN model generalize better is to increase the dataset size. In your case (LSTM with about 200 cells), you probably want to have on the order of 100,000 or more labeled samples to train on.