Spell correction using Keras RNN in python

I am doing spell correction using RNN, Below is the code I am using

from _future_ import print_function, division, unicode_literals import os import errno from collections import Counter from hashlib import sha256 import re import json import itertools import logging import requests import numpy as np import pandas as pd from numpy.random import choice as random_choice, randint as random_randint, shuffle as random_shuffle, seed as random_seed, rand from numpy import zeros as np_zeros # pylint:disable=no-name-in-module from keras.models import Sequential, load_model from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, Dropout, recurrent from keras.callbacks import Callback # Set a logger for the module LOGGER = logging.getLogger(__name__) # Every log will use the module name LOGGER.addHandler(logging.StreamHandler()) LOGGER.setLevel(logging.DEBUG) random_seed(123) # Reproducibility class Configuration(object): """Dump stuff here""" CONFIG = Configuration() #pylint:disable=attribute-defined-outside-init # Parameters for the model: CONFIG.input_layers = 2 CONFIG.output_layers = 2 CONFIG.amount_of_dropout = 0.2 CONFIG.hidden_size = 500 CONFIG.initialization = "he_normal" # : Gaussian initialization scaled by fan-in (He et al., 2014) CONFIG.number_of_chars = 100 CONFIG.max_input_len = 20 CONFIG.inverted = True # parameters for the training: CONFIG.batch_size = 100 # As the model changes in size, play with the batch size to best fit the process in memory CONFIG.epochs = 500 # due to mini-epochs. CONFIG.steps_per_epoch = 1000 # This is a mini-epoch. Using News 2013 an epoch would need to be ~60K. CONFIG.validation_steps = 10 CONFIG.number_of_iterations = 10 dataset=pd.read_csv("input_spell.csv") input_data=dataset['input'].tolist() input_data1=str(input_data) output_data=dataset['output'].tolist() output_data1=str(output_data) chars=list("abcdefghijklmnopqrstuvwxyz") MIN_INPUT_LEN = 1 AMOUNT_OF_NOISE = 0.2 / CONFIG.max_input_len CHARS = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .") class CharacterTable(object): """ Given a set of characters: + Encode them to a one hot integer representation + Decode the one hot integer representation to their character output + Decode a vector of probabilities to their character output """ def __init__(self, chars): self.chars = sorted(set(chars)) self.char_indices = dict((c, i) for i, c in enumerate(self.chars)) self.indices_char = dict((i, c) for i, c in enumerate(self.chars)) @property def size(self): """The number of chars""" return len(self.chars) def encode(self, C, maxlen): """Encode as one-hot""" X = np.zeros((maxlen, len(self.chars)), dtype=np.bool) # pylint:disable=no-member for i, c in enumerate(C): X[i, self.char_indices[c]] = 1 return X def decode(self, X, calc_argmax=True): """Decode from one-hot""" if calc_argmax: X = X.argmax(axis=-1) return ''.join(self.indices_char[x] for x in X if x) def _vectorize(questions, answers, ctable): """Vectorize the data as numpy arrays""" len_of_questions = len(questions) X = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size),dtype=int) print("inputchars") for i in range(len(questions)): print(i) sentence = questions.pop() print(sentence) for j, c in enumerate(sentence): print(j) print(c) try: X[i, j, ctable.char_indices[c]] = 1 except KeyError: pass # Padding y = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size), dtype=int) print("outputchars") for i in range(len(answers)): print(i) sentence = answers.pop() print(sentence) for j, c in enumerate(sentence): try: y[i, j, ctable.char_indices[c]] = 1 except KeyError: pass # Padding return X, y def vectorize(questions, answers, chars=None): """Vectorize the questions and expected answers""" print('Vectorization...') chars = chars or CHARS ctable = CharacterTable(chars) print("inputdata before _vec") print(questions) X, y = _vectorize(questions, answers, ctable) # Explicitly set apart 10% for validation data that we never train over #print("input after _vec") #print(X) #print("output after _vec") #print(y) print(X.shape) print(y.shape) return X, y, CONFIG.max_input_len, ctable def generate_model(output_len, chars=None): """Generate the model""" print('Build model...') chars = chars or CHARS model = Sequential() # "Encode" the input sequence using an RNN, producing an output of hidden_size # note: in a situation where your input sequences have a variable length, # use input_shape=(None, nb_feature). for layer_number in range(CONFIG.input_layers): model.add(recurrent.LSTM(CONFIG.hidden_size, input_shape=(None, len(chars)), kernel_initializer=CONFIG.initialization, return_sequences=layer_number + 1 < CONFIG.input_layers)) model.add(Dropout(CONFIG.amount_of_dropout)) # For the decoder's input, we repeat the encoded input for each time step model.add(RepeatVector(output_len)) # The decoder RNN could be multiple layers stacked or a single layer for _ in range(CONFIG.output_layers): model.add(recurrent.LSTM(CONFIG.hidden_size, return_sequences=True, kernel_initializer=CONFIG.initialization)) model.add(Dropout(CONFIG.amount_of_dropout)) # For each of step of the output sequence, decide which character should be chosen model.add(TimeDistributed(Dense(len(chars), kernel_initializer=CONFIG.initialization))) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model def iterate_training(model, X_train, y_train, X_val, y_val, ctable): """Iterative Training""" # Train the model each generation and show predictions against the validation dataset for iteration in range(1, CONFIG.number_of_iterations): #print() #print('-' * 50) #print('Iteration', iteration) model.fit(X_train, y_train, batch_size=CONFIG.batch_size, epochs=CONFIG.epochs, validation_data=(X_val, y_val)) #print_random_predictions(model, ctable, X_val, y_val) def print_random_predictions(model, ctable, X_val, y_val): """Select 10 samples from the validation set at random so we can visualize errors""" print() for _ in range(10): #ind = random_randint(0, len(X_val)) #rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])] # pylint:disable=no-member rowX, rowy = X_val, y_val preds = model.predict_classes(rowX, verbose=0) print("preds") print(preds) q = ctable.decode(rowX[0]) print("q-value") print(q) correct = ctable.decode(rowy[0]) print("correct") print(correct) guess = ctable.decode(preds[0], calc_argmax=False) print("predicted") print(guess) return guess X_train,y_train, y_maxlen, ctable = vectorize(input_data, output_data, chars) print ("y_maxlen, chars", y_maxlen, "".join(chars)) model = generate_model(y_maxlen, chars) iterate_training(model, X_train, y_train, X_train, y_train, ctable) for inp in X_train: inputarray = ctable.decode(inp) print(inputarray) prediction=model.predict_classes(X_train, verbose=0) for p in prediction: guess = ctable.decode(p, calc_argmax=False) print(guess)

Below is the content of file input_spell.csv

input output sol solid kt kit whl wheel abr abrasive unv universal pp pipe plt plate accum accumulator

I have taken code from deepspell

Below is my prediction result for training set

ccumultorrrrrrrrrr plteeeeeeeeeeeeeeee pipeeellllllllllllll universllllllllllll brsiveeellllllllll wheellllllllllllllll kitteeeeeeeeeeeellll solidddddddddddddddd

Input and output vector is of size

Input:(8, 20, 26)
output:(8, 20, 26)

so I am getting prediction result of length 20

I have a very basic understanding of RNN and LSTM

Updating
When i try to visualize model.summary() i got

model.summary() _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= lstm_1 (LSTM) (None, None, 500) 1054000 _________________________________________________________________ dropout_1 (Dropout) (None, None, 500) 0 _________________________________________________________________ lstm_2 (LSTM) (None, 500) 2002000 _________________________________________________________________ dropout_2 (Dropout) (None, 500) 0 _________________________________________________________________ repeat_vector_1 (RepeatVecto (None, 20, 500) 0 _________________________________________________________________ lstm_3 (LSTM) (None, 20, 500) 2002000 _________________________________________________________________ dropout_3 (Dropout) (None, 20, 500) 0 _________________________________________________________________ lstm_4 (LSTM) (None, 20, 500) 2002000 _________________________________________________________________ dropout_4 (Dropout) (None, 20, 500) 0 _________________________________________________________________ time_distributed_1 (TimeDist (None, 20, 26) 13026 _________________________________________________________________ activation_1 (Activation) (None, 20, 26) 0 ================================================================= Total params: 7,073,026 Trainable params: 7,073,026 Non-trainable params: 0 _________________________________________________________________

can anyone tell me where i have gone wrong?

Have you tried to visualize the model by printing model.summary() to try and spot the error?

– Mohamed Elzarei
Mar 19 '18 at 19:22

model.summary()

@MohamedElzare I have edited and included model.summary() in question

– Ranjana Girish
Mar 20 '18 at 9:24

model.summary()

1 Answer
1

I suggest double checking the max_input_len, output_len, and y_maxlen variables, Does your model train succesfully? What it's validation error?
If it seems to be training succesfully, then I suspect a problem with the shapes of the vectorized data of the generalization step.

max_input_len

output_len

y_maxlen

Thanks for contributing an answer to Stack Overflow!

But avoid …

To learn more, see our tips on writing great answers.

Required, but never shown

By clicking "Post Your Answer", you acknowledge that you have read our updated terms of service, privacy policy and cookie policy, and that your continued use of the website is subject to these policies.

搜尋此網誌

Dfyjkt