#Movie Term Chatbot from __future__ import print_function import numpy as np import keras from keras.datasets import reuters from keras.datasets import imdb#reuters from keras.models import Sequential from keras.layers.embeddings import Embedding from keras.layers import Dense, Dropout, Activation, LSTM from keras.layers import Conv2D, MaxPooling2D from keras.preprocessing.text import Tokenizer import random batch_size = 32 max_words = 1000 epochs = 2 tempdata = open("movie_lines.txt").readlines() x_train_temp = np.array([i.split("+++$+++")[-1].strip() for i in tempdata[:1000]]) y_train_temp = [i.split("+++$+++")[-2].strip() for i in tempdata[:1000]] #Categories categories = list(set(y_train_temp)) print("Categories:"+str(categories)) y_train = np.array([categories.index(i) for i in y_train_temp]) print(y_train) num_classes = len(set(y_train)) print(num_classes, 'classes') #Storing original for later use x_train_org = x_train_temp[:] y_train_org = y_train[:] #Embeddings allwords = ' '.join(x_train_temp).lower().split(' ') uniquewords = list(set(allwords)) #Stemming from nltk.stem import * stemmer = PorterStemmer() def fixWord(word): word = stemmer.stem(word.decode("utf-8","ignore")) word = word.replace(".","") return word uniquewords = [fixWord(i) for i in uniquewords] print(uniquewords) import pdb;pdb.set_trace() x_train = [] for i in x_train_temp: #import pdb;pdb.set_trace() iwords = i.lower().split(' ') numbers = [uniquewords.index(fixWord(i)) for i in iwords] x_train.append(numbers) x_train = np.array(x_train) print(x_train) #Make binary embeddings tokenizer = Tokenizer(num_words=max_words) x_train = tokenizer.sequences_to_matrix(x_train, mode='binary') y_train = keras.utils.to_categorical(y_train, num_classes) print([i for i in x_train[0]]) #Model model = Sequential() model.add(Dense(512, input_shape=(max_words,))) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(num_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.1) def getCategory(inputString): token = tokenizer.sequences_to_matrix(np.array([inputString,x_train_org[0]])) aindex = np.argmax(model.predict(np.array([token[0]]))) return aindex def getRandomTextFromIndex(aIndex): res = -1 while res!=aIndex: aNumber = random.randint(0,len(y_train_org)) res = y_train_org[aNumber] return x_train_org[aNumber] s = " " while s: category = getCategory(s) text = getRandomTextFromIndex(category) print("Chatbot:" + text) s = raw_input("Human:")