How do I submit my own text to the input of a neural network?

There is a neural network that recognizes a good review of a movie or a bad one. How do I submit my own text to the network input for health check? The network itself:

 data = pd.concat([positive_train_data,negative_train_data,positive_test_data,negative_test_data],ignore_index = True)
    data.reset_index(drop=True,inplace=True)
    x = data.Text
    y = data.Sentiment

    x_train, x_test, y_train1, y_test = train_test_split(x, y, test_size = 0.50, random_state = 2000)
    print( "Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train),
                                                                                 (len(x_train[y_train1 == 0]) / (len(x_train)*1.))*100,
                                                                                (len(x_train[y_train1 == 1]) / (len(x_train)*1.))*100))

    print ("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test),
                                                                                 (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
                                                                                (len(x_test[y_test == 1]) / (len(x_test)*1.))*100))

    tvec1 = TfidfVectorizer(max_features=10000,ngram_range=(1, 2),min_df=3,use_idf=1,smooth_idf=1,sublinear_tf=1,stop_words = 'english')
    tvec1.fit(x_train)
    x_train_tfidf = tvec1.transform(x_train)
    print(x_test.shape)
    x_test_tfidf = tvec1.transform(x_test).toarray()
model = Sequential()
model.add(Dense(100, activation='relu', input_dim=10000))
model.add(Dropout(0.25))
model.add(Dense(50,activation = 'relu'))
model.add(Dense(1, activation='sigmoid'))
optimiz = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss = 'binary_crossentropy',optimizer = optimiz ,metrics = ['accuracy'])
hist  = model.fit(x_train_tfidf,y_train1,validation_data = (x_test_tfidf,y_test ),epochs = 5,batch_size = 64)
Author: MaxU, 2018-04-16

1 answers

You can use Keras for text preprocessing:

from pathlib import Path
import pandas as pd
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras import optimizers

def get_data(path):
    p = Path(path)
    train = pd.concat([pd.read_csv(f, sep='\t', index_col=0)
                       for f in p.glob('train_???.tsv*')],
                      ignore_index=True)
    test = pd.concat([pd.read_csv(f, sep='\t', index_col=0)
                      for f in p.glob('test_???.tsv*')],
                      ignore_index=True)
    return ((train['Text'], train['Sentiment']),
           (test['Text'], test['Sentiment']))

# read data
(X_train, y_train), (X_test, y_test) = get_data(r'D:\download\NLP')

# build vocabulary
t = Tokenizer(num_words=10000)
t.fit_on_texts(X_train)
t.fit_on_texts(X_test)


# build a model
model = Sequential()
model.add(Dense(100, activation='relu', input_dim=10000))
model.add(Dropout(0.25))
model.add(Dense(50,activation = 'relu'))
model.add(Dense(1, activation='sigmoid'))
optimiz = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss = 'binary_crossentropy',optimizer = optimiz ,metrics = ['accuracy'])

# convert texts to TF-IDF matrices
X_train = t.texts_to_matrix(X_train, mode='tfidf')
X_test = t.texts_to_matrix(X_test, mode='tfidf')

# fit the model
hist = model.fit(X_train,y_train,validation_data = (X_test,y_test),epochs = 5,batch_size = 64)

Conclusion:

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
25000/25000 [==============================] - 12s 467us/step - loss: 0.4621 - acc: 0.7892 - val_loss: 0.3244 - val_acc: 0.8743
Epoch 2/5
25000/25000 [==============================] - 11s 434us/step - loss: 0.2319 - acc: 0.9153 - val_loss: 0.3020 - val_acc: 0.8821
Epoch 3/5
25000/25000 [==============================] - 11s 434us/step - loss: 0.1519 - acc: 0.9483 - val_loss: 0.3126 - val_acc: 0.8803
Epoch 4/5
25000/25000 [==============================] - 11s 435us/step - loss: 0.1023 - acc: 0.9674 - val_loss: 0.3463 - val_acc: 0.8776
Epoch 5/5
25000/25000 [==============================] - 11s 435us/step - loss: 0.0685 - acc: 0.9794 - val_loss: 0.3812 - val_acc: 0.8748

We check the model on our own texts:

# predict our own texts
texts = ['what the hell is going on?', 'that was amazing', 'that was awful']
t.fit_on_texts(texts)
X = t.texts_to_matrix(texts, mode='tfidf')

model.predict(X)

Conclusion:

Out[5]:
array([[0.47051138],
       [0.36511892],
       [0.24146256]], dtype=float32)
 4
Author: MaxU, 2018-04-18 23:00:06