How do I submit my own text to the input of a neural network?
There is a neural network that recognizes a good review of a movie or a bad one. How do I submit my own text to the network input for health check? The network itself:
data = pd.concat([positive_train_data,negative_train_data,positive_test_data,negative_test_data],ignore_index = True)
data.reset_index(drop=True,inplace=True)
x = data.Text
y = data.Sentiment
x_train, x_test, y_train1, y_test = train_test_split(x, y, test_size = 0.50, random_state = 2000)
print( "Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train),
(len(x_train[y_train1 == 0]) / (len(x_train)*1.))*100,
(len(x_train[y_train1 == 1]) / (len(x_train)*1.))*100))
print ("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test),
(len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
(len(x_test[y_test == 1]) / (len(x_test)*1.))*100))
tvec1 = TfidfVectorizer(max_features=10000,ngram_range=(1, 2),min_df=3,use_idf=1,smooth_idf=1,sublinear_tf=1,stop_words = 'english')
tvec1.fit(x_train)
x_train_tfidf = tvec1.transform(x_train)
print(x_test.shape)
x_test_tfidf = tvec1.transform(x_test).toarray()
model = Sequential()
model.add(Dense(100, activation='relu', input_dim=10000))
model.add(Dropout(0.25))
model.add(Dense(50,activation = 'relu'))
model.add(Dense(1, activation='sigmoid'))
optimiz = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss = 'binary_crossentropy',optimizer = optimiz ,metrics = ['accuracy'])
hist = model.fit(x_train_tfidf,y_train1,validation_data = (x_test_tfidf,y_test ),epochs = 5,batch_size = 64)
2
1 answers
You can use Keras for text preprocessing:
from pathlib import Path
import pandas as pd
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras import optimizers
def get_data(path):
p = Path(path)
train = pd.concat([pd.read_csv(f, sep='\t', index_col=0)
for f in p.glob('train_???.tsv*')],
ignore_index=True)
test = pd.concat([pd.read_csv(f, sep='\t', index_col=0)
for f in p.glob('test_???.tsv*')],
ignore_index=True)
return ((train['Text'], train['Sentiment']),
(test['Text'], test['Sentiment']))
# read data
(X_train, y_train), (X_test, y_test) = get_data(r'D:\download\NLP')
# build vocabulary
t = Tokenizer(num_words=10000)
t.fit_on_texts(X_train)
t.fit_on_texts(X_test)
# build a model
model = Sequential()
model.add(Dense(100, activation='relu', input_dim=10000))
model.add(Dropout(0.25))
model.add(Dense(50,activation = 'relu'))
model.add(Dense(1, activation='sigmoid'))
optimiz = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss = 'binary_crossentropy',optimizer = optimiz ,metrics = ['accuracy'])
# convert texts to TF-IDF matrices
X_train = t.texts_to_matrix(X_train, mode='tfidf')
X_test = t.texts_to_matrix(X_test, mode='tfidf')
# fit the model
hist = model.fit(X_train,y_train,validation_data = (X_test,y_test),epochs = 5,batch_size = 64)
Conclusion:
Train on 25000 samples, validate on 25000 samples
Epoch 1/5
25000/25000 [==============================] - 12s 467us/step - loss: 0.4621 - acc: 0.7892 - val_loss: 0.3244 - val_acc: 0.8743
Epoch 2/5
25000/25000 [==============================] - 11s 434us/step - loss: 0.2319 - acc: 0.9153 - val_loss: 0.3020 - val_acc: 0.8821
Epoch 3/5
25000/25000 [==============================] - 11s 434us/step - loss: 0.1519 - acc: 0.9483 - val_loss: 0.3126 - val_acc: 0.8803
Epoch 4/5
25000/25000 [==============================] - 11s 435us/step - loss: 0.1023 - acc: 0.9674 - val_loss: 0.3463 - val_acc: 0.8776
Epoch 5/5
25000/25000 [==============================] - 11s 435us/step - loss: 0.0685 - acc: 0.9794 - val_loss: 0.3812 - val_acc: 0.8748
We check the model on our own texts:
# predict our own texts
texts = ['what the hell is going on?', 'that was amazing', 'that was awful']
t.fit_on_texts(texts)
X = t.texts_to_matrix(texts, mode='tfidf')
model.predict(X)
Conclusion:
Out[5]:
array([[0.47051138],
[0.36511892],
[0.24146256]], dtype=float32)
4
Author: MaxU, 2018-04-18 23:00:06