Sentiment Analysis

[ ]:
!pip install tkseem
!pip install tnkeeh
[ ]:
!wget https://raw.githubusercontent.com/ARBML/tkseem/master/tasks/sentiment_analysis/sentiment/data.txt
!wget https://raw.githubusercontent.com/ARBML/tkseem/master/tasks/sentiment_analysis/sentiment/labels.txt

Imports

[3]:
import numpy as np
import tkseem as tk
import tnkeeh as tn
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import GRU, Embedding, Dense, Input, Dropout, Bidirectional

Process data

[4]:
tn.clean_data(file_path = 'sentiment/data.txt', save_path = 'sentiment/cleaned_data.txt', remove_diacritics=True,
      execluded_chars=['!', '.', '?'])
tn.split_classification_data('sentiment/cleaned_data.txt', 'sentiment/labels.txt')
train_data, test_data, train_lbls, test_lbls = tn.read_data(mode = 1)
Remove diacritics
Remove Tatweel
Saving to sentiment/cleaned_data.txt
Split data
Save to data
Read data  ['test_data.txt', 'test_lbls.txt', 'train_data.txt', 'train_lbls.txt']
[5]:
max_length = max(len(data) for data in train_data)

Tokenize

[6]:
tokenizer = tk.SentencePieceTokenizer()
tokenizer.train('data/train_data.txt')
Training SentencePiece ...

Tokenize data

[7]:
def preprocess(tokenizer, data, labels):
    X = tokenizer.encode_sentences(data)
    y = np.array([int(lbl) for lbl in labels])
    return X, y
[8]:
# process training data
X_train, y_train = preprocess(tokenizer, train_data, train_lbls)

# process test data
X_test, y_test = preprocess(tokenizer, test_data, test_lbls)

Model

[9]:
model = Sequential()
model.add(Embedding(tokenizer.vocab_size, 32))
model.add(Bidirectional(GRU(units = 32)))
model.add(Dense(32, activation = 'tanh'))
model.add(Dropout(0.3))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

Train

[10]:
history = model.fit(X_train, y_train, epochs = 12, validation_split = 0.1,  batch_size= 128, shuffle = True)
Epoch 1/12
6/6 [==============================] - 3s 445ms/step - loss: 0.6936 - accuracy: 0.4986 - val_loss: 0.6990 - val_accuracy: 0.3625
Epoch 2/12
6/6 [==============================] - 2s 324ms/step - loss: 0.6883 - accuracy: 0.5097 - val_loss: 0.6986 - val_accuracy: 0.3625
Epoch 3/12
6/6 [==============================] - 1s 193ms/step - loss: 0.6827 - accuracy: 0.6139 - val_loss: 0.6890 - val_accuracy: 0.5875
Epoch 4/12
6/6 [==============================] - 2s 254ms/step - loss: 0.6706 - accuracy: 0.8222 - val_loss: 0.6814 - val_accuracy: 0.6625
Epoch 5/12
6/6 [==============================] - 1s 238ms/step - loss: 0.6473 - accuracy: 0.8861 - val_loss: 0.6730 - val_accuracy: 0.6875
Epoch 6/12
6/6 [==============================] - 1s 214ms/step - loss: 0.6117 - accuracy: 0.9014 - val_loss: 0.6543 - val_accuracy: 0.7125
Epoch 7/12
6/6 [==============================] - 2s 266ms/step - loss: 0.5536 - accuracy: 0.9167 - val_loss: 0.6210 - val_accuracy: 0.7500
Epoch 8/12
6/6 [==============================] - 1s 237ms/step - loss: 0.4579 - accuracy: 0.9347 - val_loss: 0.5906 - val_accuracy: 0.7500
Epoch 9/12
6/6 [==============================] - 1s 197ms/step - loss: 0.3353 - accuracy: 0.9500 - val_loss: 0.5605 - val_accuracy: 0.7375
Epoch 10/12
6/6 [==============================] - 1s 219ms/step - loss: 0.2050 - accuracy: 0.9639 - val_loss: 0.5069 - val_accuracy: 0.7625
Epoch 11/12
6/6 [==============================] - 1s 216ms/step - loss: 0.1315 - accuracy: 0.9694 - val_loss: 0.5215 - val_accuracy: 0.7250
Epoch 12/12
6/6 [==============================] - 1s 166ms/step - loss: 0.1063 - accuracy: 0.9625 - val_loss: 0.5699 - val_accuracy: 0.7125

Test

[11]:
def classify(sentence):
  sequence = tokenizer.encode_sentences([sentence], out_length = max_length)[0]
  pred = model.predict(sequence)[0][0]
  print(pred)
[12]:
classify("سيئة جدا جدا")
classify("رائعة جدا")
0.06951779
0.89656436