Poetry Classification

[1]:
!pip install tkseem
!pip install tnkeeh
/bin/bash: pip: command not found
/bin/bash: pip: command not found
[ ]:
!wget https://raw.githubusercontent.com/ARBML/tkseem/master/tasks/meter_classification/meters/data.txt
!wget https://raw.githubusercontent.com/ARBML/tkseem/master/tasks/meter_classification/meters/labels.txt

Imports

[3]:
import tensorflow as tf
import tkseem as tk
import tnkeeh as tn
import numpy as np
from tensorflow.keras.layers import GRU, Embedding, Dense, Input, Dropout, Bidirectional, BatchNormalization, Flatten, Reshape
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split

Process data

[4]:
tn.clean_data(file_path = 'meters/data.txt', save_path = 'meters/cleaned_data.txt', remove_diacritics=True,
      execluded_chars=['!', '.', '?', '#'])
tn.split_classification_data('meters/cleaned_data.txt', 'meters/labels.txt')
train_data, test_data, train_lbls, test_lbls = tn.read_data(mode = 1)
Remove diacritics
Remove Tatweel
Saving to meters/cleaned_data.txt
Split data
Save to data
Read data  ['test_data.txt', 'test_lbls.txt', 'train_data.txt', 'train_lbls.txt']

Tokenization

[5]:
tokenizer = tk.CharacterTokenizer()
tokenizer.train('data/train_data.txt')
Training CharacterTokenizer ...

Tokenize data

[6]:
def preprocess(tokenizer, data, labels):
    X = tokenizer.encode_sentences(data)
    y = np.array([int(lbl) for lbl in labels])
    return X, y
[7]:
# process training data
X_train, y_train = preprocess(tokenizer, train_data, train_lbls)

# process test data
X_test, y_test = preprocess(tokenizer, test_data, test_lbls)
[8]:
max_length = max(len(sent) for sent in X_train)

Model

[9]:
model = Sequential()
model.add(Input((max_length,)))
model.add(Embedding(tokenizer.vocab_size, 256))
model.add(Bidirectional(GRU(units = 256, return_sequences=True)))
model.add(Bidirectional(GRU(units = 256, return_sequences=True)))
model.add(Bidirectional(GRU(units = 256)))
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(14, activation = 'softmax'))
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
[10]:
model.fit(X_train, y_train, validation_split = 0.1, epochs = 10, batch_size= 256, shuffle = True)
Epoch 1/10
133/133 [==============================] - 465s 3s/step - loss: 2.3899 - accuracy: 0.1572 - val_loss: 1.9431 - val_accuracy: 0.2902
Epoch 2/10
133/133 [==============================] - 452s 3s/step - loss: 1.8384 - accuracy: 0.3214 - val_loss: 1.6722 - val_accuracy: 0.3905
Epoch 3/10
133/133 [==============================] - 436s 3s/step - loss: 1.5614 - accuracy: 0.4314 - val_loss: 1.5018 - val_accuracy: 0.4581
Epoch 4/10
133/133 [==============================] - 381s 3s/step - loss: 1.1860 - accuracy: 0.5879 - val_loss: 0.8718 - val_accuracy: 0.7109
Epoch 5/10
133/133 [==============================] - 370s 3s/step - loss: 0.7501 - accuracy: 0.7595 - val_loss: 0.5991 - val_accuracy: 0.8085
Epoch 6/10
133/133 [==============================] - 360s 3s/step - loss: 0.5233 - accuracy: 0.8410 - val_loss: 0.5352 - val_accuracy: 0.8332
Epoch 7/10
133/133 [==============================] - 361s 3s/step - loss: 0.4070 - accuracy: 0.8807 - val_loss: 0.4281 - val_accuracy: 0.8708
Epoch 8/10
133/133 [==============================] - 355s 3s/step - loss: 0.3229 - accuracy: 0.9074 - val_loss: 0.3947 - val_accuracy: 0.8841
Epoch 9/10
133/133 [==============================] - 356s 3s/step - loss: 0.2724 - accuracy: 0.9241 - val_loss: 0.3725 - val_accuracy: 0.8926
Epoch 10/10
133/133 [==============================] - 355s 3s/step - loss: 0.2301 - accuracy: 0.9352 - val_loss: 0.3540 - val_accuracy: 0.8989
[10]:
<tensorflow.python.keras.callbacks.History at 0x7ff3a7692160>

Test

[11]:
label2name = ['السريع', 'الكامل', 'المتقارب', 'المتدارك', 'المنسرح', 'المديد',
              'المجتث', 'الرمل', 'البسيط', 'الخفيف', 'الطويل', 'الوافر', 'الهزج', 'الرجز']
[14]:
def classify(sentence):
    sequence = tokenizer.encode_sentences([sentence], out_length = max_length)
    pred = model.predict(sequence)[0]
    print(label2name[np.argmax(pred, 0).astype('int')], np.max(pred))
[15]:
classify("ما تردون على هذا المحب # دائبا يشكو إليكم في الكتب")
classify("ولد الهدى فالكائنات ضياء # وفم الزمان تبسم وسناء")
classify(" لك يا منازل في القلوب منازل # أقفرت أنت وهن منك أواهل")
classify("ومن لم يمت بالسيف مات بغيره # تعددت الأسباب والموت واحد")
classify("أنا النبي لا كذب # أنا ابن عبد المطلب")
classify("هذه دراهم اقفرت # أم ربور محتها الدهور")
classify("هزجنا في بواديكم # فأجزلتم عطايانا")
classify("بحر سريع ماله ساحل # مستفعلن مستفعلن فاعلن")
classify("مَا مَضَى فَاتَ وَالْمُؤَمَّلُ غَيْبٌ # وَلَكَ السَّاعَةُ الَّتِيْ أَنْتَ فِيْهَا")
classify("يا ليلُ الصبّ متى غدهُ # أقيامُ الساعة موعدهُ")
الرمل 0.9957462
الكامل 0.98703927
الكامل 0.9792284
الطويل 0.99692947
الهزج 0.94578993
المديد 0.3755584
الهزج 0.981885
الرجز 0.8000305
المتدارك 0.7176092
المتدارك 0.99850094