[ ]:
# modified version from https://www.tensorflow.org/tutorials/text/nmt_with_attention

Tranlsation

[4]:
!wget https://raw.githubusercontent.com/ARBML/tkseem/master/tasks/translation/data/ar_data.txt
!wget https://raw.githubusercontent.com/ARBML/tkseem/master/tasks/translation/data/en_data.txt
Will not apply HSTS. The HSTS database must be a regular and non-world-writable file.
ERROR: could not open HSTS store at '/home/zaid/.wget-hsts'. HSTS will be disabled.
--2020-08-28 14:49:14--  https://raw.githubusercontent.com/ARBML/tkseem/master/tasks/translation/data/ar_data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.112.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.112.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3705050 (3.5M) [text/plain]
Saving to: ‘ar_data.txt’

ar_data.txt         100%[===================>]   3.53M   719KB/s    in 5.1s

2020-08-28 14:49:21 (708 KB/s) - ‘ar_data.txt’ saved [3705050/3705050]

Will not apply HSTS. The HSTS database must be a regular and non-world-writable file.
ERROR: could not open HSTS store at '/home/zaid/.wget-hsts'. HSTS will be disabled.
--2020-08-28 14:49:21--  https://raw.githubusercontent.com/ARBML/tkseem/master/tasks/translation/data/en_data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.112.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.112.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2510593 (2.4M) [text/plain]
Saving to: ‘en_data.txt’

en_data.txt         100%[===================>]   2.39M   588KB/s    in 4.2s

2020-08-28 14:49:26 (588 KB/s) - ‘en_data.txt’ saved [2510593/2510593]

[ ]:
!pip install tkseem
!pip install tnkeeh
[1]:
import re
import nltk
import time
import numpy as np
import tkseem as tk
import tnkeeh as tn
import tensorflow as tf
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt

Data Preprocessing

[5]:
tn.clean_data('ar_data.txt','ar_clean_data.txt', remove_diacritics=True)
tn.clean_data('en_data.txt','en_clean_data.txt')

tn.split_parallel_data('ar_clean_data.txt', 'en_clean_data.txt', split_ratio=0.3)
train_inp_text, train_tar_text, test_inp_text, test_tar_text = tn.read_data(mode = 2)
Remove diacritics
Remove Tatweel
Saving to ar_clean_data.txt
Remove Tatweel
Saving to en_clean_data.txt
Split data
Save to data
Read data  ['ar_data.txt', 'en_data.txt', 'test_inp_data.txt', 'test_tar_data.txt', 'train_inp_data.txt', 'train_tar_data.txt']

Tokenization

[6]:
ar_tokenizer = tk.SentencePieceTokenizer(special_tokens=['<s>', '</s>'])
ar_tokenizer.train('data/train_inp_data.txt')

en_tokenizer = tk.SentencePieceTokenizer(special_tokens=['<s>', '</s>'])
en_tokenizer.train('data/train_tar_data.txt')

train_inp_data = ar_tokenizer.encode_sentences(train_inp_text, boundries = ('<s>', '</s>'))
train_tar_data = en_tokenizer.encode_sentences(train_tar_text, boundries = ('<s>', '</s>'))
Training SentencePiece ...
Training SentencePiece ...

Create Dataset

[7]:
BATCH_SIZE = 64
BUFFER_SIZE = len(train_inp_data)

dataset = tf.data.Dataset.from_tensor_slices((train_inp_data, train_tar_data)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

Encoder, Decoder

[8]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights



def get_loss_object():
    return  tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 1))
    loss_ = get_loss_object()(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

Initialize models

[9]:
units = 1024
embedding_dim = 256
max_length_inp = train_inp_data.shape[1]
max_length_tar = train_tar_data.shape[1]
steps_per_epoch = len(train_inp_data)//BATCH_SIZE
vocab_inp_size = ar_tokenizer.vocab_size
vocab_tar_size = en_tokenizer.vocab_size

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

Training Procedure

[10]:
@tf.function
def train_step(inp, targ, enc_hidden, encoder, decoder, optimizer, en_tokenizer):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([en_tokenizer.token_to_id('<s>')] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

def train(epochs = 10, verbose = 0 ):
    optimizer = tf.keras.optimizers.Adam()

    for epoch in range(epochs):
        start = time.time()

        enc_hidden = encoder.initialize_hidden_state()
        total_loss = 0

        for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
            batch_loss = train_step(inp, targ, enc_hidden, encoder, decoder, optimizer, en_tokenizer)
            total_loss += batch_loss

            if batch % 100 == 0 and verbose:
                print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                           batch,
                                                           batch_loss.numpy()))

        if verbose:
            print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                              total_loss / steps_per_epoch))
            print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Start training

[11]:
train(epochs = 10, verbose = 1)
Epoch 1 Batch 0 Loss 8.7736
Epoch 1 Batch 100 Loss 2.1184
Epoch 1 Batch 200 Loss 1.7768
Epoch 1 Batch 300 Loss 1.7248
Epoch 1 Batch 400 Loss 1.6401
Epoch 1 Loss 2.0055
Time taken for 1 epoch 1444.5116345882416 sec

Epoch 2 Batch 0 Loss 1.6100
Epoch 2 Batch 100 Loss 1.5598
Epoch 2 Batch 200 Loss 1.5922
Epoch 2 Batch 300 Loss 1.5228
Epoch 2 Batch 400 Loss 1.4033
Epoch 2 Loss 1.5530
Time taken for 1 epoch 1424.0314059257507 sec

Epoch 3 Batch 0 Loss 1.2111
Epoch 3 Batch 100 Loss 1.4820
Epoch 3 Batch 200 Loss 1.3912
Epoch 3 Batch 300 Loss 1.4882
Epoch 3 Batch 400 Loss 1.2942
Epoch 3 Loss 1.3888
Time taken for 1 epoch 1441.213187456131 sec

Epoch 4 Batch 0 Loss 1.2663
Epoch 4 Batch 100 Loss 1.3889
Epoch 4 Batch 200 Loss 1.1667
Epoch 4 Batch 300 Loss 1.2853
Epoch 4 Batch 400 Loss 1.2746
Epoch 4 Loss 1.2559
Time taken for 1 epoch 1422.2563009262085 sec

Epoch 5 Batch 0 Loss 1.1258
Epoch 5 Batch 100 Loss 1.1021
Epoch 5 Batch 200 Loss 1.1365
Epoch 5 Batch 300 Loss 1.1450
Epoch 5 Batch 400 Loss 1.3664
Epoch 5 Loss 1.1176
Time taken for 1 epoch 1378.149689912796 sec

Epoch 6 Batch 0 Loss 0.9396
Epoch 6 Batch 100 Loss 1.0216
Epoch 6 Batch 200 Loss 1.1066
Epoch 6 Batch 300 Loss 1.0084
Epoch 6 Batch 400 Loss 1.1767
Epoch 6 Loss 0.9732
Time taken for 1 epoch 1328.8411734104156 sec

Epoch 7 Batch 0 Loss 0.9608
Epoch 7 Batch 100 Loss 0.8912
Epoch 7 Batch 200 Loss 0.8274
Epoch 7 Batch 300 Loss 0.8302
Epoch 7 Batch 400 Loss 0.7896
Epoch 7 Loss 0.8303
Time taken for 1 epoch 1294.177453994751 sec

Epoch 8 Batch 0 Loss 0.6882
Epoch 8 Batch 100 Loss 0.6465
Epoch 8 Batch 200 Loss 0.7108
Epoch 8 Batch 300 Loss 0.7176
Epoch 8 Batch 400 Loss 0.7323
Epoch 8 Loss 0.7000
Time taken for 1 epoch 1367.661788702011 sec

Epoch 9 Batch 0 Loss 0.5313
Epoch 9 Batch 100 Loss 0.4794
Epoch 9 Batch 200 Loss 0.6126
Epoch 9 Batch 300 Loss 0.6033
Epoch 9 Batch 400 Loss 0.5891
Epoch 9 Loss 0.5853
Time taken for 1 epoch 1372.0978388786316 sec

Epoch 10 Batch 0 Loss 0.5009
Epoch 10 Batch 100 Loss 0.5200
Epoch 10 Batch 200 Loss 0.4687
Epoch 10 Batch 300 Loss 0.4556
Epoch 10 Batch 400 Loss 0.4321
Epoch 10 Loss 0.4802
Time taken for 1 epoch 1334.807544708252 sec

Test

[12]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_tar, max_length_inp))

    inputs = ar_tokenizer.encode_sentences([sentence], boundries = ('<s>', '</s>'),
                                  out_length = max_length_inp)
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([en_tokenizer.token_to_id('<s>')], 0)

    for t in range(max_length_tar):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                             dec_hidden,
                                                             enc_out)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += en_tokenizer.id_to_token(predicted_id) + ' '

        if en_tokenizer.id_to_token(predicted_id) == '</s>':
            return result, sentence

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence

def translate(sentences, translations, verbose = 1):
    inputs = sentences
    outputs = []

    for i, sentence in enumerate(sentences):
        result, sentence = evaluate(sentence)
        result = ar_tokenizer.detokenize(result)
        result = result.replace('<s>', '').replace('</s>', '')
        result = re.sub(' +', ' ', result)
        outputs.append(result)
        if verbose:
            print('inpt: %s' % (sentence))
            print('pred: {}'.format(result))
            print('true: {}'.format(translations[i]))
[13]:
translate(test_inp_text[:50], test_tar_text[:50], verbose = 1)
inpt:  حسنا هناك بنك لك
pred:  Well there ' s the name for you
true: Well there ' s a bank for you
inpt:  ماذا حدث يا أبي
pred:  What happened Dad
true: What happened Father
inpt:  حسنا لقد مرت أربع سنوات تقريبا
pred:  Well I ' ll be years since
true: Well it ' s almost four years now
inpt:  هذا صحيح أليس كذلك ما
pred:  That ' s right isn ' t it
true: That ' s right ain ' t it Ma
inpt:  أربع سنوات أربع سنوات 5 يونيو بنسلفانيا
pred:  Four months years of the floor
true: Four years Four years 5th June Pa
inpt:  لم أستطع مواكبة المدفوعات
pred:  I couldn ' t steal up for the prisoner ' s jewels
true: I couldn ' t keep up the payments
inpt:  تتذكره
pred:  You remember him
true: You remember him
inpt:  راندي دنلاب
pred:  The Potem oin the toxic ms
true: Randy Dunlap
inpt:  لحاء الشجر
pred:  l journey less
true: Bark
inpt:  لذلك دخلت وطلب مني أن أجلس
pred:  So he died and keep me to stop
true: So I dropped in and he asked me to sit down
inpt:  جورج هل تعرف ماذا كان يرتدي
pred:  George do you know what was
true: George do you know what he was wearing
inpt:  كيمونو
pred:  Kim was
true: A kimono
inpt:  لا
pred:  No
true: No
inpt:  بلى
pred:  Yeah
true: Yeah
inpt:  أوه الآن اللحاء
pred:  Oh now ' s silly
true: Oh now Bark
inpt:  يجب أن يكون ثوب خلع الملابس
pred:  The ve got a big room
true: It must have been a dressing gown
inpt:  أنا أعرف ثوب خلع الملابس عندما أراه
pred:  I know the whole bedroom
true: I know a dressing gown when I see it
inpt:  كان كيمونو جورج
pred:  He was amazing mom
true: It was a kimono George
inpt:  هل لفساتين الملابس الزهور على م
pred:  Are you a rooster with the prisoner glasses
true: Do dressing gowns have flowers on ' em
inpt:  أوه اللحاء
pred:  Oh Merna
true: Oh Bark
inpt:  لا مانع من ذلك يا أبي
pred:  Don ' t mind who is my father
true: Never mind that Father
inpt:  ماذا قال
pred:  What did he say
true: What did he say
inpt:  أوه لقد كان لطيفا بما فيه الكفاية
pred:  Oh he ' s my mom enough
true: Oh he was nice enough
inpt:  أوه الآن اللحاء
pred:  Oh now ' s silly
true: Oh now Bark
inpt:  نعم لقد فعل
pred:  Yes he ' s done
true: Yeah he did
inpt:  كم من الوقت أعطاك يا أبي
pred:  How long time is still Dad
true: How much time did he give you Father
inpt:  ستة أشهر
pred:  Nine months
true: Six months
inpt:  يا حسنا إذن لا يوجد اندفاع فوري
pred:  Oh Well uh we ' s no coffin
true: Oh Oh well then there ' s no immediate rush
inpt:  متى تصل الشهور الستة
pred:  When did you a pot s I inquire
true: When are the six months up
inpt:  الثلاثاء
pred:  Paper
true: Tuesday
inpt:  لكن ولكن لماذا لم تخبرنا عاجلا
pred:  But but why didn ' t you stop them
true: But but why didn ' t you tell us sooner
inpt:  الثلاثاء
pred:  Paper
true: Tuesday
inpt:  لا يعطينا الكثير من الوقت أليس كذلك
pred:  Don ' t give them a lot of time is it
true: Doesn ' t give us much time does it
inpt:  أو
pred:  Or
true: Or
inpt:  هذا صحيح
pred:  That ' s right
true: That ' s right
inpt:  بالطبع
pred:  Of course
true: Oh sure
inpt:  الذي أعطاك هذا اللباس جيش الخلاص
pred:  The d put this place is a few weeks
true: Who gave you that dress the Salvation Army
inpt:  و
pred:  And
true: And uh
inpt:  بلى
pred:  Yeah
true: Yeah
inpt:  حسنا لا أستطيع فعل ذلك بمفردي
pred:  Well I can ' t do it on your own
true: Well I can ' t do it alone
inpt:  لا إنها لم ترسل لنا البرتقالي
pred:  No she ' s not your delicate ed and the Israelites
true: No she ' s never even sent us an orange
inpt:  نعم ولكن ماذا عن هارفي
pred:  Yes but what brings about
true: Yes but what about Harvey
inpt:  أوه نحن لا نريد أن نسأل هارفي
pred:  Oh we don ' t want to remember my mom
true: Oh we wouldn ' t want to ask Harvey
inpt:  أوه لا لن نسأل هارفي
pred:  Oh no I wouldn ' t die
true: Oh no we wouldn ' t ask Harvey
inpt:  لا طلبنا من هارفي الزواج من نيلي
pred:  Don ' t you caught my mom did you Rachel
true: No we asked Harvey to marry Nellie
inpt:  لا يمكننا أن نتوقع من الرجل أن يفعل أكثر من ذلك
pred:  We can ' t we ' ve got a man can do that
true: We can ' t expect the guy to do more than that
inpt:  روبرت توقف عن الحديث بهذه الطريقة
pred:  Elizabeth stop talking to the way
true: Robert stop talking that way
inpt:  قصها يا روبرت
pred:  A spear it Robert
true: Cut it out Robert
inpt:  ليس لدي مجال لكلا منكما
pred:  I haven ' t the whole world I ' re in
true: I haven ' t room for both of you
inpt:  لا يوجد سوى أريكة صغيرة في غرفة المعيشة
pred:  There ' s nothing for a big tree in the street
true: There ' s only a small couch in the living room