Torch taking 4X time than keras

2 years ago

#76316

Akshay Verma

We have a run a LSTM Model for binary classification having multiple sequential inputs whose output is concatenated for classification. Below is the torch model.

import pandas as pd
import time

from nltk.tokenize import word_tokenize
from itertools import combinations
from collections import Counter
import numpy as np
import pickle
import feather
from itertools import groupby

#Tokenize the data manually
from nltk.tokenize import word_tokenize
from itertools import combinations
from collections import Counter

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import torch
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
from torchvision import models
from torchsummary import summary

events = ['eve_seq_page_action','eve_seq_page_type','eve_seq_store_name','eve_seq_sub_page_type']
ts = ['ts_seq_page_action','ts_seq_page_type','ts_seq_store_name','ts_seq_sub_page_type']

def_val = 1137 + 1
max_len = 150
pod_flag = 0

#LSTM Model defined

class LSTM_model(nn.Module):
    def __init__(self,vocab_size,hidden_size,add_feats,embed_size):
        super(LSTM_model, self).__init__()
        #Model Layers
        self.embedding0 = nn.Embedding(vocab_size+1,embed_size,padding_idx=def_val)
        self.embedding1 = nn.Embedding(vocab_size+1,embed_size,padding_idx=def_val)
        self.embedding2 = nn.Embedding(vocab_size+1,embed_size,padding_idx=def_val)
        self.embedding3 = nn.Embedding(vocab_size+1,embed_size,padding_idx=def_val)
        self.lstm0 = nn.LSTM(embed_size+add_feats, hidden_size,batch_first=True)#+1 is done to accomodate additional feature i.e. ts
        self.lstm1 = nn.LSTM(embed_size+add_feats, hidden_size,batch_first=True)
        self.lstm2 = nn.LSTM(embed_size+add_feats, hidden_size,batch_first=True)
        self.lstm3 = nn.LSTM(embed_size+add_feats, hidden_size,batch_first=True)
        self.fc1 = nn.Linear(hidden_size*4,64)
        self.fc2 = nn.Linear(64,32)
        self.fc3 = nn.Linear(32,1)
        self.drop = nn.Dropout(0.3)
        #Activations
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self,a,b,c,d,ts0,ts1,ts2,ts3):
        #Initialize the Embedding Layer
        x0 = self.embedding0(a.long())
        x1 = self.embedding1(b.long())
        x2 = self.embedding2(c.long())
        x3 = self.embedding3(d.long())
        #Numerical Features for Processing data 
        co0 = torch.cat([x0,ts0.unsqueeze(2)],2)
        co1 = torch.cat([x1,ts1.unsqueeze(2)],2)
        co2 = torch.cat([x2,ts2.unsqueeze(2)],2)
        co3 = torch.cat([x3,ts3.unsqueeze(2)],2)
        #Run 4 separate LSTM's
        x0 , (h0,c0) = self.lstm0(co0)
        x1 , (h1,c1) = self.lstm1(co1)
        x2 , (h2,c2) = self.lstm2(co2)
        x3 , (h3,c3) = self.lstm3(co3)
        #Transform the matrics
        h0 = h0[-1,:,:]
        h1 = h1[-1,:,:]
        h2 = h2[-1,:,:]
        h3 = h3[-1,:,:]
        #Transform the 4 layers
        f = torch.cat([h0,h1,h2,h3],1)
        x = self.relu(self.fc1(f))
        x = self.drop(x)
        x = self.relu(self.fc2(x))
        x = self.drop(x)
        x = self.sigmoid(self.fc3(x))
        
        return x    

#Model
model = LSTM_model(vocab_size=def_val,hidden_size=32,add_feats=1,embed_size=50)
#Loss
loss_fn = nn.BCELoss()
#Optimizer
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)

import torch.utils.data as data_utils
dataset = torch.utils.data.TensorDataset(torch.zeros(100000,150),torch.zeros(100000,150),\
        torch.zeros(100000,150),torch.zeros(100000,150),torch.zeros(100000,150),torch.zeros(100000,150),\
        torch.zeros(100000,150),torch.zeros(100000,150),torch.randint(0,1,(100000,1)).float())

loader = torch.utils.data.DataLoader(
    dataset,
    num_workers=0,
    batch_size=256,
#     shuffle = True
)

%%time
# torch.set_num_threads(8)
n_epochs = 1 # or whatever
losses = []

for epoch in range(n_epochs):
    model = model.train()
    for batch_idx, (a,b,c,d,ts0,ts1,ts2,ts3,y) in enumerate(loader):
        t0 = time.time()

        print(a.size(),b.size())

        # in case you wanted a semi-full example
        outputs = model.forward(a,b,c,d,ts0,ts1,ts2,ts3)

        loss = loss_fn(outputs,y)

        optimizer.zero_grad()

        loss.backward()

        optimizer.step()
        
        
#         del [a,b,c,d,ts0,ts1,ts2,ts3]
#         gc.collect()

        
        if batch_idx%100 == 0:
            losses.append(loss)
            print("epoch {}.\ttimespent {}.\tloss : {}".format(epoch,time.time() - t0,loss))

It takes almost 4 minutes to run 1 epoch with a sample dataset. Current implementation is on a CPU.

However, same model with equal number of parameters is implemented in Keras which runs in 1 minutes for a single epoch.

Below is the Keras Implementation:

###Keras Experiment###

import pandas as pd
from nltk.tokenize import word_tokenize
from itertools import combinations
from collections import Counter
import numpy as np
import pickle
import feather
from itertools import groupby

#Tokenize the data manually
from nltk.tokenize import word_tokenize
from itertools import combinations
from collections import Counter

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.models import Sequential
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
# from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

from keras.layers import Input, Embedding, Dot, Reshape, Dense, Dropout,Concatenate

def RNN():
    #Input for the Sequential Data
    input0 = Input(name=str('input0'),shape=[max_len])
    input1 = Input(name=str('input1'),shape=[max_len])
    input2 = Input(name=str('input2'),shape=[max_len])
    input3 = Input(name=str('input3'),shape=[max_len])
    #Input Profiles for the Timespent on each page
    input_ts0 = Input(name=str('input01'),shape=[max_len,1])
    input_ts1 = Input(name=str('input02'),shape=[max_len,1])
    input_ts2 = Input(name=str('input03'),shape=[max_len,1])
    input_ts3 = Input(name=str('input04'),shape=[max_len,1])
#   #Embedding Layer
    embed0 = Embedding(def_val+1,50,input_length=max_len)(input0)
    embed1 = Embedding(def_val+1,50,input_length=max_len)(input1)
    embed2 = Embedding(def_val+1,50,input_length=max_len)(input2)
    embed3 = Embedding(def_val+1,50,input_length=max_len)(input3)
    ##concatenate the embedding and time spent on each page
    ts_eve_concat0 = Concatenate(name='Concatenated_eve_ts0')([embed0,input_ts0])
    ts_eve_concat1 = Concatenate(name='Concatenated_eve_ts1')([embed1,input_ts1])
    ts_eve_concat2 = Concatenate(name='Concatenated_eve_ts2')([embed2,input_ts2])
    ts_eve_concat3 = Concatenate(name='Concatenated_eve_ts3')([embed3,input_ts3])
    #LSTM on all the individual layers
    lstm0 = LSTM(32)(ts_eve_concat0)
    lstm1 = LSTM(32)(ts_eve_concat1)
    lstm2 = LSTM(32)(ts_eve_concat2)
    lstm3 = LSTM(32)(ts_eve_concat3)
    ##Concatenate all the LSTM Layers
    concat_lstm = Concatenate(name='Concatenated_lstm')([lstm0,lstm1,lstm2,lstm3])
    layer = Dense(64,name='FC1')(concat_lstm)
    layer = Activation('relu')(layer)
    layer = Dropout(0.3)(layer)
    layer = Dense(32,name='FC2',activation='relu')(layer)
    layer = Dropout(0.3)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=[input0,input1,input2,input3,
                         input_ts0,input_ts1,input_ts2,input_ts3],outputs=layer)
    return model

model = RNN()
model.summary()

%%time
model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit([np.zeros([100000,150]),np.zeros([100000,150]),np.zeros([100000,150]),np.zeros([100000,150]),\
          np.zeros([100000,150]),np.zeros([100000,150]),np.zeros([100000,150]),np.zeros([100000,150])], np.random.randint(2, size=(100000, 1)), 
          epochs = 1 ,batch_size= 256, verbose = True)

Any reason why would it be the case.

python

keras

pytorch

lstm

0 Answers

Your Answer

Posts

Questions

Blogs

Jobs