In [1]:
import numpy as np
import pandas as pd
from time import time, ctime
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
In [35]:
!ls ./Boltzmann_Machines/ml-100k/
README	   u.data   u.item	  u1.base  u2.test  u4.base  u5.test  ub.base
allbut.pl  u.genre  u.occupation  u1.test  u3.base  u4.test  ua.base  ub.test
mku.sh	   u.info   u.user	  u2.base  u3.test  u5.base  ua.test
In [37]:
movies = pd.read_csv('./Boltzmann_Machines/ml-1m/movies.dat',
                 sep='::', header = None, 
                 engine='python', encoding='latin-1',
                    names=['MovieID, Title','Genres'])
print movies.shape
movies.head()
(3883, 2)
Out[37]:
MovieID, Title Genres
1 Toy Story (1995) Animation|Children's|Comedy
2 Jumanji (1995) Adventure|Children's|Fantasy
3 Grumpier Old Men (1995) Comedy|Romance
4 Waiting to Exhale (1995) Comedy|Drama
5 Father of the Bride Part II (1995) Comedy

UserID::Gender::Age::Occupation::Zip-code

In [22]:
users = pd.read_csv('./Boltzmann_Machines/ml-1m/users.dat',
                    sep='::', header = None, 
                 engine='python', encoding='latin-1',
                   names=['UserID','Gender','Age','Occupation','Zip'])
print users.shape
users.head()
(6040, 5)
Out[22]:
UserID Gender Age Occupation Zip
0 1 F 1 10 48067
1 2 M 56 16 70072
2 3 M 25 15 55117
3 4 M 45 7 02460
4 5 M 25 20 55455

UserID::MovieID::Rating::Timestamp

In [24]:
ratings = pd.read_csv('./Boltzmann_Machines/ml-1m/ratings.dat',
                      sep='::', header = None, 
                 engine='python', encoding='latin-1',
                    names=['UserID','MovieID','Rating','Timestamp'])
print ratings.shape
ratings.head()
(1000209, 4)
Out[24]:
UserID MovieID Rating Timestamp
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291
In [12]:
from datetime import datetime
In [28]:
%%time
ratings['Time']=ratings['Timestamp'].apply(datetime.fromtimestamp)
CPU times: user 1.15 s, sys: 488 ms, total: 1.64 s
Wall time: 1.63 s
In [31]:
datetime.fromtimestamp(int("874965758")).strftime('%Y-%m-%d %H:%M:%S')
Out[31]:
'1997-09-22 18:02:38'
In [30]:
ratings.tail()
Out[30]:
UserID MovieID Rating Timestamp Time
1000204 6040 1091 1 956716541 2000-04-25 22:35:41
1000205 6040 1094 5 956704887 2000-04-25 19:21:27
1000206 6040 562 5 956704746 2000-04-25 19:19:06
1000207 6040 1096 4 956715648 2000-04-25 22:20:48
1000208 6040 1097 4 956715569 2000-04-25 22:19:29

training, test sets

  • from 100k dataset
In [112]:
training_set = pd.read_csv('./Boltzmann_Machines/ml-100k/u1.base',
                           delimiter='\t', names=['UserID','MovieID','Rating','Timestamp'])
print training_set.shape
training_set.head()
(80000, 4)
Out[112]:
UserID MovieID Rating Timestamp
0 1 1 5 874965758
1 1 2 3 876893171
2 1 3 4 878542960
3 1 4 3 876893119
4 1 5 3 889751712
In [113]:
t_df=training_set.copy()
training_set = np.array(training_set, dtype='int')
training_set[:2]
Out[113]:
array([[        1,         1,         5, 874965758],
       [        1,         2,         3, 876893171]])
In [114]:
test_set = pd.read_csv('./Boltzmann_Machines/ml-100k/u1.test',
                           delimiter='\t', names=['UserID','MovieID','Rating','Timestamp'])
print test_set.shape
test_set.head()
(20000, 4)
Out[114]:
UserID MovieID Rating Timestamp
0 1 6 5 887431973
1 1 10 3 875693118
2 1 12 5 878542960
3 1 14 5 874965706
4 1 17 3 875073198
In [115]:
e_df=test_set.copy()
test_set = np.array(test_set, dtype='int')
test_set[:2]
Out[115]:
array([[        1,         6,         5, 887431973],
       [        1,        10,         3, 875693118]])
In [116]:
nb_users  = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))
nb_users, nb_movies
Out[116]:
(943, 1682)
In [117]:
# C=lambda X: (len(t_df[X].unique()), len(e_df[X].unique()))
# CC=lambda X: (t_df[X].max(), e_df[X].max())
# C('UserID')
In [118]:
len(set(np.concatenate((test_set[:,0], training_set[:,0])))), len(set(np.concatenate((test_set[:,1], training_set[:,1]))))
Out[118]:
(943, 1682)
In [149]:
pd.DataFrame(test_set)[pd.DataFrame(test_set).sum(1)==0].shape
Out[149]:
(484, 1682)
In [119]:
def convert(data):
    new_data=[]
    for id_users in range(1, nb_users+1):
        id_movies  = data[:,1][data[:,0] == id_users]
        id_ratings = data[:,2][data[:,0] == id_users]
        ratings = np.zeros(nb_movies)
        ratings[id_movies -1] = id_ratings
        new_data.append(list(ratings))
    return new_data
In [120]:
%%time
training_set = convert(training_set)
test_set = convert(test_set)
CPU times: user 460 ms, sys: 12 ms, total: 472 ms
Wall time: 473 ms
In [121]:
print pd.DataFrame(training_set).shape
len(training_set), len(training_set[0])
(943, 1682)
Out[121]:
(943, 1682)
In [108]:
pd.DataFrame(training_set).head()
Out[108]:
0 1 2 3 4 5 6 7 8 9 ... 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681
0 5.0 3.0 4.0 3.0 3.0 0.0 4.0 1.0 5.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 1682 columns

use Pandas

In [109]:
t_df.tail()
Out[109]:
UserID MovieID Rating Timestamp
79995 943 1067 2 875501756
79996 943 1074 4 888640250
79997 943 1188 3 888640250
79998 943 1228 3 888640275
79999 943 1330 3 888692465
In [95]:
tmp = df.groupby('UserID').apply(lambda x: zip(list(x['MovieID']),list(x['Rating'])))
In [96]:
tmp.head()
Out[96]:
UserID
1    [(1, 5), (2, 3), (3, 4), (4, 3), (5, 3), (7, 4...
2    [(1, 4), (10, 2), (14, 4), (25, 4), (100, 5), ...
3    [(181, 4), (258, 2), (260, 4), (268, 3), (271,...
4    [(11, 4), (210, 3), (258, 5), (271, 4), (300, ...
5    [(21, 3), (25, 3), (29, 4), (50, 4), (63, 1), ...
dtype: object
In [97]:
tmp = tmp.apply(lambda x: {i[0]:i[1] for i in x})
In [98]:
tmp.head()
Out[98]:
UserID
1    {1: 5, 2: 3, 3: 4, 4: 3, 5: 3, 7: 4, 8: 1, 9: ...
2    {1: 4, 258: 3, 10: 2, 269: 4, 14: 4, 272: 5, 2...
3    {258: 2, 260: 4, 268: 3, 271: 3, 288: 2, 302: ...
4    {258: 5, 359: 5, 324: 5, 358: 2, 327: 5, 328: ...
5    {21: 3, 25: 3, 29: 4, 50: 4, 63: 1, 66: 1, 70:...
dtype: object
In [104]:
pd.DataFrame(tmp.tolist()).head()
Out[104]:
1 2 3 4 5 6 7 8 9 10 ... 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682
0 5.0 3.0 4.0 3.0 3.0 NaN 4.0 1.0 5.0 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 4.0 NaN NaN NaN NaN NaN NaN NaN NaN 2.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 1650 columns

In [151]:
from collections import defaultdict
In [164]:
d = defaultdict(int)
for i in range(1, 1683): d[i]
In [173]:
# def helper(x):
#     d = defaultdict(int)
#     for i in range(1, 1683): d[i]

#     d[x['MovieID']]=x['Rating']
#     return d
In [91]:
%%time
df=t_df.copy()
# df.columns=['user','movie','rating','time']
tmp = df.groupby('UserID').apply(lambda x: zip(list(x['MovieID']),list(x['Rating'])))
tmp = tmp.apply(lambda x: {i[0]:i[1] for i in x})

mapped = pd.DataFrame(tmp.tolist(),columns=range(1,nb_movies+1)).fillna(0)
mapped.shape
CPU times: user 808 ms, sys: 24 ms, total: 832 ms
Wall time: 826 ms
In [92]:
mapped.head()
Out[92]:
1 2 3 4 5 6 7 8 9 10 ... 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682
0 5.0 3.0 4.0 3.0 3.0 0.0 4.0 1.0 5.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 1682 columns

In [47]:
mapped.values.shape
Out[47]:
(943, 1682)
In [51]:
type(mapped.values)
Out[51]:
numpy.ndarray

two methods yield the same result

In [54]:
np.array_equal(mapped.values, np.array(training_set))
Out[54]:
True
In [56]:
mapped.values.tolist()==training_set
Out[56]:
True

pytorch

  • tensors
In [179]:
len(training_set), len(training_set[40]), len(test_set), len(test_set[9])
Out[179]:
(943, 1682, 943, 1682)
In [181]:
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

converting the ratings into binary ratings 1 and 0

In [182]:
type(training_set)
Out[182]:
torch.FloatTensor
In [64]:
training_set
Out[64]:
    5     3     4  ...      0     0     0
    4     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    5     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     5     0  ...      0     0     0
[torch.FloatTensor of size 943x1682]
In [63]:
training_set==0
Out[63]:
    0     0     0  ...      1     1     1
    0     1     1  ...      1     1     1
    1     1     1  ...      1     1     1
       ...          ⋱          ...       
    0     1     1  ...      1     1     1
    1     1     1  ...      1     1     1
    1     0     1  ...      1     1     1
[torch.ByteTensor of size 943x1682]
In [69]:
training_set[training_set==0]=-1
In [70]:
training_set
Out[70]:
    5     3     4  ...     -1    -1    -1
    4    -1    -1  ...     -1    -1    -1
   -1    -1    -1  ...     -1    -1    -1
       ...          ⋱          ...       
    5    -1    -1  ...     -1    -1    -1
   -1    -1    -1  ...     -1    -1    -1
   -1     5    -1  ...     -1    -1    -1
[torch.FloatTensor of size 943x1682]
In [343]:
training_set[training_set==0]=-1
training_set[training_set==1]=0
training_set[training_set==2]=0
training_set[training_set>=3]=1
In [344]:
test_set[test_set==0]=-1
test_set[test_set==1]=0
test_set[test_set==2]=0
test_set[test_set>=3]=1

torch

In [184]:
torch.randn(2,4)
Out[184]:
-0.2062  0.5657 -1.2096 -0.1144
 0.1077 -0.9330 -0.3702  0.6609
[torch.FloatTensor of size 2x4]
In [185]:
torch.randn(1,7)
Out[185]:
 1.1248 -0.4083 -0.0417 -0.9836  0.9033 -1.0733  0.7768
[torch.FloatTensor of size 1x7]
In [183]:
torch.randn(2,4)
Out[183]:
 0.9825  0.6290  0.1357 -0.6860
-0.6105 -0.4217  0.4372  1.5690
[torch.FloatTensor of size 2x4]
In [195]:
x = torch.IntTensor([[8,2,3,5],[6,2,7,9]])
x
Out[195]:
 8  2  3  5
 6  2  7  9
[torch.IntTensor of size 2x4]
In [198]:
y = torch.IntTensor([[66,22,11],[77,33,99],[5,5,11],[13,31,2]])
y
Out[198]:
 66  22  11
 77  33  99
  5   5  11
 13  31   2
[torch.IntTensor of size 4x3]
In [199]:
torch.mm(x,y)
Out[199]:
 762  412  329
 702  512  359
[torch.IntTensor of size 2x3]
In [203]:
x
Out[203]:
 8  2  3  5
 6  2  7  9
[torch.IntTensor of size 2x4]
In [201]:
x.t()
Out[201]:
 8  6
 2  2
 3  7
 5  9
[torch.IntTensor of size 4x2]
In [202]:
torch.mm(x, x.t())
Out[202]:
 102  118
 118  170
[torch.IntTensor of size 2x2]
In [204]:
torch.mm(x.t(),x)
Out[204]:
 100   28   66   94
  28    8   20   28
  66   20   58   78
  94   28   78  106
[torch.IntTensor of size 4x4]

class

W: weights, probabilities of visible nodes given the hidden nodes

  • nv: No. of visible nodes
  • nh: No. of hidden nodes

a, b: bias

  • a: the probabilities of the hidden nodes given the visible nodes, p(h) given v
  • b: the probabilities of the visible nodes give the hidden nodes
In [224]:
q=torch.randn(1,4)
print type(q)
q
<class 'torch.FloatTensor'>
Out[224]:
 1.7958 -0.0375 -0.2739 -0.2362
[torch.FloatTensor of size 1x4]
In [234]:
q.expand_as(torch.rand(5,4))
Out[234]:
 1.7958 -0.0375 -0.2739 -0.2362
 1.7958 -0.0375 -0.2739 -0.2362
 1.7958 -0.0375 -0.2739 -0.2362
 1.7958 -0.0375 -0.2739 -0.2362
 1.7958 -0.0375 -0.2739 -0.2362
[torch.FloatTensor of size 5x4]
In [246]:
aaa = torch.randn(3,4)
aaa
Out[246]:
 0.4573 -0.2673 -1.1019  0.2852
-0.3401  1.1289 -0.7650 -0.7398
-0.9357  0.9382  0.9801 -2.5940
[torch.FloatTensor of size 3x4]
In [247]:
torch.sigmoid(aaa)
Out[247]:
 0.6124  0.4336  0.2494  0.5708
 0.4158  0.7556  0.3176  0.3230
 0.2818  0.7187  0.7271  0.0695
[torch.FloatTensor of size 3x4]
In [265]:
aa=torch.rand(2,6)
aa
Out[265]:
 0.4599  0.2952  0.7769  0.3524  0.2644  0.8499
 0.1967  0.6823  0.8011  0.5248  0.3940  0.0863
[torch.FloatTensor of size 2x6]
In [279]:
# torch.bernoulli?
torch.bernoulli(aa)
Out[279]:
 1  0  1  0  1  1
 0  1  1  1  0  0
[torch.FloatTensor of size 2x6]
In [285]:
cc = torch.rand(2,4)
cc
Out[285]:
 0.6078  0.2293  0.0606  0.4252
 0.5368  0.6259  0.8366  0.5853
[torch.FloatTensor of size 2x4]
In [287]:
torch.sum(cc)
Out[287]:
3.907507613301277
In [288]:
torch.sum(cc, 0)
Out[288]:
 1.1446
 0.8552
 0.8972
 1.0105
[torch.FloatTensor of size 4]
In [289]:
torch.sum(cc, 1)
Out[289]:
 1.3229
 2.5846
[torch.FloatTensor of size 2]
In [360]:
class RBM():
    def __init__(self, nv, nh):
        self.W = torch.randn(nh, nv)
        self.a = torch.randn(1, nh) # one bias for each hidden node, with nh hidden nodes
        self.b = torch.randn(1, nv) # one bias for each visible node
    
    # sampling the hidden nodes according to the probabilities p(h) given v
    def sample_h(self, x):
        wx = torch.mm(x, self.W.t())
        activation = wx + self.a.expand_as(wx)
        p_h_given_v = torch.sigmoid(activation)
        
        return p_h_given_v, torch.bernoulli(p_h_given_v)
    
    def sample_v(self, y):
        wy = torch.mm(y, self.W)
        activation = wy + self.b.expand_as(wy)
        p_v_given_h = torch.sigmoid(activation)
        
        return p_v_given_h, torch.bernoulli(p_v_given_h)
    
    def train(self, v0, vk, ph0, phk):
        self.W += torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)
        self.b += torch.sum((v0 - vk), 0)
        self.a += torch.sum((ph0 - phk), 0)
        
    def __repr__(self):
        string= "RBM object\nnv: {}\nnh: {}".format(nv, nh)
        return string
In [362]:
nv=len(training_set[0])
nh=100
print nv, nh
batch_size=64
1682 100
In [363]:
rbm = RBM(nv, nh)
nb_epoch=12
rbm
Out[363]:
RBM object
nv: 1682
nh: 100
In [364]:
%%time
for epoch in range(1, nb_epoch+1):
    train_loss = 0
    s = 0.
    
    for id_user in range(0, nb_users - batch_size, batch_size):
        vk = training_set[id_user:id_user + batch_size]
        v0 = training_set[id_user:id_user + batch_size]
        ph0, _ = rbm.sample_h(v0)
        
        for k in range(10):
            _, hk = rbm.sample_h(vk)
            _, vk = rbm.sample_v(hk)
            vk[v0<0] = v0[v0<0]
            
        phk, _ = rbm.sample_h(vk)
        rbm.train(v0, vk, ph0, phk)
        train_loss += torch.mean(torch.abs(v0[v0>0] - vk[v0>0]))
        s += 1.
        
    print 'epoch: '+ str(epoch) + ' loss: '+ str(train_loss/s)
epoch: 1 loss: 0.193339535587
epoch: 2 loss: 0.154834918378
epoch: 3 loss: 0.15258418814
epoch: 4 loss: 0.151876177942
epoch: 5 loss: 0.152534301497
epoch: 6 loss: 0.151909197357
epoch: 7 loss: 0.153353382272
epoch: 8 loss: 0.150898950975
epoch: 9 loss: 0.150772481119
epoch: 10 loss: 0.150665683832
epoch: 11 loss: 0.151008741316
epoch: 12 loss: 0.151289355973
CPU times: user 27.7 s, sys: 276 ms, total: 27.9 s
Wall time: 14 s
In [365]:
training_set
Out[365]:
    1     1     1  ...     -1    -1    -1
    1    -1    -1  ...     -1    -1    -1
   -1    -1    -1  ...     -1    -1    -1
       ...          ⋱          ...       
    1    -1    -1  ...     -1    -1    -1
   -1    -1    -1  ...     -1    -1    -1
   -1     1    -1  ...     -1    -1    -1
[torch.FloatTensor of size 943x1682]

test

In [366]:
# Testing the RBM
test_loss = 0
s = 0.
for id_user in range(nb_users):
    v = training_set[id_user:id_user+1]
    vt = test_set[id_user:id_user+1]
    if len(vt[vt>=0]) > 0:
        _,h = rbm.sample_h(v)
        _,v = rbm.sample_v(h)
        test_loss += torch.mean(torch.abs(vt[vt>=0] - v[vt>=0]))
        s += 1.
print('test loss: '+str(test_loss/s))
test loss: 0.247651876439