from helper import *
# ! pip install pandas nltk gensim pyldavis
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
article_topic
topic_term
alpha
eta
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
stopwords = set(stopwords.words('english'))
punctuation = set(string.punctuation)
lemmatize = WordNetLemmatizer()
def cleaning(article):
one = " ".join([i for i in article.lower().split() if i not in stopwords])
two = "".join(i for i in one if i not in punctuation)
three = " ".join(lemmatize.lemmatize(i) for i in two.split())
return three
Data preparation can't be simpler, you only need a list of documents.
The shorter for each document, the less time will take to complete a topic model.
df = pd.read_table('plot.tok.gt9.5000', names=['text'])
df.info()
df.head(3)
text = df.applymap(cleaning)['text']
text_list = [i.split() for i in text]
len(text_list)
text_list[0]
from time import time
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO,
filename='running.log',filemode='w')
# Importing Gensim
import gensim
from gensim import corpora
# Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
dictionary = corpora.Dictionary(text_list)
dictionary.save('dictionary.dict')
print dictionary
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in text_list]
corpora.MmCorpus.serialize('corpus.mm', doc_term_matrix)
print len(doc_term_matrix)
print doc_term_matrix[100]
start = time()
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel
# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=10, id2word = dictionary, passes=50)
print 'used: {:.2f}s'.format(time()-start)
print(ldamodel.print_topics(num_topics=2, num_words=4))
for i in ldamodel.print_topics():
for j in i: print j
ldamodel.save('topic.model')
from gensim.models import LdaModel
loading = LdaModel.load('topic.model')
print(loading.print_topics(num_topics=2, num_words=4))
def pre_new(doc):
one = cleaning(doc).split()
two = dictionary.doc2bow(one)
return two
pre_new('new article that to be classified by trained model!')
belong = loading[(pre_new('new article that to be classified by trained model!'))]
belong
new = pd.DataFrame(belong,columns=['id','prob']).sort_values('prob',ascending=False)
new['topic'] = new['id'].apply(loading.print_topic)
new
new['topic']
import pyLDAvis.gensim
import gensim
pyLDAvis.enable_notebook()
d = gensim.corpora.Dictionary.load('dictionary.dict')
c = gensim.corpora.MmCorpus('corpus.mm')
lda = gensim.models.LdaModel.load('topic.model')
data = pyLDAvis.gensim.prepare(lda, c, d)
data
pyLDAvis.save_html(data,'vis.html')
# %%HTML
# <iframe width="100%" height="500" src="http://www.jishichao.com/vis"></iframe>