In [1]:
import os
import pandas as pd
import numpy as np
import gensim
from gensim import corpora, models
from nltk.corpus import stopwords

from string import punctuation

**gensim** - библиотека обработки естественного языка предназначения для «Тематического моделирования»
https://pythonru.com/biblioteki/gensim
https://radimrehurek.com/gensim/auto_examples/index.html
- *corpora* - предоставляет методы для преобразование текста в векторное представление

**nltk** - библиотека Python для обработки естественного языка
https://www.nltk.org/

**string.punctuation** возвращает все наборы знаков препинания
https://www.geeksforgeeks.org/string-punctuation-in-python/





Чтение датасета

In [4]:
dataset_tweets = pd.read_csv('data_elonmusk.csv', encoding='latin1')

dataset_tweets

Unnamed: 0,row ID,Tweet,Time,Retweet from,User
0,Row0,@MeltingIce Assuming max acceleration of 2 to ...,2017-09-29 17:39:19,,elonmusk
1,Row1,RT @SpaceX: BFR is capable of transporting sat...,2017-09-29 10:44:54,SpaceX,elonmusk
2,Row2,@bigajm Yup :),2017-09-29 10:39:57,,elonmusk
3,Row3,Part 2 https://t.co/8Fvu57muhM,2017-09-29 09:56:12,,elonmusk
4,Row4,Fly to most places on Earth in under 30 mins a...,2017-09-29 09:19:21,,elonmusk
...,...,...,...,...,...
3213,Row3213,"@YOUSRC Amos's article was fair, but his edito...",2012-11-20 08:52:03,,elonmusk
3214,Row3214,These articles in Space News describe why Aria...,2012-11-20 08:38:31,,elonmusk
3215,Row3215,Was misquoted by BBC as saying Europe's rocket...,2012-11-20 08:30:44,,elonmusk
3216,Row3216,Just returned from a trip to London and Oxford...,2012-11-19 08:59:46,,elonmusk


Препроцессинг

In [None]:
#Формируем корпус из твитов. Готовим данные в корректном формате для передачи их в качестве входных параметров модели LDA

corpus=[]
for i in range(len(dataset_tweets['Tweet'])):
        corpus.append(dataset_tweets['Tweet'][i])
corpus

In [6]:
#Загружаем стоп-слова
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
list1 = ['RT','rt']
stoplist = stopwords.words('english') + list(punctuation) + list1

#удаляем стоп-слова, знаке препинания и другие не влияющие на смысл слова
texts = [[word for word in str(document).lower().split() if word not in stoplist] for document in corpus]

#Преобразуем данные в словарь, представляющий собой коллекцию уникальных токенов
dictionary = corpora.Dictionary(texts)

texts

**dictionary.doc2bow** подсчитывает число вхождений и генерирует целочисленный идентификатор для каждого слова. Результат возвращается в виде разреженного вектора.


In [17]:
#Создаем корпус Bag of Words
corpus = [dictionary.doc2bow(text) for text in texts]

#Применяем tfidf модель для удаления часто встречающих, но не влияющих на основую суть текста, слова
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
corpus_tfidf

<gensim.interfaces.TransformedCorpus at 0x798a44dd69e0>

In [18]:
#Обучаем LDA модель
total_topics = 5

lda = models.LdaModel(corpus, id2word=dictionary, num_topics=total_topics)
corpus_lda = lda[corpus_tfidf]



In [20]:
#Вывод результатов

lda.show_topics(total_topics,20)

[(0,
  '0.004*"tesla" + 0.003*"much" + 0.003*"one" + 0.003*"space" + 0.003*"find" + 0.002*"like" + 0.002*"would" + 0.002*"ny" + 0.002*"also" + 0.002*"earth" + 0.002*"definitely" + 0.002*"good" + 0.002*"station" + 0.002*"rear" + 0.002*"model" + 0.002*"cool" + 0.002*"work" + 0.002*"people" + 0.002*"coming" + 0.002*"car"'),
 (1,
  '0.008*"rocket" + 0.007*"launch" + 0.006*"good" + 0.004*"falcon" + 0.004*"9" + 0.003*"thanks" + 0.003*"flight" + 0.003*"@spacex:" + 0.003*"landing" + 0.003*":)" + 0.003*"dragon" + 0.003*"space" + 0.003*"@spacex" + 0.002*"like" + 0.002*"make" + 0.002*"air" + 0.002*"spacex" + 0.002*"tesla" + 0.002*"almost" + 0.002*"orbit."'),
 (2,
  '0.008*"tesla" + 0.006*"falcon" + 0.006*"new" + 0.004*"9" + 0.004*"great" + 0.004*"launch" + 0.004*"first" + 0.004*"@spacex:" + 0.003*":)" + 0.003*"rocket" + 0.003*"next" + 0.003*"long" + 0.003*"@teslamotors" + 0.003*"time" + 0.003*"i\'m" + 0.003*"#dragon" + 0.002*"coming" + 0.002*"@spacex" + 0.002*"energy" + 0.002*"dragon"'),
 (3,
  '

In [21]:
data_lda = {i: dict(lda.show_topic(i,25)) for i in range(total_topics)}
data_lda

{0: {'tesla': 0.0042598224,
  'much': 0.003341169,
  'one': 0.0030169426,
  'space': 0.00267355,
  'find': 0.0025259103,
  'like': 0.0024547256,
  'would': 0.002137063,
  'ny': 0.0020987038,
  'also': 0.002088472,
  'earth': 0.002056803,
  'definitely': 0.0020399364,
  'good': 0.001992751,
  'station': 0.0019830107,
  'rear': 0.0019546645,
  'model': 0.0018646172,
  'cool': 0.0018309772,
  'work': 0.0018161389,
  'people': 0.0018152193,
  'coming': 0.0018091308,
  'car': 0.0016998297,
  'auto': 0.0016837557,
  'dragon': 0.0016465181,
  'satellite': 0.001544126,
  'falcon': 0.0015021424,
  'pretty': 0.0014818053},
 1: {'rocket': 0.007827858,
  'launch': 0.0066516064,
  'good': 0.0055094245,
  'falcon': 0.004381834,
  '9': 0.0038422334,
  'thanks': 0.0034551115,
  'flight': 0.0033586791,
  '@spacex:': 0.0032148494,
  'landing': 0.003037769,
  ':)': 0.0028968768,
  'dragon': 0.0028606334,
  'space': 0.0025814734,
  '@spacex': 0.0025108503,
  'like': 0.002399503,
  'make': 0.0021933797,
  

In [22]:
df_lda = pd.DataFrame(data_lda)
df_lda = df_lda.fillna(0).T

df_lda

Unnamed: 0,tesla,much,one,space,find,like,would,ny,also,earth,...,best,high,change,live,read,cars,climate,made,public,x
0,0.00426,0.003341,0.003017,0.002674,0.002526,0.002455,0.002137,0.002099,0.002088,0.002057,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.001897,0.0,0.001651,0.002581,0.0,0.0024,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.008465,0.0,0.0,0.002351,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.008973,0.0,0.0,0.0,0.0,0.005688,0.003735,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.015479,0.0,0.002387,0.0,0.0,0.003311,0.0,0.0,0.0,0.0,...,0.004371,0.002436,0.002334,0.002318,0.002287,0.00226,0.00226,0.002247,0.002081,0.001973


In [24]:
! pip install pyLDAvis
! pip install "pandas<2.0.0"

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy>=1.24.2 (from pyLDAvis)
  Downloading numpy-1.26.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
Collecting pandas>=2.0.0 (from pyLDAvis)
  Downloading pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m69.9 MB/s[0m eta [36m0:00:00[0m
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Collecting tzdata>=2022.1 (from pandas>=2.0.0->pyLDAvis)
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m 

**pyLDAvis** визуализирует результаты модели LDA

https://pyldavis.readthedocs.io/en/latest/modules/API.html

In [27]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

panel = gensimvis.prepare(lda, corpus_lda, dictionary, mds='tsne')
panel

  and should_run_async(code)


Методы улучшения:
- Сделать более качественную предобработку текста (удалить ненужные символы, упоминания)
- Создать биграммы, триграммы
- Выделить ключевые слова (с помощью методов автоматического веделения терминов ATE)
- Настроить гиперпараметры

