
text_lab2
.pdf16 |
20 |
|
|
be |
|
lizard |
|
naturalist |
hystaspes |
armageddon |
||
17 |
used |
|
|
or |
|
uu |
armageddon |
|
financi |
|
obviou |
|
18 |
50 |
|
|
as |
|
financi |
|
grin |
|
uu |
|
aladdin |
19 |
com |
|
|
on |
|
hystaspes |
|
nth |
|
chb |
|
grin |
20 |
jesus |
|
|
but |
|
somat |
|
aladdin |
|
somat |
|
nth |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Со стеммингом |
|
|
|
|
||
|
Count |
|
|
|
TF |
|
TF-IDF |
|||||
№ |
Без стоп- |
|
С стоп- |
|
Без стоп- |
|
С стоп- |
|
Без стоп- |
С стоп- |
||
|
слов |
|
словами |
|
слов |
|
словами |
|
слов |
словами |
||
1 |
thi |
|
the |
|
thi |
|
tempera |
|
thi |
tempera |
||
2 |
wa |
|
of |
|
wa |
|
thrush |
|
wa |
thrush |
||
3 |
use |
|
to |
|
use |
|
nitya |
|
ha |
nitya |
||
4 |
ha |
|
and |
|
ha |
|
align |
|
use |
align |
||
5 |
00 |
|
|
in |
|
like |
|
indians |
|
know |
indians |
|
6 |
ani |
|
is |
|
ani |
|
honesti |
|
like |
infanticid |
||
7 |
like |
|
that |
|
know |
|
infanticid |
|
ani |
televis |
||
8 |
peopl |
|
it |
|
new |
|
eyebal |
|
doe |
honesti |
||
9 |
new |
|
for |
|
doe |
|
televis |
|
new |
eyebal |
||
10 |
edu |
|
you |
|
edu |
|
yogurt |
|
just |
yogurt |
||
11 |
hi |
|
are |
|
just |
|
gold |
|
offer |
gold |
||
12 |
know |
|
not |
|
peopl |
|
wis |
|
good |
test |
||
13 |
doe |
|
have |
|
sale |
|
test |
|
say |
analog |
||
14 |
good |
|
thi |
|
pleas |
|
analog |
|
peopl |
authur |
||
15 |
just |
|
be |
|
offer |
|
authur |
|
hi |
ncr |
||
16 |
time |
|
with |
|
think |
|
nth |
|
veri |
wis |
||
17 |
onli |
|
or |
|
hi |
|
ncr |
|
make |
nth |
||
18 |
say |
|
as |
|
good |
|
hill |
|
includ |
delusion |
||
19 |
think |
|
do |
|
time |
|
delusion |
|
sale |
anoth |
||
20 |
make |
|
on |
|
includ |
|
norman |
|
think |
hill |
Для класса 0 (misc.forsale):
|
|
|
Без стемминга |
|
|
||
|
Count |
|
TF |
TF-IDF |
|||
№ |
Без стоп- |
С стоп- |
Без стоп- |
|
С стоп- |
Без стоп- |
С стоп- |
|
слов |
словами |
слов |
|
словами |
слов |
словами |
1 |
00 |
the |
sincer |
|
tone |
sincer |
epistl |
2 |
new |
for |
netcom |
|
epistl |
netcom |
tone |
3 |
sale |
and |
olympu |
|
adjust |
olympu |
adjust |
4 |
50 |
to |
00 |
|
trip |
somat |
trip |
5 |
10 |
of |
somat |
|
harass |
hystaspes |
harass |
6 |
dos |
in |
chb |
|
noon |
smartdrug |
shaw |
7 |
offer |
00 |
discoveri |
|
impact |
chb |
00 |
8 |
shipping |
it |
ramada |
|
ima |
ramada |
impact |
9 |
20 |
is |
hystaspes |
|
obviou |
kuhn |
noon |
10 |
price |
you |
smartdrug |
|
word |
discoveri |
obviou |
11 |
25 |
with |
uucp |
|
youth |
difficulti |
youth |
12 |
15 |
or |
kuhn |
|
forest |
uucp |
word |
13 |
condition |
have |
linux |
|
shaw |
linux |
forest |
14 |
good |
are |
difficulti |
|
grin |
alejandro |
nordenberg |
21
15 |
used |
|
|
all |
|
depriv |
|
agreement |
00 |
|
ima |
|
16 |
like |
|
|
if |
|
alejandro |
|
trade |
|
attent |
|
motorcycl |
17 |
edu |
|
|
new |
10 |
|
macintosh |
10 |
|
soar |
||
18 |
asking |
|
|
this |
|
attent |
|
motorcycl |
|
lizard |
|
macintosh |
19 |
|
|
that |
|
firmli |
|
accton |
temporari |
|
pollut |
||
20 |
interested |
|
|
sale |
|
lawrenc |
|
allergic |
|
bogu |
|
grin |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Со стеммингом |
|
|
|
|
||
|
Count |
|
|
|
TF |
|
TF-IDF |
|||||
№ |
Без стоп- |
|
С стоп- |
|
Без стоп- |
|
С стоп- |
|
Без стоп- |
С стоп- |
||
|
слов |
|
словами |
|
слов |
|
словами |
|
слов |
словами |
||
1 |
00 |
|
|
the |
|
sale |
|
tempera |
|
offer |
eyebal |
|
2 |
new |
|
for |
|
offer |
|
eyebal |
|
thi |
tempera |
||
3 |
thi |
|
and |
|
thi |
|
align |
|
sale |
align |
||
4 |
sale |
|
to |
|
use |
|
thrush |
|
use |
thrush |
||
5 |
offer |
|
of |
|
new |
|
honesti |
|
new |
honesti |
||
6 |
use |
|
in |
|
includ |
|
nitya |
|
sell |
00 |
||
7 |
includ |
|
00 |
|
00 |
|
infanticid |
|
includ |
infanticid |
||
8 |
50 |
|
|
it |
|
sell |
|
indians |
|
ship |
sharon |
|
9 |
10 |
|
|
is |
|
pleas |
|
nth |
|
price |
nl |
|
10 |
price |
|
you |
|
ship |
|
wis |
|
condit |
nitya |
||
11 |
ship |
|
with |
|
price |
|
yogurt |
|
ha |
nth |
||
12 |
20 |
|
|
or |
|
condit |
|
gold |
|
pleas |
yogurt |
|
13 |
pleas |
|
have |
|
|
sharon |
|
game |
wis |
|||
14 |
sell |
|
do |
|
ask |
|
hill |
|
look |
gold |
||
15 |
game |
|
are |
|
drive |
|
nl |
|
drive |
indians |
||
16 |
25 |
|
|
all |
|
|
analog |
|
00 |
|
sirri |
|
17 |
condit |
|
if |
|
ha |
|
test |
|
like |
mp |
||
18 |
15 |
|
|
new |
|
like |
|
lynch |
|
card |
smartwrit |
|
19 |
good |
|
thi |
|
look |
|
mp |
|
ask |
hormon |
||
20 |
ha |
|
will |
|
edu |
|
ahead |
|
manual |
lynch |
Для класса 1 (sci.med):
|
|
|
Без стемминга |
|
|
||
|
Count |
|
TF |
TF-IDF |
|||
№ |
Без стоп- |
С стоп- |
Без стоп- |
|
С стоп- |
Без стоп- |
С стоп- |
|
слов |
словами |
слов |
|
словами |
слов |
словами |
1 |
edu |
the |
definit |
|
tone |
jhu |
tone |
2 |
don |
of |
jhu |
|
trip |
difficulti |
noon |
3 |
people |
to |
difficulti |
|
noon |
kuhn |
trip |
4 |
health |
and |
kuhn |
|
ima |
pip |
ima |
5 |
use |
in |
pip |
|
adjust |
invalid |
impact |
6 |
medical |
is |
invalid |
|
impact |
definit |
adjust |
7 |
like |
that |
thank |
|
harass |
fist |
tomographi |
8 |
know |
it |
mouth |
|
tomographi |
threw |
harass |
9 |
com |
for |
threw |
|
youth |
sponsorship |
youth |
10 |
time |
you |
fist |
|
epistl |
mouth |
trade |
11 |
just |
this |
deem |
|
trade |
deem |
armageddon |
12 |
patients |
are |
apocalypt |
|
armageddon |
apocalypt |
agreement |
13 |
new |
be |
sponsorship |
|
agreement |
speak |
naturalist |
22
14 |
think |
|
with |
|
predispos |
|
naturalist |
|
farrar |
|
epistl |
|||
15 |
disease |
|
not |
|
|
farrar |
|
|
word |
|
supposedli |
|
miner |
|
16 |
good |
|
have |
|
|
firmli |
|
|
forest |
|
music |
|
aladdin |
|
17 |
msg |
|
or |
|
|
speak |
|
|
nth |
|
thank |
|
forest |
|
18 |
food |
|
on |
|
|
bro |
|
aladdin |
|
|
dat |
|
word |
|
19 |
years |
|
as |
|
huxley |
|
|
miner |
|
huxley |
|
nth |
||
20 |
doctor |
|
but |
|
|
music |
|
|
bet |
|
|
desk |
|
bet |
|
|
|
|
|
|
|
|
|
|
|
|
|||
|
|
|
|
|
|
Со стеммингом |
|
|
|
|
||||
|
Count |
|
|
|
TF |
|
TF-IDF |
|||||||
№ |
Без стоп- |
|
С стоп- |
|
Без стоп- |
|
С стоп- |
|
Без стоп- |
С стоп- |
||||
|
слов |
|
словами |
|
слов |
|
словами |
|
слов |
словами |
||||
1 |
thi |
|
the |
|
thi |
|
tempera |
|
thi |
tempera |
||||
2 |
wa |
|
of |
|
wa |
|
thrush |
|
wa |
nitya |
||||
3 |
use |
|
to |
|
ha |
|
nitya |
|
ha |
thrush |
||||
4 |
ha |
|
and |
|
ani |
|
indians |
|
know |
indians |
||||
5 |
edu |
|
in |
|
know |
|
align |
|
ani |
infanticid |
||||
6 |
ani |
|
is |
|
use |
|
infanticid |
|
use |
align |
||||
7 |
medic |
|
it |
|
edu |
|
honesti |
|
doe |
televis |
||||
8 |
like |
|
that |
|
like |
|
televis |
|
like |
honesti |
||||
9 |
patient |
|
for |
|
doe |
|
yogurt |
|
just |
yogurt |
||||
10 |
time |
|
be |
|
peopl |
|
eyebal |
|
peopl |
authur |
||||
11 |
peopl |
|
you |
|
think |
|
authur |
|
think |
test |
||||
12 |
health |
|
thi |
|
time |
|
test |
|
diseas |
analog |
||||
13 |
know |
|
are |
|
just |
|
analog |
|
pitt |
ncr |
||||
14 |
year |
|
have |
|
effect |
|
gold |
|
veri |
gold |
||||
15 |
diseas |
|
with |
|
msg |
|
ncr |
|
gordon |
eyebal |
||||
16 |
food |
|
not |
|
food |
|
wis |
|
patient |
mistaken |
||||
17 |
com |
|
or |
|
gordon |
|
delusion |
|
soon |
delusion |
||||
18 |
caus |
|
on |
|
diseas |
|
norman |
|
intellect |
anoth |
||||
19 |
doe |
|
as |
|
bank |
|
anoth |
|
surrend |
wis |
||||
20 |
think |
|
do |
|
caus |
|
mistaken |
|
geb |
norman |
Для класса 2 (talk.religion.misc):
|
|
|
Без стемминга |
|
|
||
|
Count |
|
TF |
TF-IDF |
|||
|
Без стоп- |
С стоп- |
Без стоп- |
|
С стоп- |
Без |
С стоп- |
№ |
|
стоп- |
|||||
|
слов |
словами |
слов |
|
словами |
слов |
словами |
|
|
|
|
|
|
|
|
1 |
god |
the |
financi |
|
tone |
financi |
tone |
2 |
people |
of |
pip |
|
trip |
pip |
noon |
3 |
jesus |
to |
inner |
|
noon |
inner |
trip |
4 |
don |
and |
definit |
|
adjust |
invalid |
tomographi |
5 |
bible |
that |
invalid |
|
tomographi |
jhu |
adjust |
6 |
just |
is |
thank |
|
ima |
definit |
ima |
7 |
christian |
in |
jhu |
|
harass |
kuhn |
youth |
8 |
think |
you |
bundl |
|
youth |
bundl |
harass |
9 |
know |
it |
kuhn |
|
impact |
thank |
impact |
10 |
say |
not |
sized |
|
naturalist |
deem |
naturalist |
11 |
does |
for |
deem |
|
agreement |
ct |
formul |
23

12 |
did |
|
|
as |
|
firmli |
|
trade |
|
audibl |
|
|
fauci |
13 |
good |
|
|
this |
|
ct |
armageddon |
astonish |
armageddon |
||||
14 |
like |
|
|
are |
|
audibl |
|
epistl |
buoyanc |
|
|
trade |
|
15 |
life |
|
|
be |
|
buoyanc |
|
aladdin |
|
firmli |
|
agreement |
|
16 |
way |
|
|
have |
|
astonish |
|
forest |
|
threw |
|
|
aladdin |
17 |
believe |
|
|
with |
|
warranti |
|
word |
|
sized |
|
|
toxin |
18 |
said |
|
|
was |
|
sin |
|
bet |
|
sin |
|
|
wealth |
19 |
point |
|
|
he |
|
threw |
|
formul |
kremlin |
|
|
forest |
|
20 |
time |
|
|
they |
|
prize |
|
wealth |
|
john |
|
|
bet |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Со стеммингом |
|
|
|
|
|
||
|
Count |
|
|
|
TF |
|
TF-IDF |
||||||
№ |
Без стоп- |
|
С стоп- |
|
Без стоп- |
|
С стоп- |
|
Без стоп- |
|
С стоп- |
||
|
слов |
|
словами |
|
слов |
|
словами |
|
слов |
|
словами |
||
1 |
thi |
|
the |
|
thi |
|
tempera |
|
thi |
|
tempera |
||
2 |
wa |
|
of |
|
wa |
|
thrush |
|
wa |
|
nitya |
||
3 |
god |
|
to |
|
god |
|
nitya |
|
christian |
|
thrush |
||
4 |
hi |
|
and |
|
christian |
|
align |
|
god |
|
televis |
||
5 |
christian |
|
that |
|
hi |
|
televis |
|
say |
|
align |
||
6 |
peopl |
|
is |
|
peopl |
|
indians |
|
hi |
|
indians |
||
7 |
jesu |
|
in |
|
say |
|
honesti |
|
did |
|
yogurt |
||
8 |
say |
|
you |
|
jesu |
|
yogurt |
|
know |
|
honesti |
||
9 |
ha |
|
it |
|
think |
|
infanticid |
|
peopl |
|
infanticid |
||
10 |
doe |
|
not |
|
did |
|
ncr |
|
jesu |
|
ncr |
||
11 |
think |
|
be |
|
know |
|
authur |
|
just |
|
authur |
||
12 |
did |
|
for |
|
just |
|
analog |
|
doe |
|
got |
||
13 |
know |
|
are |
|
doe |
|
test |
|
think |
|
camera |
||
14 |
believ |
|
thi |
|
ha |
|
eyebal |
|
ha |
|
franc |
||
15 |
moral |
|
as |
|
like |
|
gold |
|
like |
|
analog |
||
16 |
bibl |
|
have |
|
believ |
|
delusion |
|
believ |
|
test |
||
17 |
just |
|
do |
|
ani |
|
anoth |
|
point |
|
anoth |
||
18 |
like |
|
with |
|
moral |
|
wis |
|
use |
|
delusion |
||
19 |
use |
|
wa |
|
use |
|
bloodlet |
|
good |
|
vol |
||
20 |
onli |
|
he |
|
onli |
|
vol |
|
ani |
|
tesla |
Из полученных таблиц видно, что при взвешивании терминов с помощью Count без отсечения стоп-слов наиболее часто встречающиеся слова – предлоги, союзы, частицы, местоимения. Отсечение стоп-слов позволяет определить в качестве наиболее распространенных более содержательные по смыслу слова.
При отсутствии стемминга взвешивание Count с отсечением стоп-слов позволяет получить лучшие результаты, чем TF и TF-IDF с отсечением слов или без него.
При использовании стемминга и отсечения стоп-слов каждое из взвешиваний Count, TF и TFIDF позволяет получить достаточно неплохой результат.
7.Реализация модели Наивного Байесовского классификатора при помощи конвейера Pipeline и выявление на основе показателей качества (значения полноты, точности, f1меры и аккуратности), какая предварительная обработка данных обеспечит наилучшие результаты классификации.
24

Исследуемые характеристики:
−Наличие - отсутствие стемминга
−Отсечение – не отсечение стоп-слов
−Количество информативных терминов (max_features)
−Взвешивание: Count, TF, TF-IDF
1)Взвешивание Count при отсутствии стемминга и отсечения стоп-слов; число информативных признаков 10000:
text_clf = Pipeline([('vect', CountVectorizer(max_features= 10000)), ('clf', MultinomialNB ()),])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target) prediction = text_clf.predict(twenty_test.data)
print('Accuracy score: ', accuracy_score(prediction, twenty_test.target)) print(classification_report(twenty_test.target, prediction))
Accuracy |
score: |
0.90549662487946 |
f1-score |
support |
|
|
precision |
recall |
|||
|
0 |
0.99 |
0.91 |
0.94 |
390 |
|
1 |
0.89 |
0.91 |
0.90 |
396 |
|
2 |
0.82 |
0.90 |
0.86 |
251 |
accuracy |
0.90 |
0.90 |
0.91 |
1037 |
|
macro |
avg |
0.90 |
1037 |
||
weighted |
avg |
0.91 |
0.91 |
0.91 |
1037 |
2) Взвешивание Count при стемминге и отсутствии отсечения стоп-слов; число информативных признаков 10000:
text_clf = Pipeline([('vect', CountVectorizer(max_features= 10000)), ('clf', MultinomialNB ()),])
text_clf = text_clf.fit(stem_train.data_stem, stem_train.target) prediction = text_clf.predict(stem_test.data_stem)
print('Accuracy score: ', accuracy_score(prediction, stem_test.target)) print(classification_report(stem_test.target, prediction))
Accuracy |
score: |
0.9122468659594986 |
f1-score |
support |
|
|
precision |
recall |
|||
|
0 |
0.99 |
0.92 |
0.95 |
390 |
|
1 |
0.89 |
0.92 |
0.91 |
396 |
|
2 |
0.84 |
0.88 |
0.86 |
251 |
accuracy |
0.91 |
0.91 |
0.91 |
1037 |
|
macro |
avg |
0.91 |
1037 |
||
weighted |
avg |
0.92 |
0.91 |
0.91 |
1037 |
Так как значения метрик были выше при использовании стемминга, далее будем рассматривать модель с наличием стемминга.
3) Взвешивание Count при стемминге и отсечении стоп-слов; число информативных признаков 10000:
text_clf = Pipeline([('vect', CountVectorizer(max_features= 10000,
25

stop_words = 'english')),
('clf', MultinomialNB ()),])
text_clf = text_clf.fit(stem_train.data_stem, stem_train.target) prediction = text_clf.predict(stem_test.data_stem)
print('Accuracy score: ', accuracy_score(prediction, stem_test.target)) print(classification_report(stem_test.target, prediction))
Accuracy |
score: |
0.9170684667309547 |
f1-score |
support |
|
|
precision |
recall |
|||
|
0 |
0.97 |
0.93 |
0.95 |
390 |
|
1 |
0.90 |
0.93 |
0.91 |
396 |
|
2 |
0.87 |
0.88 |
0.88 |
251 |
accuracy |
0.91 |
0.91 |
0.92 |
1037 |
|
macro |
avg |
0.91 |
1037 |
||
weighted |
avg |
0.92 |
0.92 |
0.92 |
1037 |
Так как значения метрик были выше при использовании отсечения стоп-слов, далее будем рассматривать модель с отсечением стоп-слов.
4)Взвешивание Count при стемминге и отсечении стоп-слов; число информативных признаков 5000:
text_clf = Pipeline([('vect', CountVectorizer(max_features= 5000, stop_words = 'english')),
('clf', MultinomialNB ()),])
text_clf = text_clf.fit(stem_train.data_stem, stem_train.target) prediction = text_clf.predict(stem_test.data_stem)
print('Accuracy score: ', accuracy_score(prediction, stem_test.target)) print(classification_report(stem_test.target, prediction))
Accuracy |
score: |
0.9122468659594986 |
f1-score |
support |
|
|
precision |
recall |
|||
|
0 |
0.96 |
0.93 |
0.95 |
390 |
|
1 |
0.89 |
0.92 |
0.91 |
396 |
|
2 |
0.87 |
0.88 |
0.87 |
251 |
accuracy |
0.91 |
0.91 |
0.91 |
1037 |
|
macro |
avg |
0.91 |
1037 |
||
weighted |
avg |
0.91 |
0.91 |
0.91 |
1037 |
5)Взвешивание Count при стемминге и отсечении стоп-слов; число информативных признаков 15000:
text_clf = Pipeline([('vect', CountVectorizer(max_features= 15000, stop_words = 'english')),
('clf', MultinomialNB ()),])
text_clf = text_clf.fit(stem_train.data_stem, stem_train.target) prediction = text_clf.predict(stem_test.data_stem)
print('Accuracy score: ', accuracy_score(prediction, stem_test.target))
print(classification_report(stem_test.target, prediction)) Accuracy score: 0.9180327868852459
precision recall f1-score support
26

0 |
0.97 |
0.93 |
0.95 |
390 |
1 |
0.89 |
0.93 |
0.91 |
396 |
2 |
0.88 |
0.88 |
0.88 |
251 |
accuracy |
0.91 |
0.91 |
0.92 |
1037 |
macro avg |
0.91 |
1037 |
||
weighted avg |
0.92 |
0.92 |
0.92 |
1037 |
6)Взвешивание Count при стемминге и отсечении стоп-слов; число информативных признаков 20000:
text_clf = Pipeline([('vect', CountVectorizer(max_features= 20000, stop_words = 'english')),
('clf', MultinomialNB ()),])
text_clf = text_clf.fit(stem_train.data_stem, stem_train.target) prediction = text_clf.predict(stem_test.data_stem)
print('Accuracy score: ', accuracy_score(prediction, stem_test.target)) print(classification_report(stem_test.target, prediction))
Accuracy |
score: |
0.9199614271938283 |
f1-score |
support |
|
|
precision |
recall |
|||
|
0 |
0.98 |
0.93 |
0.95 |
390 |
|
1 |
0.89 |
0.94 |
0.91 |
396 |
|
2 |
0.88 |
0.88 |
0.88 |
251 |
accuracy |
0.92 |
0.91 |
0.92 |
1037 |
|
macro |
avg |
0.92 |
1037 |
||
weighted |
avg |
0.92 |
0.92 |
0.92 |
1037 |
Так как значения метрик были выше при использовании числа информативных признаков 20000, наилучшая модель с взвешиванием Count - при стемминге и отсечении стоп-слов с числом информативных признаков 20000.
7) Взвешивание TF-IDF при отсутствии стемминга и отсечения стоп-слов; число информативных признаков 10000:
text_clf = Pipeline([('vect', CountVectorizer(max_features= 10000)), ('tfidf', TfidfTransformer(use_idf = True)), ('clf', MultinomialNB ()),])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target) prediction = text_clf.predict(twenty_test.data)
print('Accuracy score: ', accuracy_score(prediction, twenty_test.target)) print(classification_report(twenty_test.target, prediction))
Accuracy score: |
0.8187078109932497 |
f1-score |
support |
|
precision |
recall |
|||
0 |
0.97 |
0.91 |
0.94 |
390 |
1 |
0.69 |
0.97 |
0.81 |
396 |
2 |
0.95 |
0.44 |
0.60 |
251 |
accuracy |
|
|
0.82 |
1037 |
27

macro |
avg |
0.87 |
0.77 |
0.78 |
1037 |
weighted |
avg |
0.86 |
0.82 |
0.81 |
1037 |
8) Взвешивание TF-IDF при стемминге и отсутствии отсечения стоп-слов; число информативных признаков 10000:
text_clf = Pipeline([('vect', CountVectorizer(max_features= 10000)), ('tfidf', TfidfTransformer(use_idf = True)), ('clf', MultinomialNB ()),])
text_clf = text_clf.fit(stem_train.data_stem, stem_train.target) prediction = text_clf.predict(stem_test.data_stem)
print('Accuracy score: ', accuracy_score(prediction, stem_test.target)) print(classification_report(stem_test.target, prediction))
Accuracy |
score: |
0.819672131147541 |
f1-score |
support |
|
|
precision |
recall |
|||
|
0 |
0.99 |
0.91 |
0.95 |
390 |
|
1 |
0.69 |
0.98 |
0.81 |
396 |
|
2 |
0.95 |
0.42 |
0.58 |
251 |
accuracy |
0.88 |
0.77 |
0.82 |
1037 |
|
macro |
avg |
0.78 |
1037 |
||
weighted |
avg |
0.86 |
0.82 |
0.81 |
1037 |
Так как значения метрик были выше при использовании стемминга, далее будем рассматривать модель с наличием стемминга.
9) Взвешивание TF-IDF при стемминге и отсечении стоп-слов; число информативных признаков 10000:
text_clf = Pipeline([('vect', CountVectorizer(max_features= 10000, stop_words = 'english')),
('tfidf', TfidfTransformer(use_idf = True)), ('clf', MultinomialNB ()),])
text_clf = text_clf.fit(stem_train.data_stem, stem_train.target) prediction = text_clf.predict(stem_test.data_stem)
print('Accuracy score: ', accuracy_score(prediction, stem_test.target)) print(classification_report(stem_test.target, prediction))
Accuracy |
score: |
0.8727097396335584 |
f1-score |
support |
|
|
precision |
recall |
|||
|
0 |
0.95 |
0.94 |
0.94 |
390 |
|
1 |
0.79 |
0.96 |
0.86 |
396 |
|
2 |
0.95 |
0.63 |
0.76 |
251 |
accuracy |
0.89 |
0.84 |
0.87 |
1037 |
|
macro |
avg |
0.86 |
1037 |
||
weighted |
avg |
0.89 |
0.87 |
0.87 |
1037 |
Так как значения метрик были выше при использовании отсечения стоп-слов, далее будем рассматривать модель с отсечением стоп-слов.
28

10)Взвешивание TF-IDF при стемминге и отсечении стоп-слов; число информативных признаков 5000:
text_clf = Pipeline([('vect', CountVectorizer(max_features= 5000, stop_words = 'english')),
('tfidf', TfidfTransformer(use_idf = True)), ('clf', MultinomialNB ()),])
text_clf = text_clf.fit(stem_train.data_stem, stem_train.target) prediction = text_clf.predict(stem_test.data_stem)
print('Accuracy score: ', accuracy_score(prediction, stem_test.target)) print(classification_report(stem_test.target, prediction))
Accuracy |
score: |
0.8833172613307618 |
f1-score |
support |
|
|
precision |
recall |
|||
|
0 |
0.94 |
0.95 |
0.94 |
390 |
|
1 |
0.81 |
0.94 |
0.87 |
396 |
|
2 |
0.95 |
0.69 |
0.80 |
251 |
accuracy |
0.90 |
0.86 |
0.88 |
1037 |
|
macro |
avg |
0.87 |
1037 |
||
weighted |
avg |
0.89 |
0.88 |
0.88 |
1037 |
11)Взвешивание TF-IDF при стемминге и отсечении стоп-слов; число информативных признаков 15000:
text_clf = Pipeline([('vect', CountVectorizer(max_features= 15000, stop_words = 'english')),
('tfidf', TfidfTransformer(use_idf = True)), ('clf', MultinomialNB ()),])
text_clf = text_clf.fit(stem_train.data_stem, stem_train.target) prediction = text_clf.predict(stem_test.data_stem)
print('Accuracy score: ', accuracy_score(prediction, stem_test.target)) print(classification_report(stem_test.target, prediction))
Accuracy |
score: |
0.8717454194792671 |
f1-score |
support |
|
|
precision |
recall |
|||
|
0 |
0.95 |
0.94 |
0.95 |
390 |
|
1 |
0.78 |
0.97 |
0.86 |
396 |
|
2 |
0.96 |
0.61 |
0.75 |
251 |
accuracy |
0.90 |
0.84 |
0.87 |
1037 |
|
macro |
avg |
0.85 |
1037 |
||
weighted |
avg |
0.89 |
0.87 |
0.87 |
1037 |
12)Взвешивание TF-IDF при стемминге и отсечении стоп-слов; число информативных признаков 20000:
text_clf = Pipeline([('vect', CountVectorizer(max_features= 20000, stop_words = 'english')),
('tfidf', TfidfTransformer(use_idf = True)), ('clf', MultinomialNB ()),])
29

text_clf = text_clf.fit(stem_train.data_stem, stem_train.target) prediction = text_clf.predict(stem_test.data_stem)
print('Accuracy score: ', accuracy_score(prediction, stem_test.target)) print(classification_report(stem_test.target, prediction))
Accuracy |
score: |
0.8688524590163934 |
f1-score |
support |
|
|
precision |
recall |
|||
|
0 |
0.96 |
0.94 |
0.95 |
390 |
|
1 |
0.77 |
0.97 |
0.86 |
396 |
|
2 |
0.96 |
0.59 |
0.73 |
251 |
accuracy |
0.90 |
0.84 |
0.87 |
1037 |
|
macro |
avg |
0.85 |
1037 |
||
weighted |
avg |
0.89 |
0.87 |
0.86 |
1037 |
Так как значения метрик были выше при использовании числа информативных признаков 5000, наилучшая модель с взвешиванием TF-IDF - при стемминге и отсечении стоп-слов с числом информативных признаков 5000.
13) Взвешивание TF при отсутствии стемминга и отсечения стоп-слов; число информативных признаков 10000:
text_clf = Pipeline([('vect', CountVectorizer(max_features= 10000)), ('tf', TfidfTransformer(use_idf = False)), ('clf', MultinomialNB ()),])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target) prediction = text_clf.predict(twenty_test.data)
print('Accuracy score: ', accuracy_score(prediction, twenty_test.target)) print(classification_report(twenty_test.target, prediction))
Accuracy |
score: |
0.712632594021215 |
f1-score |
support |
|
|
precision |
recall |
|||
|
0 |
0.98 |
0.87 |
0.92 |
390 |
|
1 |
0.57 |
0.99 |
0.73 |
396 |
|
2 |
1.00 |
0.04 |
0.08 |
251 |
accuracy |
0.85 |
0.63 |
0.71 |
1037 |
|
macro |
avg |
0.57 |
1037 |
||
weighted |
avg |
0.83 |
0.71 |
0.64 |
1037 |
14) Взвешивание TF при стемминге и отсутствии отсечения стоп-слов; число информативных признаков 10000:
text_clf = Pipeline([('vect', CountVectorizer(max_features= 10000)), ('tf', TfidfTransformer(use_idf = False)), ('clf', MultinomialNB ()),])
text_clf = text_clf.fit(stem_train.data_stem, stem_train.target) prediction = text_clf.predict(stem_test.data_stem)
print('Accuracy score: ', accuracy_score(prediction, stem_test.target))
30