text_lab2
.pdfx = list(zip(vect.get_feature_names(), np.ravel(train_data_tfidf05.sum(axis=0))))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('sincer', 34.54646229673236), ('netcom', 28.077512091566557), ('olympu', 27.6220068617385), ('somat', 19.683849213268186), ('hystaspes_', 17.947106624726292), ('smartdrug', 17.807524062154958), ('chb', 17.466174374675173), ('ramada', 17.231561177853152), ('kuhn', 16.362970323046493), ('discoveri', 16.21071663713713), ('difficulti', 15.453276489726257), ('uucp', 15.255323869019449), ('linux', 15.089472689813165), ('alejandro', 15.062320833913612), ('00', 14.82971578635708), ('attent', 12.458055352101512), ('10', 12.371532745257124), ('lizard', 11.97963864235259), ('', 10.66834389031414), ('bogu', 10.488474646840226)]
Вывод для класса 1 при отсечении стоп-слов (TF-IDF):
train_data_tfidf06 = tfidf.transform(train_data06)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tfidf06.sum(axis=0))))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('jhu', 23.092821894269477), ('difficulti', 20.197092979122758), ('kuhn', 19.356281029226984), ('pip', 19.027666528183083), ('invalid', 17.6868260941888), ('definit', 17.280919323911885), ('fist', 15.291576935732573), ('threw', 15.024744303080045), ('sponsorship', 14.669940470329403), ('mouth', 13.952556008148695), ('deem', 13.56887593393774), ('apocalypt', 13.464924000560059), ('speak', 13.146288502979447), ('farrar', 13.132513027424977), ('supposedli', 13.00502890097202), ('music', 12.54453937826036), ('thank', 12.25625888741643), ('dat', 12.196856540591606), ('huxley', 12.14259453179389), ('desk', 12.08895194966892)]
Вывод для класса 2 при отсечении стоп-слов (TF-IDF):
train_data_tfidf07 = tfidf.transform(train_data07)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tfidf07.sum(axis=0))))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('financi', 23.184811350596423), ('pip', 17.063300675140464), ('inner', 14.64452075646358), ('invalid', 14.404957069911507), ('jhu', 12.233946376737656), ('definit', 10.355299086888017), ('kuhn', 10.021111440685743), ('bundl', 9.857105000423246), ('thank', 9.691442384555897), ('deem', 9.36018785626855), ('ct', 8.73899504098878), ('audibl', 8.558498990464523), ('astonish', 7.922011312364973), ('buoyanc', 7.8867422611415305), ('firmli', 7.492502093472581), ('threw', 7.44336427233471), ('sized', 7.133404082434186), ('sin', 6.926598257452059), ('kremlin', 6.606023300797752), ('john', 6.282523102619925)]
Вывод для всей выборки при стемминге (TF-IDF):
11
tfidf = TfidfTransformer(use_idf = True).fit(train_data_stem) train_data_tfidf_stem = tfidf.transform(train_data_stem)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tfidf_stem.sum(axis=0)))
)
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('tempera', 157.79077188103213), ('thrush', 100.54611617981654), ('nitya', 96.37542952691709), ('align', 86.03873447487518), ('indians', 79.02337798446179), ('infanticid', 73.35370972445031), ('televis', 69.29079256033657), ('honesti', 67.50217104571504), ('eyebal', 62.187454742857405), ('yogurt', 60.030882671777086), ('gold', 41.652825619004545), ('test', 41.330882236154565), ('analog', 40.7325328622444), ('authur', 40.59652077080643), ('ncr', 39.282120778346865), ('wis', 38.75720305808977), ('nth', 37.11687450264329), ('delusion', 35.089057012458646), ('anoth', 32.51051518572325), ('hill', 32.50072703858639)]
Вывод для класса 0 при стемминге (TF-IDF):
train_data_tfidf_stem01 = tfidf.transform(train_data_stem01)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tfidf_stem01.sum(axis=0)
)))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('eyebal', 36.354530600690474), ('tempera', 32.18255201830497), ('align', 25.509134102049355), ('thrush', 24.052903285410206), ('honesti', 18.144083561394954), ('00', 17.84610110065537), ('infanticid', 17.7669380047153), ('sharon', 17.698630630799315), ('nl', 17.141969733679776), ('nitya', 16.93568280287904), ('nth', 16.554376405683428), ('yogurt', 16.47997584693423), ('wis', 15.110185003208079), ('gold', 14.775121966332373), ('indians', 14.375132496303328), ('sirri', 13.670828710724956), ('mp', 13.510601943112349), ('smartwrit', 13.284301444974792), ('hormon', 12.888871463365266), ('lynch', 12.269116489967567)]
Вывод для класса 1 при стемминге (TF-IDF):
train_data_tfidf_stem02 = tfidf.transform(train_data_stem02)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tfidf_stem02.sum(axis=0)
)))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('tempera', 73.19785320181145), ('nitya', 47.894015002175934), ('thrush', 46.72623639639594), ('indians', 42.058831895867094), ('infanticid', 39.21671119663035), ('align', 37.099079684379184), ('televis', 32.92962955950493), ('honesti', 31.595252991003772), ('yogurt', 21.65357298190406), ('authur', 20.06554088557756), ('test', 19.207134738137114), ('analog', 17.989122824343898),
12
('ncr', 17.534951889388697), ('gold', 16.826066041620084), ('eyebal', 16.725968637581865), ('mistaken', 15.931423595352783), ('delusion', 15.832190268159595), ('anoth', 15.210556211911053), ('wis', 15.036587365870055), ('norman', 14.803083907323126)]
Вывод для класса 2 при стемминге (TF-IDF):
train_data_tfidf_stem03 = tfidf.transform(train_data_stem03)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tfidf_stem03.sum(axis=0)
)))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('tempera', 52.41036666091578), ('nitya', 31.5457317218622), ('thrush', 29.76697649801028), ('televis', 28.442502261872875), ('align', 23.430520688446705), ('indians', 22.58941359229138), ('yogurt', 21.897333842938895), ('honesti', 17.76283449331637), ('infanticid', 16.370060523104573), ('ncr', 14.768409319406455), ('authur', 14.030398235121133), ('got', 13.119913791383484), ('camera', 12.560723988055729), ('franc', 12.426242320756325), ('analog', 12.085934748812717), ('test', 11.880798613011809), ('anoth', 11.606394740614364), ('delusion', 11.324976575959763), ('vol', 10.664728033702119), ('tesla', 10.650853317974791)]
Вывод для всей выборки при отсечении стоп-слов и стемминге (TF-IDF):
train_data_tfidf_stem04 = tfidf.transform(train_data_stem04)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tfidf_stem04.sum(axis=0)
)))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('thi', 129.32800919419773), ('wa', 76.97728386002947), ('ha', 53.69445658099784), ('use', 53.51224701143201), ('know', 43.32344772442169), ('like', 42.195836240882116), ('ani', 39.70519916099226), ('doe', 38.21698090733035), ('new', 35.93673401728441), ('just', 35.86760778006493), ('offer', 33.36406397481835), ('good', 32.747739647032375), ('say', 30.273363219549942), ('peopl', 30.205409363344643), ('hi', 29.733312069224763), ('veri', 29.326246895236878), ('make', 29.146452922239888), ('includ', 28.938623574701445), ('sale', 28.28901917050075), ('think', 27.98031918427612)]
Вывод для класса 0 при отсечении стоп-слов и стемминге (TF-IDF):
train_data_tfidf_stem05 = tfidf.transform(train_data_stem05)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tfidf_stem05.sum(axis=0)
)))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
13
[('offer', 31.300454250625975), ('thi', 30.71294422915274), ('sale', 27.88415855254791), ('use', 26.19807462984792), ('new', 25.12228543439097), ('sell', 24.447046378171603), ('includ', 23.128480232840484), ('ship', 21.646752580610237), ('price', 20.066339982514304), ('condit', 17.982579325953647), ('ha', 16.277814070979513), ('pleas', 16.275703057169903), ('game', 15.77903267968755), ('look', 15.548768824682524), ('drive', 15.353737243885698), ('00', 15.151775703440967), ('like', 14.728363624494344), ('card', 13.724999418206192), ('ask', 13.488615400356453), ('manual', 12.707447575244652)]
Вывод для класса 1 при отсечении стоп-слов и стемминге (TF-IDF):
train_data_tfidf_stem06 = tfidf.transform(train_data_stem06)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tfidf_stem06.sum(axis=0)
)))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('thi', 60.31506016065788), ('wa', 36.56870904199805), ('ha', 26.8877664273872), ('know', 23.287296067425668), ('ani', 21.266027064071608), ('use', 19.13538740405915), ('doe', 18.897663436655616), ('like', 17.903345304925175), ('just', 15.018185272421794), ('peopl', 14.586979652841226), ('think', 14.257205933281572), ('diseas', 14.082694098863271), ('pitt', 13.84867776057893), ('veri', 13.825231559868936), ('gordon', 13.784368382302274), ('patient', 13.763866772938552), ('soon', 13.194187022117266), ('intellect', 13.181920085996099), ('surrend', 13.121708345783816), ('geb', 13.047250414750424)]
Вывод для класса 2 при отсечении стоп-слов и стемминге (TF-IDF):
train_data_tfidf_stem07 = tfidf.transform(train_data_stem07)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tfidf_stem07.sum(axis=0)
)))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('thi', 38.30000480438703), ('wa', 29.230707163320695), ('christian', 18.278353325066014), ('god', 18.1522753870565), ('say', 15.13522226440012), ('hi', 14.319276389373092), ('did', 12.83520308410148), ('know', 12.72928745538582), ('peopl', 12.715392601434948), ('jesu', 12.166534186801478), ('just', 11.962814393957474), ('doe', 11.693381611698983), ('think', 11.137948615830004), ('ha', 10.52887608263115), ('like', 9.564127311462622), ('believ', 8.826101977033428), ('point', 8.337902894547822), ('use', 8.178784977524954), ('good', 8.079523992870506), ('ani', 7.999518371547461)]
tf = TfidfTransformer(use_idf = False).fit(train_data) train_data_tf = tf.transform(train_data)
Вывод для всей выборки (TF):
14
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tf.sum(axis=0))))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('tone', 410.6466942091253), ('trip', 251.7142371449254), ('noon', 224.7563651678267), ('adjust', 213.50667797912118), ('ima', 171.8087951834235), ('harass', 154.16287320333123), ('epistl', 150.42899781013273), ('impact', 144.3601581889401), ('tomographi', 129.90078758447194), ('youth', 107.74646955575315), ('word', 76.73952240379423), ('forest', 74.76904182863568), ('trade', 73.98236585775292), ('agreement', 72.37175564019299), ('obviou', 70.35049990487211), ('naturalist', 65.67375220517485), ('armageddon', 63.48328583893221), ('grin', 57.567519020817166), ('nth', 54.056582370621975), ('aladdin', 50.53911703806337)]
Вывод для класса 0 (TF):
train_data_tf01 = tf.transform(train_data01)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tf01.sum(axis=0))))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('tone', 96.58390854005752), ('epistl', 95.3261336507378), ('adjust', 72.56618853288931), ('trip', 68.6076816082005), ('harass', 47.77980505637252), ('noon', 46.35046197506701), ('impact', 39.032646000694484), ('ima', 35.699082341690136), ('obviou', 35.138785865799754), ('word', 33.91689638398363), ('youth', 32.98610807152292), ('forest', 31.33059776539425), ('shaw', 24.781641581778313), ('grin', 22.48655967632584), ('agreement', 21.895673322279677), ('trade', 20.766371909080622), ('macintosh', 20.657689863631422), ('motorcycl', 20.374454329778718), ('accton', 20.011925026927194), ('allergic', 19.119756075822647)]
Вывод для класса 1 (TF):
train_data_tf02 = tf.transform(train_data02)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tf02.sum(axis=0))))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('tone', 188.0767804748418), ('trip', 115.21215212812078), ('noon', 110.67112192210904), ('ima', 89.98315588904559), ('adjust', 89.28622378616487), ('impact', 76.12173829264304), ('harass', 69.91959963457383), ('tomographi', 62.83599770461837), ('youth', 38.86540409957052), ('epistl', 36.81753725915403), ('trade', 33.88799150799945), ('armageddon', 31.780279373041203), ('agreement', 31.084212644712675), ('naturalist', 29.849046600200147), ('word', 27.92515164881687), ('forest', 27.465964484283415), ('nth', 24.889911285715417), ('aladdin', 23.762950253559957), ('miner', 23.631055152617392), ('bet', 22.74486509005074)]
15
Вывод для класса 2 (TF):
train_data_tf03 = tf.transform(train_data03)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tf03.sum(axis=0))))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('tone', 125.98600519422573), ('trip', 67.89440340860428), ('noon', 67.73478127065086), ('adjust', 51.6542656600671), ('tomographi', 50.60526399005471), ('ima', 46.12655695268774), ('harass', 36.463468512385006), ('youth', 35.89495738465961), ('impact', 29.205773895602697), ('naturalist', 22.2608827293163), ('agreement', 19.39186967320067), ('trade', 19.328002440672854), ('armageddon', 19.290882428807176), ('epistl', 18.28532690024091), ('aladdin', 16.472927010282717), ('forest', 15.972479578958051), ('word', 14.897474370993647), ('bet', 14.011322719216535), ('formul', 13.872733047811504), ('wealth', 13.807224650294263)]
Вывод для всей выборки при отсечении стоп-слов (TF):
train_data_tf04 = tf.transform(train_data04)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tf04.sum(axis=0))))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('definit', 42.19836218494229), ('kuhn', 41.94033888116838), ('netcom', 38.49189557572986), ('jhu', 38.43222500691624), ('invalid', 36.441798363953794), ('pip', 36.20820897355665), ('difficulti', 34.694153609728126), ('sincer', 33.10807332706017), ('firmli', 32.20601101295828), ('thank', 28.946781478743482), ('deem', 26.955539997275125), ('threw', 26.0987325817474), ('olympu', 25.200411794386508), ('uucp', 25.15020882674478), ('00', 22.847118231150784), ('lizard', 22.24617480778731), ('uu', 21.080078313079557), ('financi', 21.00989280692224), ('hystaspes_', 20.947258397082738), ('somat', 20.37436327922064)]
Вывод для класса 0 при отсечении стоп-слов (TF):
train_data_tf05 = tf.transform(train_data05)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tf05.sum(axis=0))))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('sincer', 32.65483761325764), ('netcom', 26.531791875665423), ('olympu', 24.091705286864663), ('00', 22.292416466161598), ('somat', 20.22797546329431), ('chb', 17.557342544980045), ('discoveri', 16.378840818878814), ('ramada', 15.73734249317387), ('hystaspes_', 15.590855972723451), ('smartdrug', 15.570422707355517), ('uucp', 15.358457141843216), ('kuhn', 15.092333169147988), ('linux', 14.857875711021709), ('difficulti', 14.641828043137563), ('depriv', 14.286874745376672), ('alejandro', 13.878085620513344), ('10', 12.252398950200005),
16
('attent', 11.955520632053954), ('firmli', 11.362005986278653), ('lawrenc', 11.27035006975232)]
Вывод для класса 1 при отсечении стоп-слов (TF):
train_data_tf06 = tf.transform(train_data06)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tf06.sum(axis=0))))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('definit', 21.589878916084416), ('jhu', 21.04119998231627), ('difficulti', 19.297138878101315), ('kuhn', 17.73731563284006), ('pip', 17.405873900275285), ('invalid', 15.294268932063357), ('thank', 14.6495413220078), ('mouth', 14.026245731941556), ('threw', 13.778853949223942), ('fist', 13.314589187179175), ('deem', 12.997218833062192), ('apocalypt', 12.908770782465169), ('sponsorship', 12.801233869447662), ('predispos', 12.652476415348302), ('farrar', 12.59139169317015), ('firmli', 12.457034152057558), ('speak', 12.119701910783679), ('bro', 12.043005411898942), ('huxley', 12.043005411898942), ('music', 12.043005411898942)]
Вывод для класса 2 при отсечении стоп-слов (TF):
train_data_tf07 = tf.transform(train_data07)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tf07.sum(axis=0))))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('financi', 20.434219703353282), ('pip', 15.544106122265065), ('inner', 13.646184382428856), ('definit', 12.819186474289085), ('invalid', 12.485953360589166), ('thank', 11.524269384086313), ('jhu', 11.05933486881957), ('bundl', 9.897003235170516), ('kuhn', 9.110690079180278), ('sized', 8.816862412082582), ('deem', 8.790042992168944), ('firmli', 8.386970874622094), ('ct', 8.334783193358227), ('audibl', 7.864683989743237), ('buoyanc', 7.60326957536585), ('astonish', 7.558513996512791), ('warranti', 7.2329456679772575), ('sin', 6.865274433532172), ('threw', 6.785575289449005), ('prize', 6.488367064561185)]
tf = TfidfTransformer(use_idf = False).fit(train_data_stem) train_data_tf_stem = tf.transform(train_data_stem)
Вывод для всей выборки при стемминге (TF):
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tf_stem.sum(axis=0))))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
17
[('tempera', 401.81008124918867), ('thrush', 246.43616002561686), ('nitya', 220.04264038039668), ('align', 208.92142892485631), ('indians', 171.16656294896478), ('honesti', 150.94494561031672), ('infanticid', 147.44374510244367), ('eyebal', 146.91482648292387), ('televis', 127.33554116137952), ('yogurt', 105.57423200403775), ('gold', 79.221066957697), ('wis', 75.18014089604114), ('test', 72.51020616551847), ('analog', 72.47888994113973), ('authur', 69.19216169860725), ('nth', 68.8353980802429), ('ncr', 66.23585307278694), ('hill', 56.35525307163919), ('delusion', 55.063274509705), ('norman', 53.153453200297555)]
Вывод для класса 0 при стемминге (TF):
train_data_tf_stem01 = tf.transform(train_data_stem01)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tf_stem01.sum(axis=0))))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('tempera', 94.04935491072541), ('eyebal', 92.96362352155147), ('align', 70.79112133765337), ('thrush', 66.92086119240213), ('honesti', 46.73861863339445), ('nitya', 45.20242858920657), ('infanticid', 39.78767019234924), ('indians', 35.220078052626214), ('nth', 34.332996471572976), ('wis', 33.205533346101404), ('yogurt', 32.2425480897156), ('gold', 31.14662753051662), ('sharon', 24.510514206247038), ('hill', 21.988642203421733), ('nl', 21.720431196979277), ('analog', 21.553662886201675), ('test', 20.28116495452412), ('lynch', 20.21743632025493), ('mp', 19.880550608315847), ('ahead', 19.315159932279066)]
Вывод для класса 1 при стемминге (TF):
train_data_tf_stem02 = tf.transform(train_data_stem02)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tf_stem02.sum(axis=0))))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('tempera', 184.1512866766698), ('thrush', 112.87914637294261), ('nitya', 108.444862358752), ('indians', 89.89711469136624), ('align', 87.3596328802749), ('infanticid', 77.22934287223913), ('honesti', 68.46334225840928), ('televis', 61.52419000124554), ('yogurt', 38.0706252510112), ('eyebal', 35.98989783048909), ('authur', 34.32259962899431), ('test', 33.07011122143357), ('analog', 31.14306618264141), ('gold', 30.872613979039862), ('ncr', 29.897706867671953), ('wis', 27.37113007333662), ('delusion', 24.46925300850209), ('norman', 24.413594141530663), ('anoth', 23.21997127767892), ('mistaken', 23.12020789565872)]
Вывод для класса 2 при стемминге (TF):
train_data_tf_stem03 = tf.transform(train_data_stem03)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tf_stem03.sum(axis=0))))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
18
[('tempera', 123.60943966179349), ('thrush', 66.63615246027227), ('nitya', 66.39534943243824), ('align', 50.77067470692797), ('televis', 49.63967938013265), ('indians', 46.04937020497246), ('honesti', 35.74298471851306), ('yogurt', 35.26105866331091), ('infanticid', 30.426732037855363), ('ncr', 22.75197862101842), ('authur', 22.2099437447239), ('analog', 19.78216087229663), ('test', 19.158929989560644), ('eyebal', 17.96130513088313), ('gold', 17.20182544814053), ('delusion', 16.466740215999895), ('anoth', 16.15824068007798), ('wis', 14.603477476603075), ('bloodlet', 13.774900026852691), ('vol', 13.695637430060259)]
Вывод для всей выборки при отсечении стоп-слов и стемминге (TF):
train_data_tf_stem04 = tf.transform(train_data_stem04)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tf_stem04.sum(axis=0))))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('thi', 119.68411086611744), ('wa', 67.8130647845095), ('use', 53.0711584791655), ('ha', 51.626233026528816), ('like', 41.828280956114455), ('ani', 41.39152359405588), ('know', 37.9050419504865), ('new', 36.165670658450054), ('doe', 33.61151211578544), ('edu', 33.01669813235763), ('just', 33.004399253703504), ('peopl', 32.72368676176806), ('sale', 32.05565812608838), ('pleas', 31.290983063620995), ('offer', 30.698152509386688), ('think', 30.54565526782595), ('hi', 30.38787760294542), ('good', 30.26122588815749), ('time', 28.452666865132706), ('includ', 28.415345939750672)]
Вывод для класса 0 при отсечении стоп-слов и стемминге (TF):
train_data_tf_stem05 = tf.transform(train_data_stem05)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tf_stem05.sum(axis=0))))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('sale', 31.591562426243993), ('offer', 28.805502550285986), ('thi', 28.28394413892743), ('use', 25.857801323838395), ('new', 25.0919622282373), ('includ', 22.68719596527167), ('00', 21.959919582798438), ('sell', 21.361979341324773), ('pleas', 20.843770936340054), ('ship', 19.873997035254465), ('price', 17.411116674387507), ('condit', 17.043775025500484), ('mail', 16.239637850800797), ('ask', 15.690863391115833), ('drive', 15.587622555928782), ('email', 15.549580299988085), ('ha', 15.531062831057048), ('like', 14.588858566159105), ('look', 14.151025390415866), ('edu', 14.086486221599179)]
Вывод для класса 1 при отсечении стоп-слов и стемминге (TF):
train_data_tf_stem06 = tf.transform(train_data_stem06)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tf_stem06.sum(axis=0))))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True)
19
print (x[:20])
[('thi', 55.971573297030275), ('wa', 32.31353830566072), ('ha', 25.95505265472963), ('ani', 22.33128544375682), ('know', 20.450948164730875), ('use', 19.056635939142186), ('edu', 18.263792739201925), ('like', 17.787831589506578), ('doe', 16.61238212946485), ('peopl', 15.890083890356072), ('think', 15.625262532822678), ('time', 14.685409516342581), ('just', 13.844498079975486), ('effect', 13.09896125699551), ('msg', 12.864695193945531), ('food', 12.85761355532246), ('gordon', 12.661294467723826), ('diseas', 12.49352264416372), ('bank', 12.40319784529103), ('caus', 12.312875825395574)]
Вывод для класса 2 при отсечении стоп-слов и стемминге (TF):
train_data_tf_stem07 = tf.transform(train_data_stem07)
x = list(zip(vect.get_feature_names(), np.ravel(train_data_tf_stem07.sum(axis=0))))
def SortbyTF(inputStr): return inputStr[1]
x.sort(key=SortbyTF, reverse = True) print (x[:20])
[('thi', 35.42859343015953), ('wa', 25.68486517557042), ('god', 18.952767252233688), ('christian', 17.988216233103145), ('hi', 14.550540659362783), ('peopl', 13.782319501874834), ('say', 13.270867799637418), ('jesu', 12.283127952715487), ('think', 12.093941266157671), ('did', 11.92943972135626), ('know', 11.153315193164616), ('just', 11.057138983847045), ('doe', 10.325487270114326), ('ha', 10.140117540742123), ('like', 9.451590800448765), ('believ', 9.016125161400216), ('ani', 8.342630762560646), ('moral', 8.242981256858124), ('use', 8.156721216184955), ('onli', 7.7708407333348095)]
6.Результаты пункта 5 в виде таблиц с наиболее частотными терминами обучающей выборки и каждого класса по отдельности.
Для всей выборки:
|
|
|
Без стемминга |
|
|
||
|
Count |
|
TF |
TF-IDF |
|||
№ |
Без стоп- |
С стоп- |
Без стоп- |
|
С стоп- |
Без стоп- |
С стоп- |
|
слов |
словами |
слов |
|
словами |
слов |
словами |
1 |
00 |
the |
definit |
|
tone |
kuhn |
tone |
2 |
people |
of |
kuhn |
|
trip |
jhu |
trip |
3 |
new |
to |
netcom |
|
noon |
invalid |
noon |
4 |
edu |
and |
jhu |
|
adjust |
netcom |
adjust |
5 |
don |
in |
invalid |
|
ima |
pip |
ima |
6 |
like |
is |
pip |
|
harass |
difficulti |
impact |
7 |
good |
that |
difficulti |
|
epistl |
sincer |
tomographi |
8 |
just |
it |
sincer |
|
impact |
definit |
harass |
9 |
know |
for |
firmli |
|
tomographi |
olympu |
epistl |
10 |
10 |
you |
thank |
|
youth |
firmli |
youth |
11 |
use |
this |
deem |
|
word |
threw |
trade |
12 |
god |
are |
threw |
|
forest |
deem |
agreement |
13 |
time |
with |
olympu |
|
trade |
uucp |
forest |
14 |
think |
not |
uucp |
|
agreement |
thank |
word |
15 |
does |
have |
00 |
|
obviou |
lizard |
naturalist |
20