python文本聚类分析_python机器学习kmeans算法——对文本进行聚类分析
#!/usr/bin/env python#-*- coding: utf-8 -*-#@File : kmeans.py#@Author: 田智凱#@Date : 2020/3/19#@Desc :機器學習kmeans算法,對科技成果項目進行聚類分析
from __future__ importprint_functionimporttimefrom sklearn.feature_extraction.text importTfidfVectorizerimportmatplotlib.pyplot as pltfrom sklearn.cluster importKMeans, MiniBatchKMeansimportpymssql#讀取sqlserver數據庫
defget_dbdata():
conn_read= pymssql.connect("127.0.0.1", "sa", "###", "test", charset="GBK")
dataset=[]
sql= "select guanjianci from julei_test"cursor=conn_read.cursor()
cursor.execute(sql)
data_count=0for line incursor:
data_count+=1dataset.append(line[0])
cursor.close()
conn_read.close()print(dataset)returndatasetdef transform(dataset, n_features=1000):
vectorizer= TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=2, use_idf=True)
X=vectorizer.fit_transform(dataset)returnX, vectorizerdef train(X, vectorizer, true_k=10, minibatch=False, showLable=False):#使用采樣數據還是原始數據訓練k-means,
ifminibatch:
km= MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
init_size=1000, batch_size=1000, verbose=False)else:
km= KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1,
verbose=False)
km.fit(X)ifshowLable:print("Top terms per cluster:")
order_centroids= km.cluster_centers_.argsort()[:, ::-1]
terms=vectorizer.get_feature_names()print(vectorizer.get_stop_words())for i inrange(true_k):print("Cluster %d:" % i, end='')for ind in order_centroids[i, :10]:print('%s' % terms[ind], end='')print()
result=list(km.predict(X))print('Cluster distribution:')print(dict([(i, result.count(i)) for i inresult]))return -km.score(X)#指定簇的個數k
defk_determin():'''測試選擇最優參數'''dataset=get_dbdata()print("%d documents" %len(dataset))
X, vectorizer= transform(dataset, n_features=500)
true_ks=[]
scores=[]#中心點的個數從3到200(根據自己的數據量改寫)
for i in range(3, 200, 1):
score= train(X, vectorizer, true_k=i) /len(dataset)print(i, score)
true_ks.append(i)
scores.append(score)
plt.figure(figsize=(8, 4))
plt.plot(true_ks, scores, label="error", color="red", linewidth=1)
plt.xlabel("n_features")
plt.ylabel("error")
plt.legend()
plt.show()defmain():'''在最優參數下輸出聚類結果'''dataset=get_dbdata()
X, vectorizer= transform(dataset, n_features=500)
score= train(X, vectorizer, true_k=25, showLable=True) /len(dataset)print(score)if __name__ == '__main__':
start=time.time()#k_determin()#先確定k值
main()
end=time.time()print('程序運行時間',end-start)
總結
以上是生活随笔為你收集整理的python文本聚类分析_python机器学习kmeans算法——对文本进行聚类分析的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: winform mysql 工具类_C#
- 下一篇: Java单例模式的七种写法