|
2威望
本帖最后由 waston_yy 于 2023-3-21 13:45 编辑
代码如下:- import sys,os
- from sklearn.cluster import KMeans
- from sklearn import metrics
- from sklearn.decomposition import PCA
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- import math
- from scipy import spatial
- def getkeywords_kmeans(data,topK):
- words = data["word"] # 词汇
- vecs = data.iloc[:, 1:] # 向量表示
- i=2
- kmeans = KMeans(n_clusters=i).fit(vecs)
- labels = kmeans.labels_ # 类别结果标签
- labels = pd.DataFrame(labels, columns=['label'])
- new_df = pd.concat([labels, vecs], axis=1)
- df_count_type = new_df.groupby('label').size() # 各类别统计个数
- # print df_count_type
- vec_center = kmeans.cluster_centers_ # 聚类中心
- #vec_words = np.array(vecs) # 候选关键词向量,dataFrame转array
- quantity = pd.Series(kmeans.labels_).value_counts()
- for num in range(len(quantity)) : #0~1个簇类
- vec_center = vec_center[num]
- length = 200
- distances = []
- res0Series = pd.Series(kmeans.labels_)
- res0 = res0Series[res0Series.values == num]
- res = vecs.iloc[res0.index]
- #res = res.reset_index(drop=True)#71个词
- words_num=len(res)
- res_word=np.array(res)
- for word in range(words_num):#0~171个词
- res = res_word[word]
- dis = 0 # 向量距离
- for index in range(length):#0~200维
- #计算每一个簇中每一个词在每一维与聚类中心的距离(欧氏距离)
- dis += (vec_center[index] - res[index]) * (vec_center[index] - res[index])
- dis_ture=np.sqrt(dis)
- distances.append(dis_ture)
- distances = pd.DataFrame(distances, columns=['dis'])
- result = pd.concat([words, labels, distances], axis=1) # 拼接词语与其对应中心点的距离
- result = result.sort_values(by="dis", ascending=True) # 按照距离大小进行升序排序
- def main():
- # 读取数据集
- dataFile = 'D:/STUDY/1BIYESHEJI/keyword_extraction-master/data/data1.csv'
- articleData = pd.read_csv(dataFile,encoding='utf-8')
- ids, titles, keys = [], [], []
- rootdir = "D:/STUDY/1BIYESHEJI/keyword_extraction-master/result/vecs" # 词向量文件根目录
- fileList = os.listdir(rootdir) #列出文件夹下所有的目录与文件
- # 遍历文件
- for i in range(len(fileList)):
- filename = fileList[i]
- path = os.path.join(rootdir,filename)
- if os.path.isfile(path):
- data = pd.read_csv(path, encoding='utf-8') # 读取词向量文件数据
- #print(data)
- artile_keys = getkeywords_kmeans(data,10) # 聚类算法得到当前文件的关键词
- print(artile_keys)
- # 根据文件名获得文章id以及标题
- (shortname, extension) = os.path.splitext(filename) # 得到文件名和文件扩展名
- t = shortname.split("_")
- article_id = int(t[len(t)-1]) # 获得文章id
- artile_tit = articleData[articleData.id==article_id]['title'] # 获得文章标题
- artile_tit = list(artile_tit)[0] # series转成字符串
- ids.append(article_id)
- titles.append(artile_tit)
- keys.append(artile_keys)
- # 所有结果写入文件
- result = pd.DataFrame({"id": ids, "title": titles, "key": keys}, columns=['id', 'title', 'key'])
- result = result.sort_values(by="id",ascending=True) # 排序
- result.to_csv("D:/STUDY/1BIYESHEJI/keyword_extraction-master/result/news1_keys_word2vec1.csv", encoding="gbk",index=False)
- if __name__ == '__main__':
- main()
复制代码 错误显示如下
- Traceback (most recent call last):
- File "D:/STUDY/1BIYESHEJI/keyword_extraction-master/wiki_zh_word2vec-master/kmeans.py", line 81, in <module>
- main()
- File "D:/STUDY/1BIYESHEJI/keyword_extraction-master/wiki_zh_word2vec-master/kmeans.py", line 64, in main
- artile_keys = getkeywords_kmeans(data,10) # 聚类算法得到当前文件的关键词
- File "D:/STUDY/1BIYESHEJI/keyword_extraction-master/wiki_zh_word2vec-master/kmeans.py", line 41, in getkeywords_kmeans
- dis += (vec_center[index] - res[index]) * (vec_center[index] - res[index])
- IndexError: invalid index to scalar variable.
复制代码 还请大佬们指点
|
|