基金的申购费和认购费有哪些不同?
import faiss
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
加载数据
data = pd.read_csv('fund_data.csv')
数据预处理
data['Content'] = data['Content'].str.lower()
data['Content'] = data['Content'].str.replace('[^\w\s]','')
构建文本向量
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['Content'])
features = vectorizer.get_feature_names()
计算相似度矩阵
similarity_matrix = cosine_similarity(X)
基于相似度矩阵进行聚类分析
index = faiss.IndexFlatL2(tfidf_matrix.shape[1])
index.add(tfidf_matrix.toarray())
k = 10 # 聚类个数
D, I = index.search(tfidf_matrix.toarray(), k)
输出聚类结果
for i in range(k):
cluster = []
for j in range(len(I)):
if I[j][0] == i:
cluster.append(data['Content'][j])
print(f"Cluster {i+1}:")
print(cluster)
print()
- 上一篇:基金华夏红利今日净值