123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452 |
- # ## 1.PCA + K-means 聚类
- from flask import Flask, request, jsonify
- import pandas as pd
- import matplotlib.pyplot as plt
- from sklearn.decomposition import PCA
- from sklearn.cluster import KMeans
- from sklearn.preprocessing import StandardScaler
- import requests
- app = Flask(__name__)
- def pca_kmeans_analysis(df):
- # 特征缩放
- scaler = StandardScaler()
- X_scaled = scaler.fit_transform(df)
- # PCA降维
- pca = PCA(n_components=2)
- X_pca = pca.fit_transform(X_scaled)
- # KMeans聚类
- kmeans = KMeans(n_clusters=2, n_init=10, random_state=42)
- kmeans.fit(X_pca)
- # 定义颜色列表
- colors = ['r', 'g', 'b'] # 为每个聚类指定颜色
- # 绘制结果
- fig, ax = plt.subplots(figsize=(15, 5)) # 使用1个子图显示结果
- if kmeans.n_clusters <= len(colors):
- # 绘制聚类结果
- for i in range(kmeans.n_clusters):
- cluster_members = X_pca[kmeans.labels_ == i]
- ax.scatter(cluster_members[:, 0], cluster_members[:, 1],
- c=colors[i], label=f'Cluster {i + 1}')
- # 绘制质心
- ax.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
- s=300, c='k', marker='x', label='Centroids')
- # 设置图表标题和图例
- ax.set_title('PCA and KMeans Clustering')
- ax.legend()
- else:
- print("聚类数量超过了预定义的颜色数量。")
- return fig, pca, kmeans
- @app.route('/pca_kmeans', methods=['POST'])
- def pca_kmeans():
- try:
- # 检查请求体中是否包含文件地址
- data = request.json
- if 'file_url' not in data:
- return jsonify({'code': 400, 'msg': 'No file URL provided'})
- file_url = request.json.get('file_url')
- result_file_path = request.json.get('result_file_path')
- result_img_path = request.json.get('result_img_path')
- # 读取数据集
- df = pd.read_csv('{0}'.format(file_url))
- # 执行PCA和KMeans分析
- fig, pca, kmeans = pca_kmeans_analysis(df)
- # 保存图形
- plt.savefig('{0}'.format(result_img_path))
- plt.close() # 关闭图形以释放资源
- # 保存PCA结果和聚类标签到CSV
- pca_df = pd.DataFrame(data=pca.transform(df), columns=['PCA1', 'PCA2'])
- pca_df['Cluster'] = kmeans.labels_
- pca_df.to_csv(result_file_path, index=False)
- # 返回结果文件路径和PCA图路径给客户端
- return jsonify({
- 'code': 200,
- 'msg': 'PCA and KMeans analysis completed',
- })
- except Exception as e:
- return jsonify({
- 'code': 500,
- 'msg': str(e)
- })
- ##2.自编码器(AutoEncoder): 一种无监督神经网络模型,通过学习输入数据的有效表示来进行特征学习。
- def train_autoencoder(X_scaled):
- input_dim = X_scaled.shape[1] # 输入维度
- encoding_dim = 8 # 编码维度
- # 输入层
- input_layer = Input(shape=(input_dim,))
- # 编码层
- encoded = Dense(encoding_dim, activation='relu')(input_layer)
- # 解码层
- decoded = Dense(input_dim, activation='sigmoid')(encoded)
- # 创建自编码器模型
- autoencoder = Model(input_layer, decoded)
- # 编译模型
- autoencoder.compile(optimizer='adam', loss='mean_squared_error')
- # 训练自编码器
- autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=256, shuffle=True)
- # 创建编码器模型
- encoder = Model(input_layer, encoded)
- # 编码然后解码数据
- X_encoded = encoder.predict(X_scaled)
- X_decoded = autoencoder.predict(X_encoded)
- # 计算重构误差
- reconstruction_error = np.mean(np.square(X_scaled - X_decoded), axis=1)
- return reconstruction_error, autoencoder
- import tensorflow as tf
- graph = tf.get_default_graph()
- @app.route('/autoencoder', methods=['POST'])
- def autoencoder_anomaly_detection():
- try:
- global graph
- # 检查请求体中是否包含文件地址
- data = request.json
- if 'file_url' not in data:
- return jsonify({'code': 400, 'msg': 'No file URL provided'})
- file_url = request.json.get('file_url')
- result_file_path = request.json.get('result_file_path')
- result_img_path = request.json.get('result_img_path')
- # 读取数据集
- df = pd.read_csv('{0}'.format(file_url))
- # 特征缩放
- scaler = StandardScaler()
- X_scaled = scaler.fit_transform(df)
- # 训练自编码器并获取重构误差
- reconstruction_error, autoencoder = train_autoencoder(X_scaled)
- # 绘制重构误差的分布情况
- plt.hist(reconstruction_error, bins=50, color='blue', alpha=0.7)
- plt.xlabel('Reconstruction Error')
- plt.ylabel('Frequency')
- plt.title('Reconstruction Error Distribution')
- plt.savefig('{0}'.format(result_img_path))
- plt.close() # 关闭图形以释放资源
- # 保存重构误差到CSV
- reconstruction_error_df = pd.DataFrame(reconstruction_error, columns=['Reconstruction_Error'])
- reconstruction_error_df.to_csv(result_file_path, index=False)
- # 根据重构误差进行异常检测
- mean = np.mean(reconstruction_error)
- std = np.std(reconstruction_error)
- threshold = mean + 2 * std
- outliers = reconstruction_error > threshold
- # 异常点索引
- abnormal_indices = np.where(outliers)[0]
- # 返回结果文件路径、图路径和异常点索引给客户端
- return jsonify({
- 'code': 200,
- 'msg': 'Autoencoder training and anomaly detection completed',
-
- 'abnormalIndices': list(abnormal_indices)
- })
- except Exception as e:
- return jsonify({
- 'code': 500,
- 'msg': str(e)
- })
- ##3.高斯混合模型(GMM): 假设数据由多个高斯分布混合而成,使用EM算法来估计每个高斯分布的参数,从而进行聚类。
- from sklearn.mixture import GaussianMixture
- def perform_gmm_clustering(X_scaled):
- # PCA降维
- pca = PCA(n_components=2)
- X_pca = pca.fit_transform(X_scaled)
- # GMM聚类
- gmm = GaussianMixture(n_components=2, random_state=42)
- gmm.fit(X_pca)
- # 预测每个数据点的聚类标签
- predicted_labels = gmm.predict(X_pca)
- # 绘制结果
- plt.figure(figsize=(15, 5))
- plt.scatter(X_pca[:, 0], X_pca[:, 1], c=predicted_labels, cmap=plt.cm.viridis, label=['Cluster 1', 'Cluster 2'])
- plt.scatter(gmm.means_[:, 0], gmm.means_[:, 1], s=300, c='k', marker='x', label='Centroids')
- plt.title('Gaussian Mixture Model Clustering')
- plt.legend()
- return gmm, pca, plt
- @app.route('/gmm_clustering', methods=['POST'])
- def gmm_clustering():
- try:
- # 检查请求体中是否包含文件地址
- data = request.json
- if 'file_url' not in data:
- return jsonify({'code': 400, 'msg': 'No file URL provided'})
- file_url = request.json.get('file_url')
- result_file_path = request.json.get('result_file_path')
- result_img_path = request.json.get('result_img_path')
- # 读取数据集
- df = pd.read_csv('{0}'.format(file_url))
- # 特征缩放
- scaler = StandardScaler()
- X_scaled = scaler.fit_transform(df)
- # 执行GMM聚类分析
- gmm, pca, plt_object = perform_gmm_clustering(X_scaled)
- # 保存图形
- plt_object.savefig(result_img_path)
- plt_object.close() # 关闭图形以释放资源
- # 保存GMM结果到CSV
- gmm_df = pd.DataFrame(data=pca.transform(X_scaled), columns=['PCA1', 'PCA2'])
- gmm_df['Cluster'] = gmm.predict(pca.transform(X_scaled))
- gmm_df.to_csv(result_file_path, index=False)
- # 返回结果文件路径和GMM聚类图路径给客户端
- return jsonify({
- 'code': 200,
- 'msg': 'GMM clustering completed',
- })
- except Exception as e:
- return jsonify({
- 'code': 500,
- 'msg': str(e)
- })
- ##4.孤立森林(Isolation Forest): 一种异常检测算法,通过随机选择特征和切分值来“孤立”异常点。
- from sklearn.ensemble import IsolationForest
- from sklearn.decomposition import PCA
- def perform_iforestation(X_scaled):
- # PCA降维
- pca = PCA(n_components=2)
- X_pca = pca.fit_transform(X_scaled)
- # 孤立森林异常检测
- iforest = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)
- iforest.fit(X_scaled)
- # 预测异常分数
- scores = iforest.decision_function(X_scaled)
- # 将数据点标记为正常或异常
- predicted_labels = (scores < np.median(scores)).astype(int)
- # 绘制结果
- plt.figure(figsize=(15, 5))
- normal_color = 'green'
- anomaly_color = 'red'
- scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=[normal_color if label else anomaly_color for label in predicted_labels], label=['Normal', 'Anomaly'])
- plt.title('Isolation Forest Anomaly Detection')
- plt.legend()
- return pca, iforest, plt, predicted_labels
- @app.route('/iforestation', methods=['POST'])
- def iforestation():
- try:
- # 检查请求体中是否包含文件地址
- data = request.json
- if 'file_url' not in data:
- return jsonify({'code': 400, 'msg': 'No file URL provided'})
- file_url = request.json.get('file_url')
- result_file_path = request.json.get('result_file_path')
- result_img_path = request.json.get('result_img_path')
- # 读取数据集
- df = pd.read_csv('{0}'.format(file_url))
- # 特征缩放
- scaler = StandardScaler()
- X_scaled = scaler.fit_transform(df)
- # 执行孤立森林分析
- pca, iforest, plt_object, predicted_labels = perform_iforestation(X_scaled)
- # 保存图形
- plt_object.savefig(result_img_path)
- plt_object.close() # 关闭图形以释放资源
- # 保存孤立森林结果到CSV
- iforest_df = pd.DataFrame({
- 'PCA1': pca.transform(X_scaled)[:, 0],
- 'PCA2': pca.transform(X_scaled)[:, 1],
- 'Anomaly_Score': iforest.decision_function(X_scaled),
- 'Anomaly_Label': predicted_labels
- })
- iforest_df.to_csv(result_file_path, index=False)
- # 返回结果文件路径和孤立森林图路径给客户端
- return jsonify({
- 'code': 200,
- 'msg': 'Isolation Forest analysis completed',
- })
- except Exception as e:
- return jsonify({
- 'code': 500,
- 'msg': str(e)
- })
- ##5.变分自编码器(Variational AutoEncoder, VAE): 一种生成模型,通过学习输入数据的概率表示来进行特征学习
- from flask import Flask, request, jsonify
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- from sklearn.preprocessing import StandardScaler
- from keras.layers import Input, Dense, Lambda, Layer, Reshape
- from keras.models import Model
- from keras import backend as K
- from keras.losses import binary_crossentropy
- import requests
- def build_vae_model(input_dim, intermediate_dim):
- input_layer = Input(shape=(input_dim,), name='encoder_input')
- x = Dense(intermediate_dim, activation='relu')(input_layer)
- z_mean = Dense(input_dim, name='z_mean')(x)
- z_log_var = Dense(input_dim, name='z_log_var')(x)
- def sampling(args):
- z_mean, z_log_var = args
- batch = K.shape(z_mean)[0]
- dim = K.int_shape(z_mean)[1]
- epsilon = K.random_normal(shape=(batch, dim))
- return z_mean + K.exp(0.5 * z_log_var) * epsilon
- z = Lambda(sampling, output_shape=(input_dim,), name='z')([z_mean, z_log_var])
- encoder = Model(input_layer, [z_mean, z_log_var, z], name='encoder')
- decoder_input = Input(shape=(input_dim,), name='decoder_input')
- x = Dense(intermediate_dim, activation='relu')(decoder_input)
- x = Dense(input_dim, activation='sigmoid')(x)
- decoder = Model(decoder_input, x, name='decoder')
- z_mean, z_log_var, z = encoder(input_layer)
- x_decoded = decoder(z)
- vae = Model(input_layer, [x_decoded, z_mean, z_log_var])
- reconstruction_loss = binary_crossentropy(input_layer, x_decoded)
- reconstruction_loss *= input_dim
- kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
- kl_loss = K.sum(kl_loss, axis=-1)
- kl_loss *= -0.5
- vae_loss = K.mean(reconstruction_loss + kl_loss)
- vae.add_loss(vae_loss)
- vae.compile(optimizer='adam')
- return vae, encoder, decoder
- @app.route('/vae', methods=['POST'])
- def vae_anomaly_detection():
- try:
- # 检查请求体中是否包含文件地址
- data = request.json
- if 'file_url' not in data:
- return jsonify({'code': 400, 'msg': 'No file URL provided'})
- file_url = request.json.get('file_url')
- result_file_path = request.json.get('result_file_path')
- result_img_path = request.json.get('result_img_path')
- # 读取数据集
- df = pd.read_csv('{0}'.format(file_url))
- scaler = StandardScaler()
- X_scaled = scaler.fit_transform(df)
- input_dim = X_scaled.shape[1]
- intermediate_dim = 64
- vae, _, _ = build_vae_model(input_dim, intermediate_dim)
- vae.fit(X_scaled, epochs=50, batch_size=128, shuffle=True)
- X_decoded = vae.predict(X_scaled)[0]
- if X_decoded.ndim == 2:
- X_decoded = X_decoded.reshape(X_decoded.shape[0], -1)
- reconstruction_error = np.mean(np.square(X_scaled - X_decoded), axis=1)
- plt.figure(figsize=(10, 6))
- plt.hist(reconstruction_error, bins=50, color='blue', alpha=0.7)
- plt.xlabel('Reconstruction Error')
- plt.ylabel('Frequency')
- plt.title('Reconstruction Error Distribution')
- plt.savefig(result_img_path)
- plt.close()
- reconstruction_error_df = pd.DataFrame(
- reconstruction_error, columns=['Reconstruction_Error'])
- reconstruction_error_df.to_csv(result_file_path, index=False)
- mean = np.mean(reconstruction_error)
- std = np.std(reconstruction_error)
- threshold = mean + 2 * std
- outliers = reconstruction_error > threshold
- return jsonify({
- 'code': 200,
- 'msg': 'VAE training and anomaly detection completed',
- 'abnormalIndices': list(np.where(outliers)[0])
- })
- except Exception as e:
- return jsonify({
- 'code': 500,
- 'msg': str(e)
- })
- if __name__ == '__main__':
- app.run(debug=True, port=10003, host='0.0.0.0')
|