# ## 1.PCA + K-means 聚类 from flask import Flask, request, jsonify from sklearn.cluster import KMeans import scipy.special.cython_special app = Flask(__name__) def pca_kmeans_analysis(df): # 特征缩放 scaler = StandardScaler() X_scaled = scaler.fit_transform(df) # PCA降维 pca = PCA(n_components=2) X_pca = pca.fit_transform(X_scaled) # KMeans聚类 kmeans = KMeans(n_clusters=2, n_init=10, random_state=42) kmeans.fit(X_pca) # 定义颜色列表 colors = ['r', 'g', 'b'] # 为每个聚类指定颜色 # 绘制结果 fig, ax = plt.subplots(figsize=(15, 5)) # 使用1个子图显示结果 if kmeans.n_clusters <= len(colors): # 绘制聚类结果 for i in range(kmeans.n_clusters): cluster_members = X_pca[kmeans.labels_ == i] ax.scatter(cluster_members[:, 0], cluster_members[:, 1], c=colors[i], label=f'Cluster {i + 1}') # 绘制质心 ax.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='k', marker='x', label='Centroids') # 设置图表标题和图例 ax.set_title('PCA and KMeans Clustering') ax.legend() else: print("聚类数量超过了预定义的颜色数量。") return fig, pca, kmeans @app.route('/pca_kmeans', methods=['POST']) def pca_kmeans(): try: # 检查请求体中是否包含文件地址 data = request.json if 'file_url' not in data: return jsonify({'code': 400, 'msg': 'No file URL provided'}) file_url = request.json.get('file_url') result_file_path = request.json.get('result_file_path') result_img_path = request.json.get('result_img_path') # 读取数据集 df = pd.read_csv('{0}'.format(file_url)) # 执行PCA和KMeans分析 fig, pca, kmeans = pca_kmeans_analysis(df) # 保存图形 plt.savefig('{0}'.format(result_img_path)) plt.close() # 关闭图形以释放资源 # 保存PCA结果和聚类标签到CSV pca_df = pd.DataFrame(data=pca.transform(df), columns=['PCA1', 'PCA2']) pca_df['Cluster'] = kmeans.labels_ pca_df.to_csv(result_file_path, index=False) # 返回结果文件路径和PCA图路径给客户端 return jsonify({ 'code': 200, 'msg': 'PCA and KMeans analysis completed' }) except Exception as e: return jsonify({ 'code': 500, 'msg': str(e) }) ##2.自编码器(AutoEncoder): 一种无监督神经网络模型,通过学习输入数据的有效表示来进行特征学习。 def train_autoencoder(X_scaled): input_dim = X_scaled.shape[1] # 输入维度 encoding_dim = 8 # 编码维度 # 输入层 input_layer = Input(shape=(input_dim,)) # 编码层 encoded = Dense(encoding_dim, activation='relu')(input_layer) # 解码层 decoded = Dense(input_dim, activation='sigmoid')(encoded) # 创建自编码器模型 autoencoder = Model(input_layer, decoded) # 编译模型 autoencoder.compile(optimizer='adam', loss='mean_squared_error') # 训练自编码器 autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=256, shuffle=True) # 创建编码器模型 encoder = Model(input_layer, encoded) # 编码然后解码数据 X_encoded = encoder.predict(X_scaled) X_decoded = autoencoder.predict(X_encoded) # 计算重构误差 reconstruction_error = np.mean(np.square(X_scaled - X_decoded), axis=1) return reconstruction_error, autoencoder import tensorflow as tf @app.route('/autoencoder', methods=['POST']) def autoencoder_anomaly_detection(): try: # 检查请求体中是否包含文件地址 data = request.json if 'file_url' not in data: return jsonify({'code': 400, 'msg': 'No file URL provided'}) file_url = request.json.get('file_url') result_file_path = request.json.get('result_file_path') result_img_path = request.json.get('result_img_path') # 读取数据集 df = pd.read_csv('{0}'.format(file_url)) # 特征缩放 scaler = StandardScaler() X_scaled = scaler.fit_transform(df) # 训练自编码器并获取重构误差 reconstruction_error, autoencoder = train_autoencoder(X_scaled) # 绘制重构误差的分布情况 plt.hist(reconstruction_error, bins=50, color='blue', alpha=0.7) plt.xlabel('Reconstruction Error') plt.ylabel('Frequency') plt.title('Reconstruction Error Distribution') plt.savefig('{0}'.format(result_img_path)) plt.close() # 关闭图形以释放资源 # 保存重构误差到CSV reconstruction_error_df = pd.DataFrame(reconstruction_error, columns=['Reconstruction_Error']) reconstruction_error_df.to_csv(result_file_path, index=False) # 根据重构误差进行异常检测 mean = np.mean(reconstruction_error) std = np.std(reconstruction_error) threshold = mean + 2 * std outliers = reconstruction_error > threshold # 异常点索引 abnormal_indices = np.where(outliers)[0] # 返回结果文件路径、图路径和异常点索引给客户端 return jsonify({ 'code': 200, 'msg': 'Autoencoder training and anomaly detection completed', 'abnormalIndices': list(abnormal_indices) }) except Exception as e: return jsonify({ 'code': 500, 'msg': str(e) }) ##3.高斯混合模型(GMM): 假设数据由多个高斯分布混合而成,使用EM算法来估计每个高斯分布的参数,从而进行聚类。 from sklearn.mixture import GaussianMixture def perform_gmm_clustering(X_scaled): # PCA降维 pca = PCA(n_components=2) X_pca = pca.fit_transform(X_scaled) # GMM聚类 gmm = GaussianMixture(n_components=2, random_state=42) gmm.fit(X_pca) # 预测每个数据点的聚类标签 predicted_labels = gmm.predict(X_pca) # 绘制结果 plt.figure(figsize=(15, 5)) plt.scatter(X_pca[:, 0], X_pca[:, 1], c=predicted_labels, cmap=plt.cm.viridis, label=['Cluster 1', 'Cluster 2']) plt.scatter(gmm.means_[:, 0], gmm.means_[:, 1], s=300, c='k', marker='x', label='Centroids') plt.title('Gaussian Mixture Model Clustering') plt.legend() return gmm, pca, plt @app.route('/gmm_clustering', methods=['POST']) def gmm_clustering(): try: # 检查请求体中是否包含文件地址 data = request.json if 'file_url' not in data: return jsonify({'code': 400, 'msg': 'No file URL provided'}) file_url = request.json.get('file_url') result_file_path = request.json.get('result_file_path') result_img_path = request.json.get('result_img_path') # 读取数据集 df = pd.read_csv('{0}'.format(file_url)) # 特征缩放 scaler = StandardScaler() X_scaled = scaler.fit_transform(df) # 执行GMM聚类分析 gmm, pca, plt_object = perform_gmm_clustering(X_scaled) # 保存图形 plt_object.savefig(result_img_path) plt_object.close() # 关闭图形以释放资源 # 保存GMM结果到CSV gmm_df = pd.DataFrame(data=pca.transform(X_scaled), columns=['PCA1', 'PCA2']) gmm_df['Cluster'] = gmm.predict(pca.transform(X_scaled)) gmm_df.to_csv(result_file_path, index=False) # 返回结果文件路径和GMM聚类图路径给客户端 return jsonify({ 'code': 200, 'msg': 'GMM clustering completed', }) except Exception as e: return jsonify({ 'code': 500, 'msg': str(e) }) ##4.孤立森林(Isolation Forest): 一种异常检测算法,通过随机选择特征和切分值来“孤立”异常点。 from sklearn.ensemble import IsolationForest from sklearn.decomposition import PCA def perform_iforestation(X_scaled): # PCA降维 pca = PCA(n_components=2) X_pca = pca.fit_transform(X_scaled) # 孤立森林异常检测 iforest = IsolationForest(n_estimators=100, contamination=0.1, random_state=42) iforest.fit(X_scaled) # 预测异常分数 scores = iforest.decision_function(X_scaled) # 将数据点标记为正常或异常 predicted_labels = (scores < np.median(scores)).astype(int) # 绘制结果 plt.figure(figsize=(15, 5)) normal_color = 'green' anomaly_color = 'red' scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=[normal_color if label else anomaly_color for label in predicted_labels], label=['Normal', 'Anomaly']) plt.title('Isolation Forest Anomaly Detection') plt.legend() return pca, iforest, plt, predicted_labels @app.route('/iforestation', methods=['POST']) def iforestation(): try: # 检查请求体中是否包含文件地址 data = request.json if 'file_url' not in data: return jsonify({'code': 400, 'msg': 'No file URL provided'}) file_url = request.json.get('file_url') result_file_path = request.json.get('result_file_path') result_img_path = request.json.get('result_img_path') # 读取数据集 df = pd.read_csv('{0}'.format(file_url)) # 特征缩放 scaler = StandardScaler() X_scaled = scaler.fit_transform(df) # 执行孤立森林分析 pca, iforest, plt_object, predicted_labels = perform_iforestation(X_scaled) # 保存图形 plt_object.savefig(result_img_path) plt_object.close() # 关闭图形以释放资源 # 保存孤立森林结果到CSV iforest_df = pd.DataFrame({ 'PCA1': pca.transform(X_scaled)[:, 0], 'PCA2': pca.transform(X_scaled)[:, 1], 'Anomaly_Score': iforest.decision_function(X_scaled), 'Anomaly_Label': predicted_labels }) iforest_df.to_csv(result_file_path, index=False) # 返回结果文件路径和孤立森林图路径给客户端 return jsonify({ 'code': 200, 'msg': 'Isolation Forest analysis completed', }) except Exception as e: return jsonify({ 'code': 500, 'msg': str(e) }) ##5.变分自编码器(Variational AutoEncoder, VAE): 一种生成模型,通过学习输入数据的概率表示来进行特征学习 from flask import Flask, request, jsonify import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from keras.layers import Input, Dense, Lambda, Layer, Reshape from keras.models import Model from keras import backend as K from keras.losses import binary_crossentropy import requests def build_vae_model(input_dim, intermediate_dim): input_layer = Input(shape=(input_dim,), name='encoder_input') x = Dense(intermediate_dim, activation='relu')(input_layer) z_mean = Dense(input_dim, name='z_mean')(x) z_log_var = Dense(input_dim, name='z_log_var')(x) def sampling(args): z_mean, z_log_var = args batch = K.shape(z_mean)[0] dim = K.int_shape(z_mean)[1] epsilon = K.random_normal(shape=(batch, dim)) return z_mean + K.exp(0.5 * z_log_var) * epsilon z = Lambda(sampling, output_shape=(input_dim,), name='z')([z_mean, z_log_var]) encoder = Model(input_layer, [z_mean, z_log_var, z], name='encoder') decoder_input = Input(shape=(input_dim,), name='decoder_input') x = Dense(intermediate_dim, activation='relu')(decoder_input) x = Dense(input_dim, activation='sigmoid')(x) decoder = Model(decoder_input, x, name='decoder') z_mean, z_log_var, z = encoder(input_layer) x_decoded = decoder(z) vae = Model(input_layer, [x_decoded, z_mean, z_log_var]) reconstruction_loss = binary_crossentropy(input_layer, x_decoded) reconstruction_loss *= input_dim kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var) kl_loss = K.sum(kl_loss, axis=-1) kl_loss *= -0.5 vae_loss = K.mean(reconstruction_loss + kl_loss) vae.add_loss(vae_loss) vae.compile(optimizer='adam') return vae, encoder, decoder @app.route('/vae', methods=['POST']) def vae_anomaly_detection(): try: # 检查请求体中是否包含文件地址 data = request.json if 'file_url' not in data: return jsonify({'code': 400, 'msg': 'No file URL provided'}) file_url = request.json.get('file_url') result_file_path = request.json.get('result_file_path') result_img_path = request.json.get('result_img_path') # 读取数据集 df = pd.read_csv('{0}'.format(file_url)) scaler = StandardScaler() X_scaled = scaler.fit_transform(df) input_dim = X_scaled.shape[1] intermediate_dim = 64 vae, _, _ = build_vae_model(input_dim, intermediate_dim) vae.fit(X_scaled, epochs=50, batch_size=128, shuffle=True) X_decoded = vae.predict(X_scaled)[0] if X_decoded.ndim == 2: X_decoded = X_decoded.reshape(X_decoded.shape[0], -1) reconstruction_error = np.mean(np.square(X_scaled - X_decoded), axis=1) plt.figure(figsize=(10, 6)) plt.hist(reconstruction_error, bins=50, color='blue', alpha=0.7) plt.xlabel('Reconstruction Error') plt.ylabel('Frequency') plt.title('Reconstruction Error Distribution') plt.savefig(result_img_path) plt.close() reconstruction_error_df = pd.DataFrame( reconstruction_error, columns=['Reconstruction_Error']) reconstruction_error_df.to_csv(result_file_path, index=False) mean = np.mean(reconstruction_error) std = np.std(reconstruction_error) threshold = mean + 2 * std outliers = reconstruction_error > threshold return jsonify({ 'code': 200, 'msg': 'VAE training and anomaly detection completed', 'abnormalIndices': list(np.where(outliers)[0]) }) except Exception as e: return jsonify({ 'code': 500, 'msg': str(e) }) if __name__ == '__main__': app.run(debug=True, port=10003)