123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316 |
- ## 1.特征值聚类分析:评估聚类结果的质量和是否存在退化。
- # 轮廓系数的范围是[-1, 1],接近1表示聚类效果好,
- # 肘部法则通过改变聚类的数量,计算不同聚类数的总内部平方和,然后选择一个点,在这个点之后WCSS下降的速率明显减慢,这个点就是合适的聚类数。
- # Calinski-Harabasz指数衡量聚类间的分离度和聚类内的紧密度。CH指数越大,表示聚类效果越好。
- # 戴维森堡丁指数衡量聚类内样本的相似度和聚类间样本的不相似度。DB指数越小,表示聚类效果越好。
- import warnings
- warnings.filterwarnings('ignore', category=FutureWarning)
- from sklearn.cluster import KMeans
- import pandas as pd
- from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
- import matplotlib
- matplotlib.use('Qt5Agg')
- import matplotlib.pyplot as plt
- # 读取特征值文件
- df = pd.read_csv(r'D:\验收材料\空工大-装备性能退化评估和故障预测健康管理软件\里程碑最终算法\03特征提取\源代码\特征值文件-统计.csv')
- # 应用KMeans聚类
- kmeans = KMeans(n_clusters=2, random_state=0).fit(df)
- df['cluster'] = kmeans.labels_
- # 肘部法则
- wcss = []
- for i in range(1, 11): # 尝试1到10个聚类
- kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
- kmeans.fit(df)
- wcss.append(kmeans.inertia_)
- plt.plot(range(1, 11), wcss)
- plt.title('Elbow Method') #如果WCSS随聚类数的增加而急剧下降,然后在某个点之后下降非常缓慢或几乎不变,这可能表明聚类数过多,导致模型开始“退化”
- plt.xlabel('Number of clusters')
- plt.ylabel('WCSS')
- plt.savefig('kmeans.jpg')
- plt.show()
- # 计算轮廓系数
- silhouette_avg = silhouette_score(df, kmeans.labels_)
- # 计算戴维森堡丁指数
- db_index = davies_bouldin_score(df, kmeans.labels_)
- # 计算Calinski-Harabasz指数
- ch_index = calinski_harabasz_score(df, kmeans.labels_)
- # 创建一个包含计算结果的DataFrame
- results_df = pd.DataFrame({
- 'Metric': ['Silhouette Coefficient', 'Davies-Bouldin Index', 'Calinski-Harabasz Index'],
- 'Value': [silhouette_avg, db_index, ch_index]
- })
- # 保存结果到CSV文件
- results_df.to_csv('kmeans_results.csv', index=False)
- # ## 2.基于统计的方法来评估退化:计算DataFrame中每一列的均值和标准差。并绘制每一列特征值的直方图
- # import numpy as np
- # import pandas as pd
- # import matplotlib
- # matplotlib.use('Qt5Agg')
- # import matplotlib.pyplot as plt
- #
- # def statistical_degradation(df):
- # means = df.mean()
- # std_devs = df.std()
- # return means, std_devs
- # def save_to_csv(means, std_devs, file_path):
- # # 创建一个包含均值和标准差的DataFrame
- # stats_df = pd.DataFrame({'Mean': means, 'Standard Deviation': std_devs})
- # # 将DataFrame存储到CSV文件
- # stats_df.to_csv(file_path, index=False)
- #
- # def plot_feature_distribution(df, means, std_devs):
- # fig, axes = plt.subplots(nrows=len(df.columns), ncols=1, figsize=(10, 5 * len(df.columns)))
- # for i, (column, ax) in enumerate(zip(df.columns, axes)):
- # df[column].plot(kind='hist', ax=ax, bins=20, alpha=0.5)
- # ax.axvline(means[i], color='r', linestyle='--', label=f'Mean: {means[i]:.2f}')
- # ax.axvline(means[i] - std_devs[i], color='g', linestyle='-', label=f'-1 Std Dev')
- # ax.axvline(means[i] + std_devs[i], color='g', linestyle='-')
- # ax.set_title(f'Distribution of {column}')
- # ax.legend(loc='upper left')
- # if i != len(df.columns) - 1:
- # ax.xaxis.set_visible(False) # 隐藏除底部子图外的所有x轴标签
- # if i == 0:
- # ax.set_ylabel('Frequency') # 只在第一个子图上显示y轴标签
- # else:
- # ax.yaxis.set_visible(False) # 隐藏除左侧子图外的所有y轴标签
- # fig.tight_layout()
- # plt.savefig('distribution.jpg')
- # plt.show()
- # # 读取CSV文件
- # df = pd.read_csv(r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\里程碑最终算法\03特征提取\源代码\特征值文件-频域.csv')
- # # 计算统计数据
- # means, std_devs = statistical_degradation(df)
- # save_to_csv(means, std_devs, 'statistics.csv')
- # # 绘制特征值分布图
- # plot_feature_distribution(df, means, std_devs)
- # ## 3.基于趋势分析法:分析特征值随时间的趋势,如果斜率显著不为零,则可能表明退化。
- # from sklearn.linear_model import LinearRegression
- # import numpy as np
- # import pandas as pd
- # import matplotlib
- # matplotlib.use('Qt5Agg')
- # import matplotlib.pyplot as plt
- #
- # # 趋势分析函数,适用于多列特征值
- # def trend_analysis(df, some_threshold):
- # results_list = []
- # for column in df.columns:
- # X = np.arange(len(df[column])).reshape(-1, 1)
- # y = df[column].values
- # model = LinearRegression().fit(X, y)
- # slope = model.coef_[0]
- # # print(f"Slope for {column}:", slope)
- # results_list.append({'Feature': column, 'Slope': slope, 'Significant': abs(slope) > some_threshold})
- # results_df = pd.DataFrame(results_list)
- # return results_df
- #
- # # 读取CSV文件
- # df = pd.read_csv(r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\里程碑最终算法\03特征提取\源代码\特征值文件-频域.csv')
- #
- # # 设置斜率显著性的阈值
- # some_threshold = 0.01
- #
- # # 进行趋势分析并获取结果
- # slopes = trend_analysis(df, some_threshold)
- #
- # # 保存结果到CSV文件
- # slopes.to_csv('trend_analysis_results.csv', index=False)
- #
- # # 筛选出显著的特征
- # significant_columns = slopes[slopes['Significant'] == True]['Feature']
- # num_features = len(significant_columns)
- #
- # # 动态设置图形的高度,每个子图的高度为4英寸
- # plt.figure(figsize=(15, 4 * num_features)) # 总宽度15英寸,高度根据特征数量自适应
- # for i, column in enumerate(significant_columns):
- # plt.subplot(num_features, 1, i+1) # 创建子图
- # plt.scatter(range(len(df)), df[column], label='Data')
- # significant_slope = slopes[slopes['Feature'] == column]['Slope'].values[0]
- # plt.plot(range(len(df)), significant_slope * np.arange(len(df)) + df[column].iloc[0],
- # color='red', label=f'Trend line with slope {significant_slope:.4f}')
- # plt.xlabel('Time')
- # plt.ylabel(column)
- # plt.title(f'Trend Analysis for {column}')
- # plt.legend()
- #
- # plt.tight_layout() # 调整子图布局以避免重叠
- # plt.savefig('trend_analysis.jpg')
- # plt.show()
- # # ## 4.基于时间序列分析的方法:识别特征值的周期性变化或异常模式。
- # import numpy as np
- # import pandas as pd
- # import matplotlib
- # matplotlib.use('Qt5Agg')
- # import matplotlib.pyplot as plt
- # from statsmodels.tsa.arima.model import ARIMA
- # import warnings
- # warnings.filterwarnings('ignore', category=FutureWarning)
- #
- # # 多变量时间序列退化评估函数
- # def time_series_degradation_multicolumn(df):
- # aic_values = pd.Series()
- # for column in df.columns:
- # data = df[column].values
- # model = ARIMA(data, order=(1, 1, 1))
- # results = model.fit()
- # aic_value = results.aic
- # print(f"AIC for {column}:", aic_value)
- # aic_values[column] = aic_value
- # return aic_values
- #
- # # 读取CSV文件
- # df = pd.read_csv(r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\里程碑最终算法\03特征提取\源代码\特征值文件-频域.csv')
- #
- # # 进行多变量时间序列退化评估
- # aic_values = time_series_degradation_multicolumn(df)
- #
- # # 将AIC值保存到CSV文件
- # aic_values.to_csv('aic_values.csv', index=True)
- #
- # # 选择AIC值最高的前N个特征
- # N = 10
- # top_features = aic_values.sort_values(ascending=False).index[:N]
- #
- # # 设置图形和子图的布局
- # num_features = len(top_features)
- # fig, axes = plt.subplots(num_features, 1, figsize=(10, 4 * num_features), sharex=True)
- #
- # for i, column in enumerate(top_features):
- # df[column].plot(ax=axes[i], label=column)
- # axes[i].set_title(f'Time Series Plot for {column}')
- # axes[i].set_xlabel('Time')
- # axes[i].set_ylabel(column)
- # axes[i].legend()
- #
- # # 如果只有一个特征,axes可能不是数组,需要检查并相应地调整
- # if num_features == 1:
- # axes.legend()
- #
- # plt.tight_layout()
- # plt.savefig('time_series.jpg')
- # plt.show()
- # #5.频域分析:通过傅里叶变换分析信号的频率成分,识别异常频率成分可能表明的退化。
- #
- #
- # import numpy as np
- # import pandas as pd
- # import matplotlib
- # matplotlib.use('Qt5Agg')
- # import matplotlib.pyplot as plt
- #
- # # 假设df是包含多列时间序列特征的DataFrame
- # df = pd.read_csv(r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\里程碑最终算法\03特征提取\源代码\特征值文件-频域.csv')
- #
- # # 进行傅里叶变换并分析每列特征
- # n_columns = df.shape[1]
- # fig, axes = plt.subplots(n_columns, 1, figsize=(10, 4 * n_columns))
- #
- # fft_results = []
- # for i, column in enumerate(df.columns):
- # # 对每列特征进行FFT
- # data = df[column].values
- # fft = np.fft.fft(data)
- # frequencies = np.fft.fftfreq(len(data), d=1)
- #
- # # 找到峰值频率及其幅度
- # peak_frequency_index = np.argmax(np.abs(fft))
- # peak_frequency = frequencies[peak_frequency_index]
- # peak_amplitude = np.abs(fft[peak_frequency_index])
- #
- # # 绘制频谱图
- # axes[i].plot(frequencies, np.abs(fft))
- # axes[i].set_title(f'Frequency Spectrum of {column}')
- # axes[i].set_xlabel('Frequency (Hz)')
- # axes[i].set_ylabel('Amplitude')
- # axes[i].grid(True)
- #
- # # 存储FFT结果
- # fft_results.append({
- # 'Feature': column,
- # 'Peak Frequency (Hz)': peak_frequency,
- # 'Peak Amplitude': peak_amplitude
- # })
- #
- # # 调整子图布局以避免重叠
- # plt.tight_layout()
- # plt.subplots_adjust(hspace=0.5)
- #
- # # 保存图形
- # plt.savefig('fft_spectrum.jpg')
- # plt.show()
- #
- # # 将FFT结果保存到CSV
- # fft_df = pd.DataFrame(fft_results)
- # fft_df.to_csv('fft_degradation_parameters.csv', index=False)
- # ## 6.移动平均和标准差来评估退化
- # import matplotlib
- # matplotlib.use('Qt5Agg')
- # import matplotlib.pyplot as plt
- # import numpy as np
- # import pandas as pd
- #
- # # 计算移动平均和标准差,用于退化评估
- # def simple_degradation_assessment(data, window_size=5):
- # moving_average = data.rolling(window=window_size).mean()
- # std_deviation = data.rolling(window=window_size).std()
- # return moving_average, std_deviation
- # # 计算描述性统计量
- # def descriptive_statistics(data):
- # return {
- # 'Mean': data.mean(),
- # 'Min': data.min(),
- # 'Max': data.max(),
- # 'Std': data.std()
- # }
- # # 读取CSV文件
- # df = pd.read_csv(r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\里程碑最终算法\03特征提取\源代码\特征值文件-频域.csv')
- # # 对每一列进行退化评估
- # degradation_stats = {column: descriptive_statistics(df[column]) for column in df.columns}
- #
- # # 将描述性统计量转换为DataFrame并保存到CSV
- # degradation_df = pd.DataFrame(degradation_stats).T
- # degradation_df.to_csv('degradation_statistics.csv', index=False)
- # # 设置图形的大小和布局
- # num_features = len(df.columns)
- # fig, axes = plt.subplots(num_features, 1, figsize=(10, 4 * num_features))
- #
- # # 对每一列进行退化评估并绘制到子图上
- # feature_data = [] # 用于存储所有特征的移动平均和标准差数据
- # for i, column in enumerate(df.columns, 1):
- # ma, std = simple_degradation_assessment(df[column], window_size=5)
- # axes[i-1].plot(df[column], label='Original Data', color='blue')
- # axes[i-1].plot(ma, label='Moving Average', linestyle='--', color='red')
- # axes[i-1].plot(std, label='Standard Deviation', linestyle='-.', color='green')
- # axes[i-1].set_title(f'Degradation Assessment for {column}')
- # axes[i-1].set_xlabel('Time')
- # axes[i-1].set_ylabel('Value')
- # axes[i-1].legend()
- #
- # # 将结果收集到列表中
- # feature_data.append({
- # 'Feature': column,
- # 'Moving Average': ma.to_list(),
- # 'Standard Deviation': std.to_list()
- # })
- #
- # # 调整子图布局以避免重叠
- # plt.tight_layout()
- # plt.subplots_adjust(hspace=0.5) # 调整子图之间的垂直间距
- #
- # # 保存图形
- # plt.savefig('degradation_assessment.jpg')
- # plt.show()
|