特征提取.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. # 1.基本统计特征
  2. import pandas as pd
  3. from scipy.stats import skew, kurtosis
  4. import numpy as np
  5. def ae_feature(raw, fs):
  6. mean = np.mean(raw)
  7. std_dev = np.std(raw)
  8. max_value = np.max(raw)
  9. min_value = np.min(raw)
  10. variance = np.var(raw)
  11. skewness = skew(raw)
  12. kurt = kurtosis(raw)
  13. range=np.max(raw) - np.min(raw)
  14. rms = np.sum(raw ** 2) / len(raw)
  15. ae_feature = pd.DataFrame()
  16. ae_feature['mean'] = [mean]
  17. ae_feature['std_dev'] = [std_dev]
  18. ae_feature['max_value'] = [max_value]
  19. ae_feature['min_value'] = [min_value]
  20. ae_feature['variance'] = [variance]
  21. ae_feature['skewness'] = [skewness]
  22. ae_feature['kurt'] = [kurt]
  23. ae_feature['range'] = [range]
  24. ae_feature['rms'] = [rms]
  25. return ae_feature
  26. # 读取CSV文件
  27. df = pd.read_csv('扩充结果.csv')
  28. s=df.shape[1]
  29. p=df.shape[0]
  30. fs=300 # 设置提取特征值每组的数据量
  31. # 初始化一个空的DataFrame来保存所有特征
  32. all_features_df = pd.DataFrame()
  33. # 遍历每一列,按照每组300个数据计算基本统计特征
  34. for i in range(int(df.shape[1])):
  35. for j in range(0, len(df), fs):
  36. end_index = min(j + fs, len(df))
  37. data_test = np.array(df.iloc[j:end_index, i])
  38. # 计算时域特征
  39. sa=ae_feature(data_test,fs)
  40. all_features_df = pd.concat([all_features_df, sa], ignore_index=True)
  41. all_features_df.to_csv('特征值文件-统计.csv', index=False)
  42. # # 2.时间域特征
  43. # import numpy as np
  44. # import pandas as pd
  45. # import math
  46. # from scipy import fftpack
  47. #
  48. # def time_features(raw, fs):
  49. # n = len(raw)
  50. # fft_raw = abs(fftpack.fft(raw, n) * 2 / n)
  51. # main_freq_index = np.where(fft_raw[0:math.floor(n / 2)] == max(fft_raw[0:math.floor(n / 2)]))
  52. # f = np.linspace(0, fs, n) * fs / n
  53. # peak_freq = f[main_freq_index[0][0]] # 峰值频率,单位Hz
  54. #
  55. # xm = np.mean(raw) # 均值
  56. # xstd = np.std(raw) # 标准差
  57. # kur = ((np.sum((raw - xm) ** 4)) / len(raw)) / (xstd ** 4) # 峭度
  58. # cal_ave_amp = np.sum(np.abs(raw)) / n
  59. # power_2 = np.power(raw, 2)
  60. # sum_power = np.sum(power_2)
  61. # cal_rms = np.sqrt(sum_power / n)
  62. # cal_form = cal_rms / cal_ave_amp # 波峰因子,均方根/平均幅值
  63. # cal_peak = np.max(np.abs(raw)) # 峰值
  64. # cal_crest = cal_peak / cal_rms # 峰值因子
  65. #
  66. # feature_df = pd.DataFrame()
  67. # feature_df['peak_freq_khz'] = [peak_freq / 1000] # 转换为kHz
  68. # feature_df['均值v'] = [xm]
  69. # feature_df['标准差'] = [xstd]
  70. # feature_df['峭度'] = [kur]
  71. # feature_df['平均幅值'] = [cal_ave_amp]
  72. # feature_df['均方根'] = [cal_rms]
  73. # feature_df['波峰因子'] = [cal_form]
  74. # feature_df['峰值'] = [cal_peak]
  75. # feature_df['峰值因子'] = [cal_crest]
  76. #
  77. # return feature_df
  78. # # 读取CSV文件
  79. # df = pd.read_csv(r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\里程碑最终算法\02扩充\源代码\扩充后的数据-随机采样.csv')
  80. # s=df.shape[1]
  81. # p=df.shape[0]
  82. # fs=300 # 设置每组的数据量
  83. # # 初始化一个空的DataFrame来保存所有特征
  84. # all_features_df = pd.DataFrame()
  85. # # 遍历每一列,按照每组300个数据计算基本统计特征
  86. # for i in range(int(df.shape[1])):
  87. # for j in range(0, len(df), fs):
  88. # end_index = min(j + fs, len(df))
  89. # data_test = np.array(df.iloc[j:end_index, i])
  90. # # 计算时域特征
  91. # sa=time_features(data_test,fs)
  92. # all_features_df = pd.concat([all_features_df, sa], ignore_index=True)
  93. # all_features_df.to_csv('特征值文件-时域.csv', index=False)
  94. # # 3.频域特征
  95. # import pandas as pd
  96. # from scipy.fft import fft, fftfreq
  97. # from scipy.stats import skew, kurtosis
  98. # import numpy as np
  99. #
  100. # def fe_feature(signal, fs):
  101. # n = len(signal)
  102. # freqs = fftfreq(n, d=1/fs) # 生成频率序列
  103. # signal_fft = fft(signal) # 计算FFT
  104. # signal_fft_shifted = np.fft.fftshift(signal_fft) # 将FFT结果中心化
  105. #
  106. # # 计算频域特征
  107. # total_energy = np.sum(np.abs(signal_fft)**2)
  108. # total_power = total_energy / n
  109. # rms = np.sqrt(total_power)
  110. # freq_centroid = np.sum(np.abs(freqs) * np.abs(signal_fft_shifted)**2) / total_energy
  111. # freq_bandwidth = np.sum(np.abs(freqs[1:]**2 * signal_fft_shifted[1:]**2) - np.abs(freqs[:-1]**2 * signal_fft_shifted[:-1]**2)) / (2 * np.sum(np.abs(signal_fft_shifted)**2))
  112. # peak_freq = (np.argmax(np.abs(signal_fft_shifted)) + 1) * fs / n # 峰值频率
  113. # freq_skewness = skew(freqs[1:n//2] * (np.abs(signal_fft[1:n//2]) / np.max(np.abs(signal_fft[1:n//2]))))
  114. # freq_kurtosis = kurtosis(freqs[1:n//2] * (np.abs(signal_fft[1:n//2]) / np.max(np.abs(signal_fft[1:n//2]))))
  115. #
  116. # feature_df = pd.DataFrame({
  117. # '总能量': [total_energy],
  118. # '总功率': [total_power],
  119. # '均方根': [rms],
  120. # '频率中心': [freq_centroid],
  121. # '频率带宽': [freq_bandwidth],
  122. # '峰值频率': [peak_freq],
  123. # '频率偏度': [freq_skewness],
  124. # '频率峰度': [freq_kurtosis]
  125. # })
  126. #
  127. # return feature_df
  128. #
  129. # # 读取CSV文件
  130. # df = pd.read_csv(r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\里程碑最终算法\02扩充\源代码\扩充后的数据-随机采样.csv')
  131. #
  132. # fs = 300 # 设置采样频率,这里假设为300Hz,根据实际情况调整
  133. # all_features_df = pd.DataFrame()
  134. #
  135. # # 遍历每一列,按照每组300个数据计算频域特征
  136. # for i in range(int(df.shape[1])):
  137. # for j in range(0, len(df), fs):
  138. # end_index = min(j + fs, len(df))
  139. # data_test = df.iloc[j:end_index, i].to_numpy()
  140. # # 计算频域特征
  141. # fa = fe_feature(data_test, fs)
  142. # all_features_df = pd.concat([all_features_df, fa], ignore_index=True)
  143. #
  144. # # 保存为CSV文件
  145. # all_features_df.to_csv('特征值文件-频域.csv', index=False)
  146. #4.时频域特征
  147. # import pandas as pd
  148. # from scipy.stats import skew, kurtosis
  149. # import numpy as np
  150. # from scipy.fft import fft, fftfreq
  151. #
  152. # def tf_features(raw, fs):
  153. # # 计算快速傅里叶变换
  154. # fft_values = fft(raw)
  155. # fft_magnitude = np.abs(fft_values)
  156. #
  157. # # 计算频率向量
  158. # n = len(raw)
  159. # freq = fftfreq(n, 1 / fs)
  160. #
  161. # # 时域特征
  162. # mean = np.mean(raw)
  163. # std_dev = np.std(raw)
  164. # max_value = np.max(raw)
  165. # min_value = np.min(raw)
  166. # variance = np.var(raw)
  167. # skewness = skew(raw)
  168. # kurt = kurtosis(raw)
  169. #
  170. # # 频域特征
  171. # power_spectrum = np.square(fft_magnitude) / n
  172. # mean_freq = np.mean(freq[power_spectrum > 0])
  173. # std_freq = np.std(freq[power_spectrum > 0])
  174. # freq_skewness = skew(freq[power_spectrum > 0])
  175. # freq_kurt = kurtosis(freq[power_spectrum > 0])
  176. # peak_freq_index = np.argmax(power_spectrum)
  177. # peak_freq = freq[peak_freq_index]
  178. # bandwidth = np.max(freq[power_spectrum > 0]) - np.min(freq[power_spectrum > 0])
  179. # energy = np.sum(power_spectrum)
  180. #
  181. # tf_feature = pd.DataFrame()
  182. # tf_feature['mean'] = [mean]
  183. # tf_feature['std_dev'] = [std_dev]
  184. # tf_feature['max_value'] = [max_value]
  185. # tf_feature['min_value'] = [min_value]
  186. # tf_feature['variance'] = [variance]
  187. # tf_feature['skewness'] = [skewness]
  188. # tf_feature['kurt'] = [kurt]
  189. # tf_feature['mean_freq'] = [mean_freq]
  190. # tf_feature['std_freq'] = [std_freq]
  191. # tf_feature['freq_skewness'] = [freq_skewness]
  192. # tf_feature['freq_kurt'] = [freq_kurt]
  193. # tf_feature['peak_freq'] = [peak_freq]
  194. # tf_feature['bandwidth'] = [bandwidth]
  195. # tf_feature['energy'] = [energy]
  196. # return tf_feature
  197. #
  198. #
  199. # # 读取CSV文件
  200. # df = pd.read_csv(r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\里程碑最终算法\02扩充\源代码\扩充后的数据-随机采样.csv')
  201. # fs = 300 # 假设的采样频率,根据实际情况调整
  202. # # 初始化一个空的DataFrame来保存所有特征
  203. # all_features_df = pd.DataFrame()
  204. #
  205. # # 遍历每一列,按照每组300个数据计算频域特征
  206. # for i in range(int(df.shape[1])):
  207. # for j in range(0, len(df), fs):
  208. # end_index = min(j + fs, len(df))
  209. # data_test = df.iloc[j:end_index, i].to_numpy()
  210. # # 计算频域特征
  211. # fa = tf_features(data_test, fs)
  212. # all_features_df = pd.concat([all_features_df, fa], ignore_index=True)
  213. #
  214. # # 保存为CSV文件
  215. # all_features_df.to_csv('特征值文件-时频域.csv', index=False)
  216. #5.形态学特征提取
  217. # import pandas as pd
  218. # from scipy.signal import find_peaks
  219. # import numpy as np
  220. #
  221. # def morphological_features(raw):
  222. # nonzero_elements = np.count_nonzero(raw)
  223. # mean_slope = np.mean(np.diff(raw) / np.diff(np.arange(len(raw))))
  224. #
  225. # peaks, _ = find_peaks(raw)
  226. # num_peaks = len(peaks)
  227. # troughs, _ = find_peaks(-raw) # 寻找谷值
  228. # num_troughs = len(troughs)
  229. #
  230. # # 假设峰值和谷值的宽度为相邻峰值或谷值之间的距离
  231. # if num_peaks > 1:
  232. # peak_widths = np.diff(np.sort(peaks)[1:] - np.sort(peaks)[:-1])
  233. # else:
  234. # peak_widths = np.array([]) # 如果只有一个峰值,则没有宽度
  235. #
  236. # if num_troughs > 1:
  237. # trough_widths = np.diff(np.sort(troughs)[1:] - np.sort(troughs)[:-1])
  238. # else:
  239. # trough_widths = np.array([]) # 如果只有一个谷值,则没有宽度
  240. #
  241. # # 计算曲率和加速度变化需要一阶和二阶导数
  242. # first_derivative = np.diff(raw) / np.diff(np.arange(len(raw)))
  243. # second_derivative = np.diff(first_derivative) / np.diff(np.arange(len(raw) - 1))
  244. # curvature = np.sum(second_derivative ** 2) # 曲率的简单估计
  245. # acceleration_change = np.sum(np.abs(np.diff(first_derivative))) # 加速度变化的简单估计
  246. #
  247. # morph_feature = pd.DataFrame()
  248. # morph_feature['nonzero_elements'] = [nonzero_elements]
  249. # morph_feature['mean_slope'] = [mean_slope]
  250. # morph_feature['num_peaks'] = [num_peaks]
  251. # morph_feature['num_troughs'] = [num_troughs]
  252. # morph_feature['mean_peak_width'] = [np.mean(peak_widths)] if peak_widths.size else [0]
  253. # morph_feature['mean_trough_width'] = [np.mean(trough_widths)] if trough_widths.size else [0]
  254. # morph_feature['curvature'] = [curvature]
  255. # morph_feature['acceleration_change'] = [acceleration_change]
  256. # return morph_feature
  257. #
  258. # # 读取CSV文件
  259. # df = pd.read_csv(r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\里程碑最终算法\02扩充\源代码\扩充后的数据-随机采样.csv')
  260. # fs = 300 # 假设的采样频率,根据实际情况调整
  261. # # 初始化一个空的DataFrame来保存所有特征
  262. # all_features_df = pd.DataFrame()
  263. #
  264. # # 遍历每一列,按照每组300个数据计算频域特征
  265. # for i in range(int(df.shape[1])):
  266. # for j in range(0, len(df), fs):
  267. # end_index = min(j + fs, len(df))
  268. # data_test = df.iloc[j:end_index, i].to_numpy()
  269. # # 计算频域特征
  270. # fa = morphological_features(data_test)
  271. # all_features_df = pd.concat([all_features_df, fa], ignore_index=True)
  272. #
  273. # # 保存为CSV文件
  274. # all_features_df.to_csv('特征值文件-形态学.csv', index=False)