扩充.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. # #扩充1--随机采样:是一种简单直接的数据增强方法,通过从现有数据中随机选择样本来增加数据集的大小。对于不平衡的数据集,可以通过有偏的采样来平衡类别,即对少数类的样本进行过采样,对多数类的样本进行欠采样。
  2. import pandas as pd
  3. import numpy as np
  4. def extend_data_with_ordered_sampling(df, expansion_ratio=0.2):
  5. """
  6. 通过有序采样的方式扩充整个数据集,不包括时间列(如果存在)。
  7. :param df: 原始数据帧。
  8. :param expansion_ratio: 扩充数据的比例,即新数据占原始数据的比例。
  9. :return: 扩充后的数据帧。
  10. """
  11. # 检查是否存在 'Time' 列,如果存在,则删除
  12. if 'Time' in df.columns:
  13. df = df.drop(columns=['Time'])
  14. # 计算需要扩充的样本数量
  15. n_samples = int(len(df) * expansion_ratio)
  16. # 对剩余数据进行排序(假设df已经是按照时间排序的,如果没有排序需要添加排序逻辑)
  17. df_sorted = df.sort_index()
  18. # 按照顺序取出一部分数据作为扩充数据,不打乱顺序
  19. sampled_data = df_sorted.tail(n_samples).copy()
  20. # 合并原始数据帧与采样数据帧
  21. final_data = pd.concat([df, sampled_data], ignore_index=True)
  22. return final_data
  23. # 读取数据
  24. data_path = r'D:\验收材料\空工大-装备性能退化评估和故障预测健康管理软件\里程碑最终算法\01补全\源代码\补全后的数据.csv'
  25. data = pd.read_csv(data_path)
  26. expansion_ratio = 0.2 # 扩充数据的比例
  27. try:
  28. extended_data = extend_data_with_ordered_sampling(data, expansion_ratio)
  29. extended_data.to_csv('扩充数据-随机采样.csv', index=False)
  30. except Exception as e:
  31. print(f"发生错误: {e}")
  32. # #扩充2--数据扰动:通过在原始数据上添加小的随机扰动来生成新的数据点。这种方法适用于数值型数据,可以帮助模型学习到更加泛化的特征表示。
  33. # import numpy as np
  34. # import pandas as pd
  35. #
  36. # def add_random_perturbation(series, sigma):
  37. # """
  38. # 对数值型序列添加随机扰动。
  39. # """
  40. # return series + np.random.normal(0, sigma, size=len(series))
  41. #
  42. # def extend_data_with_perturbation(df, sigma, expansion_ratio):
  43. # """
  44. # 对数据帧中的数值型列添加随机扰动,并扩充数据。
  45. # """
  46. # # 检查是否存在 'Time' 列,如果存在,则删除
  47. # if 'Time' in df.columns:
  48. # df = df.drop(columns=['Time'])
  49. #
  50. # numerical_columns = df.select_dtypes(include=[np.number]).columns
  51. # extended_data = df.copy()
  52. #
  53. # for col in numerical_columns:
  54. # extended_data[col] = add_random_perturbation(df[col], sigma)
  55. #
  56. # # 计算扩充的数据量
  57. # n_samples = int(len(df) * expansion_ratio)
  58. #
  59. # # 扩充数据
  60. # expanded_data = extended_data.iloc[-n_samples:].copy()
  61. #
  62. # # 合并原始数据和扩充数据
  63. # final_data = pd.concat([df, expanded_data], ignore_index=True)
  64. #
  65. # return final_data
  66. #
  67. # # 读取数据
  68. # data_path = r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\里程碑最终算法\01补全\源代码\补全后的数据-无time.csv'
  69. # data = pd.read_csv(data_path)
  70. #
  71. # sigma = 0.05 # 扰动的标准差
  72. # expansion_ratio = 0.2 # 扩充数据的比例
  73. #
  74. # try:
  75. # extended_data = extend_data_with_perturbation(data, sigma, expansion_ratio)
  76. # extended_data.to_csv('扩充后的数据-数据扰动.csv', index=False)
  77. # except Exception as e:
  78. # print(f"发生错误: {e}")
  79. # #扩充3--Wavelet变换:可以将信号分解成不同频率的子信号,然后可以通过对这些子信号进行处理来实现数据扩充。
  80. # import numpy as np
  81. # import pandas as pd
  82. # import pywt
  83. #
  84. # def wavelet_transform(series, wavelet='db1', level=1):
  85. # """
  86. # 对一维数值数据进行小波变换。
  87. # """
  88. # return pywt.wavedec(series, wavelet, level=level)
  89. #
  90. # def wavelet_reconstruct(coeffs, wavelet='db1'):
  91. # """
  92. # 使用小波变换后的系数重构数据。
  93. # """
  94. # return pywt.waverec(coeffs, wavelet)
  95. #
  96. # def perturb_coeffs(coeffs, sigma):
  97. # """
  98. # 对小波变换的系数进行扰动。
  99. # """
  100. # perturbed_coeffs = []
  101. # for coeff in coeffs:
  102. # # 对细节系数进行扰动,近似系数保持不变
  103. # if np.issubdtype(coeff.dtype, np.number):
  104. # perturbed_coeff = coeff + sigma * np.random.randn(*coeff.shape)
  105. # else:
  106. # perturbed_coeff = coeff
  107. # perturbed_coeffs.append(perturbed_coeff)
  108. # return perturbed_coeffs
  109. #
  110. # def extend_data_with_wavelet(df, wavelet='db1', level=1, sigma=0.05, expansion_ratio=0.2):
  111. # # 检查是否存在 'Time' 列,如果存在,则删除
  112. # if 'Time' in df.columns:
  113. # df = df.drop(columns=['Time'])
  114. #
  115. # numerical_columns = df.select_dtypes(include=[np.number]).columns
  116. # extended_data = df.copy()
  117. #
  118. # for col in numerical_columns:
  119. # coeffs = wavelet_transform(df[col], wavelet, level)
  120. # perturbed_coeffs = perturb_coeffs(coeffs, sigma)
  121. # reconstructed_series = wavelet_reconstruct(perturbed_coeffs, wavelet)
  122. # extended_data[col] = reconstructed_series[:len(df[col])] # 保持原数据长度
  123. #
  124. # # 计算扩充的数据量
  125. # n_samples = int(len(df) * expansion_ratio)
  126. #
  127. # # 扩充数据
  128. # expanded_data = extended_data.iloc[-n_samples:].copy()
  129. #
  130. # # 合并原始数据和扩充数据
  131. # final_data = pd.concat([extended_data, expanded_data], ignore_index=True)
  132. #
  133. # return final_data
  134. #
  135. # # 读取数据
  136. # data_path = r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\里程碑最终算法\01补全\源代码\补全后的数据-无time.csv'
  137. # data = pd.read_csv(data_path)
  138. #
  139. # wavelet = 'db1' # 选择小波基
  140. # level = 1 # 分解层数
  141. # sigma = 0.05 # 扰动的标准差
  142. # expansion_ratio = 0.2 # 扩充数据的比例
  143. #
  144. # try:
  145. # extended_data = extend_data_with_wavelet(data, wavelet, level, sigma, expansion_ratio)
  146. # extended_data.to_csv('扩充后的数据-wavelet.csv', index=False)
  147. # except Exception as e:
  148. # print(f"发生错误: {e}")
  149. # #扩充四--小波系数扰动:对小波变换后的系数进行扰动,即在系数中加入随机噪声。这是通过在系数上加上标准差为sigma的高斯随机数实现的
  150. # import numpy as np
  151. # import pandas as pd
  152. # import pywt
  153. #
  154. # def wavelet_transform(series, wavelet='db1', level=1):
  155. # return pywt.wavedec(series, wavelet, level=level)
  156. #
  157. # def wavelet_reconstruct(coeffs, wavelet='db1'):
  158. # return pywt.waverec(coeffs, wavelet)
  159. #
  160. # def perturb_coeffs(coeffs, sigma):
  161. # perturbed_coeffs = []
  162. # for coeff in coeffs:
  163. # if np.issubdtype(coeff.dtype, np.number):
  164. # perturbed_coeff = coeff + sigma * np.random.randn(*coeff.shape)
  165. # else:
  166. # perturbed_coeff = coeff
  167. # perturbed_coeffs.append(perturbed_coeff)
  168. # return perturbed_coeffs
  169. #
  170. # def enhance_or_reduce(coeffs, factor):
  171. # """
  172. # 对小波变换后的高频系数进行增强或衰减。
  173. # """
  174. # enhanced_coeffs = []
  175. # for i, coeff in enumerate(coeffs):
  176. # # 细节系数从索引1开始,我们对其进行增强或衰减
  177. # if i > 0:
  178. # enhanced_coeffs.append(coeff * factor)
  179. # else:
  180. # enhanced_coeffs.append(coeff)
  181. # return enhanced_coeffs
  182. #
  183. # def extend_data_with_wavelet(df, wavelet='db1', level=1, sigma=0.05, expansion_ratio=0.2):
  184. # # 检查是否存在 'Time' 列,如果存在,则删除
  185. # if 'Time' in df.columns:
  186. # df = df.drop(columns=['Time'])
  187. #
  188. # numerical_columns = df.select_dtypes(include=[np.number]).columns
  189. # extended_data = df.copy()
  190. #
  191. # for col in numerical_columns:
  192. # coeffs = wavelet_transform(df[col], wavelet, level)
  193. # perturbed_coeffs = perturb_coeffs(coeffs, sigma)
  194. # enhanced_coeffs = enhance_or_reduce(perturbed_coeffs, factor=1.1) # 增强高频系数
  195. # reconstructed_series = wavelet_reconstruct(enhanced_coeffs, wavelet)
  196. # extended_data[col] = reconstructed_series
  197. #
  198. # # 计算扩充的数据量
  199. # n_samples = int(len(df) * expansion_ratio)
  200. #
  201. # # 扩充数据
  202. # expanded_data = extended_data.iloc[-n_samples:].copy()
  203. #
  204. # # 合并原始数据和扩充数据
  205. # final_data = pd.concat([df, expanded_data], axis=0, ignore_index=True)
  206. #
  207. # return final_data
  208. #
  209. #
  210. # data = pd.read_csv(r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\里程碑最终算法\01补全\源代码\补全后的数据-无time.csv')
  211. # wavelet = 'db1' # 选择小波基
  212. # level = 1 # 分解层数
  213. # sigma = 0.05 # 扰动的标准差
  214. # expansion_ratio = 0.2 # 扩充数据的比例
  215. #
  216. # try:
  217. # extended_data = extend_data_with_wavelet(data, wavelet, level, sigma, expansion_ratio)
  218. # extended_data.to_csv('扩充后的数据-Wavelet变换.csv', index=False)
  219. # except Exception as e:
  220. # print(f"发生错误: {e}")
  221. # #扩充5:小波线性插值
  222. # import numpy as np
  223. # import pandas as pd
  224. # import pywt
  225. #
  226. # def wavelet_transform(series, wavelet='db1', level=1):
  227. # """
  228. # 对一维数值数据进行小波变换。
  229. # """
  230. # return pywt.wavedec(series, wavelet, level=level)
  231. #
  232. # def wavelet_reconstruct(coeffs, wavelet='db1'):
  233. # """
  234. # 使用小波变换后的系数重构数据。
  235. # """
  236. # return pywt.waverec(coeffs, wavelet)
  237. #
  238. # def perturb_coeffs(coeffs, sigma):
  239. # """
  240. # 对小波变换的系数进行扰动。
  241. # """
  242. # perturbed_coeffs = []
  243. # for coeff in coeffs:
  244. # if np.issubdtype(coeff.dtype, np.number):
  245. # perturbed_coeff = coeff + sigma * np.random.randn(*coeff.shape)
  246. # else:
  247. # perturbed_coeff = coeff
  248. # perturbed_coeffs.append(perturbed_coeff)
  249. # return perturbed_coeffs
  250. #
  251. # def interpolate_coeffs(coeffs, new_length):
  252. # """
  253. # 对小波变换的系数进行线性插值。
  254. # """
  255. # interpolated_coeffs = []
  256. # for coeff in coeffs:
  257. # if new_length:
  258. # coeff = np.interp(np.arange(new_length), np.arange(len(coeff)), coeff)
  259. # interpolated_coeffs.append(coeff)
  260. # return interpolated_coeffs
  261. #
  262. # def extend_data_with_wavelet(df, wavelet='db1', level=1, sigma=0.05, expansion_ratio=0.2, new_length=None):
  263. # # 检查是否存在 'Time' 列,如果存在,则删除
  264. # if 'Time' in df.columns:
  265. # df = df.drop(columns=['Time'])
  266. #
  267. # numerical_columns = df.select_dtypes(include=[np.number]).columns
  268. # extended_data = df.copy()
  269. #
  270. # for col in numerical_columns:
  271. # coeffs = wavelet_transform(df[col], wavelet, level)
  272. # perturbed_coeffs = perturb_coeffs(coeffs, sigma)
  273. # if new_length is not None:
  274. # perturbed_coeffs = interpolate_coeffs(perturbed_coeffs, new_length)
  275. # reconstructed_series = wavelet_reconstruct(perturbed_coeffs, wavelet)
  276. # extended_data[col] = reconstructed_series[:len(df[col])] # 确保数据长度一致
  277. #
  278. # # 计算扩充的数据量
  279. # n_samples = int(len(df) * expansion_ratio)
  280. #
  281. # # 扩充数据
  282. # expanded_data = extended_data.iloc[-n_samples:].copy()
  283. #
  284. # # 合并原始数据和扩充数据
  285. # final_data = pd.concat([extended_data, expanded_data], ignore_index=True)
  286. #
  287. # return final_data
  288. #
  289. # # 读取数据
  290. # data = pd.read_csv(r'E:\BaiduSyncdisk\zhiguan\01最近做的\算法调试\自己调试--Y\里程碑最终算法\01补全\源代码\补全后的数据-无time.csv')
  291. #
  292. # wavelet = 'db1' # 选择小波基
  293. # level = 1 # 分解层数
  294. # sigma = 0.05 # 扰动的标准差
  295. # expansion_ratio = 0.2 # 扩充数据的比例
  296. # new_length = None # 设置新的数据长度,如果需要
  297. #
  298. # try:
  299. # extended_data = extend_data_with_wavelet(data, wavelet, level, sigma, expansion_ratio, new_length)
  300. # extended_data.to_csv('扩充后的数据-小波线性.csv', index=False)
  301. # except Exception as e:
  302. # print(f"发生错误: {e}")