扩充.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475
  1. #扩充1--随机采样:是一种简单直接的数据增强方法,通过从现有数据中随机选择样本来增加数据集的大小。对于不平衡的数据集,可以通过有偏的采样来平衡类别,即对少数类的样本进行过采样,对多数类的样本进行欠采样。
  2. from flask import Flask, request, jsonify
  3. import pandas as pd
  4. import os
  5. import requests
  6. app = Flask(__name__)
  7. def extend_data_with_ordered_sampling(df, expansion_ratio=0.2):
  8. """
  9. 通过有序采样的方式扩充整个数据集,不包括时间列(如果存在)。
  10. :param df: 原始数据帧。
  11. :param expansion_ratio: 扩充数据的比例,即新数据占原始数据的比例。
  12. :return: 扩充后的数据帧。
  13. """
  14. # 检查是否存在 'Time' 列,如果存在,则删除
  15. if 'Time' in df.columns:
  16. df = df.drop(columns=['Time'])
  17. # 计算需要扩充的样本数量
  18. n_samples = int(len(df) * expansion_ratio)
  19. # 对剩余数据进行排序(假设df已经是按照时间排序的,如果没有排序需要添加排序逻辑)
  20. df_sorted = df.sort_index()
  21. # 按照顺序取出一部分数据作为扩充数据,不打乱顺序
  22. sampled_data = df_sorted.tail(n_samples).copy()
  23. # 合并原始数据帧与采样数据帧
  24. final_data = pd.concat([df, sampled_data], ignore_index=True)
  25. return final_data
  26. @app.route('/random', methods=['POST'])
  27. def upload_file():
  28. try:
  29. # 检查请求体中是否包含文件地址
  30. data = request.json
  31. if 'file_url' not in data:
  32. return jsonify({'code': 400, 'msg': 'No file URL provided'})
  33. file_url = data['file_url']
  34. # 使用requests获取文件内容
  35. response = requests.get(file_url)
  36. if response.status_code != 200:
  37. return jsonify({'code': 500, 'msg': 'Failed to retrieve file from URL'})
  38. # 读取数据集
  39. df = pd.read_csv(pd.compat.StringIO(response.text), header=None)
  40. # 调用扩充数据函数
  41. expansion_ratio = 0.2 # 扩充数据的比例
  42. result_data = extend_data_with_ordered_sampling(df, expansion_ratio)
  43. # 将结果保存为CSV文件
  44. result_file_path = os.path.join('/tmp', '扩充数据-随机采样.csv')
  45. result_data.to_csv(result_file_path, index=False, header=False)
  46. # 构建返回数据,只返回文件路径
  47. return jsonify({
  48. 'code': 200,
  49. 'msg': '文件处理完成',
  50. 'file_path': result_file_path
  51. })
  52. except Exception as e:
  53. return jsonify({
  54. 'code': 500,
  55. 'msg': str(e)
  56. })
  57. if __name__ == '__main__':
  58. app.run(debug=True, port=8081, host='0.0.0.0')
  59. #扩充2--数据扰动:通过在原始数据上添加小的随机扰动来生成新的数据点。这种方法适用于数值型数据,可以帮助模型学习到更加泛化的特征表示。
  60. from flask import Flask, request, jsonify
  61. import pandas as pd
  62. import numpy as np
  63. import os
  64. import requests
  65. app = Flask(__name__)
  66. def add_random_perturbation(series, sigma):
  67. """
  68. 对数值型序列添加随机扰动。
  69. """
  70. return series + np.random.normal(0, sigma, size=len(series))
  71. def extend_data_with_perturbation(df, sigma, expansion_ratio):
  72. """
  73. 对数据帧中的数值型列添加随机扰动,并扩充数据。
  74. """
  75. # 检查是否存在 'Time' 列,如果存在,则删除
  76. if 'Time' in df.columns:
  77. df = df.drop(columns=['Time'])
  78. numerical_columns = df.select_dtypes(include=[np.number]).columns
  79. extended_data = df.copy()
  80. for col in numerical_columns:
  81. extended_data[col] = add_random_perturbation(df[col], sigma)
  82. # 计扩充的数据量
  83. n_samples = int(len(df) * expansion_ratio)
  84. # 扩充数据
  85. expanded_data = extended_data.iloc[-n_samples:].copy()
  86. # 合并原始数据和扩充数据
  87. final_data = pd.concat([df, expanded_data], ignore_index=True)
  88. return final_data
  89. @app.route('/perturbation', methods=['POST'])
  90. def upload_file():
  91. try:
  92. # 检查请求体中是否包含文件地址
  93. data = request.json
  94. if 'file_url' not in data:
  95. return jsonify({'code': 400, 'msg': 'No file URL provided'})
  96. file_url = data['file_url']
  97. # 使用requests获取文件内容
  98. response = requests.get(file_url)
  99. if response.status_code != 200:
  100. return jsonify({'code': 500, 'msg': 'Failed to retrieve file from URL'})
  101. # 读取数据集
  102. df = pd.read_csv(pd.compat.StringIO(response.text), header=None)
  103. sigma = 0.05 # 扰动的标准差
  104. expansion_ratio = 0.2 # 扩充数据的比例
  105. extended_data = extend_data_with_perturbation(data, sigma, expansion_ratio)
  106. result_file_path = os.path.join('/tmp', '扩充数据-数据扰动.csv')
  107. extended_data.to_csv(result_file_path, index=False)
  108. # 构建返回数据,只返回文件路径
  109. return jsonify({
  110. 'code': 200,
  111. 'msg': '文件处理完成',
  112. 'file_path': result_file_path
  113. })
  114. except Exception as e:
  115. return jsonify({
  116. 'code': 500,
  117. 'msg': str(e)
  118. })
  119. if __name__ == '__main__':
  120. app.run(debug=True, port=8081, host='0.0.0.0')
  121. #扩充3--Wavelet变换:可以将信号分解成不同频率的子信号,然后可以通过对这些子信号进行处理来实现数据扩充。
  122. import numpy as np
  123. import pandas as pd
  124. import pywt
  125. from flask import Flask, request, jsonify
  126. import os
  127. import requests
  128. app =Flask(__name__)
  129. def wavelet_transform(series, wavelet='db1', level=1):
  130. """
  131. 对一维数值数据进行小波变换。
  132. """
  133. return pywt.wavedec(series, wavelet, level=level)
  134. def wavelet_reconstruct(coeffs, wavelet='db1'):
  135. """
  136. 使用小波变换后的系数重构数据。
  137. """
  138. return pywt.waverec(coeffs, wavelet)
  139. def perturb_coeffs(coeffs, sigma):
  140. """
  141. 对小波变换的系数进行扰动。
  142. """
  143. perturbed_coeffs = []
  144. for coeff in coeffs:
  145. # 对细节系数进行扰动,近似系数保持不变
  146. if np.issubdtype(coeff.dtype, np.number):
  147. perturbed_coeff = coeff + sigma * np.random.randn(*coeff.shape)
  148. else:
  149. perturbed_coeff = coeff
  150. perturbed_coeffs.append(perturbed_coeff)
  151. return perturbed_coeffs
  152. def extend_data_with_wavelet(df, wavelet='db1', level=1, sigma=0.05, expansion_ratio=0.2):
  153. # 检查是否存在 'Time' 列,如果存在,则删除
  154. if 'Time' in df.columns:
  155. df = df.drop(columns=['Time'])
  156. numerical_columns = df.select_dtypes(include=[np.number]).columns
  157. extended_data = df.copy()
  158. for col in numerical_columns:
  159. coeffs = wavelet_transform(df[col], wavelet, level)
  160. perturbed_coeffs = perturb_coeffs(coeffs, sigma)
  161. reconstructed_series = wavelet_reconstruct(perturbed_coeffs, wavelet)
  162. extended_data[col] = reconstructed_series[:len(df[col])] # 保持原数据长度
  163. # 计算扩充的数据量
  164. n_samples = int(len(df) * expansion_ratio)
  165. # 扩充数据
  166. expanded_data = extended_data.iloc[-n_samples:].copy()
  167. # 合并原始数据和扩充数据
  168. final_data = pd.concat([extended_data, expanded_data], ignore_index=True)
  169. return final_data
  170. @app.route('/Wavelet', methods=['POST'])
  171. def upload_file():
  172. try:
  173. # 检查请求体中是否包含文件地址
  174. data = request.json
  175. if 'file_url' not in data:
  176. return jsonify({'code': 400, 'msg': 'No file URL provided'})
  177. file_url = data['file_url']
  178. # 使用requests获取文件内容
  179. response = requests.get(file_url)
  180. if response.status_code != 200:
  181. return jsonify({'code': 500, 'msg': 'Failed to retrieve file from URL'})
  182. # 读取数据集
  183. df = pd.read_csv(pd.compat.StringIO(response.text), header=None)
  184. wavelet = 'db1' # 选择小波基
  185. level = 1 # 分解层数
  186. sigma = 0.05 # 扰动的标准差
  187. expansion_ratio = 0.2 # 扩充数据的比例
  188. extended_data = extend_data_with_wavelet(data, wavelet, level, sigma, expansion_ratio)
  189. result_file_path = os.path.join('/tmp', '扩充数据-wavelet.csv')
  190. extended_data.to_csv(result_file_path, index=False)
  191. # 构建返回数据,只返回文件路径
  192. return jsonify({
  193. 'code': 200,
  194. 'msg': '文件处理完成',
  195. 'file_path': result_file_path
  196. })
  197. except Exception as e:
  198. return jsonify({
  199. 'code': 500,
  200. 'msg': str(e)
  201. })
  202. if __name__ == '__main__':
  203. app.run(debug=True, port=8081, host='0.0.0.0')
  204. #扩充四-小波系数扰动:对小波变换后的系数进行扰动,即在系数中加入随机噪声。这是通过在系数上加上标准差为sigma的高斯随机数实现的
  205. import numpy as np
  206. import pandas as pd
  207. import pywt
  208. from flask import Flask, request, jsonify
  209. import os
  210. import requests
  211. app = Flask(__name__)
  212. def wavelet_transform(series, wavelet='db1', level=1):
  213. return pywt.wavedec(series, wavelet, level=level)
  214. def wavelet_reconstruct(coeffs, wavelet='db1'):
  215. return pywt.waverec(coeffs, wavelet)
  216. def perturb_coeffs(coeffs, sigma):
  217. perturbed_coeffs = []
  218. for coeff in coeffs:
  219. if np.issubdtype(coeff.dtype, np.number):
  220. perturbed_coeff = coeff + sigma * np.random.randn(*coeff.shape)
  221. else:
  222. perturbed_coeff = coeff
  223. perturbed_coeffs.append(perturbed_coeff)
  224. return perturbed_coeffs
  225. def enhance_or_reduce(coeffs, factor):
  226. """
  227. 对小波变换后的高频系数进行增强或衰减。
  228. """
  229. enhanced_coeffs = []
  230. for i, coeff in enumerate(coeffs):
  231. # 细节系数从索引1开始,我们对其进行增强或衰减
  232. if i > 0:
  233. enhanced_coeffs.append(coeff * factor)
  234. else:
  235. enhanced_coeffs.append(coeff)
  236. return enhanced_coeffs
  237. def extend_data_with_wavelet(df, wavelet='db1', level=1, sigma=0.05, expansion_ratio=0.2):
  238. # 检查是否存在 'Time' 列,如果存在,则删除
  239. if 'Time' in df.columns:
  240. df = df.drop(columns=['Time'])
  241. numerical_columns = df.select_dtypes(include=[np.number]).columns
  242. extended_data = df.copy()
  243. for col in numerical_columns:
  244. coeffs = wavelet_transform(df[col], wavelet, level)
  245. perturbed_coeffs = perturb_coeffs(coeffs, sigma)
  246. enhanced_coeffs = enhance_or_reduce(perturbed_coeffs, factor=1.1) # 增强高频系数
  247. reconstructed_series = wavelet_reconstruct(enhanced_coeffs, wavelet)
  248. extended_data[col] = reconstructed_series
  249. # 计算扩充的数据量
  250. n_samples = int(len(df) * expansion_ratio)
  251. # 扩充数据
  252. expanded_data = extended_data.iloc[-n_samples:].copy()
  253. # 合并原始数据和扩充数据
  254. final_data = pd.concat([df, expanded_data], axis=0, ignore_index=True)
  255. return final_data
  256. @app.route('/coefficient', methods=['POST'])
  257. def upload_file():
  258. try:
  259. # 检查请求体中是否包含文件地址
  260. data = request.json
  261. if 'file_url' not in data:
  262. return jsonify({'code': 400, 'msg': 'No file URL provided'})
  263. file_url = data['file_url']
  264. # 使用requests获取文件内容
  265. response = requests.get(file_url)
  266. if response.status_code != 200:
  267. return jsonify({'code': 500, 'msg': 'Failed to retrieve file from URL'})
  268. # 读取数据集
  269. df = pd.read_csv(pd.compat.StringIO(response.text), header=None)
  270. wavelet = 'db1' # 选择小波基
  271. level = 1 # 分解层数
  272. sigma = 0.05 # 扰动的标准差
  273. expansion_ratio = 0.2 # 扩充数据的比例
  274. extended_data = extend_data_with_wavelet(data, wavelet, level, sigma, expansion_ratio)
  275. result_file_path = os.path.join('/tmp', '扩充数据-Wavelet变换.csv')
  276. extended_data.to_csv(result_file_path, index=False)
  277. # 构建返回数据,只返回文件路径
  278. return jsonify({
  279. 'code': 200,
  280. 'msg': '文件处理完成',
  281. 'file_path': result_file_path
  282. })
  283. except Exception as e:
  284. return jsonify({
  285. 'code': 500,
  286. 'msg': str(e)
  287. })
  288. if __name__ == '__main__':
  289. app.run(debug=True, port=8081, host='0.0.0.0')
  290. #扩充5:小波线性插值
  291. import numpy as np
  292. import pandas as pd
  293. import pywt
  294. from flask import Flask, request, jsonify
  295. import os
  296. import requests
  297. app = Flask(__name__)
  298. def wavelet_transform(series, wavelet='db1', level=1):
  299. """
  300. 对一维数值数据进行小波变换。
  301. """
  302. return pywt.wavedec(series, wavelet, level=level)
  303. def wavelet_reconstruct(coeffs, wavelet='db1'):
  304. """
  305. 使用小波变换后的系数重构数据。
  306. """
  307. return pywt.waverec(coeffs, wavelet)
  308. def perturb_coeffs(coeffs, sigma):
  309. """
  310. 对小波变换的系数进行扰动。
  311. """
  312. perturbed_coeffs = []
  313. for coeff in coeffs:
  314. if np.issubdtype(coeff.dtype, np.number):
  315. perturbed_coeff = coeff + sigma * np.random.randn(*coeff.shape)
  316. else:
  317. perturbed_coeff = coeff
  318. perturbed_coeffs.append(perturbed_coeff)
  319. return perturbed_coeffs
  320. def interpolate_coeffs(coeffs, new_length):
  321. """
  322. 对小波变换的系数进行线性插值。
  323. """
  324. interpolated_coeffs = []
  325. for coeff in coeffs:
  326. if new_length:
  327. coeff = np.interp(np.arange(new_length), np.arange(len(coeff)), coeff)
  328. interpolated_coeffs.append(coeff)
  329. return interpolated_coeffs
  330. def extend_data_with_wavelet(df, wavelet='db1', level=1, sigma=0.05, expansion_ratio=0.2, new_length=None):
  331. # 检查是否存在 'Time' 列,如果存在,则删除
  332. if 'Time' in df.columns:
  333. df = df.drop(columns=['Time'])
  334. numerical_columns = df.select_dtypes(include=[np.number]).columns
  335. extended_data = df.copy()
  336. for col in numerical_columns:
  337. coeffs = wavelet_transform(df[col], wavelet, level)
  338. perturbed_coeffs = perturb_coeffs(coeffs, sigma)
  339. if new_length is not None:
  340. perturbed_coeffs = interpolate_coeffs(perturbed_coeffs, new_length)
  341. reconstructed_series = wavelet_reconstruct(perturbed_coeffs, wavelet)
  342. extended_data[col] = reconstructed_series[:len(df[col])] # 确保数据长度一致
  343. # 计算扩充的数据量
  344. n_samples = int(len(df) * expansion_ratio)
  345. # 扩充数据
  346. expanded_data = extended_data.iloc[-n_samples:].copy()
  347. # 合并原始数据和扩充数据
  348. final_data = pd.concat([extended_data, expanded_data], ignore_index=True)
  349. return final_data
  350. @app.route('/interpolation', methods=['POST'])
  351. def upload_file():
  352. try:
  353. # 检查请求体中是否包含文件地址
  354. data = request.json
  355. if 'file_url' not in data:
  356. return jsonify({'code': 400, 'msg': 'No file URL provided'})
  357. file_url = data['file_url']
  358. # 使用requests获取文件内容
  359. response = requests.get(file_url)
  360. if response.status_code != 200:
  361. return jsonify({'code': 500, 'msg': 'Failed to retrieve file from URL'})
  362. # 读取数据集
  363. df = pd.read_csv(pd.compat.StringIO(response.text), header=None)
  364. wavelet = 'db1'
  365. level = 1
  366. sigma = 0.05
  367. expansion_ratio = 0.2
  368. new_length = None
  369. extended_data = extend_data_with_wavelet(data, wavelet, level, sigma, expansion_ratio, new_length)
  370. result_file_path = os.path.join('/tmp', '扩充数据-小波线性.csv')
  371. extended_data.to_csv(result_file_path, index=False)
  372. # 构建返回数据,只返回文件路径
  373. return jsonify({
  374. 'code': 200,
  375. 'msg': '文件处理完成',
  376. 'file_path': result_file_path
  377. })
  378. except Exception as e:
  379. return jsonify({
  380. 'code': 500,
  381. 'msg': str(e)
  382. })
  383. if __name__ == '__main__':
  384. app.run(debug=True, port=8081, host='0.0.0.0')